├── README.md
├── .gitignore
├── submission.ipynb
├── [Analyse] LB Detect.ipynb
├── [Model] MPCNN.ipynb
├── [Feature] V1.ipynb
├── [Model] TextRNN.ipynb
├── [Model] TextRCNN.ipynb
├── [Model] CNN.ipynb
├── [Model] Multi LSTM CNN v0 word.ipynb
├── [Model] TextCNN.ipynb
├── [Model] Multi LSTM CNN v4.ipynb
├── [Model] Multi LSTM CNN v2.ipynb
└── [Model] Multi LSTM CNN v5.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # paipaidai_comp
2 | 
3 | [第三届魔镜杯大赛](https://ai.ppdai.com/mirror/goToMirrorDetail?mirrorId=1), 智能客服聊天机器人真实数据, 提高智能客服的识别能力和服务质量
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # Jupyter Notebook
 6 | .ipynb_checkpoints
 7 | 
 8 | # folder
 9 | data/
10 | log/
11 | result/
12 | models/


--------------------------------------------------------------------------------
/submission.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import glob\n",
12 |     "import pandas as pd"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": 2,
18 |    "metadata": {
19 |     "collapsed": true
20 |    },
21 |    "outputs": [],
22 |    "source": [
23 |     "word_pred = pd.read_csv(\"./result/20180715-101519-Multi_LSTM_CNN_v3_word_best.csv\")\n",
24 |     "char_pred = pd.read_csv(\"./result/20180715-132714-Multi_LSTM_CNN_v5_char_best.csv\")\n",
25 |     "final_pred = (word_pred + char_pred) / 2\n",
26 |     "final_pred.to_csv(\"./result/prediction_v5.csv\", index=False)"
27 |    ]
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": null,
32 |    "metadata": {
33 |     "collapsed": true
34 |    },
35 |    "outputs": [],
36 |    "source": []
37 |   }
38 |  ],
39 |  "metadata": {
40 |   "kernelspec": {
41 |    "display_name": "Python 3",
42 |    "language": "python",
43 |    "name": "python3"
44 |   },
45 |   "language_info": {
46 |    "codemirror_mode": {
47 |     "name": "ipython",
48 |     "version": 3
49 |    },
50 |    "file_extension": ".py",
51 |    "mimetype": "text/x-python",
52 |    "name": "python",
53 |    "nbconvert_exporter": "python",
54 |    "pygments_lexer": "ipython3",
55 |    "version": "3.6.2"
56 |   }
57 |  },
58 |  "nbformat": 4,
59 |  "nbformat_minor": 2
60 | }
61 | 


--------------------------------------------------------------------------------
/[Analyse] LB Detect.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "DATA_PATH = \"./ppd_data/\"\n",
 24 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 25 |     "TEST_PATH = DATA_PATH + \"test.csv\""
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 37 |     "test_data = pd.read_csv(TEST_PATH)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "(254386, 172956)"
 49 |       ]
 50 |      },
 51 |      "execution_count": 4,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "num_train, num_test = len(train_data), len(test_data)\n",
 58 |     "num_train, num_test"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "0.5191087559849992"
 70 |       ]
 71 |      },
 72 |      "execution_count": 5,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "pos_rate = train_data[\"label\"].sum() / num_train\n",
 79 |     "pos_rate"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 6,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "pd.DataFrame(np.ones(shape=(num_test,)) * pos_rate, columns=[\"y_pre\"]).to_csv(\"./pred_const.csv\", index=False)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 7,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "0.5029553655657454"
102 |       ]
103 |      },
104 |      "execution_count": 7,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "log_loss = 0.693652\n",
111 |     "\n",
112 |     "r = (log_loss + np.log(1 - pos_rate)) / (np.log((1 - pos_rate) / pos_rate))\n",
113 |     "r"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.6.2"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 2
138 | }
139 | 


--------------------------------------------------------------------------------
/[Model] MPCNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Multi-Perspective Sentence Similarity Modeling with Convolutional Neural Networks"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import warnings\n",
 19 |     "warnings.filterwarnings(\"ignore\")\n",
 20 |     "\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "DATA_PATH = \"./data/\"\n",
 34 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 35 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 36 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 37 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 38 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 39 |     "\n",
 40 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 41 |     "test_data = pd.read_csv(TEST_PATH)\n",
 42 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 43 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 44 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 45 |     "\n",
 46 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 47 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 48 |     "\n",
 49 |     "label = train_data[\"label\"].values\n",
 50 |     "\n",
 51 |     "from keras.preprocessing.text import Tokenizer\n",
 52 |     "\n",
 53 |     "MAX_COUNT = 10000\n",
 54 |     "\n",
 55 |     "word_tokenizer = Tokenizer(MAX_COUNT)\n",
 56 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
 57 |     "\n",
 58 |     "word_embedding_data = np.concatenate(\n",
 59 |     "    (\n",
 60 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 61 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 62 |     "    ),\n",
 63 |     "    axis=0\n",
 64 |     ")\n",
 65 |     "\n",
 66 |     "char_tokenizer = Tokenizer(MAX_COUNT)\n",
 67 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
 68 |     "\n",
 69 |     "char_embedding_data = np.concatenate(\n",
 70 |     "    (\n",
 71 |     "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
 72 |     "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 73 |     "    ),\n",
 74 |     "    axis=0\n",
 75 |     ")\n",
 76 |     "\n",
 77 |     "word_embedding_data.shape, char_embedding_data.shape"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {
 84 |     "collapsed": true
 85 |    },
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from keras.preprocessing.sequence import pad_sequences\n",
 89 |     "\n",
 90 |     "SEQ_LEN = 30\n",
 91 |     "\n",
 92 |     "def gen_word_data(data):\n",
 93 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
 94 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
 95 |     "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
 96 |     "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
 97 |     "    \n",
 98 |     "def gen_char_data(data):\n",
 99 |     "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
100 |     "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
101 |     "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
102 |     "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
103 |     "\n",
104 |     "word1, word2 = gen_word_data(train_data)\n",
105 |     "char1, char2 = gen_char_data(train_data)\n",
106 |     "test_word1, test_word2 = gen_word_data(test_data)\n",
107 |     "test_char1, test_char2 = gen_char_data(test_data)\n",
108 |     "\n",
109 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "from keras.models import Model\n",
121 |     "from keras.layers.merge import concatenate\n",
122 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
123 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed\n",
124 |     "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n",
125 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activationation"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "from sklearn.model_selection import train_test_split\n",
137 |     "\n",
138 |     "train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n",
139 |     "    word1, word2, train_data[\"label\"].values,\n",
140 |     "    test_size=0.2\n",
141 |     ")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {
148 |     "collapsed": true
149 |    },
150 |    "outputs": [],
151 |    "source": [
152 |     "embedding_layer = Embedding(\n",
153 |     "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
154 |     "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
155 |     "        weights=[word_embedding_data],  # word/char switch\n",
156 |     "        input_length=SEQ_LEN,\n",
157 |     "        trainable=False\n",
158 |     "    )\n",
159 |     "\n",
160 |     "vector1 = embedding_layer(input1)\n",
161 |     "vector2 = embedding_layer(input2)\n",
162 |     "\n"
163 |    ]
164 |   }
165 |  ],
166 |  "metadata": {
167 |   "kernelspec": {
168 |    "display_name": "Python 3",
169 |    "language": "python",
170 |    "name": "python3"
171 |   },
172 |   "language_info": {
173 |    "codemirror_mode": {
174 |     "name": "ipython",
175 |     "version": 3
176 |    },
177 |    "file_extension": ".py",
178 |    "mimetype": "text/x-python",
179 |    "name": "python",
180 |    "nbconvert_exporter": "python",
181 |    "pygments_lexer": "ipython3",
182 |    "version": "3.6.2"
183 |   }
184 |  },
185 |  "nbformat": 4,
186 |  "nbformat_minor": 2
187 | }
188 | 


--------------------------------------------------------------------------------
/[Feature] V1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 所有特征\n",
  8 |     "\n",
  9 |     "## 问题特征\n",
 10 |     "\n",
 11 |     "- 问题出现次数: 1\n",
 12 |     "- 问题单词数量: 2\n",
 13 |     "- 问题字符数量: 2\n",
 14 |     "- 问题Hash值: 2\n",
 15 |     "\n",
 16 |     "## 问题对特征\n",
 17 |     "\n",
 18 |     "- 问题对重复单词数量: 1\n",
 19 |     "- 问题对重复字符数量: 1\n",
 20 |     "\n",
 21 |     "## 图特征\n",
 22 |     "\n",
 23 |     "- Clique Size, 与此问题对相互毗邻结点组成的子图中结点的数量: 1\n",
 24 |     "- K-core, 每个点最大的K-core值: 2"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "import networkx as nx\n",
 36 |     "import numpy as np\n",
 37 |     "import pandas as pd\n",
 38 |     "from itertools import combinations"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "DATA_PATH = \"./data/\"\n",
 50 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 51 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 52 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 53 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 54 |     "QUEST_PATH = DATA_PATH + \"question.csv\""
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 66 |     "test_data = pd.read_csv(TEST_PATH)\n",
 67 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 68 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 69 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 70 |     "\n",
 71 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 72 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 73 |     "\n",
 74 |     "label = train_data[\"label\"].values.copy()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "total_question = pd.concat([train_data[\"q1\"], train_data[\"q2\"], test_data[\"q1\"], test_data[\"q2\"]])\n",
 86 |     "question_feature = total_question.value_counts().reset_index()\n",
 87 |     "question_feature.columns = [\"qid\", \"q_count\"]"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "unique_question = total_question.drop_duplicates().reset_index(drop=True)\n",
 99 |     "question_dict = pd.Series(unique_question.index, unique_question).to_dict()"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "from keras.preprocessing.text import Tokenizer\n",
109 |     "\n",
110 |     "word_tokenizer = Tokenizer()\n",
111 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
112 |     "char_tokenizer = Tokenizer()\n",
113 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "word_count = sorted(list(word_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)\n",
125 |     "word_count = pd.DataFrame(word_count, columns=[\"word\", \"word_times\"])\n",
126 |     "char_count = sorted(list(char_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)\n",
127 |     "char_count = pd.DataFrame(char_count, columns=[\"cahr\", \"char_times\"])"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "train = train_data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
139 |     "                .drop([\"qid\", \"label\"], axis=1) \\\n",
140 |     "                .rename(columns={\"words\": \"words1\", \"chars\": \"chars1\"}) \\\n",
141 |     "                .merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
142 |     "                .drop([\"qid\"], axis=1) \\\n",
143 |     "                .rename(columns={\"words\": \"words2\", \"chars\": \"chars2\"})\n",
144 |     "test = test_data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
145 |     "                .drop([\"qid\"], axis=1) \\\n",
146 |     "                .rename(columns={\"words\": \"words1\", \"chars\": \"chars1\"}) \\\n",
147 |     "                .merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
148 |     "                .drop([\"qid\"], axis=1) \\\n",
149 |     "                .rename(columns={\"words\": \"words2\", \"chars\": \"chars2\"})"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "train1 = train.merge(question_feature, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
161 |     "    .drop(\"qid\", axis=1) \\\n",
162 |     "    .rename(columns={\"q_count\": \"q1_count\"})\n",
163 |     "train1 = train1.merge(question_feature, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
164 |     "    .drop(\"qid\", axis=1) \\\n",
165 |     "    .rename(columns={\"q_count\": \"q2_count\"})"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "test1 = test.merge(question_feature, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
177 |     "    .drop(\"qid\", axis=1) \\\n",
178 |     "    .rename(columns={\"q_count\": \"q1_count\"})\n",
179 |     "test1 = test1.merge(question_feature, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
180 |     "    .drop(\"qid\", axis=1) \\\n",
181 |     "    .rename(columns={\"q_count\": \"q2_count\"})"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "def question_feature(data):\n",
193 |     "    data[\"word1_len\"], data[\"word2_len\"] = data[\"words1\"].map(len), data[\"words2\"].map(len)\n",
194 |     "    data[\"char1_len\"], data[\"char2_len\"] = data[\"chars1\"].map(len), data[\"chars2\"].map(len)\n",
195 |     "    data[\"word_same\"] = data.apply(lambda x: len(set(x[\"words1\"]).intersection(set(x[\"words2\"]))), axis=1)\n",
196 |     "    data[\"char_same\"] = data.apply(lambda x: len(set(x[\"chars1\"]).intersection(set(x[\"chars2\"]))), axis=1)\n",
197 |     "    data[\"q1_hash\"], data[\"q2_hash\"] = data[\"q1\"].map(question_dict), data[\"q2\"].map(question_dict)\n",
198 |     "    return data\n",
199 |     "    \n",
200 |     "train2, test2 = question_feature(train1), question_feature(test1)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "train2.head()"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "graph = networkx.Graph()\n",
221 |     "edges = [tuple(pair) for pair in pd.concat([train_data[[\"q1\", \"q2\"]], test_data[[\"q1\", \"q2\"]]]).values]\n",
222 |     "graph.add_edges_from(edges)\n",
223 |     "\n",
224 |     "cliques = sorted(list(networkx.find_cliques(graph)), key=lambda x: len(x), reverse=True)\n",
225 |     "map_label = dict(((x[0], x[1]), 1) for x in pd.concat([train_data[[\"q1\", \"q2\"]], test_data[[\"q1\", \"q2\"]]]).values)\n",
226 |     "\n",
227 |     "map_clique_size = {}\n",
228 |     "for c in cliques:\n",
229 |     "    for q1, q2 in combinations(c, 2):\n",
230 |     "        if (q1, q2) in map_label:\n",
231 |     "            map_clique_size[q1, q2] = len(c)\n",
232 |     "        elif (q2, q1) in map_label:\n",
233 |     "            map_clique_size[q2, q1] = len(c)\n",
234 |     "\n",
235 |     "train2['clique_size'] = train2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)\n",
236 |     "test2['clique_size'] = test2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "max_kcore = pd.DataFrame(list(nx.core_number(graph).items()), columns=[\"qid\", \"kcore\"])\n",
246 |     "train3 = train2.merge(max_kcore, how=\"left\", left_on=\"q1\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q1_kcore\"}) \\\n",
247 |     "    .merge(max_kcore, how=\"left\", left_on=\"q2\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q2_kcore\"})\n",
248 |     "test3 = test2.merge(max_kcore, how=\"left\", left_on=\"q1\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q1_kcore\"}) \\\n",
249 |     "    .merge(max_kcore, how=\"left\", left_on=\"q2\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q2_kcore\"})"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "train3.drop([\"q1\", \"q2\", \"words1\", \"chars1\", \"words2\", \"chars2\"], axis=1).to_csv(\"./data/train_feature.csv\", index=False)\n",
259 |     "test3.drop([\"q1\", \"q2\", \"words1\", \"chars1\", \"words2\", \"chars2\"], axis=1).to_csv(\"./data/test_feature.csv\", index=False)"
260 |    ]
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "Python 3",
266 |    "language": "python",
267 |    "name": "python3"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.6.2"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 2
284 | }
285 | 


--------------------------------------------------------------------------------
/[Model] TextRNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings(\"ignore\")\n",
 11 |     "\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "from glob import glob\n",
 15 |     "from datetime import datetime\n",
 16 |     "\n",
 17 |     "DATA_PATH = \"./data/\"\n",
 18 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 19 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 20 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 21 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 22 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 23 |     "\n",
 24 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 25 |     "test_data = pd.read_csv(TEST_PATH)\n",
 26 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 27 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 28 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 29 |     "\n",
 30 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 31 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 32 |     "\n",
 33 |     "label = train_data[\"label\"].values"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from keras.preprocessing.text import Tokenizer\n",
 43 |     "\n",
 44 |     "MAX_COUNT = 10000\n",
 45 |     "\n",
 46 |     "word_tokenizer = Tokenizer(MAX_COUNT)\n",
 47 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
 48 |     "\n",
 49 |     "word_embedding_data = np.concatenate(\n",
 50 |     "    (\n",
 51 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 52 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 53 |     "    ),\n",
 54 |     "    axis=0\n",
 55 |     ")\n",
 56 |     "\n",
 57 |     "char_tokenizer = Tokenizer(MAX_COUNT)\n",
 58 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
 59 |     "\n",
 60 |     "char_embedding_data = np.concatenate(\n",
 61 |     "    (\n",
 62 |     "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
 63 |     "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 64 |     "    ),\n",
 65 |     "    axis=0\n",
 66 |     ")\n",
 67 |     "\n",
 68 |     "word_embedding_data.shape, char_embedding_data.shape"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from keras.preprocessing.sequence import pad_sequences\n",
 78 |     "\n",
 79 |     "SEQ_LEN = 25\n",
 80 |     "\n",
 81 |     "def gen_word_data(data):\n",
 82 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
 83 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
 84 |     "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
 85 |     "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
 86 |     "    \n",
 87 |     "def gen_char_data(data):\n",
 88 |     "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
 89 |     "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
 90 |     "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
 91 |     "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
 92 |     "\n",
 93 |     "word1, word2 = gen_word_data(train_data)\n",
 94 |     "char1, char2 = gen_char_data(train_data)\n",
 95 |     "test_word1, test_word2 = gen_word_data(test_data)\n",
 96 |     "test_char1, test_char2 = gen_char_data(test_data)\n",
 97 |     "\n",
 98 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "from keras.models import Model\n",
110 |     "from keras.layers.merge import concatenate\n",
111 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
112 |     "from keras.optimizers import Adam, Nadam, SGD\n",
113 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed, CuDNNLSTM\n",
114 |     "from keras.layers import Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPool1D\n",
115 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {
122 |     "collapsed": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "BATCH_SIZE = 1024\n",
127 |     "NUM_EPOCHES = 50\n",
128 |     "DROP_RATE = 0.3\n",
129 |     "PATIENCE = 8\n",
130 |     "\n",
131 |     "LSTM_SIZE1 = 256\n",
132 |     "LSTM_SIZE2 = 256\n",
133 |     "\n",
134 |     "DENSE_SIZE1 = 512\n",
135 |     "DENSE_SIZE2 = 256"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from sklearn.model_selection import StratifiedKFold\n",
145 |     "\n",
146 |     "best_results = []\n",
147 |     "last_results = []\n",
148 |     "\n",
149 |     "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10, shuffle=True).split(X=char1, y=label)):\n",
150 |     "    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch\n",
151 |     "    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch\n",
152 |     "    \n",
153 |     "    input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
154 |     "    input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
155 |     "\n",
156 |     "    embedding_layer = Embedding(\n",
157 |     "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
158 |     "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
159 |     "        weights=[word_embedding_data],  # word/char switch\n",
160 |     "        input_length=SEQ_LEN,\n",
161 |     "        trainable=False\n",
162 |     "    )\n",
163 |     "    \n",
164 |     "    vector1 = embedding_layer(input1)\n",
165 |     "    vector2 = embedding_layer(input2)\n",
166 |     "    \n",
167 |     "    lstm_layer1 = Bidirectional(LSTM(LSTM_SIZE1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n",
168 |     "    layer1a = lstm_layer1(vector1)\n",
169 |     "    layer1b = lstm_layer1(vector2)\n",
170 |     "    lstm_layer2 = Bidirectional(LSTM(LSTM_SIZE2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n",
171 |     "    layer2a = lstm_layer2(layer1a)\n",
172 |     "    layer2b = lstm_layer2(layer1b)\n",
173 |     "    layer2a = GlobalMaxPool1D()(layer2a)\n",
174 |     "    layer2b = GlobalMaxPool1D()(layer2b)\n",
175 |     "    merge = concatenate([layer2a, layer2b])\n",
176 |     "    \n",
177 |     "    x = Dropout(DROP_RATE)(merge)\n",
178 |     "    x = BatchNormalization()(x)\n",
179 |     "    x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
180 |     "    x = Dropout(DROP_RATE)(x)\n",
181 |     "    x = BatchNormalization()(x)\n",
182 |     "    x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
183 |     "    x = Dropout(DROP_RATE)(x)\n",
184 |     "    x = BatchNormalization()(x)\n",
185 |     "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
186 |     "    \n",
187 |     "    model = Model(inputs=[input1, input2], outputs=pred)\n",
188 |     "    model.compile(\n",
189 |     "        optimizer=\"nadam\",\n",
190 |     "        loss=\"binary_crossentropy\",\n",
191 |     "        metrics=[\"acc\"]\n",
192 |     "    )\n",
193 |     "    \n",
194 |     "    early_stopping = EarlyStopping(\"val_loss\", patience=PATIENCE)\n",
195 |     "    check_point = ModelCheckpoint(\n",
196 |     "        \"./log/%s.TextRNN.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
197 |     "        monitor=\"val_loss\",\n",
198 |     "        save_best_only=True,\n",
199 |     "    )\n",
200 |     "    \n",
201 |     "    fit_res = model.fit(\n",
202 |     "        x=[train_x1, train_x2],\n",
203 |     "        y=train_y,\n",
204 |     "        batch_size=BATCH_SIZE,\n",
205 |     "        epochs=NUM_EPOCHES,\n",
206 |     "        validation_data=([dev_x1, dev_x2], dev_y),\n",
207 |     "        shuffle=True,\n",
208 |     "        callbacks=[early_stopping, check_point]\n",
209 |     "    )\n",
210 |     "    \n",
211 |     "    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
212 |     "    last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
213 |     "    \n",
214 |     "    print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
215 |     "    model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
216 |     "    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
217 |     "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
218 |     "\n",
219 |     "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
220 |     "    \"./result/%s-TextRNN_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
221 |     "    index=False\n",
222 |     ")\n",
223 |     "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
224 |     "    \"./result/%s-TextRNN_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
225 |     "    index=False\n",
226 |     ")"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "kernelspec": {
241 |    "display_name": "Python 3",
242 |    "language": "python",
243 |    "name": "python3"
244 |   },
245 |   "language_info": {
246 |    "codemirror_mode": {
247 |     "name": "ipython",
248 |     "version": 3
249 |    },
250 |    "file_extension": ".py",
251 |    "mimetype": "text/x-python",
252 |    "name": "python",
253 |    "nbconvert_exporter": "python",
254 |    "pygments_lexer": "ipython3",
255 |    "version": "3.6.2"
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 2
260 | }
261 | 


--------------------------------------------------------------------------------
/[Model] TextRCNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings(\"ignore\")\n",
 13 |     "\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "from glob import glob\n",
 17 |     "from datetime import datetime\n",
 18 |     "\n",
 19 |     "DATA_PATH = \"./data/\"\n",
 20 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 21 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 22 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 23 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 24 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 25 |     "\n",
 26 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 27 |     "test_data = pd.read_csv(TEST_PATH)\n",
 28 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 29 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 30 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 31 |     "\n",
 32 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 33 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 34 |     "\n",
 35 |     "label = train_data[\"label\"].values"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stderr",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Using TensorFlow backend.\n"
 48 |      ]
 49 |     },
 50 |     {
 51 |      "data": {
 52 |       "text/plain": [
 53 |        "((10001, 300), (3049, 300))"
 54 |       ]
 55 |      },
 56 |      "execution_count": 2,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "from keras.preprocessing.text import Tokenizer\n",
 63 |     "\n",
 64 |     "MAX_COUNT = 10000\n",
 65 |     "\n",
 66 |     "word_tokenizer = Tokenizer(MAX_COUNT)\n",
 67 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
 68 |     "\n",
 69 |     "word_embedding_data = np.concatenate(\n",
 70 |     "    (\n",
 71 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 72 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 73 |     "    ),\n",
 74 |     "    axis=0\n",
 75 |     ")\n",
 76 |     "\n",
 77 |     "char_tokenizer = Tokenizer(MAX_COUNT)\n",
 78 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
 79 |     "\n",
 80 |     "char_embedding_data = np.concatenate(\n",
 81 |     "    (\n",
 82 |     "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
 83 |     "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 84 |     "    ),\n",
 85 |     "    axis=0\n",
 86 |     ")\n",
 87 |     "\n",
 88 |     "word_embedding_data.shape, char_embedding_data.shape"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 3,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "((254386, 25),\n",
100 |        " (254386, 25),\n",
101 |        " (172956, 25),\n",
102 |        " (172956, 25),\n",
103 |        " (254386, 25),\n",
104 |        " (254386, 25),\n",
105 |        " (172956, 25),\n",
106 |        " (172956, 25))"
107 |       ]
108 |      },
109 |      "execution_count": 3,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "from keras.preprocessing.sequence import pad_sequences\n",
116 |     "\n",
117 |     "SEQ_LEN = 25\n",
118 |     "\n",
119 |     "def gen_word_data(data):\n",
120 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
121 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
122 |     "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
123 |     "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
124 |     "    \n",
125 |     "def gen_char_data(data):\n",
126 |     "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
127 |     "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
128 |     "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
129 |     "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
130 |     "\n",
131 |     "word1, word2 = gen_word_data(train_data)\n",
132 |     "char1, char2 = gen_char_data(train_data)\n",
133 |     "test_word1, test_word2 = gen_word_data(test_data)\n",
134 |     "test_char1, test_char2 = gen_char_data(test_data)\n",
135 |     "\n",
136 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 4,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "from keras.models import Model\n",
148 |     "from keras.layers.merge import concatenate\n",
149 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
150 |     "from keras.optimizers import Adam, Nadam, SGD\n",
151 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed, CuDNNLSTM\n",
152 |     "from keras.layers import Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPool1D\n",
153 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 5,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "# general\n",
165 |     "BATCH_SIZE = 512\n",
166 |     "NUM_EPOCHES = 30\n",
167 |     "DROP_RATE = 0.3\n",
168 |     "PATIENCE = 8\n",
169 |     "# cnn\n",
170 |     "CONV_LEN_1 = 128\n",
171 |     "CONV_LEN_2 = 128\n",
172 |     "CONV_LEN_3 = 128\n",
173 |     "CONV_LEN_4 = 128\n",
174 |     "CONV_LEN_5 = 128\n",
175 |     "CONV_LEN_6 = 128\n",
176 |     "CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6\n",
177 |     "# lstm\n",
178 |     "LSTM_SIZE1 = 256\n",
179 |     "LSTM_SIZE2 = 256\n",
180 |     "LSTM_DROP_RATE = 0.3\n",
181 |     "# dense\n",
182 |     "DENSE_SIZE1 = 512\n",
183 |     "DENSE_SIZE2 = 256"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "collapsed": true
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "def cnn_layer1(step_input, filters, kernel_size):\n",
195 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
196 |     "    conv_output = conv(step_input)\n",
197 |     "    conv_output = GlobalMaxPool1D()(conv_output)\n",
198 |     "    return conv_output\n",
199 |     "\n",
200 |     "def cnn_layer2(step_input, filters, kernel_size):\n",
201 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
202 |     "    conv_output = conv(step_input)\n",
203 |     "    conv_output = GlobalAveragePooling1D()(conv_output)\n",
204 |     "    return conv_output\n",
205 |     "\n",
206 |     "def cnn_layer3(step_input, filters, kernel_size):\n",
207 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
208 |     "    conv_output = conv(step_input)\n",
209 |     "    conv_output1 = GlobalMaxPool1D()(conv_output)\n",
210 |     "    conv_output2 = GlobalAveragePooling1D()(conv_output)\n",
211 |     "    conv_output = concatenate([conv_output1, conv_output2])\n",
212 |     "    return conv_output"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n",
225 |       "Instructions for updating:\n",
226 |       "`NHWC` for data_format is deprecated, use `NWC` instead\n",
227 |       "Train on 228946 samples, validate on 25440 samples\n",
228 |       "Epoch 1/30\n",
229 |       "228946/228946 [==============================] - 304s 1ms/step - loss: 0.3771 - acc: 0.8290 - val_loss: 0.2841 - val_acc: 0.8757\n",
230 |       "Epoch 2/30\n",
231 |       "228946/228946 [==============================] - 299s 1ms/step - loss: 0.2587 - acc: 0.8887 - val_loss: 0.2429 - val_acc: 0.8976\n",
232 |       "Epoch 3/30\n",
233 |       "228946/228946 [==============================] - 299s 1ms/step - loss: 0.2155 - acc: 0.9091 - val_loss: 0.2256 - val_acc: 0.9086\n",
234 |       "Epoch 4/30\n",
235 |       "228946/228946 [==============================] - 297s 1ms/step - loss: 0.1888 - acc: 0.9217 - val_loss: 0.2054 - val_acc: 0.9134\n",
236 |       "Epoch 5/30\n",
237 |       "228946/228946 [==============================] - 296s 1ms/step - loss: 0.1684 - acc: 0.9307 - val_loss: 0.2144 - val_acc: 0.9143\n",
238 |       "Epoch 6/30\n",
239 |       "228946/228946 [==============================] - 295s 1ms/step - loss: 0.1524 - acc: 0.9382 - val_loss: 0.2035 - val_acc: 0.9181\n",
240 |       "Epoch 7/30\n",
241 |       "228946/228946 [==============================] - 296s 1ms/step - loss: 0.1361 - acc: 0.9441 - val_loss: 0.2061 - val_acc: 0.9204\n",
242 |       "Epoch 8/30\n",
243 |       "228946/228946 [==============================] - 297s 1ms/step - loss: 0.1254 - acc: 0.9490 - val_loss: 0.2181 - val_acc: 0.9162\n",
244 |       "Epoch 9/30\n",
245 |       " 70144/228946 [========>.....................] - ETA: 3:17 - loss: 0.1033 - acc: 0.9583"
246 |      ]
247 |     }
248 |    ],
249 |    "source": [
250 |     "from sklearn.model_selection import StratifiedKFold\n",
251 |     "\n",
252 |     "best_results = []\n",
253 |     "last_results = []\n",
254 |     "\n",
255 |     "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10, shuffle=True).split(X=word1, y=label)):  # word/char switch\n",
256 |     "    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch\n",
257 |     "    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch\n",
258 |     "    \n",
259 |     "    input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
260 |     "    input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
261 |     "\n",
262 |     "    embedding_layer = Embedding(\n",
263 |     "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
264 |     "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
265 |     "        weights=[word_embedding_data],  # word/char switch\n",
266 |     "        input_length=SEQ_LEN,\n",
267 |     "        trainable=False\n",
268 |     "    )\n",
269 |     "    \n",
270 |     "    vector1 = embedding_layer(input1)\n",
271 |     "    vector2 = embedding_layer(input2)\n",
272 |     "    \n",
273 |     "    lstm_layer1 = Bidirectional(LSTM(LSTM_SIZE1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n",
274 |     "    layer1a = lstm_layer1(vector1)\n",
275 |     "    layer1b = lstm_layer1(vector2)\n",
276 |     "    lstm_layer2 = Bidirectional(LSTM(LSTM_SIZE2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n",
277 |     "    layer2a = lstm_layer2(layer1a)\n",
278 |     "    layer2b = lstm_layer2(layer1b)\n",
279 |     "    layer2a = concatenate([vector1, layer2a])\n",
280 |     "    layer2b = concatenate([vector2, layer2b])\n",
281 |     "    \n",
282 |     "    # TODO: 这里还可以添加BatchNorm层\n",
283 |     "    \n",
284 |     "    conv1a, conv1b = cnn_layer1(layer2a, filters=CONV_LEN_1, kernel_size=1), cnn_layer1(layer2b, filters=CONV_LEN_1, kernel_size=1)\n",
285 |     "    conv2a, conv2b = cnn_layer1(layer2a, filters=CONV_LEN_2, kernel_size=2), cnn_layer1(layer2b, filters=CONV_LEN_2, kernel_size=2)\n",
286 |     "    conv3a, conv3b = cnn_layer1(layer2a, filters=CONV_LEN_3, kernel_size=3), cnn_layer1(layer2b, filters=CONV_LEN_3, kernel_size=3)\n",
287 |     "    conv4a, conv4b = cnn_layer1(layer2a, filters=CONV_LEN_4, kernel_size=4), cnn_layer1(layer2b, filters=CONV_LEN_4, kernel_size=4)\n",
288 |     "    conv5a, conv5b = cnn_layer1(layer2a, filters=CONV_LEN_5, kernel_size=5), cnn_layer1(layer2b, filters=CONV_LEN_5, kernel_size=5)\n",
289 |     "    conv6a, conv6b = cnn_layer1(layer2a, filters=CONV_LEN_6, kernel_size=6), cnn_layer1(layer2b, filters=CONV_LEN_6, kernel_size=6)\n",
290 |     "    \n",
291 |     "    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
292 |     "    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
293 |     "    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
294 |     "    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
295 |     "    merge = concatenate([diff, mult])\n",
296 |     "    \n",
297 |     "    x = Dropout(DROP_RATE)(merge)\n",
298 |     "    x = BatchNormalization()(x)\n",
299 |     "    x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
300 |     "    x = Dropout(DROP_RATE)(x)\n",
301 |     "    x = BatchNormalization()(x)\n",
302 |     "    x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
303 |     "    x = Dropout(DROP_RATE)(x)\n",
304 |     "    x = BatchNormalization()(x)\n",
305 |     "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
306 |     "    \n",
307 |     "    model = Model(inputs=[input1, input2], outputs=pred)\n",
308 |     "    model.compile(\n",
309 |     "        optimizer=\"nadam\",\n",
310 |     "        loss=\"binary_crossentropy\",\n",
311 |     "        metrics=[\"acc\"]\n",
312 |     "    )\n",
313 |     "\n",
314 |     "    early_stopping = EarlyStopping(\"val_loss\", patience=PATIENCE)\n",
315 |     "    check_point = ModelCheckpoint(\n",
316 |     "        \"./log/%s.TextRCNN.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
317 |     "        monitor=\"val_loss\",\n",
318 |     "        save_best_only=True,\n",
319 |     "    )\n",
320 |     "    \n",
321 |     "    fit_res = model.fit(\n",
322 |     "        x=[train_x1, train_x2],\n",
323 |     "        y=train_y,\n",
324 |     "        batch_size=BATCH_SIZE,\n",
325 |     "        epochs=NUM_EPOCHES,\n",
326 |     "        validation_data=([dev_x1, dev_x2], dev_y),\n",
327 |     "        shuffle=True,\n",
328 |     "        callbacks=[early_stopping, check_point]\n",
329 |     "    )\n",
330 |     "    \n",
331 |     "    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
332 |     "    last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
333 |     "    \n",
334 |     "    print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
335 |     "    model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
336 |     "    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
337 |     "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
338 |     "\n",
339 |     "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
340 |     "    \"./result/%s-TextRCNN_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
341 |     "    index=False\n",
342 |     ")\n",
343 |     "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
344 |     "    \"./result/%s-TextRCNN_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
345 |     "    index=False\n",
346 |     ")"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {
353 |     "collapsed": true
354 |    },
355 |    "outputs": [],
356 |    "source": []
357 |   }
358 |  ],
359 |  "metadata": {
360 |   "kernelspec": {
361 |    "display_name": "Python 3",
362 |    "language": "python",
363 |    "name": "python3"
364 |   },
365 |   "language_info": {
366 |    "codemirror_mode": {
367 |     "name": "ipython",
368 |     "version": 3
369 |    },
370 |    "file_extension": ".py",
371 |    "mimetype": "text/x-python",
372 |    "name": "python",
373 |    "nbconvert_exporter": "python",
374 |    "pygments_lexer": "ipython3",
375 |    "version": "3.6.2"
376 |   }
377 |  },
378 |  "nbformat": 4,
379 |  "nbformat_minor": 2
380 | }
381 | 


--------------------------------------------------------------------------------
/[Model] CNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings(\"ignore\")\n",
 13 |     "\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "from glob import glob\n",
 17 |     "from datetime import datetime"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "DATA_PATH = \"./data/\"\n",
 29 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 30 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 31 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 32 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 33 |     "QUEST_PATH = DATA_PATH + \"question.csv\""
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 45 |     "test_data = pd.read_csv(TEST_PATH)\n",
 46 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 47 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 48 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 49 |     "\n",
 50 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 51 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from keras.preprocessing.text import Tokenizer\n",
 61 |     "\n",
 62 |     "MAX_WORD_NUMS = 10000\n",
 63 |     "\n",
 64 |     "word_tokenizer = Tokenizer(MAX_WORD_NUMS)\n",
 65 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "word_embedding_data = np.concatenate(\n",
 75 |     "    (\n",
 76 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 77 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values\n",
 78 |     "    ),\n",
 79 |     "    axis=0\n",
 80 |     ")\n",
 81 |     "word_embedding_data.shape"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "from keras.preprocessing.sequence import pad_sequences\n",
 91 |     "\n",
 92 |     "WORD_SEQ_LEN = 30\n",
 93 |     "\n",
 94 |     "def gen_data(data):\n",
 95 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
 96 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
 97 |     "    return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
 98 |     "        pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
 99 |     "\n",
100 |     "word1, word2 = gen_data(train_data)\n",
101 |     "test_word1, test_word2 = gen_data(test_data)\n",
102 |     "\n",
103 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "label = train_data[\"label\"].values"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "from keras.layers import Input, Embedding, Conv1D, GlobalAveragePooling1D, MaxPool1D, Lambda, Dropout, BatchNormalization, Dense, Flatten, K\n",
126 |     "from keras.models import Model\n",
127 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
128 |     "from keras.layers.merge import concatenate\n",
129 |     "\n",
130 |     "CONV_LEN_1 = 128\n",
131 |     "CONV_LEN_2 = 128\n",
132 |     "CONV_LEN_3 = 128\n",
133 |     "CONV_LEN_4 = 128\n",
134 |     "CONV_LEN_5 = 128\n",
135 |     "CONV_LEN_6 = 128\n",
136 |     "CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6\n",
137 |     "DROP_RATE = 0.6\n",
138 |     "DENSE_SIZE = 300\n",
139 |     "BATCH_SIZE = 2048\n",
140 |     "NUM_EPOCHES = 50"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "from sklearn.model_selection import train_test_split\n",
150 |     "\n",
151 |     "train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n",
152 |     "    word1, word2, train_data[\"label\"].values,\n",
153 |     "    test_size=0.2\n",
154 |     ")\n",
155 |     "\n",
156 |     "word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
157 |     "word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
158 |     "\n",
159 |     "embedding_layer = Embedding(\n",
160 |     "    input_dim=word_embedding_data.shape[0],\n",
161 |     "    output_dim=word_embedding_data.shape[1],\n",
162 |     "    weights=[word_embedding_data],\n",
163 |     "    input_length=WORD_SEQ_LEN,\n",
164 |     "    trainable=False\n",
165 |     ")\n",
166 |     "\n",
167 |     "word_vector1 = embedding_layer(word_input1)\n",
168 |     "word_vector2 = embedding_layer(word_input2)\n",
169 |     "\n",
170 |     "def cnn_layer(input1, input2, kernel_size, filters):\n",
171 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
172 |     "    \n",
173 |     "    conv_a = conv(input1)\n",
174 |     "    conv_a = MaxPool1D(pool_size=WORD_SEQ_LEN, strides=WORD_SEQ_LEN, padding=\"same\")(conv_a)\n",
175 |     "    conv_a = Flatten()(conv_a)\n",
176 |     "    \n",
177 |     "    conv_b = conv(input2)\n",
178 |     "    conv_b = MaxPool1D(pool_size=WORD_SEQ_LEN, strides=WORD_SEQ_LEN, padding=\"same\")(conv_b)\n",
179 |     "    conv_b = Flatten()(conv_b)\n",
180 |     "    return conv_a, conv_b\n",
181 |     "\n",
182 |     "conv1a, conv1b = cnn_layer(word_vector1, word_vector2, kernel_size=1, filters=CONV_LEN_1)\n",
183 |     "conv2a, conv2b = cnn_layer(word_vector1, word_vector2, kernel_size=2, filters=CONV_LEN_2)\n",
184 |     "conv3a, conv3b = cnn_layer(word_vector1, word_vector2, kernel_size=3, filters=CONV_LEN_3)\n",
185 |     "conv4a, conv4b = cnn_layer(word_vector1, word_vector2, kernel_size=4, filters=CONV_LEN_4)\n",
186 |     "conv5a, conv5b = cnn_layer(word_vector1, word_vector2, kernel_size=5, filters=CONV_LEN_5)\n",
187 |     "conv6a, conv6b = cnn_layer(word_vector1, word_vector2, kernel_size=6, filters=CONV_LEN_6)\n",
188 |     "\n",
189 |     "merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
190 |     "merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
191 |     "diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
192 |     "mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
193 |     "merge = concatenate([diff, mult])\n",
194 |     "\n",
195 |     "x = Dropout(DROP_RATE)(merge)\n",
196 |     "x = BatchNormalization()(x)\n",
197 |     "x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n",
198 |     "\n",
199 |     "x = Dropout(DROP_RATE)(x)\n",
200 |     "x = BatchNormalization()(x)\n",
201 |     "pred = Dense(1, activation=\"sigmoid\")(x)\n",
202 |     "\n",
203 |     "model = Model(\n",
204 |     "    inputs = [word_input1, word_input2],\n",
205 |     "    outputs = pred\n",
206 |     ")\n",
207 |     "model.compile(\n",
208 |     "    optimizer=\"adam\",\n",
209 |     "    loss=\"binary_crossentropy\",\n",
210 |     "    metrics=[\"acc\"]\n",
211 |     ")\n",
212 |     "\n",
213 |     "early_stop = EarlyStopping(\"val_loss\", patience=10)\n",
214 |     "check_point = ModelCheckpoint(\n",
215 |     "    \"./log/%s.cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
216 |     "    monitor=\"val_loss\",\n",
217 |     "    save_best_only=True,\n",
218 |     "    save_weights_only=True\n",
219 |     ")\n",
220 |     "\n",
221 |     "model_res = model.fit(\n",
222 |     "    x=[train_word1, train_word2],\n",
223 |     "    y=train_y,\n",
224 |     "    batch_size=BATCH_SIZE,\n",
225 |     "    epochs=NUM_EPOCHES,\n",
226 |     "    validation_data=([dev_word1, dev_word2], dev_y),\n",
227 |     "    shuffle=True,\n",
228 |     "    callbacks=[early_stop, check_point]\n",
229 |     ")\n",
230 |     "\n",
231 |     "test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
232 |     "pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_last.csv\", index=False)\n",
233 |     "\n",
234 |     "print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
235 |     "model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
236 |     "test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
237 |     "pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_best.csv\", index=False)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": [
248 |     "# from sklearn.model_selection import train_test_split\n",
249 |     "\n",
250 |     "# train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n",
251 |     "#     word1, word2, train_data[\"label\"].values,\n",
252 |     "#     test_size=0.2\n",
253 |     "# )\n",
254 |     "\n",
255 |     "# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
256 |     "# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
257 |     "\n",
258 |     "# embedding_layer = Embedding(\n",
259 |     "#     input_dim=word_embedding_data.shape[0],\n",
260 |     "#     output_dim=word_embedding_data.shape[1],\n",
261 |     "#     weights=[word_embedding_data],\n",
262 |     "#     input_length=WORD_SEQ_LEN,\n",
263 |     "#     trainable=False\n",
264 |     "# )\n",
265 |     "\n",
266 |     "# word_vector1 = embedding_layer(word_input1)\n",
267 |     "# word_vector2 = embedding_layer(word_input2)\n",
268 |     "\n",
269 |     "# conv1 = Conv1D(filters=CONV_LEN_1, kernel_size=1, padding=\"same\", activation=\"relu\")\n",
270 |     "# conv1a = conv1(word_vector1)\n",
271 |     "# conv1a = GlobalAveragePooling1D()(conv1a)\n",
272 |     "# conv1b = conv1(word_vector2)\n",
273 |     "# conv1b = GlobalAveragePooling1D()(conv1b)\n",
274 |     "\n",
275 |     "# conv2 = Conv1D(filters=CONV_LEN_2, kernel_size=2, padding=\"same\", activation=\"relu\")\n",
276 |     "# conv2a = conv2(word_vector1)\n",
277 |     "# conv2a = GlobalAveragePooling1D()(conv2a)\n",
278 |     "# conv2b = conv2(word_vector2)\n",
279 |     "# conv2b = GlobalAveragePooling1D()(conv2b)\n",
280 |     "\n",
281 |     "# conv3 = Conv1D(filters=CONV_LEN_3, kernel_size=3, padding=\"same\", activation=\"relu\")\n",
282 |     "# conv3a = conv3(word_vector1)\n",
283 |     "# conv3a = GlobalAveragePooling1D()(conv3a)\n",
284 |     "# conv3b = conv3(word_vector2)\n",
285 |     "# conv3b = GlobalAveragePooling1D()(conv3b)\n",
286 |     "\n",
287 |     "# conv4 = Conv1D(filters=CONV_LEN_4, kernel_size=4, padding=\"same\", activation=\"relu\")\n",
288 |     "# conv4a = conv4(word_vector1)\n",
289 |     "# conv4a = GlobalAveragePooling1D()(conv4a)\n",
290 |     "# conv4b = conv4(word_vector2)\n",
291 |     "# conv4b = GlobalAveragePooling1D()(conv4b)\n",
292 |     "\n",
293 |     "# conv5 = Conv1D(filters=CONV_LEN_5, kernel_size=5, padding=\"same\", activation=\"relu\")\n",
294 |     "# conv5a = conv5(word_vector1)\n",
295 |     "# conv5a = GlobalAveragePooling1D()(conv5a)\n",
296 |     "# conv5b = conv5(word_vector2)\n",
297 |     "# conv5b = GlobalAveragePooling1D()(conv5b)\n",
298 |     "\n",
299 |     "# conv6 = Conv1D(filters=CONV_LEN_6, kernel_size=6, padding=\"same\", activation=\"relu\")\n",
300 |     "# conv6a = conv6(word_vector1)\n",
301 |     "# conv6a = GlobalAveragePooling1D()(conv6a)\n",
302 |     "# conv6b = conv6(word_vector2)\n",
303 |     "# conv6b = GlobalAveragePooling1D()(conv6b)\n",
304 |     "\n",
305 |     "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
306 |     "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
307 |     "# # merge = concatenate([merge_a, merge_b])\n",
308 |     "\n",
309 |     "# diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(CONV_LEN,))([merge_a, merge_b])\n",
310 |     "# mult = Lambda(lambda x: x[0] * x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])\n",
311 |     "# merge = concatenate([diff, mult])\n",
312 |     "\n",
313 |     "# x = Dropout(DROP_RATE)(merge)\n",
314 |     "# x = BatchNormalization()(x)\n",
315 |     "# x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n",
316 |     "\n",
317 |     "# x = Dropout(DROP_RATE)(x)\n",
318 |     "# x = BatchNormalization()(x)\n",
319 |     "# pred = Dense(1, activation=\"sigmoid\")(x)\n",
320 |     "\n",
321 |     "# model = Model(\n",
322 |     "#     inputs = [word_input1, word_input2],\n",
323 |     "#     outputs = pred\n",
324 |     "# )\n",
325 |     "# model.compile(\n",
326 |     "#     optimizer=\"adam\",\n",
327 |     "#     loss=\"binary_crossentropy\",\n",
328 |     "#     metrics=[\"acc\"]\n",
329 |     "# )\n",
330 |     "\n",
331 |     "# early_stop = EarlyStopping(\"val_loss\", patience=10)\n",
332 |     "# check_point = ModelCheckpoint(\n",
333 |     "#     \"./log/%s.cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
334 |     "#     monitor=\"val_loss\",\n",
335 |     "#     save_best_only=True,\n",
336 |     "#     save_weights_only=True\n",
337 |     "# )\n",
338 |     "\n",
339 |     "# model_res = model.fit(\n",
340 |     "#     x=[train_word1, train_word2],\n",
341 |     "#     y=train_y,\n",
342 |     "#     batch_size=BATCH_SIZE,\n",
343 |     "#     epochs=NUM_EPOCHES,\n",
344 |     "#     validation_data=([dev_word1, dev_word2], dev_y),\n",
345 |     "#     shuffle=True,\n",
346 |     "#     callbacks=[early_stop, check_point]\n",
347 |     "# )\n",
348 |     "\n",
349 |     "# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
350 |     "# pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_last.csv\", index=False)\n",
351 |     "\n",
352 |     "# print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
353 |     "# model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
354 |     "# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
355 |     "# pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_best.csv\", index=False)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {
362 |     "collapsed": true
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "# from sklearn.model_selection import StratifiedKFold\n",
367 |     "\n",
368 |     "# pred_collect = []\n",
369 |     "\n",
370 |     "# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):\n",
371 |     "#     train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]\n",
372 |     "#     dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]\n",
373 |     "    \n",
374 |     "#     word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
375 |     "#     word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
376 |     "    \n",
377 |     "#     embedding_layer = Embedding(\n",
378 |     "#         input_dim=word_embedding_data.shape[0],\n",
379 |     "#         output_dim=word_embedding_data.shape[1],\n",
380 |     "#         weights=[word_embedding_data],\n",
381 |     "#         input_length=WORD_SEQ_LEN,\n",
382 |     "#         trainable=False\n",
383 |     "#     )\n",
384 |     "    \n",
385 |     "#     word_vector1 = embedding_layer(word_input1)\n",
386 |     "#     word_vector2 = embedding_layer(word_input2)\n",
387 |     "    \n",
388 |     "#     conv1 = Conv1D(filters=128, kernel_size=1, padding=\"same\", activation=\"relu\")\n",
389 |     "#     conv1a = conv1(word_vector1)\n",
390 |     "#     conv1a = GlobalAveragePooling1D()(conv1a)\n",
391 |     "#     conv1b = conv1(word_vector2)\n",
392 |     "#     conv1b = GlobalAveragePooling1D()(conv1b)\n",
393 |     "    \n",
394 |     "#     conv2 = Conv1D(filters=128, kernel_size=2, padding=\"same\", activation=\"relu\")\n",
395 |     "#     conv2a = conv2(word_vector1)\n",
396 |     "#     conv2a = GlobalAveragePooling1D()(conv2a)\n",
397 |     "#     conv2b = conv2(word_vector2)\n",
398 |     "#     conv2b = GlobalAveragePooling1D()(conv2b)\n",
399 |     "    \n",
400 |     "#     conv3 = Conv1D(filters=128, kernel_size=3, padding=\"same\", activation=\"relu\")\n",
401 |     "#     conv3a = conv3(word_vector1)\n",
402 |     "#     conv3a = GlobalAveragePooling1D()(conv3a)\n",
403 |     "#     conv3b = conv3(word_vector2)\n",
404 |     "#     conv3b = GlobalAveragePooling1D()(conv3b)\n",
405 |     "    \n",
406 |     "#     conv4 = Conv1D(filters=128, kernel_size=4, padding=\"same\", activation=\"relu\")\n",
407 |     "#     conv4a = conv4(word_vector1)\n",
408 |     "#     conv4a = GlobalAveragePooling1D()(conv4a)\n",
409 |     "#     conv4b = conv4(word_vector2)\n",
410 |     "#     conv4b = GlobalAveragePooling1D()(conv4b)\n",
411 |     "    \n",
412 |     "#     conv5 = Conv1D(filters=128, kernel_size=5, padding=\"same\", activation=\"relu\")\n",
413 |     "#     conv5a = conv5(word_vector1)\n",
414 |     "#     conv5a = GlobalAveragePooling1D()(conv5a)\n",
415 |     "#     conv5b = conv5(word_vector2)\n",
416 |     "#     conv5b = GlobalAveragePooling1D()(conv5b)\n",
417 |     "    \n",
418 |     "#     conv6 = Conv1D(filters=128, kernel_size=6, padding=\"same\", activation=\"relu\")\n",
419 |     "#     conv6a = conv6(word_vector1)\n",
420 |     "#     conv6a = GlobalAveragePooling1D()(conv6a)\n",
421 |     "#     conv6b = conv6(word_vector2)\n",
422 |     "#     conv6b = GlobalAveragePooling1D()(conv6b)\n",
423 |     "    \n",
424 |     "#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
425 |     "#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
426 |     "    \n",
427 |     "#     diff = Lambda(lambda x: x[0] - x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])\n",
428 |     "#     mult = Lambda(lambda x: x[0] * x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])\n",
429 |     "    \n",
430 |     "#     merge = concatenate([diff, mult])\n",
431 |     "    \n",
432 |     "#     x = Dropout(DROP_RATE)(merge)\n",
433 |     "#     x = BatchNormalization()(x)\n",
434 |     "#     x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n",
435 |     "    \n",
436 |     "#     x = Dropout(DROP_RATE)(x)\n",
437 |     "#     x = BatchNormalization()(x)\n",
438 |     "#     pred = Dense(1, activation=\"sigmoid\")(x)\n",
439 |     "    \n",
440 |     "#     model = Model(\n",
441 |     "#         inputs = [word_input1, word_input2],\n",
442 |     "#         outputs = pred\n",
443 |     "#     )\n",
444 |     "#     model.compile(\n",
445 |     "#         optimizer=\"adam\",\n",
446 |     "#         loss=\"binary_crossentropy\",\n",
447 |     "#         metrics=[\"acc\"]\n",
448 |     "#     )\n",
449 |     "    \n",
450 |     "#     early_stop = EarlyStopping(\"val_loss\", patience=10)\n",
451 |     "#     check_point = ModelCheckpoint(\n",
452 |     "#         \"./log/cnn_%02d.{epoch:02d}_{val_loss:.3f}.hdf5\" % (i + 1),\n",
453 |     "#         monitor=\"val_loss\",\n",
454 |     "#         save_best_only=True,\n",
455 |     "#         save_weights_only=True\n",
456 |     "#     )\n",
457 |     "    \n",
458 |     "#     model_res = model.fit(\n",
459 |     "#         x=[train_word1, train_word2],\n",
460 |     "#         y=train_y,\n",
461 |     "#         batch_size=BATCH_SIZE,\n",
462 |     "#         epochs=NUM_EPOCHES,\n",
463 |     "#         validation_data=([dev_word1, dev_word2], dev_y),\n",
464 |     "#         shuffle=True,\n",
465 |     "#         callbacks=[early_stop, check_point]\n",
466 |     "#     )\n",
467 |     "    \n",
468 |     "#     print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
469 |     "#     model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
470 |     "\n",
471 |     "#     test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
472 |     "#     pred_collect.append(pd.DataFrame(test_pred, columns=[\"y_pre\"]))\n",
473 |     "\n",
474 |     "# pd.DataFrame(pd.concat(pred_collect, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\"./result/pred.csv\", index=False)"
475 |    ]
476 |   }
477 |  ],
478 |  "metadata": {
479 |   "kernelspec": {
480 |    "display_name": "Python 3",
481 |    "language": "python",
482 |    "name": "python3"
483 |   },
484 |   "language_info": {
485 |    "codemirror_mode": {
486 |     "name": "ipython",
487 |     "version": 3
488 |    },
489 |    "file_extension": ".py",
490 |    "mimetype": "text/x-python",
491 |    "name": "python",
492 |    "nbconvert_exporter": "python",
493 |    "pygments_lexer": "ipython3",
494 |    "version": "3.6.2"
495 |   }
496 |  },
497 |  "nbformat": 4,
498 |  "nbformat_minor": 2
499 | }
500 | 


--------------------------------------------------------------------------------
/[Model] Multi LSTM CNN v0 word.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings(\"ignore\")\n",
 13 |     "\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "from glob import glob\n",
 17 |     "from datetime import datetime"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "DATA_PATH = \"./data/\"\n",
 29 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 30 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 31 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 32 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 33 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 34 |     "\n",
 35 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 36 |     "test_data = pd.read_csv(TEST_PATH)\n",
 37 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 38 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 39 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 40 |     "\n",
 41 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 42 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 43 |     "\n",
 44 |     "label = train_data[\"label\"].values"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from keras.preprocessing.text import Tokenizer\n",
 54 |     "\n",
 55 |     "MAX_WORD_NUMS = 10000\n",
 56 |     "\n",
 57 |     "word_tokenizer = Tokenizer(MAX_WORD_NUMS)\n",
 58 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
 59 |     "\n",
 60 |     "word_embedding_data = np.concatenate(\n",
 61 |     "    (\n",
 62 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 63 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values\n",
 64 |     "    ),\n",
 65 |     "    axis=0\n",
 66 |     ")\n",
 67 |     "word_embedding_data.shape"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from keras.preprocessing.sequence import pad_sequences\n",
 77 |     "\n",
 78 |     "WORD_SEQ_LEN = 30\n",
 79 |     "\n",
 80 |     "def gen_data(data):\n",
 81 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
 82 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
 83 |     "    return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
 84 |     "        pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
 85 |     "\n",
 86 |     "word1, word2 = gen_data(train_data)\n",
 87 |     "test_word1, test_word2 = gen_data(test_data)\n",
 88 |     "\n",
 89 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from keras.models import Model\n",
101 |     "from keras.layers.merge import concatenate\n",
102 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
103 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed\n",
104 |     "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n",
105 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "collapsed": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "# general\n",
117 |     "NUM_EPOCHES = 50\n",
118 |     "BATCH_SIZE = 1024\n",
119 |     "DENSE_SIZE = 300 # 512\n",
120 |     "DROP_RATE = 0.3\n",
121 |     "\n",
122 |     "# cnn\n",
123 |     "CONV_LEN_1 = 128\n",
124 |     "CONV_LEN_2 = 128\n",
125 |     "CONV_LEN_3 = 128\n",
126 |     "CONV_LEN_4 = 128\n",
127 |     "CONV_LEN_5 = 128\n",
128 |     "CONV_LEN_6 = 128\n",
129 |     "CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6\n",
130 |     "\n",
131 |     "# lstm\n",
132 |     "LSTM_SIZE_1 = 256\n",
133 |     "LSTM_SIZE_2 = 256\n",
134 |     "DROP_RATE_LSTM = 0.3"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "def cnn_layer_1(input1, input2, kernel_size, filters):\n",
146 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
147 |     "    \n",
148 |     "    conv_a = conv(input1)\n",
149 |     "    conv_a = GlobalAveragePooling1D()(conv_a)\n",
150 |     "    \n",
151 |     "    conv_b = conv(input2)\n",
152 |     "    conv_b = GlobalAveragePooling1D()(conv_b)\n",
153 |     "    return conv_a, conv_b"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "# from sklearn.model_selection import train_test_split\n",
165 |     "\n",
166 |     "# train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n",
167 |     "#     word1, word2, train_data[\"label\"].values,\n",
168 |     "#     test_size=0.2\n",
169 |     "# )\n",
170 |     "\n",
171 |     "# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
172 |     "# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
173 |     "\n",
174 |     "# embedding_layer = Embedding(\n",
175 |     "#     input_dim=word_embedding_data.shape[0],\n",
176 |     "#     output_dim=word_embedding_data.shape[1],\n",
177 |     "#     weights=[word_embedding_data],\n",
178 |     "#     input_length=WORD_SEQ_LEN,\n",
179 |     "#     trainable=False\n",
180 |     "# )\n",
181 |     "\n",
182 |     "# word_vector1 = embedding_layer(word_input1)\n",
183 |     "# word_vector2 = embedding_layer(word_input2)\n",
184 |     "\n",
185 |     "# lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n",
186 |     "# word_first_1 = lstm_layer1(word_vector1)\n",
187 |     "# word_first_1 = Dropout(DROP_RATE)(word_first_1)\n",
188 |     "# word_first_2 = lstm_layer1(word_vector2)\n",
189 |     "# word_first_2 = Dropout(DROP_RATE)(word_first_2)\n",
190 |     "\n",
191 |     "# lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n",
192 |     "# word_second_1 = lstm_layer2(word_first_1)\n",
193 |     "# word_second_2 = lstm_layer2(word_first_2)\n",
194 |     "\n",
195 |     "# conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)\n",
196 |     "# conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)\n",
197 |     "# conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)\n",
198 |     "# conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)\n",
199 |     "# conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)\n",
200 |     "# conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)\n",
201 |     "\n",
202 |     "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
203 |     "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
204 |     "# diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
205 |     "# mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
206 |     "# merge = concatenate([diff, mult])\n",
207 |     "\n",
208 |     "# x = Dropout(DROP_RATE)(merge)\n",
209 |     "# x = BatchNormalization()(x)\n",
210 |     "\n",
211 |     "# x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n",
212 |     "# x = Dropout(DROP_RATE)(x)\n",
213 |     "# x = BatchNormalization()(x)\n",
214 |     "\n",
215 |     "# pred = Dense(1, activation=\"sigmoid\")(x)\n",
216 |     "\n",
217 |     "# model = Model(inputs=[word_input1, word_input2], outputs=pred)\n",
218 |     "# model.compile(\n",
219 |     "#     optimizer=\"nadam\",\n",
220 |     "#     loss=\"binary_crossentropy\",\n",
221 |     "#     metrics=[\"acc\"]\n",
222 |     "# )\n",
223 |     "\n",
224 |     "# early_stopping = EarlyStopping(\"val_loss\", patience=10)\n",
225 |     "# check_point = ModelCheckpoint(\n",
226 |     "#     \"./log/%s.multi_lstm_cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
227 |     "#     monitor=\"val_loss\",\n",
228 |     "#     save_best_only=True,\n",
229 |     "#     save_weights_only=True\n",
230 |     "# )\n",
231 |     "\n",
232 |     "# train_res = model.fit(\n",
233 |     "#     x=[train_word1, train_word2],\n",
234 |     "#     y=train_y,\n",
235 |     "#     batch_size=BATCH_SIZE,\n",
236 |     "#     epochs=NUM_EPOCHES,\n",
237 |     "#     validation_data=([dev_word1, dev_word2], dev_y),\n",
238 |     "#     shuffle=True,\n",
239 |     "#     callbacks=[early_stopping, check_point]\n",
240 |     "# )\n",
241 |     "\n",
242 |     "# pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
243 |     "# pd.DataFrame(pred_last, columns=[\"y_pre\"]).to_csv(\n",
244 |     "#     \"./result/%s-multilstm_cnn_pred_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
245 |     "#     index=False\n",
246 |     "# )\n",
247 |     "\n",
248 |     "# print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
249 |     "# model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
250 |     "# pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
251 |     "# pd.DataFrame(pred_best, columns=[\"y_pre\"]).to_csv(\n",
252 |     "#     \"./result/%s-multilstm_cnn_pred_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
253 |     "#     index=False\n",
254 |     "# )"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "from sklearn.model_selection import StratifiedKFold\n",
264 |     "\n",
265 |     "best_results = []\n",
266 |     "last_results = []\n",
267 |     "\n",
268 |     "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):\n",
269 |     "    train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]\n",
270 |     "    dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]\n",
271 |     "    \n",
272 |     "    word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
273 |     "    word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
274 |     "\n",
275 |     "    embedding_layer = Embedding(\n",
276 |     "        input_dim=word_embedding_data.shape[0],\n",
277 |     "        output_dim=word_embedding_data.shape[1],\n",
278 |     "        weights=[word_embedding_data],\n",
279 |     "        input_length=WORD_SEQ_LEN,\n",
280 |     "        trainable=False\n",
281 |     "    )\n",
282 |     "\n",
283 |     "    word_vector1 = embedding_layer(word_input1)\n",
284 |     "    word_vector2 = embedding_layer(word_input2)\n",
285 |     "\n",
286 |     "    lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n",
287 |     "    word_first_1 = lstm_layer1(word_vector1)\n",
288 |     "    word_first_1 = Dropout(DROP_RATE)(word_first_1)\n",
289 |     "    word_first_2 = lstm_layer1(word_vector2)\n",
290 |     "    word_first_2 = Dropout(DROP_RATE)(word_first_2)\n",
291 |     "\n",
292 |     "    lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n",
293 |     "    word_second_1 = lstm_layer2(word_first_1)\n",
294 |     "    word_second_2 = lstm_layer2(word_first_2)\n",
295 |     "\n",
296 |     "    conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)\n",
297 |     "    conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)\n",
298 |     "    conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)\n",
299 |     "    conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)\n",
300 |     "    conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)\n",
301 |     "    conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)\n",
302 |     "\n",
303 |     "    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
304 |     "    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
305 |     "    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
306 |     "    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
307 |     "    merge = concatenate([diff, mult])\n",
308 |     "\n",
309 |     "    x = Dropout(DROP_RATE)(merge)\n",
310 |     "    x = BatchNormalization()(x)\n",
311 |     "\n",
312 |     "    x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n",
313 |     "    x = Dropout(DROP_RATE)(x)\n",
314 |     "    x = BatchNormalization()(x)\n",
315 |     "\n",
316 |     "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
317 |     "\n",
318 |     "    model = Model(inputs=[word_input1, word_input2], outputs=pred)\n",
319 |     "    model.compile(\n",
320 |     "        optimizer=\"nadam\",\n",
321 |     "        loss=\"binary_crossentropy\",\n",
322 |     "        metrics=[\"acc\"]\n",
323 |     "    )\n",
324 |     "\n",
325 |     "    early_stopping = EarlyStopping(\"val_loss\", patience=10)\n",
326 |     "    check_point = ModelCheckpoint(\n",
327 |     "        \"./log/%s.multi_lstm_cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
328 |     "        monitor=\"val_loss\",\n",
329 |     "        save_best_only=True,\n",
330 |     "        save_weights_only=True\n",
331 |     "    )\n",
332 |     "\n",
333 |     "    train_res = model.fit(\n",
334 |     "        x=[train_word1, train_word2],\n",
335 |     "        y=train_y,\n",
336 |     "        batch_size=BATCH_SIZE,\n",
337 |     "        epochs=NUM_EPOCHES,\n",
338 |     "        validation_data=([dev_word1, dev_word2], dev_y),\n",
339 |     "        shuffle=True,\n",
340 |     "        callbacks=[early_stopping, check_point]\n",
341 |     "    )\n",
342 |     "\n",
343 |     "    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
344 |     "    last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
345 |     "    \n",
346 |     "\n",
347 |     "    print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
348 |     "    model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
349 |     "    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
350 |     "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
351 |     "\n",
352 |     "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
353 |     "    \"./result/%s-multilstm_cnn_pred_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
354 |     "    index=False\n",
355 |     ")\n",
356 |     "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
357 |     "    \"./result/%s-multilstm_cnn_pred_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
358 |     "    index=False\n",
359 |     ")"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {
366 |     "collapsed": true
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "# train_encode = []\n",
371 |     "# test_encode = []\n",
372 |     "\n",
373 |     "# for model_name in glob(\"./models/*.hdf5\"):\n",
374 |     "#     word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
375 |     "#     word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n",
376 |     "\n",
377 |     "#     embedding_layer = Embedding(\n",
378 |     "#         input_dim=word_embedding_data.shape[0],\n",
379 |     "#         output_dim=word_embedding_data.shape[1],\n",
380 |     "#         weights=[word_embedding_data],\n",
381 |     "#         input_length=WORD_SEQ_LEN,\n",
382 |     "#         trainable=False\n",
383 |     "#     )\n",
384 |     "\n",
385 |     "#     word_vector1 = embedding_layer(word_input1)\n",
386 |     "#     word_vector2 = embedding_layer(word_input2)\n",
387 |     "\n",
388 |     "#     lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n",
389 |     "#     word_first_1 = lstm_layer1(word_vector1)\n",
390 |     "#     word_first_1 = Dropout(DROP_RATE)(word_first_1)\n",
391 |     "#     word_first_2 = lstm_layer1(word_vector2)\n",
392 |     "#     word_first_2 = Dropout(DROP_RATE)(word_first_2)\n",
393 |     "\n",
394 |     "#     lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n",
395 |     "#     word_second_1 = lstm_layer2(word_first_1)\n",
396 |     "#     word_second_2 = lstm_layer2(word_first_2)\n",
397 |     "\n",
398 |     "#     conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)\n",
399 |     "#     conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)\n",
400 |     "#     conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)\n",
401 |     "#     conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)\n",
402 |     "#     conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)\n",
403 |     "#     conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)\n",
404 |     "\n",
405 |     "#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
406 |     "#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
407 |     "#     diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
408 |     "#     mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
409 |     "#     merge = concatenate([diff, mult])\n",
410 |     "\n",
411 |     "#     x = Dropout(DROP_RATE)(merge)\n",
412 |     "#     x = BatchNormalization()(x)\n",
413 |     "\n",
414 |     "#     x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n",
415 |     "#     x = Dropout(DROP_RATE)(x)\n",
416 |     "#     x = BatchNormalization()(x)\n",
417 |     "\n",
418 |     "#     pred = Dense(1, activation=\"sigmoid\")(x)\n",
419 |     "\n",
420 |     "#     model = Model(inputs=[word_input1, word_input2], outputs=pred)\n",
421 |     "#     model.load_weights(model_name.replace(\"\\\\\", \"/\"))\n",
422 |     "    \n",
423 |     "#     encode_model = Model(inputs=[word_input1, word_input2], outputs=model.layers[-4].output)\n",
424 |     "#     train_feature = encode_model.predict([word1, word2], batch_size=BATCH_SIZE)\n",
425 |     "#     test_feature = encode_model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n",
426 |     "#     train_encode.append(train_feature)\n",
427 |     "#     test_encode.append(test_feature)\n",
428 |     "\n",
429 |     "# train_dense = train_encode[0].copy()\n",
430 |     "# for t in train_encode[1:]:\n",
431 |     "#     train_dense += t\n",
432 |     "# train_dense = train_dense / 10\n",
433 |     "# pd.DataFrame(train_dense).to_csv(\"train_input.csv\", index=False)\n",
434 |     "\n",
435 |     "# test_dense = test_encode[0].copy()\n",
436 |     "# for t in test_encode[1:]:\n",
437 |     "#     test_dense += t\n",
438 |     "# test_dense = test_dense / 10\n",
439 |     "# pd.DataFrame(test_dense).to_csv(\"test_input.csv\", index=False)\n",
440 |     "\n",
441 |     "# import xgboost as xgb\n",
442 |     "# from sklearn.model_selection import train_test_split\n",
443 |     "\n",
444 |     "# xgb_train_x, xgb_dev_x, xgb_train_y, xgb_dev_y = train_test_split(mean_dense, label, test_size=0.2, stratify=label)\n",
445 |     "\n",
446 |     "# train_data = xgb.DMatrix(xgb_train_x, xgb_train_y)\n",
447 |     "# dev_data = xgb.DMatrix(xgb_dev_x, xgb_dev_y)\n",
448 |     "\n",
449 |     "# params = {\n",
450 |     "#     \"objective\": \"binary:logistic\",\n",
451 |     "#     \"eval_metric\": \"logloss\",\n",
452 |     "#     \"eta\": 0.01,\n",
453 |     "#     \"max_depth\": 5,\n",
454 |     "#     \"subsample\": 0.8,\n",
455 |     "#     \"colsample_bytree\": 0.8,\n",
456 |     "#     \"lambda\": 1,\n",
457 |     "# }\n",
458 |     "\n",
459 |     "\n",
460 |     "# boost = xgb.train(\n",
461 |     "#     params=params,\n",
462 |     "#     dtrain=train_data,\n",
463 |     "#     num_boost_round=200,\n",
464 |     "#     evals=[(dev_data, \"dev\")],\n",
465 |     "#     early_stopping_rounds=10,\n",
466 |     "# )\n",
467 |     "\n",
468 |     "# test_pred = boost.predict(xgb.DMatrix(test_dense))\n",
469 |     "# pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\n",
470 |     "#     \"./result/%s-xgb_multilstm_cnn_pred_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
471 |     "#     index=False\n",
472 |     "# )"
473 |    ]
474 |   }
475 |  ],
476 |  "metadata": {
477 |   "kernelspec": {
478 |    "display_name": "Python 3",
479 |    "language": "python",
480 |    "name": "python3"
481 |   },
482 |   "language_info": {
483 |    "codemirror_mode": {
484 |     "name": "ipython",
485 |     "version": 3
486 |    },
487 |    "file_extension": ".py",
488 |    "mimetype": "text/x-python",
489 |    "name": "python",
490 |    "nbconvert_exporter": "python",
491 |    "pygments_lexer": "ipython3",
492 |    "version": "3.6.2"
493 |   }
494 |  },
495 |  "nbformat": 4,
496 |  "nbformat_minor": 2
497 | }
498 | 


--------------------------------------------------------------------------------
/[Model] TextCNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings(\"ignore\")\n",
 13 |     "\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "from glob import glob\n",
 17 |     "from datetime import datetime\n",
 18 |     "\n",
 19 |     "DATA_PATH = \"./data/\"\n",
 20 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 21 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 22 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 23 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 24 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 25 |     "\n",
 26 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 27 |     "test_data = pd.read_csv(TEST_PATH)\n",
 28 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 29 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 30 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 31 |     "\n",
 32 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 33 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 34 |     "\n",
 35 |     "label = train_data[\"label\"].values"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stderr",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Using TensorFlow backend.\n"
 48 |      ]
 49 |     },
 50 |     {
 51 |      "data": {
 52 |       "text/plain": [
 53 |        "((10001, 300), (3049, 300))"
 54 |       ]
 55 |      },
 56 |      "execution_count": 2,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "from keras.preprocessing.text import Tokenizer\n",
 63 |     "\n",
 64 |     "MAX_COUNT = 10000\n",
 65 |     "\n",
 66 |     "word_tokenizer = Tokenizer(MAX_COUNT)\n",
 67 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
 68 |     "\n",
 69 |     "word_embedding_data = np.concatenate(\n",
 70 |     "    (\n",
 71 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 72 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 73 |     "    ),\n",
 74 |     "    axis=0\n",
 75 |     ")\n",
 76 |     "\n",
 77 |     "char_tokenizer = Tokenizer(MAX_COUNT)\n",
 78 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
 79 |     "\n",
 80 |     "char_embedding_data = np.concatenate(\n",
 81 |     "    (\n",
 82 |     "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
 83 |     "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 84 |     "    ),\n",
 85 |     "    axis=0\n",
 86 |     ")\n",
 87 |     "\n",
 88 |     "word_embedding_data.shape, char_embedding_data.shape"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 3,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "((254386, 30),\n",
100 |        " (254386, 30),\n",
101 |        " (172956, 30),\n",
102 |        " (172956, 30),\n",
103 |        " (254386, 30),\n",
104 |        " (254386, 30),\n",
105 |        " (172956, 30),\n",
106 |        " (172956, 30))"
107 |       ]
108 |      },
109 |      "execution_count": 3,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "from keras.preprocessing.sequence import pad_sequences\n",
116 |     "\n",
117 |     "SEQ_LEN = 30\n",
118 |     "\n",
119 |     "def gen_word_data(data):\n",
120 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
121 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
122 |     "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
123 |     "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
124 |     "    \n",
125 |     "def gen_char_data(data):\n",
126 |     "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
127 |     "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
128 |     "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
129 |     "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
130 |     "\n",
131 |     "word1, word2 = gen_word_data(train_data)\n",
132 |     "char1, char2 = gen_char_data(train_data)\n",
133 |     "test_word1, test_word2 = gen_word_data(test_data)\n",
134 |     "test_char1, test_char2 = gen_char_data(test_data)\n",
135 |     "\n",
136 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 4,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "from keras.models import Model\n",
148 |     "from keras.layers.merge import concatenate\n",
149 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
150 |     "from keras.optimizers import Adam, Nadam, SGD\n",
151 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed, CuDNNLSTM\n",
152 |     "from keras.layers import Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D\n",
153 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 5,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "BATCH_SIZE = 1024\n",
165 |     "NUM_EPOCHES = 50\n",
166 |     "DROP_RATE = 0.3\n",
167 |     "\n",
168 |     "CONV_FILTER_LAYER1 = 128\n",
169 |     "CONV_FILTER_LAYER2 = 128\n",
170 |     "\n",
171 |     "DENSE_SIZE1 = 512\n",
172 |     "DENSE_SIZE2 = 256"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 6,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "def textcnn_layer(input_tensor, kernel_size):\n",
184 |     "    conv_1 = Conv1D(filters=CONV_FILTER_LAYER1, kernel_size=kernel_size, padding=\"same\")(input_tensor)\n",
185 |     "    conv_1 = BatchNormalization()(conv_1)\n",
186 |     "    conv_1 = Activation(activation=\"relu\")(conv_1)\n",
187 |     "    conv_2 = Conv1D(filters=CONV_FILTER_LAYER2, kernel_size=kernel_size, padding=\"same\")(conv_1)\n",
188 |     "    conv_2 = BatchNormalization()(conv_2)\n",
189 |     "    conv_2 = Activation(activation=\"relu\")(conv_2)\n",
190 |     "    conv_2_max = GlobalMaxPool1D()(conv_2)\n",
191 |     "    conv_2_avg = GlobalAveragePooling1D()(conv_2)\n",
192 |     "    conv_2_merge = concatenate([conv_2_max, conv_2_avg])\n",
193 |     "    return conv_2_max"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 7,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "name": "stdout",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n",
206 |       "Instructions for updating:\n",
207 |       "`NHWC` for data_format is deprecated, use `NWC` instead\n",
208 |       "Train on 228946 samples, validate on 25440 samples\n",
209 |       "Epoch 1/50\n",
210 |       "228946/228946 [==============================] - 99s 433us/step - loss: 0.4382 - acc: 0.7953 - val_loss: 0.3387 - val_acc: 0.8498\n",
211 |       "Epoch 2/50\n",
212 |       "228946/228946 [==============================] - 90s 392us/step - loss: 0.3082 - acc: 0.8647 - val_loss: 0.2845 - val_acc: 0.8755\n",
213 |       "Epoch 3/50\n",
214 |       "228946/228946 [==============================] - 93s 405us/step - loss: 0.2626 - acc: 0.8868 - val_loss: 0.2673 - val_acc: 0.8857\n",
215 |       "Epoch 4/50\n",
216 |       "228946/228946 [==============================] - 92s 402us/step - loss: 0.2304 - acc: 0.9020 - val_loss: 0.2649 - val_acc: 0.8886\n",
217 |       "Epoch 5/50\n",
218 |       "228946/228946 [==============================] - 89s 391us/step - loss: 0.2048 - acc: 0.9137 - val_loss: 0.2551 - val_acc: 0.8915\n",
219 |       "Epoch 6/50\n",
220 |       "228946/228946 [==============================] - 92s 400us/step - loss: 0.1845 - acc: 0.9231 - val_loss: 0.2518 - val_acc: 0.8948\n",
221 |       "Epoch 7/50\n",
222 |       "228946/228946 [==============================] - 93s 408us/step - loss: 0.1643 - acc: 0.9329 - val_loss: 0.2583 - val_acc: 0.8960\n",
223 |       "Epoch 8/50\n",
224 |       " 28672/228946 [==>...........................] - ETA: 1:18 - loss: 0.1283 - acc: 0.9493"
225 |      ]
226 |     },
227 |     {
228 |      "ename": "KeyboardInterrupt",
229 |      "evalue": "",
230 |      "output_type": "error",
231 |      "traceback": [
232 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
233 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
234 |       "\u001b[1;32m<ipython-input-7-28ddf00ad087>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     66\u001b[0m         \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdev_x1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_x2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     67\u001b[0m         \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 68\u001b[1;33m         \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mearly_stopping\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_point\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     69\u001b[0m     )\n\u001b[0;32m     70\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
235 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[0;32m   1040\u001b[0m                                         \u001b[0minitial_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1041\u001b[0m                                         \u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m                                         validation_steps=validation_steps)\n\u001b[0m\u001b[0;32m   1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1044\u001b[0m     def evaluate(self, x=None, y=None,\n",
236 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training_arrays.py\u001b[0m in \u001b[0;36mfit_loop\u001b[1;34m(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[0;32m    197\u001b[0m                     \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    198\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 199\u001b[1;33m                 \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    200\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    201\u001b[0m                     \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
237 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m   2665\u001b[0m                     \u001b[1;34m'In order to feed symbolic tensors to a Keras model '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2666\u001b[0m                     'in TensorFlow, you need tensorflow 1.8 or higher.')\n\u001b[1;32m-> 2667\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_legacy_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2668\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2669\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
238 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m_legacy_call\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m   2647\u001b[0m         \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2648\u001b[0m         updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[1;32m-> 2649\u001b[1;33m                               **self.session_kwargs)\n\u001b[0m\u001b[0;32m   2650\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2651\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
239 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m    903\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    904\u001b[0m       result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[1;32m--> 905\u001b[1;33m                          run_metadata_ptr)\n\u001b[0m\u001b[0;32m    906\u001b[0m       \u001b[1;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    907\u001b[0m         \u001b[0mproto_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
240 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m   1135\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1136\u001b[0m       results = self._do_run(handle, final_targets, final_fetches,\n\u001b[1;32m-> 1137\u001b[1;33m                              feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[0;32m   1138\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1139\u001b[0m       \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
241 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m   1353\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1354\u001b[0m       return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[1;32m-> 1355\u001b[1;33m                            options, run_metadata)\n\u001b[0m\u001b[0;32m   1356\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1357\u001b[0m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
242 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m   1359\u001b[0m   \u001b[1;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1360\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1361\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1362\u001b[0m     \u001b[1;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1363\u001b[0m       \u001b[0mmessage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
243 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[1;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m   1338\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1339\u001b[0m           return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[1;32m-> 1340\u001b[1;33m                                    target_list, status, run_metadata)\n\u001b[0m\u001b[0;32m   1341\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1342\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
244 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "from sklearn.model_selection import StratifiedKFold\n",
250 |     "\n",
251 |     "best_results = []\n",
252 |     "last_results = []\n",
253 |     "\n",
254 |     "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10, shuffle=True).split(X=char1, y=label)):\n",
255 |     "    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch\n",
256 |     "    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch\n",
257 |     "    \n",
258 |     "    input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
259 |     "    input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
260 |     "\n",
261 |     "    embedding_layer = Embedding(\n",
262 |     "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
263 |     "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
264 |     "        weights=[word_embedding_data],  # word/char switch\n",
265 |     "        input_length=SEQ_LEN,\n",
266 |     "        trainable=False\n",
267 |     "    )\n",
268 |     "    \n",
269 |     "    vector1 = embedding_layer(input1)\n",
270 |     "    vector2 = embedding_layer(input2)\n",
271 |     "    \n",
272 |     "    conv1a, conv1b = textcnn_layer(vector1, kernel_size=1), textcnn_layer(vector2, kernel_size=1)\n",
273 |     "    conv2a, conv2b = textcnn_layer(vector1, kernel_size=2), textcnn_layer(vector2, kernel_size=2)\n",
274 |     "    conv3a, conv3b = textcnn_layer(vector1, kernel_size=3), textcnn_layer(vector2, kernel_size=3)\n",
275 |     "    conv4a, conv4b = textcnn_layer(vector1, kernel_size=4), textcnn_layer(vector2, kernel_size=4)\n",
276 |     "    conv5a, conv5b = textcnn_layer(vector1, kernel_size=5), textcnn_layer(vector2, kernel_size=5)\n",
277 |     "    conv6a, conv6b = textcnn_layer(vector1, kernel_size=6), textcnn_layer(vector2, kernel_size=6)\n",
278 |     "    \n",
279 |     "    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
280 |     "    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
281 |     "    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
282 |     "    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
283 |     "    merge = concatenate([diff, mult])\n",
284 |     "    \n",
285 |     "    x = Dropout(DROP_RATE)(merge)\n",
286 |     "    x = BatchNormalization()(x)\n",
287 |     "    x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
288 |     "    x = Dropout(DROP_RATE)(x)\n",
289 |     "    x = BatchNormalization()(x)\n",
290 |     "    x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
291 |     "    x = Dropout(DROP_RATE)(x)\n",
292 |     "    x = BatchNormalization()(x)\n",
293 |     "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
294 |     "    \n",
295 |     "    model = Model(inputs=[input1, input2], outputs=pred)\n",
296 |     "    model.compile(\n",
297 |     "        optimizer=\"nadam\",\n",
298 |     "        loss=\"binary_crossentropy\",\n",
299 |     "        metrics=[\"acc\"]\n",
300 |     "    )\n",
301 |     "    \n",
302 |     "    early_stopping = EarlyStopping(\"val_loss\", patience=10)\n",
303 |     "    check_point = ModelCheckpoint(\n",
304 |     "        \"./log/%s.TextCNN.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
305 |     "        monitor=\"val_loss\",\n",
306 |     "        save_best_only=True,\n",
307 |     "    )\n",
308 |     "    \n",
309 |     "    fit_res = model.fit(\n",
310 |     "        x=[train_x1, train_x2],\n",
311 |     "        y=train_y,\n",
312 |     "        batch_size=BATCH_SIZE,\n",
313 |     "        epochs=NUM_EPOCHES,\n",
314 |     "        validation_data=([dev_x1, dev_x2], dev_y),\n",
315 |     "        shuffle=True,\n",
316 |     "        callbacks=[early_stopping, check_point]\n",
317 |     "    )\n",
318 |     "    \n",
319 |     "    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
320 |     "    last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
321 |     "    \n",
322 |     "    print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
323 |     "    model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
324 |     "    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
325 |     "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
326 |     "\n",
327 |     "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
328 |     "    \"./result/%s-TextCNN_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
329 |     "    index=False\n",
330 |     ")\n",
331 |     "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
332 |     "    \"./result/%s-TextCNN_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
333 |     "    index=False\n",
334 |     ")"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {
341 |     "collapsed": true
342 |    },
343 |    "outputs": [],
344 |    "source": []
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "kernelspec": {
349 |    "display_name": "Python 3",
350 |    "language": "python",
351 |    "name": "python3"
352 |   },
353 |   "language_info": {
354 |    "codemirror_mode": {
355 |     "name": "ipython",
356 |     "version": 3
357 |    },
358 |    "file_extension": ".py",
359 |    "mimetype": "text/x-python",
360 |    "name": "python",
361 |    "nbconvert_exporter": "python",
362 |    "pygments_lexer": "ipython3",
363 |    "version": "3.6.2"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 2
368 | }
369 | 


--------------------------------------------------------------------------------
/[Model] Multi LSTM CNN v4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 模型结构\n",
  8 |     "\n",
  9 |     "- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)\n",
 10 |     "- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中\n",
 11 |     "- 卷积层为双层, 最后的池化层有Average和Max两种\n",
 12 |     "- 对于每个问题, 将所有卷积核结果并起来\n",
 13 |     "- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import warnings\n",
 25 |     "warnings.filterwarnings(\"ignore\")\n",
 26 |     "\n",
 27 |     "import os\n",
 28 |     "import shutil\n",
 29 |     "import numpy as np\n",
 30 |     "import pandas as pd\n",
 31 |     "from glob import glob\n",
 32 |     "from datetime import datetime"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "try:\n",
 44 |     "    os.mkdir(\"./log/\")\n",
 45 |     "    os.mkdir(\"./result/\")\n",
 46 |     "except FileExistsError:\n",
 47 |     "    pass"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "DATA_PATH = \"./data/\"\n",
 57 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 58 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 59 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 60 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 61 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 62 |     "\n",
 63 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 64 |     "test_data = pd.read_csv(TEST_PATH)\n",
 65 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 66 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 67 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 68 |     "\n",
 69 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 70 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 71 |     "\n",
 72 |     "label = train_data[\"label\"].values\n",
 73 |     "\n",
 74 |     "from keras.preprocessing.text import Tokenizer\n",
 75 |     "\n",
 76 |     "MAX_COUNT = 10000\n",
 77 |     "\n",
 78 |     "word_tokenizer = Tokenizer(MAX_COUNT)\n",
 79 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
 80 |     "\n",
 81 |     "word_embedding_data = np.concatenate(\n",
 82 |     "    (\n",
 83 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 84 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 85 |     "    ),\n",
 86 |     "    axis=0\n",
 87 |     ")\n",
 88 |     "\n",
 89 |     "char_tokenizer = Tokenizer(MAX_COUNT)\n",
 90 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
 91 |     "\n",
 92 |     "char_embedding_data = np.concatenate(\n",
 93 |     "    (\n",
 94 |     "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
 95 |     "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 96 |     "    ),\n",
 97 |     "    axis=0\n",
 98 |     ")\n",
 99 |     "\n",
100 |     "word_embedding_data.shape, char_embedding_data.shape"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "from keras.preprocessing.sequence import pad_sequences\n",
110 |     "\n",
111 |     "SEQ_LEN = 25\n",
112 |     "\n",
113 |     "def gen_word_data(data):\n",
114 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
115 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
116 |     "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
117 |     "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
118 |     "    \n",
119 |     "def gen_char_data(data):\n",
120 |     "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
121 |     "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
122 |     "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
123 |     "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
124 |     "\n",
125 |     "word1, word2 = gen_word_data(train_data)\n",
126 |     "char1, char2 = gen_char_data(train_data)\n",
127 |     "test_word1, test_word2 = gen_word_data(test_data)\n",
128 |     "test_char1, test_char2 = gen_char_data(test_data)\n",
129 |     "\n",
130 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": true
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "from keras.models import Model\n",
142 |     "from keras.layers.merge import concatenate\n",
143 |     "from keras.optimizers import Adam, SGD, Nadam\n",
144 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n",
145 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed\n",
146 |     "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D\n",
147 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "collapsed": true
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "# general\n",
159 |     "NUM_EPOCHES = 50\n",
160 |     "EPOCHES1 = 5\n",
161 |     "EPOCHES2 = 25 # 5\n",
162 |     "EPOCHES3 = 22\n",
163 |     "BATCH_SIZE = 1024\n",
164 |     "DROP_RATE = 0.3\n",
165 |     "\n",
166 |     "# cnn\n",
167 |     "CONV_LEN1 = 128\n",
168 |     "CONV_LEN2 = 128\n",
169 |     "CONV_LEN3 = 128\n",
170 |     "CONV_LEN4 = 128\n",
171 |     "CONV_LEN5 = 128\n",
172 |     "CONV_LEN6 = 128\n",
173 |     "\n",
174 |     "# lstm\n",
175 |     "LSTM_SIZE1 = 256\n",
176 |     "LSTM_SIZE2 = 256\n",
177 |     "LSTM_DROP_RATE = 0.3\n",
178 |     "\n",
179 |     "# dense\n",
180 |     "DENSE_INPUT = 300\n",
181 |     "DENSE_SIZE1 = 512\n",
182 |     "DENSE_SIZE2 = 256"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "def cnn_layer1(inputa, inputb, filters, kernel_size): # with average pooling\n",
194 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
195 |     "    conv_outputa = conv(inputa)\n",
196 |     "    conv_outputa = GlobalAveragePooling1D()(conv_outputa)\n",
197 |     "    conv_outputb = conv(inputb)\n",
198 |     "    conv_outputb = GlobalAveragePooling1D()(conv_outputb)\n",
199 |     "    return conv_outputa, conv_outputb\n",
200 |     "    \n",
201 |     "def cnn_layer2(inputa, inputb, filters, kernel_size): # with max pooling\n",
202 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
203 |     "    conv_outputa = conv(inputa)\n",
204 |     "    conv_outputa = MaxPool1D(pool_size=SEQ_LEN)(conv_outputa)\n",
205 |     "    conv_outputa = Flatten()(conv_outputa)\n",
206 |     "    conv_outputb = conv(inputb)\n",
207 |     "    conv_outputb = MaxPool1D(pool_size=SEQ_LEN)(conv_outputb)\n",
208 |     "    conv_outputb = Flatten()(conv_outputb)\n",
209 |     "    return conv_outputa, conv_outputb\n",
210 |     "\n",
211 |     "def cnn_layer3(inputa, inputb, filters, kernel_size): # with both max and average poolings\n",
212 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
213 |     "    \n",
214 |     "    conv_outputa = conv(inputa)\n",
215 |     "    conv_outputa1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputa))\n",
216 |     "    conv_outputa2 = GlobalAveragePooling1D()(conv_outputa)\n",
217 |     "    conv_outputa = concatenate([conv_outputa1, conv_outputa2])\n",
218 |     "    \n",
219 |     "    conv_outputb = conv(inputb)\n",
220 |     "    conv_outputb1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputb))\n",
221 |     "    conv_outputb2 = GlobalAveragePooling1D()(conv_outputb)\n",
222 |     "    conv_outputb = concatenate([conv_outputb1, conv_outputb2])\n",
223 |     "    \n",
224 |     "    return conv_outputa, conv_outputb\n",
225 |     "\n",
226 |     "def cnn_double_layer(inputa, inputb, filters, kernel_size):\n",
227 |     "    conv1 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n",
228 |     "    conv2 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n",
229 |     "    \n",
230 |     "    conv1a = conv1(inputa)\n",
231 |     "    conv1a = BatchNormalization()(conv1a)\n",
232 |     "    conv1a = Activation(activation=\"relu\")(conv1a)\n",
233 |     "    conv2a = conv2(conv1a)\n",
234 |     "    conv2a = BatchNormalization()(conv2a)\n",
235 |     "    conv2a = Activation(activation=\"relu\")(conv2a)\n",
236 |     "    output_avg_a = GlobalAveragePooling1D()(conv2a)\n",
237 |     "    output_max_a = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2a))\n",
238 |     "    output_min_a = Lambda(lambda x: K.min(x, axis=1))(conv2a)\n",
239 |     "    output_a = concatenate([output_avg_a, output_max_a, output_min_a])\n",
240 |     "    \n",
241 |     "    conv1b = conv1(inputb)\n",
242 |     "    conv1b = BatchNormalization()(conv1b)\n",
243 |     "    conv1b = Activation(activation=\"relu\")(conv1b)\n",
244 |     "    conv2b = conv2(conv1b)\n",
245 |     "    conv2b = BatchNormalization()(conv2b)\n",
246 |     "    conv2b = Activation(activation=\"relu\")(conv2b)\n",
247 |     "    output_avg_b = GlobalAveragePooling1D()(conv2b)\n",
248 |     "    output_max_b = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2b))\n",
249 |     "    output_min_b = Lambda(lambda x: K.min(x, axis=1))(conv2b)\n",
250 |     "    output_b = concatenate([output_avg_b, output_max_b, output_min_b])\n",
251 |     "    \n",
252 |     "    return output_a, output_b\n",
253 |     "\n",
254 |     "\n",
255 |     "def sim_l1(v1, v2):\n",
256 |     "    return Lambda(lambda x: K.sum(K.abs(x[0] - x[1]), axis=1))([v1, v2])\n",
257 |     "\n",
258 |     "def sim_l2(v1, v2):\n",
259 |     "    return Lambda(lambda x: K.sqrt(K.sum(K.square(x[0] - x[1]), axis=1)))([v1, v2])\n",
260 |     "\n",
261 |     "def sim_cos(v1, v2):\n",
262 |     "    return Lambda(lambda x: K.sum(x[0] * x[1], axis=1) / (K.sqrt(K.sum(x[0] * x[0], axis=1)) * K.sqrt(K.sum(x[1] * x[1], axis=1))))([v1, v2])\n",
263 |     "\n",
264 |     "def sim_vec(v1, v2):\n",
265 |     "    l1 = sim_l1(v1, v2)\n",
266 |     "    l2 = sim_l2(v1, v2)\n",
267 |     "    cos = sim_cos(v1, v2)\n",
268 |     "    vec = concatenate([Lambda(lambda x: K.reshape(x, shape=(-1, 1)))(t) for t in [l1, l2, cos]], axis=1)\n",
269 |     "    return vec\n",
270 |     "\n",
271 |     "def similarity_mpcnn(s1, s2):\n",
272 |     "    fea_h, fea_a = [], []\n",
273 |     "    out1, out2 = [], []        \n",
274 |     "    for i in range(len(s1)):\n",
275 |     "        avg1 = GlobalAveragePooling1D()(s1[i])\n",
276 |     "        max1 = GlobalMaxPooling1D()(s1[i])\n",
277 |     "        min1 = Lambda(lambda x: K.min(x, axis=1))(s1[i])\n",
278 |     "        out1.append([avg1, max1, min1])\n",
279 |     "        \n",
280 |     "        avg2 = GlobalAveragePooling1D()(s2[i])\n",
281 |     "        max2 = GlobalMaxPooling1D()(s2[i])\n",
282 |     "        min2 = Lambda(lambda x: K.min(x, axis=1))(s2[i])\n",
283 |     "        out2.append([avg2, max2, min2])\n",
284 |     "        \n",
285 |     "    output1, output2 = [], [] # pool nums\n",
286 |     "    for p in range(3):\n",
287 |     "        output1.append(concatenate([Lambda(lambda x:K.reshape(x, shape=(-1, 1, CONV_LEN1)))(t[p]) for t in out1], axis=1))\n",
288 |     "        output2.append(concatenate([Lambda(lambda x:K.reshape(x, shape=(-1, 1, CONV_LEN1)))(t[p]) for t in out2], axis=1))\n",
289 |     "    \n",
290 |     "    for p in range(3):\n",
291 |     "        for f in range(CONV_LEN1):\n",
292 |     "            fea_h.append(sim_vec(Lambda(lambda x: x[:, :, f])(output1[p]), Lambda(lambda x: x[:, :, f])(output2[p])))\n",
293 |     "    \n",
294 |     "    for p in range(3):\n",
295 |     "        for k1 in range(len(s1)):\n",
296 |     "            for k2 in range(len(s1)):\n",
297 |     "                fea_a.append(sim_vec(Lambda(lambda x: x[:, k1, :])(output1[p]), Lambda(lambda x: x[:, k2, :])(output2[p])))\n",
298 |     "    fea = concatenate(fea_h + fea_a, axis=1)\n",
299 |     "    return fea"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "# WORDS"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {
313 |     "collapsed": true
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "from sklearn.model_selection import StratifiedKFold\n",
318 |     "\n",
319 |     "best_results = []\n",
320 |     "last_results = []\n",
321 |     "best_file_names = []\n",
322 |     "\n",
323 |     "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):  # word/char switch\n",
324 |     "    print(\"fold {} start\".format(i + 1))\n",
325 |     "    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch\n",
326 |     "    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch\n",
327 |     "    \n",
328 |     "    input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
329 |     "    input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
330 |     "\n",
331 |     "    embedding_layer = Embedding(\n",
332 |     "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
333 |     "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
334 |     "        weights=[word_embedding_data],  # word/char switch\n",
335 |     "        input_length=SEQ_LEN,\n",
336 |     "        trainable=False\n",
337 |     "    )\n",
338 |     "    \n",
339 |     "    vector1 = embedding_layer(input1)\n",
340 |     "    vector2 = embedding_layer(input2)\n",
341 |     "    \n",
342 |     "    input_layer = TimeDistributed(Dense(DENSE_INPUT))\n",
343 |     "    vector1 = input_layer(vector1)\n",
344 |     "    vector1 = BatchNormalization()(vector1)\n",
345 |     "    vector2 = input_layer(vector2)\n",
346 |     "    vector2 = BatchNormalization()(vector2)\n",
347 |     "    \n",
348 |     "    lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
349 |     "    layer1a = lstm_layer1(vector1)\n",
350 |     "    layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n",
351 |     "    layer1b = lstm_layer1(vector2)\n",
352 |     "    layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n",
353 |     "    lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
354 |     "    layer2a = lstm_layer2(layer1a)\n",
355 |     "    layer2b = lstm_layer2(layer1b)\n",
356 |     "    \n",
357 |     "    conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n",
358 |     "    conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n",
359 |     "    conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n",
360 |     "    conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n",
361 |     "    conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n",
362 |     "    conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n",
363 |     "\n",
364 |     "    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
365 |     "    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
366 |     "    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
367 |     "    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
368 |     "    merge = concatenate([diff, mult])\n",
369 |     "    \n",
370 |     "    x = Dropout(DROP_RATE)(merge)\n",
371 |     "    x = BatchNormalization()(x)\n",
372 |     "    x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
373 |     "    x = Dropout(DROP_RATE)(x)\n",
374 |     "    x = BatchNormalization()(x)\n",
375 |     "    x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
376 |     "    x = Dropout(DROP_RATE)(x)\n",
377 |     "    x = BatchNormalization()(x)\n",
378 |     "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
379 |     "    \n",
380 |     "    model = Model(inputs=[input1, input2], outputs=pred)\n",
381 |     "    model.compile(\n",
382 |     "        optimizer=\"nadam\",\n",
383 |     "        loss=\"binary_crossentropy\",\n",
384 |     "        metrics=[\"acc\"]\n",
385 |     "    )\n",
386 |     "    \n",
387 |     "    early_stopping = EarlyStopping(\"val_loss\", patience=8)\n",
388 |     "    lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.001)\n",
389 |     "    check_point = ModelCheckpoint(\n",
390 |     "        \"./log/%s.Multi_LSTM_CNN_v4.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
391 |     "        monitor=\"val_loss\",\n",
392 |     "        save_best_only=True,\n",
393 |     "    )\n",
394 |     "    \n",
395 |     "    fit_res = model.fit(\n",
396 |     "        x=[train_x1, train_x2],\n",
397 |     "        y=train_y,\n",
398 |     "        batch_size=BATCH_SIZE,\n",
399 |     "        epochs=NUM_EPOCHES,\n",
400 |     "        validation_data=([dev_x1, dev_x2], dev_y),\n",
401 |     "        shuffle=True,\n",
402 |     "        callbacks=[early_stopping, lr_reducer, check_point]\n",
403 |     "    )\n",
404 |     "    \n",
405 |     "    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
406 |     "    last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
407 |     "    \n",
408 |     "    best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n",
409 |     "    best_file_names.append(best_model_file)\n",
410 |     "    print(\"load model %s\" % (best_model_file,))\n",
411 |     "    model.load_weights(best_model_file)\n",
412 |     "    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
413 |     "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
414 |     "\n",
415 |     "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
416 |     "    \"./result/%s-Multi_LSTM_CNN_v4_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
417 |     "    index=False\n",
418 |     ")\n",
419 |     "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
420 |     "    \"./result/%s-Multi_LSTM_CNN_v4_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
421 |     "    index=False\n",
422 |     ")\n",
423 |     "\n",
424 |     "model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n",
425 |     "os.mkdir(model_path)\n",
426 |     "for model_name in best_file_names:\n",
427 |     "    abs_name = os.path.split(model_name)[1]\n",
428 |     "    os.rename(model_name, model_path + abs_name)"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {
434 |     "collapsed": true
435 |    },
436 |    "source": [
437 |     "# CHARS"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "# from sklearn.model_selection import StratifiedKFold\n",
447 |     "\n",
448 |     "# best_results = []\n",
449 |     "# last_results = []\n",
450 |     "# best_file_names = []\n",
451 |     "\n",
452 |     "# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=char1, y=label)):  # word/char switch\n",
453 |     "#     print(\"fold {} start\".format(i + 1))\n",
454 |     "#     train_x1, train_x2, train_y = char1[train_index, :], char2[train_index, :], label[train_index]  # word/char switch\n",
455 |     "#     dev_x1, dev_x2, dev_y = char1[dev_index, :], char2[dev_index, :], label[dev_index]  # word/char switch\n",
456 |     "    \n",
457 |     "#     input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
458 |     "#     input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
459 |     "\n",
460 |     "#     embedding_layer = Embedding(\n",
461 |     "#         input_dim=char_embedding_data.shape[0],  # word/char switch\n",
462 |     "#         output_dim=char_embedding_data.shape[1],  # word/char switch\n",
463 |     "#         weights=[char_embedding_data],  # word/char switch\n",
464 |     "#         input_length=SEQ_LEN,\n",
465 |     "#         trainable=False\n",
466 |     "#     )\n",
467 |     "    \n",
468 |     "#     vector1 = embedding_layer(input1)\n",
469 |     "#     vector2 = embedding_layer(input2)\n",
470 |     "    \n",
471 |     "#     input_layer = TimeDistributed(Dense(DENSE_INPUT))\n",
472 |     "#     vector1 = input_layer(vector1)\n",
473 |     "#     vector1 = BatchNormalization()(vector1)\n",
474 |     "#     vector2 = input_layer(vector2)\n",
475 |     "#     vector2 = BatchNormalization()(vector2)\n",
476 |     "    \n",
477 |     "#     lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
478 |     "#     layer1a = lstm_layer1(vector1)\n",
479 |     "#     layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n",
480 |     "#     layer1b = lstm_layer1(vector2)\n",
481 |     "#     layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n",
482 |     "#     lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
483 |     "#     layer2a = lstm_layer2(layer1a)\n",
484 |     "#     layer2b = lstm_layer2(layer1b)\n",
485 |     "    \n",
486 |     "#     conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n",
487 |     "#     conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n",
488 |     "#     conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n",
489 |     "#     conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n",
490 |     "#     conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n",
491 |     "#     conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n",
492 |     "\n",
493 |     "#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
494 |     "#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
495 |     "#     diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
496 |     "#     mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
497 |     "#     merge = concatenate([diff, mult])\n",
498 |     "    \n",
499 |     "#     x = Dropout(DROP_RATE)(merge)\n",
500 |     "#     x = BatchNormalization()(x)\n",
501 |     "#     x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
502 |     "#     x = Dropout(DROP_RATE)(x)\n",
503 |     "#     x = BatchNormalization()(x)\n",
504 |     "#     x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
505 |     "#     x = Dropout(DROP_RATE)(x)\n",
506 |     "#     x = BatchNormalization()(x)\n",
507 |     "#     pred = Dense(1, activation=\"sigmoid\")(x)\n",
508 |     "    \n",
509 |     "#     model = Model(inputs=[input1, input2], outputs=pred)\n",
510 |     "#     model.compile(\n",
511 |     "#         optimizer=\"nadam\",\n",
512 |     "#         loss=\"binary_crossentropy\",\n",
513 |     "#         metrics=[\"acc\"]\n",
514 |     "#     )\n",
515 |     "    \n",
516 |     "#     early_stopping = EarlyStopping(\"val_loss\", patience=8)\n",
517 |     "#     lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.001)\n",
518 |     "#     check_point = ModelCheckpoint(\n",
519 |     "#         \"./log/%s.Multi_LSTM_CNN_v4.char.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
520 |     "#         monitor=\"val_loss\",\n",
521 |     "#         save_best_only=True,\n",
522 |     "#     )\n",
523 |     "    \n",
524 |     "#     fit_res = model.fit(\n",
525 |     "#         x=[train_x1, train_x2],\n",
526 |     "#         y=train_y,\n",
527 |     "#         batch_size=BATCH_SIZE,\n",
528 |     "#         epochs=NUM_EPOCHES,\n",
529 |     "#         validation_data=([dev_x1, dev_x2], dev_y),\n",
530 |     "#         shuffle=True,\n",
531 |     "#         callbacks=[early_stopping, lr_reducer, check_point]\n",
532 |     "#     )\n",
533 |     "    \n",
534 |     "#     pred_last = model.predict([test_char1, test_char2], batch_size=BATCH_SIZE)  # word/char switch\n",
535 |     "#     last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
536 |     "    \n",
537 |     "#     best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n",
538 |     "#     best_file_names.append(best_model_file)\n",
539 |     "#     print(\"load model %s\" % (best_model_file,))\n",
540 |     "#     model.load_weights(best_model_file)\n",
541 |     "#     pred_best = model.predict([test_char1, test_char2], batch_size=BATCH_SIZE)  # word/char switch\n",
542 |     "#     best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
543 |     "\n",
544 |     "# pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
545 |     "#     \"./result/%s-Multi_LSTM_CNN_v4_char_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
546 |     "#     index=False\n",
547 |     "# )\n",
548 |     "# pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
549 |     "#     \"./result/%s-Multi_LSTM_CNN_v4_char_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
550 |     "#     index=False\n",
551 |     "# )\n",
552 |     "\n",
553 |     "# model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n",
554 |     "# os.mkdir(model_path)\n",
555 |     "# for model_name in best_file_names:\n",
556 |     "#     abs_name = os.path.split(model_name)[1]\n",
557 |     "#     os.rename(model_name, model_path + abs_name)"
558 |    ]
559 |   }
560 |  ],
561 |  "metadata": {
562 |   "kernelspec": {
563 |    "display_name": "Python 3",
564 |    "language": "python",
565 |    "name": "python3"
566 |   },
567 |   "language_info": {
568 |    "codemirror_mode": {
569 |     "name": "ipython",
570 |     "version": 3
571 |    },
572 |    "file_extension": ".py",
573 |    "mimetype": "text/x-python",
574 |    "name": "python",
575 |    "nbconvert_exporter": "python",
576 |    "pygments_lexer": "ipython3",
577 |    "version": "3.6.2"
578 |   }
579 |  },
580 |  "nbformat": 4,
581 |  "nbformat_minor": 2
582 | }
583 | 


--------------------------------------------------------------------------------
/[Model] Multi LSTM CNN v2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 模型结构\n",
  8 |     "\n",
  9 |     "- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)\n",
 10 |     "- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中\n",
 11 |     "- 卷积层为单层\n",
 12 |     "- 对于每个问题, 将所有卷积核结果并起来\n",
 13 |     "- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]\n",
 14 |     "\n",
 15 |     "## 训练技巧\n",
 16 |     "\n",
 17 |     "- 首先正常训练一定的epoch, 使用Adam方法\n",
 18 |     "- 待loss降到一定水平后, 开放embedding参数的训练, 继续使用Adam方法训练, 并加入学习率衰减callback # (效果不好, dev loss降不下去)\n",
 19 |     "- 待loss降到比较低的水平后, 改用SGD方法进行训练, 直至结束 # (貌似不可行, 取消)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import warnings\n",
 31 |     "warnings.filterwarnings(\"ignore\")\n",
 32 |     "\n",
 33 |     "import numpy as np\n",
 34 |     "import pandas as pd\n",
 35 |     "from glob import glob\n",
 36 |     "from datetime import datetime"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stderr",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "Using TensorFlow backend.\n"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "data": {
 53 |       "text/plain": [
 54 |        "((10001, 300), (3049, 300))"
 55 |       ]
 56 |      },
 57 |      "execution_count": 2,
 58 |      "metadata": {},
 59 |      "output_type": "execute_result"
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "DATA_PATH = \"./data/\"\n",
 64 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 65 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 66 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 67 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 68 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 69 |     "\n",
 70 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 71 |     "test_data = pd.read_csv(TEST_PATH)\n",
 72 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 73 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 74 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 75 |     "\n",
 76 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
 77 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
 78 |     "\n",
 79 |     "label = train_data[\"label\"].values\n",
 80 |     "\n",
 81 |     "from keras.preprocessing.text import Tokenizer\n",
 82 |     "\n",
 83 |     "MAX_COUNT = 10000\n",
 84 |     "\n",
 85 |     "word_tokenizer = Tokenizer(MAX_COUNT)\n",
 86 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
 87 |     "\n",
 88 |     "word_embedding_data = np.concatenate(\n",
 89 |     "    (\n",
 90 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
 91 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
 92 |     "    ),\n",
 93 |     "    axis=0\n",
 94 |     ")\n",
 95 |     "\n",
 96 |     "char_tokenizer = Tokenizer(MAX_COUNT)\n",
 97 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
 98 |     "\n",
 99 |     "char_embedding_data = np.concatenate(\n",
100 |     "    (\n",
101 |     "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
102 |     "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
103 |     "    ),\n",
104 |     "    axis=0\n",
105 |     ")\n",
106 |     "\n",
107 |     "word_embedding_data.shape, char_embedding_data.shape"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 3,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "((254386, 30),\n",
119 |        " (254386, 30),\n",
120 |        " (172956, 30),\n",
121 |        " (172956, 30),\n",
122 |        " (254386, 30),\n",
123 |        " (254386, 30),\n",
124 |        " (172956, 30),\n",
125 |        " (172956, 30))"
126 |       ]
127 |      },
128 |      "execution_count": 3,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "from keras.preprocessing.sequence import pad_sequences\n",
135 |     "\n",
136 |     "SEQ_LEN = 30\n",
137 |     "\n",
138 |     "def gen_word_data(data):\n",
139 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
140 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
141 |     "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
142 |     "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
143 |     "    \n",
144 |     "def gen_char_data(data):\n",
145 |     "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
146 |     "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
147 |     "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
148 |     "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
149 |     "\n",
150 |     "word1, word2 = gen_word_data(train_data)\n",
151 |     "char1, char2 = gen_char_data(train_data)\n",
152 |     "test_word1, test_word2 = gen_word_data(test_data)\n",
153 |     "test_char1, test_char2 = gen_char_data(test_data)\n",
154 |     "\n",
155 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 4,
161 |    "metadata": {
162 |     "collapsed": true
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "from keras.models import Model\n",
167 |     "from keras.layers.merge import concatenate\n",
168 |     "from keras.optimizers import Adam, SGD, Nadam\n",
169 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n",
170 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed\n",
171 |     "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n",
172 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 5,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "# general\n",
184 |     "NUM_EPOCHES = 30\n",
185 |     "EPOCHES1 = 5\n",
186 |     "EPOCHES2 = 25 # 5\n",
187 |     "EPOCHES3 = 22\n",
188 |     "BATCH_SIZE = 1024\n",
189 |     "DROP_RATE = 0.3\n",
190 |     "\n",
191 |     "# cnn\n",
192 |     "CONV_LEN1 = 128\n",
193 |     "CONV_LEN2 = 128\n",
194 |     "CONV_LEN3 = 128\n",
195 |     "CONV_LEN4 = 128\n",
196 |     "CONV_LEN5 = 128\n",
197 |     "CONV_LEN6 = 128\n",
198 |     "CONV_LEN = CONV_LEN1 + CONV_LEN2 + CONV_LEN3 + CONV_LEN4 + CONV_LEN5 + CONV_LEN6\n",
199 |     "\n",
200 |     "# lstm\n",
201 |     "LSTM_SIZE1 = 256\n",
202 |     "LSTM_SIZE2 = 256\n",
203 |     "LSTM_DROP_RATE = 0.3\n",
204 |     "\n",
205 |     "# dense\n",
206 |     "DENSE_SIZE1 = 512\n",
207 |     "DENSE_SIZE2 = 256"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 6,
213 |    "metadata": {
214 |     "collapsed": true
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "def cnn_layer1(inputa, inputb, filters, kernel_size):\n",
219 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
220 |     "    conv_outputa = conv(inputa)\n",
221 |     "    conv_outputa = GlobalAveragePooling1D()(conv_outputa)\n",
222 |     "    conv_outputb = conv(inputb)\n",
223 |     "    conv_outputb = GlobalAveragePooling1D()(conv_outputb)\n",
224 |     "    return conv_outputa, conv_outputb\n",
225 |     "    \n",
226 |     "def cnn_layer2(inputa, inputb, filters, kernel_size):\n",
227 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
228 |     "    conv_outputa = conv(inputa)\n",
229 |     "    conv_outputa = MaxPool1D(pool_size=SEQ_LEN)(conv_outputa)\n",
230 |     "    conv_outputa = Flatten()(conv_outputa)\n",
231 |     "    conv_outputb = conv(inputb)\n",
232 |     "    conv_outputb = MaxPool1D(pool_size=SEQ_LEN)(conv_outputb)\n",
233 |     "    conv_outputb = Flatten()(conv_outputb)\n",
234 |     "    return conv_outputa, conv_outputb\n",
235 |     "\n",
236 |     "def cnn_layer3(inputa, inputb, filters, kernel_size):\n",
237 |     "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
238 |     "    \n",
239 |     "    conv_outputa = conv(inputa)\n",
240 |     "    conv_outputa1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputa))\n",
241 |     "    conv_outputa2 = GlobalAveragePooling1D()(conv_outputa)\n",
242 |     "    conv_outputa = concatenate([conv_outputa1, conv_outputa2])\n",
243 |     "    \n",
244 |     "    conv_outputb = conv(inputb)\n",
245 |     "    conv_outputb1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputb))\n",
246 |     "    conv_outputb2 = GlobalAveragePooling1D()(conv_outputb)\n",
247 |     "    conv_outputb = concatenate([conv_outputb1, conv_outputb2])\n",
248 |     "    \n",
249 |     "    return conv_outputa, conv_outputb"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "# WORDS"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 7,
262 |    "metadata": {},
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n",
269 |       "Instructions for updating:\n",
270 |       "`NHWC` for data_format is deprecated, use `NWC` instead\n",
271 |       "Train on 228946 samples, validate on 25440 samples\n",
272 |       "Epoch 1/5\n",
273 |       "228946/228946 [==============================] - 169s 738us/step - loss: 0.4468 - acc: 0.7899 - val_loss: 0.3382 - val_acc: 0.8574\n",
274 |       "Epoch 2/5\n",
275 |       "228946/228946 [==============================] - 160s 697us/step - loss: 0.3121 - acc: 0.8629 - val_loss: 0.2594 - val_acc: 0.8870\n",
276 |       "Epoch 3/5\n",
277 |       "228946/228946 [==============================] - 166s 726us/step - loss: 0.2687 - acc: 0.8839 - val_loss: 0.2528 - val_acc: 0.8978\n",
278 |       "Epoch 4/5\n",
279 |       "228946/228946 [==============================] - 166s 724us/step - loss: 0.2414 - acc: 0.8968 - val_loss: 0.2279 - val_acc: 0.9031\n",
280 |       "Epoch 5/5\n",
281 |       "228946/228946 [==============================] - 167s 730us/step - loss: 0.2240 - acc: 0.9047 - val_loss: 0.2136 - val_acc: 0.9123\n",
282 |       "Train on 228946 samples, validate on 25440 samples\n",
283 |       "Epoch 1/25\n",
284 |       "228946/228946 [==============================] - 178s 776us/step - loss: 0.2172 - acc: 0.9079 - val_loss: 0.2190 - val_acc: 0.9129\n",
285 |       "Epoch 2/25\n",
286 |       "228946/228946 [==============================] - 173s 755us/step - loss: 0.2038 - acc: 0.9146 - val_loss: 0.2058 - val_acc: 0.9172\n",
287 |       "Epoch 3/25\n",
288 |       "228946/228946 [==============================] - 176s 770us/step - loss: 0.1922 - acc: 0.9196 - val_loss: 0.2084 - val_acc: 0.9175\n",
289 |       "Epoch 4/25\n",
290 |       "228946/228946 [==============================] - 175s 764us/step - loss: 0.1825 - acc: 0.9239 - val_loss: 0.2094 - val_acc: 0.9196\n",
291 |       "Epoch 5/25\n",
292 |       "228946/228946 [==============================] - 176s 770us/step - loss: 0.1746 - acc: 0.9277 - val_loss: 0.1939 - val_acc: 0.9237\n",
293 |       "Epoch 6/25\n",
294 |       "228946/228946 [==============================] - 176s 769us/step - loss: 0.1684 - acc: 0.9303 - val_loss: 0.1912 - val_acc: 0.9258\n",
295 |       "Epoch 7/25\n",
296 |       "228946/228946 [==============================] - 175s 766us/step - loss: 0.1616 - acc: 0.9342 - val_loss: 0.1897 - val_acc: 0.9267\n",
297 |       "Epoch 8/25\n",
298 |       "228946/228946 [==============================] - 172s 752us/step - loss: 0.1570 - acc: 0.9352 - val_loss: 0.1979 - val_acc: 0.9271\n",
299 |       "Epoch 9/25\n",
300 |       "228946/228946 [==============================] - 178s 779us/step - loss: 0.1525 - acc: 0.9376 - val_loss: 0.1896 - val_acc: 0.9278\n",
301 |       "Epoch 10/25\n",
302 |       "228946/228946 [==============================] - 178s 776us/step - loss: 0.1482 - acc: 0.9391 - val_loss: 0.2019 - val_acc: 0.9228\n",
303 |       "Epoch 11/25\n",
304 |       "228946/228946 [==============================] - 178s 775us/step - loss: 0.1446 - acc: 0.9411 - val_loss: 0.1884 - val_acc: 0.9289\n",
305 |       "Epoch 12/25\n",
306 |       "228946/228946 [==============================] - 176s 771us/step - loss: 0.1411 - acc: 0.9422 - val_loss: 0.1884 - val_acc: 0.9308\n",
307 |       "Epoch 13/25\n",
308 |       "228946/228946 [==============================] - 176s 768us/step - loss: 0.1387 - acc: 0.9427 - val_loss: 0.1833 - val_acc: 0.9323\n",
309 |       "Epoch 14/25\n",
310 |       "228946/228946 [==============================] - 176s 771us/step - loss: 0.1354 - acc: 0.9442 - val_loss: 0.1861 - val_acc: 0.9309\n",
311 |       "Epoch 15/25\n",
312 |       "228946/228946 [==============================] - 176s 771us/step - loss: 0.1327 - acc: 0.9455 - val_loss: 0.1982 - val_acc: 0.9292\n",
313 |       "Epoch 16/25\n",
314 |       "228946/228946 [==============================] - 177s 774us/step - loss: 0.1305 - acc: 0.9463 - val_loss: 0.2033 - val_acc: 0.9267\n",
315 |       "Epoch 17/25\n",
316 |       "159744/228946 [===================>..........] - ETA: 52s - loss: 0.1273 - acc: 0.9476"
317 |      ]
318 |     },
319 |     {
320 |      "ename": "KeyboardInterrupt",
321 |      "evalue": "",
322 |      "output_type": "error",
323 |      "traceback": [
324 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
325 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
326 |       "\u001b[1;32m<ipython-input-7-9eaeb7594bf4>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     97\u001b[0m         \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdev_x1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_x2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     98\u001b[0m         \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 99\u001b[1;33m         \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mearly_stopping\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlr_reducer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_point\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    100\u001b[0m     )\n\u001b[0;32m    101\u001b[0m \u001b[1;31m#     # 第三次训练\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
327 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[0;32m   1040\u001b[0m                                         \u001b[0minitial_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1041\u001b[0m                                         \u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m                                         validation_steps=validation_steps)\n\u001b[0m\u001b[0;32m   1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1044\u001b[0m     def evaluate(self, x=None, y=None,\n",
328 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training_arrays.py\u001b[0m in \u001b[0;36mfit_loop\u001b[1;34m(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[0;32m    197\u001b[0m                     \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    198\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 199\u001b[1;33m                 \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    200\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    201\u001b[0m                     \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
329 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m   2665\u001b[0m                     \u001b[1;34m'In order to feed symbolic tensors to a Keras model '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2666\u001b[0m                     'in TensorFlow, you need tensorflow 1.8 or higher.')\n\u001b[1;32m-> 2667\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_legacy_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2668\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2669\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
330 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m_legacy_call\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m   2647\u001b[0m         \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2648\u001b[0m         updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[1;32m-> 2649\u001b[1;33m                               **self.session_kwargs)\n\u001b[0m\u001b[0;32m   2650\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2651\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
331 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m    903\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    904\u001b[0m       result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[1;32m--> 905\u001b[1;33m                          run_metadata_ptr)\n\u001b[0m\u001b[0;32m    906\u001b[0m       \u001b[1;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    907\u001b[0m         \u001b[0mproto_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
332 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m   1135\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1136\u001b[0m       results = self._do_run(handle, final_targets, final_fetches,\n\u001b[1;32m-> 1137\u001b[1;33m                              feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[0;32m   1138\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1139\u001b[0m       \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
333 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m   1353\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1354\u001b[0m       return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[1;32m-> 1355\u001b[1;33m                            options, run_metadata)\n\u001b[0m\u001b[0;32m   1356\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1357\u001b[0m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
334 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m   1359\u001b[0m   \u001b[1;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1360\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1361\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1362\u001b[0m     \u001b[1;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1363\u001b[0m       \u001b[0mmessage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
335 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[1;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m   1338\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1339\u001b[0m           return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[1;32m-> 1340\u001b[1;33m                                    target_list, status, run_metadata)\n\u001b[0m\u001b[0;32m   1341\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1342\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
336 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "from sklearn.model_selection import StratifiedKFold\n",
342 |     "\n",
343 |     "best_results = []\n",
344 |     "last_results = []\n",
345 |     "best_file_names = []\n",
346 |     "\n",
347 |     "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):  # word/char switch\n",
348 |     "    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch\n",
349 |     "    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch\n",
350 |     "    \n",
351 |     "    input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
352 |     "    input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
353 |     "\n",
354 |     "    embedding_layer = Embedding(\n",
355 |     "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
356 |     "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
357 |     "        weights=[word_embedding_data],  # word/char switch\n",
358 |     "        input_length=SEQ_LEN,\n",
359 |     "        trainable=False\n",
360 |     "    )\n",
361 |     "    \n",
362 |     "    vector1 = embedding_layer(input1)\n",
363 |     "    vector2 = embedding_layer(input2)\n",
364 |     "    \n",
365 |     "    lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
366 |     "    layer1a = lstm_layer1(vector1)\n",
367 |     "    layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n",
368 |     "    layer1b = lstm_layer1(vector2)\n",
369 |     "    layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n",
370 |     "\n",
371 |     "    lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
372 |     "    layer2a = lstm_layer2(layer1a)\n",
373 |     "    layer2b = lstm_layer2(layer1b)\n",
374 |     "#     # 每个序列片拼接对应的原始embedding向量\n",
375 |     "#     layer2a = concatenate([vector1, layer2a])\n",
376 |     "#     layer2b = concatenate([vector2, layer2b])\n",
377 |     "    \n",
378 |     "    conv1a, conv1b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n",
379 |     "    conv2a, conv2b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n",
380 |     "    conv3a, conv3b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n",
381 |     "    conv4a, conv4b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n",
382 |     "    conv5a, conv5b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n",
383 |     "    conv6a, conv6b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n",
384 |     "    \n",
385 |     "    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
386 |     "    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
387 |     "    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
388 |     "    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
389 |     "    merge = concatenate([diff, mult])\n",
390 |     "    \n",
391 |     "    x = Dropout(DROP_RATE)(merge)\n",
392 |     "    x = BatchNormalization()(x)\n",
393 |     "    x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
394 |     "    x = Dropout(DROP_RATE)(x)\n",
395 |     "    x = BatchNormalization()(x)\n",
396 |     "    x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
397 |     "    x = Dropout(DROP_RATE)(x)\n",
398 |     "    x = BatchNormalization()(x)\n",
399 |     "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
400 |     "    \n",
401 |     "    model = Model(inputs=[input1, input2], outputs=pred)\n",
402 |     "    \n",
403 |     "    # 第一次训练\n",
404 |     "    model.compile(\n",
405 |     "        optimizer=\"nadam\",\n",
406 |     "        loss=\"binary_crossentropy\",\n",
407 |     "        metrics=[\"acc\"]\n",
408 |     "    )\n",
409 |     "    fit_res1 = model.fit(\n",
410 |     "        x=[train_x1, train_x2],\n",
411 |     "        y=train_y,\n",
412 |     "        batch_size=BATCH_SIZE,\n",
413 |     "        epochs=EPOCHES1,\n",
414 |     "        validation_data=([dev_x1, dev_x2], dev_y),\n",
415 |     "        shuffle=True,\n",
416 |     "    )\n",
417 |     "    # 第二次训练\n",
418 |     "    embedding_layer.trainable = True\n",
419 |     "    lr_reducer = ReduceLROnPlateau(factor=0.5, patience=4, min_lr=0.0005)\n",
420 |     "    early_stopping = EarlyStopping(\"val_loss\", patience=8)\n",
421 |     "    check_point = ModelCheckpoint(\n",
422 |     "        \"./log/%s.Multi_LSTM_CNN_v2.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
423 |     "        monitor=\"val_loss\",\n",
424 |     "        save_best_only=True,\n",
425 |     "    )\n",
426 |     "    model.compile(\n",
427 |     "        optimizer=\"nadam\",\n",
428 |     "        loss=\"binary_crossentropy\",\n",
429 |     "        metrics=[\"acc\"]\n",
430 |     "    )\n",
431 |     "    fit_res2 = model.fit(\n",
432 |     "        x=[train_x1, train_x2],\n",
433 |     "        y=train_y,\n",
434 |     "        batch_size=BATCH_SIZE,\n",
435 |     "        epochs=EPOCHES2,\n",
436 |     "        validation_data=([dev_x1, dev_x2], dev_y),\n",
437 |     "        shuffle=True,\n",
438 |     "        callbacks=[early_stopping, lr_reducer, check_point]\n",
439 |     "    )\n",
440 |     "    \n",
441 |     "#     # 第三次训练\n",
442 |     "#     model.compile(\n",
443 |     "#         optimizer=SGD(lr=0.001),\n",
444 |     "#         loss=\"binary_crossentropy\",\n",
445 |     "#         metrics=[\"acc\"]\n",
446 |     "#     )\n",
447 |     "#     fit_res3 = model.fit(\n",
448 |     "#         x=[train_x1, train_x2],\n",
449 |     "#         y=train_y,\n",
450 |     "#         batch_size=BATCH_SIZE,\n",
451 |     "#         epochs=EPOCHES3,\n",
452 |     "#         validation_data=([dev_x1, dev_x2], dev_y),\n",
453 |     "#         shuffle=True,\n",
454 |     "#         callbacks=[early_stopping, lr_reducer, check_point]\n",
455 |     "#     )\n",
456 |     "    \n",
457 |     "    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
458 |     "    last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
459 |     "    \n",
460 |     "    best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n",
461 |     "    best_file_names.append(best_model_file)\n",
462 |     "    print(\"load model %s\" % (best_model_file,))\n",
463 |     "    model.load_weights(best_model_file)\n",
464 |     "    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
465 |     "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
466 |     "\n",
467 |     "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
468 |     "    \"./result/%s-Multi_LSTM_CNN_v2_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
469 |     "    index=False\n",
470 |     ")\n",
471 |     "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
472 |     "    \"./result/%s-Multi_LSTM_CNN_v2_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
473 |     "    index=False\n",
474 |     ")"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": null,
480 |    "metadata": {
481 |     "collapsed": true
482 |    },
483 |    "outputs": [],
484 |    "source": []
485 |   }
486 |  ],
487 |  "metadata": {
488 |   "kernelspec": {
489 |    "display_name": "Python 3",
490 |    "language": "python",
491 |    "name": "python3"
492 |   },
493 |   "language_info": {
494 |    "codemirror_mode": {
495 |     "name": "ipython",
496 |     "version": 3
497 |    },
498 |    "file_extension": ".py",
499 |    "mimetype": "text/x-python",
500 |    "name": "python",
501 |    "nbconvert_exporter": "python",
502 |    "pygments_lexer": "ipython3",
503 |    "version": "3.6.2"
504 |   }
505 |  },
506 |  "nbformat": 4,
507 |  "nbformat_minor": 2
508 | }
509 | 


--------------------------------------------------------------------------------
/[Model] Multi LSTM CNN v5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 模型结构\n",
  8 |     "\n",
  9 |     "- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)\n",
 10 |     "- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中\n",
 11 |     "- 卷积层为双层, 最后的池化层有Average和Max两种\n",
 12 |     "- 对于每个问题, 将所有卷积核结果并起来\n",
 13 |     "- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]\n",
 14 |     "- 融入特征"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import warnings\n",
 26 |     "warnings.filterwarnings(\"ignore\")\n",
 27 |     "\n",
 28 |     "import os\n",
 29 |     "import time\n",
 30 |     "import numpy as np\n",
 31 |     "import pandas as pd\n",
 32 |     "from glob import glob\n",
 33 |     "from datetime import datetime"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "try:\n",
 45 |     "    os.mkdir(\"./log/\")\n",
 46 |     "    os.mkdir(\"./result/\")\n",
 47 |     "except FileExistsError:\n",
 48 |     "    pass"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "DATA_PATH = \"./data/\"\n",
 60 |     "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
 61 |     "TEST_PATH = DATA_PATH + \"test.csv\"\n",
 62 |     "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
 63 |     "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
 64 |     "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
 65 |     "TRAIN_FEATURE = DATA_PATH + \"train_feature.csv\"\n",
 66 |     "TEST_FEATURE = DATA_PATH + \"test_feature.csv\""
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stderr",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "Using TensorFlow backend.\n"
 79 |      ]
 80 |     },
 81 |     {
 82 |      "data": {
 83 |       "text/plain": [
 84 |        "((10001, 300), (3049, 300))"
 85 |       ]
 86 |      },
 87 |      "execution_count": 4,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "train_data = pd.read_csv(TRAIN_PATH)\n",
 94 |     "test_data = pd.read_csv(TEST_PATH)\n",
 95 |     "question_data = pd.read_csv(QUEST_PATH)\n",
 96 |     "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 97 |     "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
 98 |     "train_feature = pd.read_csv(TRAIN_FEATURE).values\n",
 99 |     "test_feature = pd.read_csv(TEST_FEATURE).values\n",
100 |     "\n",
101 |     "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
102 |     "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
103 |     "\n",
104 |     "label = train_data[\"label\"].values\n",
105 |     "\n",
106 |     "from keras.preprocessing.text import Tokenizer\n",
107 |     "\n",
108 |     "MAX_COUNT = 10000\n",
109 |     "\n",
110 |     "word_tokenizer = Tokenizer(MAX_COUNT)\n",
111 |     "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
112 |     "\n",
113 |     "word_embedding_data = np.concatenate(\n",
114 |     "    (\n",
115 |     "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
116 |     "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
117 |     "    ),\n",
118 |     "    axis=0\n",
119 |     ")\n",
120 |     "\n",
121 |     "char_tokenizer = Tokenizer(MAX_COUNT)\n",
122 |     "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
123 |     "\n",
124 |     "char_embedding_data = np.concatenate(\n",
125 |     "    (\n",
126 |     "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
127 |     "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
128 |     "    ),\n",
129 |     "    axis=0\n",
130 |     ")\n",
131 |     "\n",
132 |     "word_embedding_data.shape, char_embedding_data.shape"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 5,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "((254386, 25),\n",
144 |        " (254386, 25),\n",
145 |        " (172956, 25),\n",
146 |        " (172956, 25),\n",
147 |        " (254386, 25),\n",
148 |        " (254386, 25),\n",
149 |        " (172956, 25),\n",
150 |        " (172956, 25))"
151 |       ]
152 |      },
153 |      "execution_count": 5,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "from keras.preprocessing.sequence import pad_sequences\n",
160 |     "\n",
161 |     "SEQ_LEN = 25\n",
162 |     "\n",
163 |     "def gen_word_data(data):\n",
164 |     "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
165 |     "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
166 |     "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
167 |     "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
168 |     "    \n",
169 |     "def gen_char_data(data):\n",
170 |     "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
171 |     "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
172 |     "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
173 |     "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
174 |     "\n",
175 |     "word1, word2 = gen_word_data(train_data)\n",
176 |     "char1, char2 = gen_char_data(train_data)\n",
177 |     "test_word1, test_word2 = gen_word_data(test_data)\n",
178 |     "test_char1, test_char2 = gen_char_data(test_data)\n",
179 |     "\n",
180 |     "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 6,
186 |    "metadata": {
187 |     "collapsed": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "from keras.models import Model\n",
192 |     "from keras.layers.merge import concatenate\n",
193 |     "from keras.optimizers import Adam, SGD, Nadam\n",
194 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n",
195 |     "from keras.layers import LSTM, Bidirectional, TimeDistributed\n",
196 |     "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n",
197 |     "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 7,
203 |    "metadata": {
204 |     "collapsed": true
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "# general\n",
209 |     "NUM_EPOCHES = 25\n",
210 |     "EPOCHES1 = 5\n",
211 |     "EPOCHES2 = 25 # 5\n",
212 |     "EPOCHES3 = 22\n",
213 |     "BATCH_SIZE = 1024\n",
214 |     "DROP_RATE = 0.3\n",
215 |     "\n",
216 |     "# cnn\n",
217 |     "CONV_LEN1 = 128\n",
218 |     "CONV_LEN2 = 128\n",
219 |     "CONV_LEN3 = 128\n",
220 |     "CONV_LEN4 = 128\n",
221 |     "CONV_LEN5 = 128\n",
222 |     "CONV_LEN6 = 128\n",
223 |     "\n",
224 |     "# lstm\n",
225 |     "LSTM_SIZE1 = 256\n",
226 |     "LSTM_SIZE2 = 256\n",
227 |     "LSTM_DROP_RATE = 0.3\n",
228 |     "\n",
229 |     "# dense\n",
230 |     "DENSE_SIZE1 = 512\n",
231 |     "DENSE_SIZE2 = 256\n",
232 |     "DENSE_FEATURE = 32"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 8,
238 |    "metadata": {
239 |     "collapsed": true
240 |    },
241 |    "outputs": [],
242 |    "source": [
243 |     "def cnn_double_layer(inputa, inputb, filters, kernel_size):\n",
244 |     "    conv1 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n",
245 |     "    conv2 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n",
246 |     "    \n",
247 |     "    conv1a = conv1(inputa)\n",
248 |     "    conv1a = BatchNormalization()(conv1a)\n",
249 |     "    conv1a = Activation(activation=\"relu\")(conv1a)\n",
250 |     "    conv2a = conv2(conv1a)\n",
251 |     "    conv2a = BatchNormalization()(conv2a)\n",
252 |     "    conv2a = Activation(activation=\"relu\")(conv2a)\n",
253 |     "    output_avg_a = GlobalAveragePooling1D()(conv2a)\n",
254 |     "    output_max_a = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2a))\n",
255 |     "    output_a = concatenate([output_avg_a, output_max_a])\n",
256 |     "    \n",
257 |     "    conv1b = conv1(inputb)\n",
258 |     "    conv1b = BatchNormalization()(conv1b)\n",
259 |     "    conv1b = Activation(activation=\"relu\")(conv1b)\n",
260 |     "    conv2b = conv2(conv1b)\n",
261 |     "    conv2b = BatchNormalization()(conv2b)\n",
262 |     "    conv2b = Activation(activation=\"relu\")(conv2b)\n",
263 |     "    output_avg_b = GlobalAveragePooling1D()(conv2b)\n",
264 |     "    output_max_b = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2b))\n",
265 |     "    output_b = concatenate([output_avg_b, output_max_b])\n",
266 |     "    \n",
267 |     "    return output_a, output_b"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "# WORDS"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 9,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "# from sklearn.model_selection import StratifiedKFold\n",
284 |     "\n",
285 |     "# best_results = []\n",
286 |     "# # last_results = []\n",
287 |     "# best_file_names = []\n",
288 |     "# dev_predictions = []\n",
289 |     "\n",
290 |     "# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):  # word/char switch\n",
291 |     "#     print(\"-\" * 60)\n",
292 |     "#     print(\"Fold {} training start...\".format(i))\n",
293 |     "    \n",
294 |     "#     train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch\n",
295 |     "#     dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch\n",
296 |     "#     train_f, dev_f = train_feature[train_index, :], train_feature[dev_index, :]\n",
297 |     "    \n",
298 |     "#     input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
299 |     "#     input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
300 |     "#     inputf = Input(shape=(train_f.shape[1],), dtype=\"float32\")\n",
301 |     "\n",
302 |     "#     embedding_layer = Embedding(\n",
303 |     "#         input_dim=word_embedding_data.shape[0],  # word/char switch\n",
304 |     "#         output_dim=word_embedding_data.shape[1],  # word/char switch\n",
305 |     "#         weights=[word_embedding_data],  # word/char switch\n",
306 |     "#         input_length=SEQ_LEN,\n",
307 |     "#         trainable=False\n",
308 |     "#     )\n",
309 |     "    \n",
310 |     "#     vector1 = embedding_layer(input1)\n",
311 |     "#     vector2 = embedding_layer(input2)\n",
312 |     "    \n",
313 |     "#     lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
314 |     "#     layer1a = lstm_layer1(vector1)\n",
315 |     "#     layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n",
316 |     "#     layer1b = lstm_layer1(vector2)\n",
317 |     "#     layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n",
318 |     "\n",
319 |     "#     lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
320 |     "#     layer2a = lstm_layer2(layer1a)\n",
321 |     "#     layer2b = lstm_layer2(layer1b)\n",
322 |     "    \n",
323 |     "#     conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n",
324 |     "#     conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n",
325 |     "#     conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n",
326 |     "#     conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n",
327 |     "#     conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n",
328 |     "#     conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n",
329 |     "    \n",
330 |     "#     merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
331 |     "#     merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
332 |     "#     diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
333 |     "#     mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
334 |     "#     merge = concatenate([diff, mult])\n",
335 |     "    \n",
336 |     "#     x = Dropout(DROP_RATE)(merge)\n",
337 |     "#     x = BatchNormalization()(x)\n",
338 |     "#     x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
339 |     "    \n",
340 |     "#     fe = BatchNormalization()(inputf)\n",
341 |     "#     fe = Dense(DENSE_FEATURE, activation=\"relu\")(fe)\n",
342 |     "    \n",
343 |     "#     x = concatenate([x, fe])\n",
344 |     "    \n",
345 |     "#     x = Dropout(DROP_RATE)(x)\n",
346 |     "#     x = BatchNormalization()(x)\n",
347 |     "#     x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
348 |     "#     x = Dropout(DROP_RATE)(x)\n",
349 |     "#     x = BatchNormalization()(x)\n",
350 |     "#     pred = Dense(1, activation=\"sigmoid\")(x)\n",
351 |     "    \n",
352 |     "#     model = Model(inputs=[input1, input2, inputf], outputs=pred)\n",
353 |     "#     model.compile(\n",
354 |     "#         optimizer=\"nadam\",\n",
355 |     "#         loss=\"binary_crossentropy\",\n",
356 |     "#         metrics=[\"acc\"]\n",
357 |     "#     )\n",
358 |     "    \n",
359 |     "#     early_stopping = EarlyStopping(\"val_loss\", patience=6)\n",
360 |     "#     lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0005)\n",
361 |     "#     check_point = ModelCheckpoint(\n",
362 |     "#         \"./log/%s.Multi_LSTM_CNN_v3.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
363 |     "#         monitor=\"val_loss\",\n",
364 |     "#         save_best_only=True,\n",
365 |     "#     )\n",
366 |     "    \n",
367 |     "#     fit_res = model.fit(\n",
368 |     "#         x=[train_x1, train_x2, train_f],\n",
369 |     "#         y=train_y,\n",
370 |     "#         batch_size=BATCH_SIZE,\n",
371 |     "#         epochs=NUM_EPOCHES,\n",
372 |     "#         validation_data=([dev_x1, dev_x2, dev_f], dev_y),\n",
373 |     "#         shuffle=True,\n",
374 |     "#         callbacks=[early_stopping, lr_reducer, check_point]\n",
375 |     "#     )\n",
376 |     "    \n",
377 |     "# #     pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
378 |     "# #     last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
379 |     "    \n",
380 |     "#     best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n",
381 |     "#     best_file_names.append(best_model_file)\n",
382 |     "#     print(\"load model %s\" % (best_model_file,))\n",
383 |     "#     model.load_weights(best_model_file)\n",
384 |     "#     pred_best = model.predict([test_word1, test_word2, test_feature], batch_size=BATCH_SIZE)  # word/char switch\n",
385 |     "#     best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
386 |     "    \n",
387 |     "#     dev_pred = model.predict([dev_x1, dev_x2, dev_f], batch_size=BATCH_SIZE)\n",
388 |     "#     dev_result = pd.DataFrame({\"pred\": dev_pred.ravel(), \"label\": dev_y})\n",
389 |     "#     dev_predictions.append(dev_result)\n",
390 |     "\n",
391 |     "# # pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
392 |     "# #     \"./result/%s-Multi_LSTM_CNN_v5_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
393 |     "# #     index=False\n",
394 |     "# # )\n",
395 |     "# pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
396 |     "#     \"./result/%s-Multi_LSTM_CNN_v5_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
397 |     "#     index=False\n",
398 |     "# )\n",
399 |     "\n",
400 |     "# total_dev = pd.concat(dev_predictions, axis=0)\n",
401 |     "# total_dev.to_csv(\n",
402 |     "#     \"./result/%s-Multi_LSTM_CNN_v5_word_dev_result.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n",
403 |     "#     index=False\n",
404 |     "# )\n",
405 |     "\n",
406 |     "# model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n",
407 |     "# os.mkdir(model_path)\n",
408 |     "# for model_name in best_file_names:\n",
409 |     "#     abs_name = os.path.split(model_name)[1]\n",
410 |     "#     os.rename(model_name, model_path + abs_name)"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {},
416 |    "source": [
417 |     "# CHARS"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 10,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "name": "stdout",
427 |      "output_type": "stream",
428 |      "text": [
429 |       "------------------------------------------------------------\n",
430 |       "Fold 0 training start...\n",
431 |       "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n",
432 |       "Instructions for updating:\n",
433 |       "`NHWC` for data_format is deprecated, use `NWC` instead\n",
434 |       "Train on 228946 samples, validate on 25440 samples\n",
435 |       "Epoch 1/25\n",
436 |       "228946/228946 [==============================] - 198s 866us/step - loss: 0.3947 - acc: 0.8071 - val_loss: 0.3225 - val_acc: 0.8560\n",
437 |       "Epoch 2/25\n",
438 |       "228946/228946 [==============================] - 182s 797us/step - loss: 0.2892 - acc: 0.8666 - val_loss: 0.2669 - val_acc: 0.8855\n",
439 |       "Epoch 3/25\n",
440 |       "228946/228946 [==============================] - 187s 818us/step - loss: 0.2522 - acc: 0.8873 - val_loss: 0.2193 - val_acc: 0.9046\n",
441 |       "Epoch 4/25\n",
442 |       "228946/228946 [==============================] - 193s 843us/step - loss: 0.2297 - acc: 0.8979 - val_loss: 0.2147 - val_acc: 0.9078\n",
443 |       "Epoch 5/25\n",
444 |       "228946/228946 [==============================] - 194s 848us/step - loss: 0.2145 - acc: 0.9059 - val_loss: 0.1984 - val_acc: 0.9136\n",
445 |       "Epoch 6/25\n",
446 |       "228946/228946 [==============================] - 182s 797us/step - loss: 0.2030 - acc: 0.9114 - val_loss: 0.1981 - val_acc: 0.9162\n",
447 |       "Epoch 7/25\n",
448 |       "228946/228946 [==============================] - 188s 823us/step - loss: 0.1934 - acc: 0.9166 - val_loss: 0.1876 - val_acc: 0.9191\n",
449 |       "Epoch 8/25\n",
450 |       "228946/228946 [==============================] - 185s 809us/step - loss: 0.1857 - acc: 0.9196 - val_loss: 0.1846 - val_acc: 0.9231\n",
451 |       "Epoch 9/25\n",
452 |       "228946/228946 [==============================] - 192s 839us/step - loss: 0.1798 - acc: 0.9231 - val_loss: 0.1780 - val_acc: 0.9268\n",
453 |       "Epoch 10/25\n",
454 |       "228946/228946 [==============================] - 187s 818us/step - loss: 0.1743 - acc: 0.9254 - val_loss: 0.1820 - val_acc: 0.9276\n",
455 |       "Epoch 11/25\n",
456 |       "228946/228946 [==============================] - 192s 838us/step - loss: 0.1693 - acc: 0.9275 - val_loss: 0.1749 - val_acc: 0.9291\n",
457 |       "Epoch 12/25\n",
458 |       "228946/228946 [==============================] - 193s 842us/step - loss: 0.1663 - acc: 0.9290 - val_loss: 0.1740 - val_acc: 0.9301\n",
459 |       "Epoch 13/25\n",
460 |       "228946/228946 [==============================] - 193s 844us/step - loss: 0.1614 - acc: 0.9309 - val_loss: 0.1744 - val_acc: 0.9292\n",
461 |       "Epoch 14/25\n",
462 |       "228946/228946 [==============================] - 193s 844us/step - loss: 0.1577 - acc: 0.9333 - val_loss: 0.1710 - val_acc: 0.9306\n",
463 |       "Epoch 15/25\n",
464 |       "228946/228946 [==============================] - 194s 846us/step - loss: 0.1543 - acc: 0.9346 - val_loss: 0.1704 - val_acc: 0.9309\n",
465 |       "Epoch 16/25\n",
466 |       "228946/228946 [==============================] - 192s 839us/step - loss: 0.1524 - acc: 0.9356 - val_loss: 0.1730 - val_acc: 0.9318\n",
467 |       "Epoch 17/25\n",
468 |       "228946/228946 [==============================] - 193s 842us/step - loss: 0.1477 - acc: 0.9377 - val_loss: 0.1647 - val_acc: 0.9346\n",
469 |       "Epoch 18/25\n",
470 |       "228946/228946 [==============================] - 183s 799us/step - loss: 0.1451 - acc: 0.9387 - val_loss: 0.1720 - val_acc: 0.9333\n",
471 |       "Epoch 19/25\n",
472 |       "228946/228946 [==============================] - 183s 799us/step - loss: 0.1433 - acc: 0.9395 - val_loss: 0.1699 - val_acc: 0.9341\n",
473 |       "Epoch 20/25\n",
474 |       "228946/228946 [==============================] - 190s 831us/step - loss: 0.1409 - acc: 0.9407 - val_loss: 0.1677 - val_acc: 0.9338\n",
475 |       "Epoch 21/25\n",
476 |       "228946/228946 [==============================] - 190s 831us/step - loss: 0.1315 - acc: 0.9447 - val_loss: 0.1687 - val_acc: 0.9346\n",
477 |       "Epoch 22/25\n",
478 |       "228946/228946 [==============================] - 193s 842us/step - loss: 0.1256 - acc: 0.9471 - val_loss: 0.1647 - val_acc: 0.9372\n",
479 |       "Epoch 23/25\n",
480 |       "228946/228946 [==============================] - 193s 845us/step - loss: 0.1222 - acc: 0.9490 - val_loss: 0.1644 - val_acc: 0.9365\n",
481 |       "Epoch 24/25\n",
482 |       "228946/228946 [==============================] - 192s 839us/step - loss: 0.1197 - acc: 0.9498 - val_loss: 0.1695 - val_acc: 0.9358\n",
483 |       "Epoch 25/25\n",
484 |       "228946/228946 [==============================] - 192s 840us/step - loss: 0.1168 - acc: 0.9513 - val_loss: 0.1724 - val_acc: 0.9350\n",
485 |       "load model ./log/20180715-102931.Multi_LSTM_CNN_v5.char.023.hdf5\n",
486 |       "------------------------------------------------------------\n",
487 |       "Fold 1 training start...\n",
488 |       "Train on 228946 samples, validate on 25440 samples\n",
489 |       "Epoch 1/25\n",
490 |       "228946/228946 [==============================] - 198s 866us/step - loss: 0.3974 - acc: 0.8062 - val_loss: 0.3109 - val_acc: 0.8622\n",
491 |       "Epoch 2/25\n",
492 |       "228946/228946 [==============================] - 185s 807us/step - loss: 0.2894 - acc: 0.8665 - val_loss: 0.2533 - val_acc: 0.8928\n",
493 |       "Epoch 3/25\n",
494 |       "228946/228946 [==============================] - 190s 828us/step - loss: 0.2506 - acc: 0.8879 - val_loss: 0.2277 - val_acc: 0.9017\n",
495 |       "Epoch 4/25\n",
496 |       "228946/228946 [==============================] - 187s 815us/step - loss: 0.2285 - acc: 0.8992 - val_loss: 0.2086 - val_acc: 0.9099\n",
497 |       "Epoch 5/25\n",
498 |       "228946/228946 [==============================] - 183s 798us/step - loss: 0.2128 - acc: 0.9065 - val_loss: 0.2071 - val_acc: 0.9135\n",
499 |       "Epoch 6/25\n",
500 |       "228946/228946 [==============================] - 183s 798us/step - loss: 0.2018 - acc: 0.9124 - val_loss: 0.1964 - val_acc: 0.9168\n",
501 |       "Epoch 7/25\n",
502 |       "228946/228946 [==============================] - 189s 827us/step - loss: 0.1925 - acc: 0.9173 - val_loss: 0.1846 - val_acc: 0.9245\n",
503 |       "Epoch 8/25\n",
504 |       "228946/228946 [==============================] - 183s 799us/step - loss: 0.1860 - acc: 0.9197 - val_loss: 0.1793 - val_acc: 0.9238\n",
505 |       "Epoch 9/25\n",
506 |       "228946/228946 [==============================] - 188s 822us/step - loss: 0.1799 - acc: 0.9231 - val_loss: 0.1794 - val_acc: 0.9254\n",
507 |       "Epoch 10/25\n",
508 |       "228946/228946 [==============================] - 192s 839us/step - loss: 0.1736 - acc: 0.9255 - val_loss: 0.1836 - val_acc: 0.9225\n",
509 |       "Epoch 11/25\n",
510 |       "228946/228946 [==============================] - 188s 820us/step - loss: 0.1688 - acc: 0.9276 - val_loss: 0.1729 - val_acc: 0.9289\n",
511 |       "Epoch 12/25\n",
512 |       "228946/228946 [==============================] - 190s 831us/step - loss: 0.1657 - acc: 0.9291 - val_loss: 0.1711 - val_acc: 0.9296\n",
513 |       "Epoch 13/25\n",
514 |       "228946/228946 [==============================] - 191s 836us/step - loss: 0.1609 - acc: 0.9311 - val_loss: 0.1759 - val_acc: 0.9300\n",
515 |       "Epoch 14/25\n",
516 |       "228946/228946 [==============================] - 191s 835us/step - loss: 0.1578 - acc: 0.9328 - val_loss: 0.1673 - val_acc: 0.9318\n",
517 |       "Epoch 15/25\n",
518 |       "228946/228946 [==============================] - 191s 833us/step - loss: 0.1544 - acc: 0.9346 - val_loss: 0.1677 - val_acc: 0.9309\n",
519 |       "Epoch 16/25\n",
520 |       "228946/228946 [==============================] - 192s 840us/step - loss: 0.1521 - acc: 0.9353 - val_loss: 0.1710 - val_acc: 0.9315\n",
521 |       "Epoch 17/25\n",
522 |       "228946/228946 [==============================] - 193s 845us/step - loss: 0.1481 - acc: 0.9372 - val_loss: 0.1729 - val_acc: 0.9318\n",
523 |       "Epoch 18/25\n",
524 |       "228946/228946 [==============================] - 188s 820us/step - loss: 0.1380 - acc: 0.9415 - val_loss: 0.1689 - val_acc: 0.9338\n",
525 |       "Epoch 19/25\n",
526 |       "228946/228946 [==============================] - 191s 836us/step - loss: 0.1325 - acc: 0.9440 - val_loss: 0.1628 - val_acc: 0.9357\n",
527 |       "Epoch 20/25\n",
528 |       "228946/228946 [==============================] - 183s 798us/step - loss: 0.1287 - acc: 0.9464 - val_loss: 0.1655 - val_acc: 0.9359\n",
529 |       "Epoch 21/25\n",
530 |       "228946/228946 [==============================] - 183s 798us/step - loss: 0.1250 - acc: 0.9476 - val_loss: 0.1651 - val_acc: 0.9364\n",
531 |       "Epoch 22/25\n",
532 |       "228946/228946 [==============================] - 183s 798us/step - loss: 0.1227 - acc: 0.9490 - val_loss: 0.1689 - val_acc: 0.9354\n",
533 |       "Epoch 23/25\n",
534 |       "228946/228946 [==============================] - 183s 798us/step - loss: 0.1152 - acc: 0.9512 - val_loss: 0.1671 - val_acc: 0.9377\n",
535 |       "Epoch 24/25\n",
536 |       "228946/228946 [==============================] - 183s 799us/step - loss: 0.1117 - acc: 0.9535 - val_loss: 0.1682 - val_acc: 0.9373\n",
537 |       "Epoch 25/25\n",
538 |       "228946/228946 [==============================] - 189s 828us/step - loss: 0.1096 - acc: 0.9542 - val_loss: 0.1683 - val_acc: 0.9369\n",
539 |       "load model ./log/20180715-115006.Multi_LSTM_CNN_v5.char.019.hdf5\n",
540 |       "------------------------------------------------------------\n",
541 |       "Fold 2 training start...\n",
542 |       "Train on 228947 samples, validate on 25439 samples\n",
543 |       "Epoch 1/25\n",
544 |       "228947/228947 [==============================] - 193s 843us/step - loss: 0.3951 - acc: 0.8062 - val_loss: 0.3309 - val_acc: 0.8579\n",
545 |       "Epoch 2/25\n"
546 |      ]
547 |     },
548 |     {
549 |      "name": "stdout",
550 |      "output_type": "stream",
551 |      "text": [
552 |       "228947/228947 [==============================] - 183s 800us/step - loss: 0.2904 - acc: 0.8656 - val_loss: 0.2692 - val_acc: 0.8873\n",
553 |       "Epoch 3/25\n",
554 |       "228947/228947 [==============================] - 183s 799us/step - loss: 0.2526 - acc: 0.8868 - val_loss: 0.2245 - val_acc: 0.9035\n",
555 |       "Epoch 4/25\n",
556 |       "228947/228947 [==============================] - 183s 800us/step - loss: 0.2295 - acc: 0.8988 - val_loss: 0.2094 - val_acc: 0.9101\n",
557 |       "Epoch 5/25\n",
558 |       "228947/228947 [==============================] - 183s 800us/step - loss: 0.2141 - acc: 0.9066 - val_loss: 0.2024 - val_acc: 0.9129\n",
559 |       "Epoch 6/25\n",
560 |       "228947/228947 [==============================] - 183s 800us/step - loss: 0.2020 - acc: 0.9126 - val_loss: 0.1879 - val_acc: 0.9185\n",
561 |       "Epoch 7/25\n",
562 |       "228947/228947 [==============================] - 183s 800us/step - loss: 0.1924 - acc: 0.9168 - val_loss: 0.1918 - val_acc: 0.9192\n",
563 |       "Epoch 8/25\n",
564 |       "228947/228947 [==============================] - 187s 819us/step - loss: 0.1853 - acc: 0.9209 - val_loss: 0.1880 - val_acc: 0.9228\n",
565 |       "Epoch 9/25\n",
566 |       "228947/228947 [==============================] - 187s 817us/step - loss: 0.1789 - acc: 0.9233 - val_loss: 0.1787 - val_acc: 0.9243\n",
567 |       "Epoch 10/25\n",
568 |       "228947/228947 [==============================] - 189s 824us/step - loss: 0.1732 - acc: 0.9263 - val_loss: 0.1790 - val_acc: 0.9259\n",
569 |       "Epoch 11/25\n",
570 |       " 71680/228947 [========>.....................] - ETA: 2:04 - loss: 0.1647 - acc: 0.9303"
571 |      ]
572 |     },
573 |     {
574 |      "ename": "KeyboardInterrupt",
575 |      "evalue": "",
576 |      "output_type": "error",
577 |      "traceback": [
578 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
579 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
580 |       "\u001b[1;32m<ipython-input-10-a59053ed2bb2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     90\u001b[0m         \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdev_x1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_x2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_f\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     91\u001b[0m         \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 92\u001b[1;33m         \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mearly_stopping\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlr_reducer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_point\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     93\u001b[0m     )\n\u001b[0;32m     94\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
581 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[0;32m   1040\u001b[0m                                         \u001b[0minitial_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1041\u001b[0m                                         \u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m                                         validation_steps=validation_steps)\n\u001b[0m\u001b[0;32m   1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1044\u001b[0m     def evaluate(self, x=None, y=None,\n",
582 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training_arrays.py\u001b[0m in \u001b[0;36mfit_loop\u001b[1;34m(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[0;32m    197\u001b[0m                     \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    198\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 199\u001b[1;33m                 \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    200\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    201\u001b[0m                     \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
583 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m   2665\u001b[0m                     \u001b[1;34m'In order to feed symbolic tensors to a Keras model '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2666\u001b[0m                     'in TensorFlow, you need tensorflow 1.8 or higher.')\n\u001b[1;32m-> 2667\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_legacy_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2668\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2669\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
584 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m_legacy_call\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m   2647\u001b[0m         \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2648\u001b[0m         updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[1;32m-> 2649\u001b[1;33m                               **self.session_kwargs)\n\u001b[0m\u001b[0;32m   2650\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2651\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
585 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m    903\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    904\u001b[0m       result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[1;32m--> 905\u001b[1;33m                          run_metadata_ptr)\n\u001b[0m\u001b[0;32m    906\u001b[0m       \u001b[1;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    907\u001b[0m         \u001b[0mproto_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
586 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m   1135\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1136\u001b[0m       results = self._do_run(handle, final_targets, final_fetches,\n\u001b[1;32m-> 1137\u001b[1;33m                              feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[0;32m   1138\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1139\u001b[0m       \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
587 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m   1353\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1354\u001b[0m       return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[1;32m-> 1355\u001b[1;33m                            options, run_metadata)\n\u001b[0m\u001b[0;32m   1356\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1357\u001b[0m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
588 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m   1359\u001b[0m   \u001b[1;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1360\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1361\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1362\u001b[0m     \u001b[1;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1363\u001b[0m       \u001b[0mmessage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
589 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[1;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m   1338\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1339\u001b[0m           return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[1;32m-> 1340\u001b[1;33m                                    target_list, status, run_metadata)\n\u001b[0m\u001b[0;32m   1341\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1342\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
590 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
591 |      ]
592 |     }
593 |    ],
594 |    "source": [
595 |     "from sklearn.model_selection import StratifiedKFold\n",
596 |     "\n",
597 |     "best_results = []\n",
598 |     "# last_results = []\n",
599 |     "best_file_names = []\n",
600 |     "dev_predictions = []\n",
601 |     "\n",
602 |     "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=char1, y=label)):  # word/char switch\n",
603 |     "    print(\"-\" * 60)\n",
604 |     "    print(\"Fold {} training start...\".format(i))\n",
605 |     "    \n",
606 |     "    train_x1, train_x2, train_y = char1[train_index, :], char2[train_index, :], label[train_index]  # word/char switch\n",
607 |     "    dev_x1, dev_x2, dev_y = char1[dev_index, :], char2[dev_index, :], label[dev_index]  # word/char switch\n",
608 |     "    train_f, dev_f = train_feature[train_index, :], train_feature[dev_index, :]\n",
609 |     "    \n",
610 |     "    input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
611 |     "    input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
612 |     "    inputf = Input(shape=(train_f.shape[1],), dtype=\"float32\")\n",
613 |     "\n",
614 |     "    embedding_layer = Embedding(\n",
615 |     "        input_dim=char_embedding_data.shape[0],  # word/char switch\n",
616 |     "        output_dim=char_embedding_data.shape[1],  # word/char switch\n",
617 |     "        weights=[char_embedding_data],  # word/char switch\n",
618 |     "        input_length=SEQ_LEN,\n",
619 |     "        trainable=False\n",
620 |     "    )\n",
621 |     "    \n",
622 |     "    vector1 = embedding_layer(input1)\n",
623 |     "    vector2 = embedding_layer(input2)\n",
624 |     "    \n",
625 |     "    lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
626 |     "    layer1a = lstm_layer1(vector1)\n",
627 |     "    layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n",
628 |     "    layer1b = lstm_layer1(vector2)\n",
629 |     "    layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n",
630 |     "\n",
631 |     "    lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n",
632 |     "    layer2a = lstm_layer2(layer1a)\n",
633 |     "    layer2b = lstm_layer2(layer1b)\n",
634 |     "    \n",
635 |     "    conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n",
636 |     "    conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n",
637 |     "    conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n",
638 |     "    conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n",
639 |     "    conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n",
640 |     "    conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n",
641 |     "    \n",
642 |     "    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
643 |     "    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
644 |     "    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
645 |     "    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
646 |     "    merge = concatenate([diff, mult])\n",
647 |     "    \n",
648 |     "    x = Dropout(DROP_RATE)(merge)\n",
649 |     "    x = BatchNormalization()(x)\n",
650 |     "    x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
651 |     "    \n",
652 |     "    fe = BatchNormalization()(inputf)\n",
653 |     "    fe = Dense(DENSE_FEATURE, activation=\"relu\")(fe)\n",
654 |     "    \n",
655 |     "    x = concatenate([x, fe])\n",
656 |     "    \n",
657 |     "    x = Dropout(DROP_RATE)(x)\n",
658 |     "    x = BatchNormalization()(x)\n",
659 |     "    x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
660 |     "    x = Dropout(DROP_RATE)(x)\n",
661 |     "    x = BatchNormalization()(x)\n",
662 |     "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
663 |     "    \n",
664 |     "    model = Model(inputs=[input1, input2, inputf], outputs=pred)\n",
665 |     "    model.compile(\n",
666 |     "        optimizer=\"nadam\",\n",
667 |     "        loss=\"binary_crossentropy\",\n",
668 |     "        metrics=[\"acc\"]\n",
669 |     "    )\n",
670 |     "    \n",
671 |     "    early_stopping = EarlyStopping(\"val_loss\", patience=6)\n",
672 |     "    lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0005)\n",
673 |     "    check_point = ModelCheckpoint(\n",
674 |     "        \"./log/%s.Multi_LSTM_CNN_v5.char.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
675 |     "        monitor=\"val_loss\",\n",
676 |     "        save_best_only=True,\n",
677 |     "    )\n",
678 |     "    \n",
679 |     "    fit_res = model.fit(\n",
680 |     "        x=[train_x1, train_x2, train_f],\n",
681 |     "        y=train_y,\n",
682 |     "        batch_size=BATCH_SIZE,\n",
683 |     "        epochs=NUM_EPOCHES,\n",
684 |     "        validation_data=([dev_x1, dev_x2, dev_f], dev_y),\n",
685 |     "        shuffle=True,\n",
686 |     "        callbacks=[early_stopping, lr_reducer, check_point]\n",
687 |     "    )\n",
688 |     "    \n",
689 |     "#     pred_last = model.predict([test_char1, test_char2, test_feature], batch_size=BATCH_SIZE)  # word/char switch\n",
690 |     "#     last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
691 |     "    \n",
692 |     "    best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n",
693 |     "    best_file_names.append(best_model_file)\n",
694 |     "    print(\"load model %s\" % (best_model_file,))\n",
695 |     "    model.load_weights(best_model_file)\n",
696 |     "    pred_best = model.predict([test_char1, test_char2, test_feature], batch_size=BATCH_SIZE)  # word/char switch\n",
697 |     "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
698 |     "    \n",
699 |     "    dev_pred = model.predict([dev_x1, dev_x2, dev_f], batch_size=BATCH_SIZE)\n",
700 |     "    dev_result = pd.DataFrame({\"pred\": dev_pred.ravel(), \"label\": dev_y})\n",
701 |     "    dev_predictions.append(dev_result)\n",
702 |     "\n",
703 |     "# pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
704 |     "#     \"./result/%s-Multi_LSTM_CNN_v5_char_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
705 |     "#     index=False\n",
706 |     "# )\n",
707 |     "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
708 |     "    \"./result/%s-Multi_LSTM_CNN_v5_char_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
709 |     "    index=False\n",
710 |     ")\n",
711 |     "\n",
712 |     "total_dev = pd.concat(dev_predictions, axis=0)\n",
713 |     "total_dev.to_csv(\n",
714 |     "    \"./result/%s-Multi_LSTM_CNN_v5_char_dev_result.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n",
715 |     "    index=False\n",
716 |     ")\n",
717 |     "\n",
718 |     "model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n",
719 |     "os.mkdir(model_path)\n",
720 |     "for model_name in best_file_names:\n",
721 |     "    abs_name = os.path.split(model_name)[1]\n",
722 |     "    os.rename(model_name, model_path + abs_name)"
723 |    ]
724 |   }
725 |  ],
726 |  "metadata": {
727 |   "kernelspec": {
728 |    "display_name": "Python 3",
729 |    "language": "python",
730 |    "name": "python3"
731 |   },
732 |   "language_info": {
733 |    "codemirror_mode": {
734 |     "name": "ipython",
735 |     "version": 3
736 |    },
737 |    "file_extension": ".py",
738 |    "mimetype": "text/x-python",
739 |    "name": "python",
740 |    "nbconvert_exporter": "python",
741 |    "pygments_lexer": "ipython3",
742 |    "version": "3.6.2"
743 |   }
744 |  },
745 |  "nbformat": 4,
746 |  "nbformat_minor": 2
747 | }
748 | 


--------------------------------------------------------------------------------