├── .ipynb_checkpoints
├── CNN_metafeature-checkpoint.ipynb
├── CNN_metafeature_dilated-checkpoint.ipynb
├── gene_npy-checkpoint.ipynb
├── lgb_meta_features-checkpoint.ipynb
├── main_test-checkpoint.ipynb
├── main_train-checkpoint.ipynb
├── pickle_pre-checkpoint.ipynb
└── submit-checkpoint.ipynb
├── CNN_metafeature.ipynb
├── CNN_metafeature_dilated.ipynb
├── README.md
├── gene_npy.ipynb
├── lgb_meta_features.ipynb
├── main_test.ipynb
├── main_train.ipynb
├── pickle_pre.ipynb
├── submit.ipynb
└── 上地西二旗人民.pptx
/.ipynb_checkpoints/CNN_metafeature-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
13 | " return f(*args, **kwds)\n",
14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
15 | " return f(*args, **kwds)\n"
16 | ]
17 | }
18 | ],
19 | "source": [
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "import os\n",
23 | "from tqdm import tqdm\n",
24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "path = '../input/'\n",
34 | "train = pd.read_csv(path + 'final_train.csv')\n",
35 | "test = pd.read_csv(path + 'final_test.csv')"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "((89806693, 5), (79288375, 4))"
47 | ]
48 | },
49 | "execution_count": 3,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "train.shape,test.shape"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "unique_api = train['api'].unique()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "(295,)"
76 | ]
77 | },
78 | "execution_count": 5,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "unique_api.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 6,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n",
94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 7,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "train['api_idx'] = train['api'].map(api2index)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 8,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "test['api_idx'] = test['api'].map(api2index)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 9,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n",
122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 10,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "def get_sequence(df,period_idx):\n",
132 | " seq_list = []\n",
133 | " for _id,begin in enumerate(period_idx[:-1]):\n",
134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n",
135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n",
136 | " return seq_list"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 11,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n",
146 | "test_df = test[['file_id']].drop_duplicates(keep='first')"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 12,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "train_df['seq'] = get_sequence(train,train_period_idx)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 13,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "test_df['seq'] = get_sequence(test,test_period_idx)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 14,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "(19350.97816934013, 6466.961402750774, 888204)"
176 | ]
177 | },
178 | "execution_count": 14,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 15,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "(15911.676663585444, 6120.291393284446, 769590)"
196 | ]
197 | },
198 | "execution_count": 15,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 16,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stderr",
214 | "output_type": "stream",
215 | "text": [
216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
217 | " return f(*args, **kwds)\n",
218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
219 | " from ._conv import register_converters as _register_converters\n",
220 | "Using TensorFlow backend.\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "from keras.preprocessing.text import Tokenizer\n",
226 | "from keras.preprocessing.sequence import pad_sequences\n",
227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n",
228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n",
229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n",
230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n",
231 | "from keras.models import Model\n",
232 | "from keras.optimizers import RMSprop,Adam\n",
233 | "from keras.layers.normalization import BatchNormalization\n",
234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
235 | "from keras.optimizers import SGD\n",
236 | "from keras import backend as K\n",
237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n",
238 | "from keras.layers import SpatialDropout1D\n",
239 | "from keras.layers.wrappers import Bidirectional"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 17,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "def TextCNN(max_len,max_cnt,embed_size,\n",
249 | " num_filters,kernel_size,\n",
250 | " conv_action,\n",
251 | " mask_zero):\n",
252 | " _input = Input(shape=(max_len,), dtype='int32')\n",
253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n",
254 | " _embed = SpatialDropout1D(0.15)(_embed)\n",
255 | " warppers = []\n",
256 | " for _kernel_size in kernel_size:\n",
257 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)\n",
258 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n",
259 | " \n",
260 | " fc = concatenate(warppers)\n",
261 | " fc = Dropout(0.5)(fc)\n",
262 | " #fc = BatchNormalization()(fc)\n",
263 | " fc = Dense(256, activation='relu')(fc)\n",
264 | " fc = Dropout(0.25)(fc)\n",
265 | " #fc = BatchNormalization()(fc) \n",
266 | " preds = Dense(8, activation = 'softmax')(fc)\n",
267 | " \n",
268 | " model = Model(inputs=_input, outputs=preds)\n",
269 | " \n",
270 | " model.compile(loss='categorical_crossentropy',\n",
271 | " optimizer='adam',\n",
272 | " metrics=['accuracy'])\n",
273 | " return model"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 18,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "train_labels = pd.get_dummies(train_df.label).values\n",
283 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n",
284 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 20,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 21,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "max_len = 6000\n",
303 | "max_cnt = 295\n",
304 | "embed_size = 256\n",
305 | "num_filters = 64\n",
306 | "kernel_size = [2,4,6,8,10,12,14]\n",
307 | "conv_action = 'relu'\n",
308 | "mask_zero = False\n",
309 | "TRAIN = True"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 25,
315 | "metadata": {
316 | "scrolled": false
317 | },
318 | "outputs": [
319 | {
320 | "name": "stdout",
321 | "output_type": "stream",
322 | "text": [
323 | "FOLD: \n",
324 | "2780 11107\n",
325 | "2780/2780 [==============================] - 5s 2ms/step\n",
326 | "12955/12955 [==============================] - 18s 1ms/step\n",
327 | "FOLD: \n",
328 | "2779 11108\n",
329 | "2779/2779 [==============================] - 4s 2ms/step\n",
330 | "12955/12955 [==============================] - 18s 1ms/step\n",
331 | "FOLD: \n",
332 | "2777 11110\n",
333 | "2777/2777 [==============================] - 4s 2ms/step\n",
334 | "12955/12955 [==============================] - 18s 1ms/step\n",
335 | "FOLD: \n",
336 | "2776 11111\n",
337 | "2776/2776 [==============================] - 4s 2ms/step\n",
338 | "12955/12955 [==============================] - 18s 1ms/step\n",
339 | "FOLD: \n",
340 | "2775 11112\n",
341 | "2775/2775 [==============================] - 5s 2ms/step\n",
342 | "12955/12955 [==============================] - 19s 1ms/step\n"
343 | ]
344 | }
345 | ],
346 | "source": [
347 | "import os\n",
348 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
349 | "meta_train = np.zeros(shape = (len(train_seq),8))\n",
350 | "meta_test = np.zeros(shape = (len(test_seq),8))\n",
351 | "FLAG = False\n",
352 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
353 | " print('FOLD: '.format(i))\n",
354 | " print(len(te_ind),len(tr_ind))\n",
355 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n",
356 | " model_name = 'benchmark_textcnn_fold_'+str(i)\n",
357 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n",
358 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n",
359 | " \n",
360 | " model = TextCNN(max_len,max_cnt,embed_size,\n",
361 | " num_filters,kernel_size,\n",
362 | " conv_action,\n",
363 | " mask_zero)\n",
364 | " \n",
365 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n",
366 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n",
367 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n",
368 | " if TRAIN and FLAG:\n",
369 | " model.fit(X_train,X_train_label,\n",
370 | " validation_data=(X_val,X_val_label),\n",
371 | " epochs=100,batch_size=64,\n",
372 | " shuffle=True,\n",
373 | " callbacks=[early_stopping,model_checkpoint]\n",
374 | " )\n",
375 | " \n",
376 | " model.load_weights(model_save_path)\n",
377 | " pred_val = model.predict(X_val,batch_size=128,verbose=1)\n",
378 | " pred_test = model.predict(test_seq,batch_size=128,verbose=1)\n",
379 | " \n",
380 | " meta_train[te_ind] = pred_val\n",
381 | " meta_test += pred_test\n",
382 | " K.clear_session()\n",
383 | "meta_test /= 5.0\n"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 37,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "pd.to_pickle(meta_train,'../train_meta_cnn.pkl')\n",
393 | "pd.to_pickle(meta_test,'../test_meta_cnn.pkl')"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 38,
399 | "metadata": {},
400 | "outputs": [
401 | {
402 | "data": {
403 | "text/plain": [
404 | "'/home/enjoy/tianchi/安全赛复赛/src'"
405 | ]
406 | },
407 | "execution_count": 38,
408 | "metadata": {},
409 | "output_type": "execute_result"
410 | }
411 | ],
412 | "source": [
413 | "%pwd"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": null,
419 | "metadata": {},
420 | "outputs": [],
421 | "source": []
422 | }
423 | ],
424 | "metadata": {
425 | "kernelspec": {
426 | "display_name": "Python 3",
427 | "language": "python",
428 | "name": "python3"
429 | },
430 | "language_info": {
431 | "codemirror_mode": {
432 | "name": "ipython",
433 | "version": 3
434 | },
435 | "file_extension": ".py",
436 | "mimetype": "text/x-python",
437 | "name": "python",
438 | "nbconvert_exporter": "python",
439 | "pygments_lexer": "ipython3",
440 | "version": "3.6.5"
441 | }
442 | },
443 | "nbformat": 4,
444 | "nbformat_minor": 2
445 | }
446 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/CNN_metafeature_dilated-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
13 | " return f(*args, **kwds)\n",
14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
15 | " return f(*args, **kwds)\n"
16 | ]
17 | }
18 | ],
19 | "source": [
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "import os\n",
23 | "from tqdm import tqdm\n",
24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "path = '../input/'\n",
34 | "train = pd.read_csv(path + 'final_train.csv')\n",
35 | "test = pd.read_csv(path + 'final_test.csv')"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "((89806693, 5), (79288375, 4))"
47 | ]
48 | },
49 | "execution_count": 3,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "train.shape,test.shape"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "unique_api = train['api'].unique()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "(295,)"
76 | ]
77 | },
78 | "execution_count": 5,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "unique_api.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 6,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n",
94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 7,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "train['api_idx'] = train['api'].map(api2index)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 8,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "test['api_idx'] = test['api'].map(api2index)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 9,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n",
122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 10,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "def get_sequence(df,period_idx):\n",
132 | " seq_list = []\n",
133 | " for _id,begin in enumerate(period_idx[:-1]):\n",
134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n",
135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n",
136 | " return seq_list"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 11,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n",
146 | "test_df = test[['file_id']].drop_duplicates(keep='first')"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 12,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "train_df['seq'] = get_sequence(train,train_period_idx)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 13,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "test_df['seq'] = get_sequence(test,test_period_idx)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 14,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "(19350.97816934013, 6466.961402750774, 888204)"
176 | ]
177 | },
178 | "execution_count": 14,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 15,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "(15911.676663585444, 6120.291393284446, 769590)"
196 | ]
197 | },
198 | "execution_count": 15,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 16,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stderr",
214 | "output_type": "stream",
215 | "text": [
216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
217 | " return f(*args, **kwds)\n",
218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
219 | " from ._conv import register_converters as _register_converters\n",
220 | "Using TensorFlow backend.\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "from keras.preprocessing.text import Tokenizer\n",
226 | "from keras.preprocessing.sequence import pad_sequences\n",
227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n",
228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n",
229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n",
230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n",
231 | "from keras.models import Model\n",
232 | "from keras.optimizers import RMSprop,Adam\n",
233 | "from keras.layers.normalization import BatchNormalization\n",
234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
235 | "from keras.optimizers import SGD\n",
236 | "from keras import backend as K\n",
237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n",
238 | "from keras.layers import SpatialDropout1D\n",
239 | "from keras.layers.wrappers import Bidirectional"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 17,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "def TextCNN(max_len,max_cnt,embed_size,\n",
249 | " num_filters,kernel_size,\n",
250 | " conv_action,\n",
251 | " mask_zero):\n",
252 | " _input = Input(shape=(max_len,), dtype='int32')\n",
253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n",
254 | " _embed = SpatialDropout1D(0.25)(_embed)\n",
255 | " warppers = []\n",
256 | " for _kernel_size in kernel_size:\n",
257 | " for dilated_rate in [1,2,3,4]:\n",
258 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action, dilation_rate=dilated_rate)(_embed)\n",
259 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n",
260 | " \n",
261 | " fc = concatenate(warppers)\n",
262 | " fc = Dropout(0.5)(fc)\n",
263 | " #fc = BatchNormalization()(fc)\n",
264 | " fc = Dense(256, activation='relu')(fc)\n",
265 | " fc = Dropout(0.25)(fc)\n",
266 | " #fc = BatchNormalization()(fc) \n",
267 | " preds = Dense(8, activation = 'softmax')(fc)\n",
268 | " \n",
269 | " model = Model(inputs=_input, outputs=preds)\n",
270 | " \n",
271 | " model.compile(loss='categorical_crossentropy',\n",
272 | " optimizer='adam',\n",
273 | " metrics=['accuracy'])\n",
274 | " return model"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 18,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "train_labels = pd.get_dummies(train_df.label).values\n",
284 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n",
285 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 20,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 21,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "max_len = 6000\n",
304 | "max_cnt = 295\n",
305 | "embed_size = 256\n",
306 | "num_filters = 64\n",
307 | "kernel_size = [2,3,4,5]\n",
308 | "conv_action = 'relu'\n",
309 | "mask_zero = False\n",
310 | "TRAIN = True"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 22,
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "name": "stdout",
320 | "output_type": "stream",
321 | "text": [
322 | "FOLD: \n",
323 | "2780 11107\n",
324 | "FOLD: \n",
325 | "2779 11108\n",
326 | "FOLD: \n",
327 | "2777 11110\n",
328 | "FOLD: \n",
329 | "2776 11111\n",
330 | "Train on 11111 samples, validate on 2776 samples\n",
331 | "Epoch 1/100\n",
332 | "11111/11111 [==============================] - 142s 13ms/step - loss: 0.9257 - acc: 0.6915 - val_loss: 0.4994 - val_acc: 0.8505\n",
333 | "Epoch 2/100\n",
334 | "11111/11111 [==============================] - 116s 10ms/step - loss: 0.5334 - acc: 0.8335 - val_loss: 0.4226 - val_acc: 0.8689\n",
335 | "Epoch 3/100\n",
336 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4632 - acc: 0.8550 - val_loss: 0.3850 - val_acc: 0.8761\n",
337 | "Epoch 4/100\n",
338 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4105 - acc: 0.8701 - val_loss: 0.3808 - val_acc: 0.8754\n",
339 | "Epoch 5/100\n",
340 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3784 - acc: 0.8763 - val_loss: 0.3663 - val_acc: 0.8829\n",
341 | "Epoch 6/100\n",
342 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3536 - acc: 0.8840 - val_loss: 0.3467 - val_acc: 0.8872\n",
343 | "Epoch 7/100\n",
344 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3420 - acc: 0.8903 - val_loss: 0.3426 - val_acc: 0.8909\n",
345 | "Epoch 8/100\n",
346 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3284 - acc: 0.8941 - val_loss: 0.3377 - val_acc: 0.8945\n",
347 | "Epoch 9/100\n",
348 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3133 - acc: 0.8936 - val_loss: 0.3380 - val_acc: 0.8945\n",
349 | "Epoch 10/100\n",
350 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3034 - acc: 0.8971 - val_loss: 0.3415 - val_acc: 0.8923\n",
351 | "Epoch 11/100\n",
352 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2916 - acc: 0.9007 - val_loss: 0.3232 - val_acc: 0.8995\n",
353 | "Epoch 12/100\n",
354 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2765 - acc: 0.9058 - val_loss: 0.3402 - val_acc: 0.8934\n",
355 | "Epoch 13/100\n",
356 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2657 - acc: 0.9086 - val_loss: 0.3294 - val_acc: 0.8984\n",
357 | "Epoch 14/100\n",
358 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2620 - acc: 0.9079 - val_loss: 0.3411 - val_acc: 0.8977\n",
359 | "FOLD: \n",
360 | "2775 11112\n",
361 | "Train on 11112 samples, validate on 2775 samples\n",
362 | "Epoch 1/100\n",
363 | "11112/11112 [==============================] - 116s 10ms/step - loss: 0.9019 - acc: 0.7001 - val_loss: 0.4956 - val_acc: 0.8436\n",
364 | "Epoch 2/100\n",
365 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.5189 - acc: 0.8322 - val_loss: 0.4210 - val_acc: 0.8695\n",
366 | "Epoch 3/100\n",
367 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4525 - acc: 0.8543 - val_loss: 0.3906 - val_acc: 0.8778\n",
368 | "Epoch 4/100\n",
369 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4038 - acc: 0.8721 - val_loss: 0.3832 - val_acc: 0.8674\n",
370 | "Epoch 5/100\n",
371 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3802 - acc: 0.8790 - val_loss: 0.3687 - val_acc: 0.8836\n",
372 | "Epoch 6/100\n",
373 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3563 - acc: 0.8813 - val_loss: 0.3739 - val_acc: 0.8807\n",
374 | "Epoch 7/100\n",
375 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3277 - acc: 0.8909 - val_loss: 0.3597 - val_acc: 0.8840\n",
376 | "Epoch 8/100\n",
377 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3239 - acc: 0.8935 - val_loss: 0.3534 - val_acc: 0.8901\n",
378 | "Epoch 9/100\n",
379 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3061 - acc: 0.8954 - val_loss: 0.3581 - val_acc: 0.8861\n",
380 | "Epoch 10/100\n",
381 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2973 - acc: 0.8994 - val_loss: 0.3528 - val_acc: 0.8901\n",
382 | "Epoch 11/100\n",
383 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2875 - acc: 0.9035 - val_loss: 0.3537 - val_acc: 0.8847\n",
384 | "Epoch 12/100\n",
385 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2736 - acc: 0.9060 - val_loss: 0.3596 - val_acc: 0.8908\n",
386 | "Epoch 13/100\n",
387 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2613 - acc: 0.9078 - val_loss: 0.3521 - val_acc: 0.8908\n",
388 | "Epoch 14/100\n",
389 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2639 - acc: 0.9055 - val_loss: 0.3457 - val_acc: 0.8926\n",
390 | "Epoch 15/100\n",
391 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2514 - acc: 0.9121 - val_loss: 0.3702 - val_acc: 0.8865\n",
392 | "Epoch 16/100\n",
393 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2497 - acc: 0.9112 - val_loss: 0.3684 - val_acc: 0.8905\n",
394 | "Epoch 17/100\n",
395 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2366 - acc: 0.9147 - val_loss: 0.3700 - val_acc: 0.8908\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "import os\n",
401 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n",
402 | "meta_train = np.zeros(shape = (len(train_seq),8))\n",
403 | "meta_test = np.zeros(shape = (len(test_seq),8))\n",
404 | "FLAG = False\n",
405 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
406 | " if i in [3,4]:\n",
407 | " FLAG = True\n",
408 | " print('FOLD: '.format(i))\n",
409 | " print(len(te_ind),len(tr_ind))\n",
410 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n",
411 | " model_name = 'benchmark_dilated_textcnn_fold_'+str(i)\n",
412 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n",
413 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n",
414 | " \n",
415 | " model = TextCNN(max_len,max_cnt,embed_size,\n",
416 | " num_filters,kernel_size,\n",
417 | " conv_action,\n",
418 | " mask_zero)\n",
419 | " \n",
420 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n",
421 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n",
422 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n",
423 | " if TRAIN and FLAG:\n",
424 | " model.fit(X_train,X_train_label,\n",
425 | " validation_data=(X_val,X_val_label),\n",
426 | " epochs=100,batch_size=64,\n",
427 | " shuffle=True,\n",
428 | " callbacks=[early_stopping,model_checkpoint]\n",
429 | " )\n",
430 | " \n",
431 | " #model.load_weights(model_save_path)\n",
432 | " #pred_val = model.predict(X_val,batch_size=128)\n",
433 | " #pred_test = model.predict(test_seq,batch_size=128)\n",
434 | " \n",
435 | " #meta_train[te_ind] = pred_val\n",
436 | " #meta_test += pred_test\n",
437 | " FLAG = False\n",
438 | " #K.clear_session()\n",
439 | "#meta_test /= 5.0\n"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": null,
445 | "metadata": {},
446 | "outputs": [],
447 | "source": [
448 | "pd.to_pickle(meta_train,'../feature_final/train_meta_dilated_cnn.pkl')\n",
449 | "pd.to_pickle(meta_test,'../feature_final/test_meta_dilated_cnn.pkl')"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "print '1322'"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "metadata": {},
465 | "outputs": [],
466 | "source": []
467 | }
468 | ],
469 | "metadata": {
470 | "kernelspec": {
471 | "display_name": "Python 3",
472 | "language": "python",
473 | "name": "python3"
474 | },
475 | "language_info": {
476 | "codemirror_mode": {
477 | "name": "ipython",
478 | "version": 3
479 | },
480 | "file_extension": ".py",
481 | "mimetype": "text/x-python",
482 | "name": "python",
483 | "nbconvert_exporter": "python",
484 | "pygments_lexer": "ipython3",
485 | "version": "3.6.5"
486 | }
487 | },
488 | "nbformat": 4,
489 | "nbformat_minor": 2
490 | }
491 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/gene_npy-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 4,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "test_data_2gram_final = pd.read_csv('./test_data_2gram_final.csv')\n",
20 | "train_data_2gram_final = pd.read_csv('./train_data_2gram_final.csv')"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 6,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "cols = [item for item in train_data_2gram_final.columns if item not in ['label']]\n",
30 | "np.save('../X_test.npy',test_data_2gram_final[cols].values)\n",
31 | "np.save('../X_train.npy',train_data_2gram_final[cols].values)\n",
32 | "np.save('../labels.npy',train_data_2gram_final['label'].values)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 8,
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "data": {
42 | "text/plain": [
43 | "((13887, 3252), (12955, 3251))"
44 | ]
45 | },
46 | "execution_count": 8,
47 | "metadata": {},
48 | "output_type": "execute_result"
49 | }
50 | ],
51 | "source": [
52 | "train_data_2gram_final.shape,test_data_2gram_final.shape"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": []
61 | }
62 | ],
63 | "metadata": {
64 | "kernelspec": {
65 | "display_name": "Python 3",
66 | "language": "python",
67 | "name": "python3"
68 | },
69 | "language_info": {
70 | "codemirror_mode": {
71 | "name": "ipython",
72 | "version": 3
73 | },
74 | "file_extension": ".py",
75 | "mimetype": "text/x-python",
76 | "name": "python",
77 | "nbconvert_exporter": "python",
78 | "pygments_lexer": "ipython3",
79 | "version": "3.6.5"
80 | }
81 | },
82 | "nbformat": 4,
83 | "nbformat_minor": 2
84 | }
85 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/lgb_meta_features-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/user/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
13 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import numpy as np\n",
20 | "import lightgbm as lgb\n",
21 | "from sklearn.cross_validation import train_test_split\n",
22 | "import gc\n",
23 | "from sklearn.preprocessing import OneHotEncoder\n",
24 | "from sklearn.cross_validation import StratifiedKFold\n",
25 | "import datetime"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "cur time = 2018/09/21 18:54:08\n",
38 | "(13887, 3251) (12955, 3251)\n",
39 | "cur time = 2018/09/21 18:54:08\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
45 | "train = np.load('../X_train.npy')\n",
46 | "test = np.load('../X_test.npy')\n",
47 | "train_labels = np.load('../labels.npy')\n",
48 | "print train.shape,test.shape\n",
49 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
59 | "\n",
60 | "meta_train = np.zeros(shape = (len(train),8))\n",
61 | "meta_test = np.zeros(shape = (len(test),8))\n",
62 | "\n",
63 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
64 | " print 'FOLD: ',i\n",
65 | " print len(te_ind),len(tr_ind)\n",
66 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
67 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
68 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
69 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
70 | " params = {\n",
71 | " 'task':'train', \n",
72 | " 'boosting_type':'gbdt',\n",
73 | " 'num_leaves': 15,\n",
74 | " 'objective': 'multiclass',\n",
75 | " 'num_class':8,\n",
76 | " 'learning_rate': 0.05,\n",
77 | " 'feature_fraction': 0.85,\n",
78 | " 'subsample':0.85,\n",
79 | " 'num_threads': 32,\n",
80 | " 'metric':'multi_logloss',\n",
81 | " 'seed':100\n",
82 | " } \n",
83 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
84 | " pred_val = model.predict(X_val)\n",
85 | " pred_test = model.predict(test)\n",
86 | " \n",
87 | " meta_train[te_ind] = pred_val\n",
88 | " meta_test += pred_test\n",
89 | "meta_test /= 5.0"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "pd.to_pickle(meta_train,'../train_meta_lgb_1.pkl')\n",
99 | "pd.to_pickle(meta_test,'../test_meta_lgb_1.pkl')"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "metadata": {
106 | "scrolled": false
107 | },
108 | "outputs": [
109 | {
110 | "name": "stdout",
111 | "output_type": "stream",
112 | "text": [
113 | "FOLD: 0\n",
114 | "2780 11107\n",
115 | "Training until validation scores don't improve for 100 rounds.\n",
116 | "[100]\ttraining's multi_logloss: 0.105693\tvalid_1's multi_logloss: 0.290438\n",
117 | "[200]\ttraining's multi_logloss: 0.0243107\tvalid_1's multi_logloss: 0.28446\n",
118 | "Early stopping, best iteration is:\n",
119 | "[145]\ttraining's multi_logloss: 0.0517928\tvalid_1's multi_logloss: 0.277273\n",
120 | "FOLD: 1\n",
121 | "2779 11108\n",
122 | "Training until validation scores don't improve for 100 rounds.\n",
123 | "[100]\ttraining's multi_logloss: 0.108126\tvalid_1's multi_logloss: 0.284527\n",
124 | "[200]\ttraining's multi_logloss: 0.0254294\tvalid_1's multi_logloss: 0.283195\n",
125 | "Early stopping, best iteration is:\n",
126 | "[139]\ttraining's multi_logloss: 0.0583621\tvalid_1's multi_logloss: 0.273231\n",
127 | "FOLD: 2\n",
128 | "2777 11110\n",
129 | "Training until validation scores don't improve for 100 rounds.\n",
130 | "[100]\ttraining's multi_logloss: 0.107591\tvalid_1's multi_logloss: 0.271276\n",
131 | "[200]\ttraining's multi_logloss: 0.0256978\tvalid_1's multi_logloss: 0.267876\n",
132 | "Early stopping, best iteration is:\n",
133 | "[151]\ttraining's multi_logloss: 0.0490566\tvalid_1's multi_logloss: 0.258754\n",
134 | "FOLD: 3\n",
135 | "2776 11111\n",
136 | "Training until validation scores don't improve for 100 rounds.\n",
137 | "[100]\ttraining's multi_logloss: 0.109872\tvalid_1's multi_logloss: 0.2752\n",
138 | "[200]\ttraining's multi_logloss: 0.0267958\tvalid_1's multi_logloss: 0.266528\n",
139 | "Early stopping, best iteration is:\n",
140 | "[153]\ttraining's multi_logloss: 0.0492415\tvalid_1's multi_logloss: 0.260417\n",
141 | "FOLD: 4\n",
142 | "2775 11112\n",
143 | "Training until validation scores don't improve for 100 rounds.\n",
144 | "[100]\ttraining's multi_logloss: 0.108239\tvalid_1's multi_logloss: 0.286993\n",
145 | "[200]\ttraining's multi_logloss: 0.0260953\tvalid_1's multi_logloss: 0.276078\n",
146 | "Early stopping, best iteration is:\n",
147 | "[155]\ttraining's multi_logloss: 0.0471788\tvalid_1's multi_logloss: 0.270497\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
153 | "\n",
154 | "meta_train = np.zeros(shape = (len(train),8))\n",
155 | "meta_test = np.zeros(shape = (len(test),8))\n",
156 | "\n",
157 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
158 | " print 'FOLD: ',i\n",
159 | " print len(te_ind),len(tr_ind)\n",
160 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
161 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
162 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
163 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
164 | " params = {\n",
165 | " 'task':'train', \n",
166 | " 'boosting_type':'gbdt',\n",
167 | " 'num_leaves': 31,\n",
168 | " 'objective': 'multiclass',\n",
169 | " 'num_class':8,\n",
170 | " 'learning_rate': 0.05,\n",
171 | " 'feature_fraction': 0.85,\n",
172 | " 'subsample':0.85,\n",
173 | " 'num_threads': 32,\n",
174 | " 'metric':'multi_logloss',\n",
175 | " 'seed':100\n",
176 | " } \n",
177 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
178 | " pred_val = model.predict(X_val)\n",
179 | " pred_test = model.predict(test)\n",
180 | " \n",
181 | " meta_train[te_ind] = pred_val\n",
182 | " meta_test += pred_test\n",
183 | "meta_test /= 5.0"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 6,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "pd.to_pickle(meta_train,'../train_meta_lgb_2.pkl')\n",
193 | "pd.to_pickle(meta_test,'../test_meta_lgb_2.pkl')"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 7,
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "FOLD: 0\n",
206 | "2780 11107\n",
207 | "Training until validation scores don't improve for 100 rounds.\n",
208 | "[100]\ttraining's multi_logloss: 0.126813\tvalid_1's multi_logloss: 0.299223\n",
209 | "[200]\ttraining's multi_logloss: 0.0319222\tvalid_1's multi_logloss: 0.278803\n",
210 | "Early stopping, best iteration is:\n",
211 | "[161]\ttraining's multi_logloss: 0.0520005\tvalid_1's multi_logloss: 0.276196\n",
212 | "FOLD: 1\n",
213 | "2779 11108\n",
214 | "Training until validation scores don't improve for 100 rounds.\n",
215 | "[100]\ttraining's multi_logloss: 0.128834\tvalid_1's multi_logloss: 0.292494\n",
216 | "[200]\ttraining's multi_logloss: 0.0332951\tvalid_1's multi_logloss: 0.277843\n",
217 | "Early stopping, best iteration is:\n",
218 | "[153]\ttraining's multi_logloss: 0.0597567\tvalid_1's multi_logloss: 0.272742\n",
219 | "FOLD: 2\n",
220 | "2777 11110\n",
221 | "Training until validation scores don't improve for 100 rounds.\n",
222 | "[100]\ttraining's multi_logloss: 0.128497\tvalid_1's multi_logloss: 0.279648\n",
223 | "[200]\ttraining's multi_logloss: 0.0334364\tvalid_1's multi_logloss: 0.263845\n",
224 | "Early stopping, best iteration is:\n",
225 | "[159]\ttraining's multi_logloss: 0.0551787\tvalid_1's multi_logloss: 0.25859\n",
226 | "FOLD: 3\n",
227 | "2776 11111\n",
228 | "Training until validation scores don't improve for 100 rounds.\n",
229 | "[100]\ttraining's multi_logloss: 0.130386\tvalid_1's multi_logloss: 0.286192\n",
230 | "[200]\ttraining's multi_logloss: 0.0347223\tvalid_1's multi_logloss: 0.263253\n",
231 | "Early stopping, best iteration is:\n",
232 | "[169]\ttraining's multi_logloss: 0.0501232\tvalid_1's multi_logloss: 0.260649\n",
233 | "FOLD: 4\n",
234 | "2775 11112\n",
235 | "Training until validation scores don't improve for 100 rounds.\n",
236 | "[100]\ttraining's multi_logloss: 0.129009\tvalid_1's multi_logloss: 0.296055\n",
237 | "[200]\ttraining's multi_logloss: 0.0340881\tvalid_1's multi_logloss: 0.274158\n",
238 | "Early stopping, best iteration is:\n",
239 | "[173]\ttraining's multi_logloss: 0.0469372\tvalid_1's multi_logloss: 0.272973\n"
240 | ]
241 | }
242 | ],
243 | "source": [
244 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
245 | "\n",
246 | "meta_train = np.zeros(shape = (len(train),8))\n",
247 | "meta_test = np.zeros(shape = (len(test),8))\n",
248 | "\n",
249 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
250 | " print 'FOLD: ',i\n",
251 | " print len(te_ind),len(tr_ind)\n",
252 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
253 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
254 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
255 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
256 | " params = {\n",
257 | " 'task':'train', \n",
258 | " 'boosting_type':'gbdt',\n",
259 | " 'num_leaves': 31,\n",
260 | " 'objective': 'multiclass',\n",
261 | " 'num_class':8,\n",
262 | " 'learning_rate': 0.045,\n",
263 | " 'feature_fraction': 0.8,\n",
264 | " 'subsample':0.8,\n",
265 | " 'num_threads': 32,\n",
266 | " 'metric':'multi_logloss',\n",
267 | " 'seed':100\n",
268 | " } \n",
269 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
270 | " pred_val = model.predict(X_val)\n",
271 | " pred_test = model.predict(test)\n",
272 | " \n",
273 | " meta_train[te_ind] = pred_val\n",
274 | " meta_test += pred_test\n",
275 | "meta_test /= 5.0"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 8,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "pd.to_pickle(meta_train,'../train_meta_lgb_3.pkl')\n",
285 | "pd.to_pickle(meta_test,'../test_meta_lgb_3.pkl')"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": []
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 9,
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "FOLD: 0\n",
305 | "2780 11107\n",
306 | "Training until validation scores don't improve for 100 rounds.\n",
307 | "[100]\ttraining's multi_logloss: 0.0771172\tvalid_1's multi_logloss: 0.289138\n",
308 | "[200]\ttraining's multi_logloss: 0.00851115\tvalid_1's multi_logloss: 0.298243\n",
309 | "Early stopping, best iteration is:\n",
310 | "[133]\ttraining's multi_logloss: 0.0357694\tvalid_1's multi_logloss: 0.27818\n",
311 | "FOLD: 1\n",
312 | "2779 11108\n",
313 | "Training until validation scores don't improve for 100 rounds.\n",
314 | "[100]\ttraining's multi_logloss: 0.0780999\tvalid_1's multi_logloss: 0.289059\n",
315 | "[200]\ttraining's multi_logloss: 0.00887645\tvalid_1's multi_logloss: 0.298286\n",
316 | "Early stopping, best iteration is:\n",
317 | "[134]\ttraining's multi_logloss: 0.0357742\tvalid_1's multi_logloss: 0.278663\n",
318 | "FOLD: 2\n",
319 | "2777 11110\n",
320 | "Training until validation scores don't improve for 100 rounds.\n",
321 | "[100]\ttraining's multi_logloss: 0.0784245\tvalid_1's multi_logloss: 0.274011\n",
322 | "[200]\ttraining's multi_logloss: 0.00891692\tvalid_1's multi_logloss: 0.282485\n",
323 | "Early stopping, best iteration is:\n",
324 | "[134]\ttraining's multi_logloss: 0.0356565\tvalid_1's multi_logloss: 0.263027\n",
325 | "FOLD: 3\n",
326 | "2776 11111\n",
327 | "Training until validation scores don't improve for 100 rounds.\n",
328 | "[100]\ttraining's multi_logloss: 0.0795669\tvalid_1's multi_logloss: 0.280272\n",
329 | "[200]\ttraining's multi_logloss: 0.00927117\tvalid_1's multi_logloss: 0.284248\n",
330 | "Early stopping, best iteration is:\n",
331 | "[135]\ttraining's multi_logloss: 0.0357068\tvalid_1's multi_logloss: 0.267277\n",
332 | "FOLD: 4\n",
333 | "2775 11112\n",
334 | "Training until validation scores don't improve for 100 rounds.\n",
335 | "[100]\ttraining's multi_logloss: 0.0782005\tvalid_1's multi_logloss: 0.287082\n",
336 | "[200]\ttraining's multi_logloss: 0.00896856\tvalid_1's multi_logloss: 0.294814\n",
337 | "Early stopping, best iteration is:\n",
338 | "[129]\ttraining's multi_logloss: 0.0400827\tvalid_1's multi_logloss: 0.277252\n"
339 | ]
340 | }
341 | ],
342 | "source": [
343 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
344 | "\n",
345 | "meta_train = np.zeros(shape = (len(train),8))\n",
346 | "meta_test = np.zeros(shape = (len(test),8))\n",
347 | "\n",
348 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
349 | " print 'FOLD: ',i\n",
350 | " print len(te_ind),len(tr_ind)\n",
351 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
352 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
353 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
354 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
355 | " params = {\n",
356 | " 'task':'train', \n",
357 | " 'boosting_type':'gbdt',\n",
358 | " 'num_leaves': 63,\n",
359 | " 'objective': 'multiclass',\n",
360 | " 'num_class':8,\n",
361 | " 'learning_rate': 0.045,\n",
362 | " 'feature_fraction': 0.5,\n",
363 | " 'subsample':0.7,\n",
364 | " 'num_threads': 54,\n",
365 | " 'metric':'multi_logloss',\n",
366 | " 'seed':100\n",
367 | " } \n",
368 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
369 | " pred_val = model.predict(X_val)\n",
370 | " pred_test = model.predict(test)\n",
371 | " \n",
372 | " meta_train[te_ind] = pred_val\n",
373 | " meta_test += pred_test\n",
374 | "meta_test /= 5.0\n",
375 | "\n",
376 | "pd.to_pickle(meta_train,'../train_meta_lgb_4.pkl')\n",
377 | "pd.to_pickle(meta_test,'../test_meta_lgb_4.pkl')"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": []
386 | }
387 | ],
388 | "metadata": {
389 | "kernelspec": {
390 | "display_name": "Python 3",
391 | "language": "python",
392 | "name": "python3"
393 | },
394 | "language_info": {
395 | "codemirror_mode": {
396 | "name": "ipython",
397 | "version": 3
398 | },
399 | "file_extension": ".py",
400 | "mimetype": "text/x-python",
401 | "name": "python",
402 | "nbconvert_exporter": "python",
403 | "pygments_lexer": "ipython3",
404 | "version": "3.6.5"
405 | }
406 | },
407 | "nbformat": 4,
408 | "nbformat_minor": 2
409 | }
410 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/pickle_pre-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "\n",
11 | "meta_train = pd.read_pickle('../meta/train_meta_dilated_cnn.pkl')\n",
12 | "meta_test = pd.read_pickle('../meta/test_meta_dilated_cnn.pkl')\n",
13 | "\n",
14 | "import pickle\n",
15 | "\n",
16 | "f=open('../meta/train_meta_dilated_cnn_a.pkl','wb') \n",
17 | "pickle.dump(meta_train,f,0) \n",
18 | "f.close()\n",
19 | "\n",
20 | "f=open('../meta/test_meta_dilated_cnn_a.pkl','wb') \n",
21 | "pickle.dump(meta_test,f,0) \n",
22 | "f.close()"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "meta_train = pd.read_pickle('../meta/train_meta_cnn.pkl')\n",
32 | "meta_test = pd.read_pickle('../meta/test_meta_cnn.pkl')\n",
33 | "\n",
34 | "f=open('../meta/train_meta_cnn_a.pkl','wb') \n",
35 | "pickle.dump(meta_train,f,0) \n",
36 | "f.close()\n",
37 | "\n",
38 | "f=open('../meta/test_meta_cnn_a.pkl','wb') \n",
39 | "pickle.dump(meta_test,f,0) \n",
40 | "f.close()"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/plain": [
51 | "'/Users/didi/天池/安全赛复赛/temp'"
52 | ]
53 | },
54 | "execution_count": 2,
55 | "metadata": {},
56 | "output_type": "execute_result"
57 | }
58 | ],
59 | "source": [
60 | "%pwd"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": []
69 | }
70 | ],
71 | "metadata": {
72 | "kernelspec": {
73 | "display_name": "Python 3",
74 | "language": "python",
75 | "name": "python3"
76 | },
77 | "language_info": {
78 | "codemirror_mode": {
79 | "name": "ipython",
80 | "version": 3
81 | },
82 | "file_extension": ".py",
83 | "mimetype": "text/x-python",
84 | "name": "python",
85 | "nbconvert_exporter": "python",
86 | "pygments_lexer": "ipython3",
87 | "version": "3.6.5"
88 | }
89 | },
90 | "nbformat": 4,
91 | "nbformat_minor": 2
92 | }
93 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/submit-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 26,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "# coding: utf-8\n",
12 | "\n",
13 | "# In[1]:\n",
14 | "\n",
15 | "\n",
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "import lightgbm as lgb\n",
19 | "from sklearn.cross_validation import train_test_split\n",
20 | "import gc\n",
21 | "from sklearn.preprocessing import OneHotEncoder\n",
22 | "import datetime\n",
23 | "from sklearn.cross_validation import StratifiedKFold\n",
24 | "\n",
25 | "# In[2]:"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 27,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "cur time = 2018/09/21 20:10:16\n",
38 | "(13887, 3251) (12955, 3251)\n",
39 | "cur time = 2018/09/21 20:10:16\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
45 | "train = np.load('../X_train.npy')\n",
46 | "test = np.load('../X_test.npy')\n",
47 | "train_labels = np.load('../labels.npy')\n",
48 | "\n",
49 | "print train.shape,test.shape\n",
50 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 28,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "train_cnn_1 = pd.read_pickle('../train_meta_cnn_a.pkl')\n",
60 | "test_cnn_1 = pd.read_pickle('../test_meta_cnn_a.pkl')"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 29,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "train_cnn_2 = pd.read_pickle('../train_meta_dilated_cnn_a.pkl')\n",
70 | "test_cnn_2 = pd.read_pickle('../test_meta_dilated_cnn_a.pkl')"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 30,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "train_lgb_1 = pd.read_pickle('../train_meta_lgb_1.pkl')\n",
80 | "test_lgb_1 = pd.read_pickle('../test_meta_lgb_1.pkl')\n",
81 | "\n",
82 | "train_lgb_2 = pd.read_pickle('../train_meta_lgb_2.pkl')\n",
83 | "test_lgb_2 = pd.read_pickle('../test_meta_lgb_2.pkl')\n",
84 | "\n",
85 | "train_lgb_3 = pd.read_pickle('../train_meta_lgb_3.pkl')\n",
86 | "test_lgb_3 = pd.read_pickle('../test_meta_lgb_3.pkl')\n",
87 | "\n",
88 | "train_lgb_4 = pd.read_pickle('../train_meta_lgb_4.pkl')\n",
89 | "test_lgb_4 = pd.read_pickle('../test_meta_lgb_4.pkl')"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 33,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "train = np.hstack([train,train_cnn_1, train_cnn_2, train_lgb_1, train_lgb_2, train_lgb_3, train_lgb_4])\n",
99 | "test = np.hstack([test,test_cnn_1, test_cnn_2, test_lgb_1, test_lgb_2, test_lgb_3, test_lgb_4])"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 36,
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "name": "stdout",
109 | "output_type": "stream",
110 | "text": [
111 | "Times: 0\n",
112 | "cur time = 2018/09/21 20:12:20\n",
113 | "FOLD: 0\n",
114 | "2780 11107\n",
115 | "Training until validation scores don't improve for 100 rounds.\n",
116 | "[100]\ttraining's multi_logloss: 0.70958\tvalid_1's multi_logloss: 0.755308\n",
117 | "[200]\ttraining's multi_logloss: 0.354997\tvalid_1's multi_logloss: 0.434251\n",
118 | "[300]\ttraining's multi_logloss: 0.217096\tvalid_1's multi_logloss: 0.326171\n",
119 | "[400]\ttraining's multi_logloss: 0.152399\tvalid_1's multi_logloss: 0.289922\n",
120 | "[500]\ttraining's multi_logloss: 0.117076\tvalid_1's multi_logloss: 0.278519\n",
121 | "[600]\ttraining's multi_logloss: 0.094704\tvalid_1's multi_logloss: 0.277246\n",
122 | "Early stopping, best iteration is:\n",
123 | "[598]\ttraining's multi_logloss: 0.0950629\tvalid_1's multi_logloss: 0.277223\n",
124 | "cur time = 2018/09/21 20:13:46\n",
125 | "FOLD: 1\n",
126 | "2779 11108\n",
127 | "Training until validation scores don't improve for 100 rounds.\n",
128 | "[100]\ttraining's multi_logloss: 0.714406\tvalid_1's multi_logloss: 0.746063\n",
129 | "[200]\ttraining's multi_logloss: 0.360927\tvalid_1's multi_logloss: 0.41752\n",
130 | "[300]\ttraining's multi_logloss: 0.223406\tvalid_1's multi_logloss: 0.305014\n",
131 | "[400]\ttraining's multi_logloss: 0.159355\tvalid_1's multi_logloss: 0.265401\n",
132 | "[500]\ttraining's multi_logloss: 0.123548\tvalid_1's multi_logloss: 0.251562\n",
133 | "[600]\ttraining's multi_logloss: 0.100453\tvalid_1's multi_logloss: 0.247619\n",
134 | "[700]\ttraining's multi_logloss: 0.0840086\tvalid_1's multi_logloss: 0.247471\n",
135 | "Early stopping, best iteration is:\n",
136 | "[645]\ttraining's multi_logloss: 0.0925202\tvalid_1's multi_logloss: 0.246913\n",
137 | "cur time = 2018/09/21 20:15:23\n",
138 | "FOLD: 2\n",
139 | "2777 11110\n",
140 | "Training until validation scores don't improve for 100 rounds.\n",
141 | "[100]\ttraining's multi_logloss: 0.710447\tvalid_1's multi_logloss: 0.758826\n",
142 | "[200]\ttraining's multi_logloss: 0.354958\tvalid_1's multi_logloss: 0.436029\n",
143 | "[300]\ttraining's multi_logloss: 0.216709\tvalid_1's multi_logloss: 0.326181\n",
144 | "[400]\ttraining's multi_logloss: 0.15243\tvalid_1's multi_logloss: 0.287969\n",
145 | "[500]\ttraining's multi_logloss: 0.117201\tvalid_1's multi_logloss: 0.275582\n",
146 | "[600]\ttraining's multi_logloss: 0.0948654\tvalid_1's multi_logloss: 0.273565\n",
147 | "Early stopping, best iteration is:\n",
148 | "[578]\ttraining's multi_logloss: 0.0990779\tvalid_1's multi_logloss: 0.273456\n",
149 | "cur time = 2018/09/21 20:16:47\n",
150 | "FOLD: 3\n",
151 | "2776 11111\n",
152 | "Training until validation scores don't improve for 100 rounds.\n",
153 | "[100]\ttraining's multi_logloss: 0.710814\tvalid_1's multi_logloss: 0.757495\n",
154 | "[200]\ttraining's multi_logloss: 0.356598\tvalid_1's multi_logloss: 0.432203\n",
155 | "[300]\ttraining's multi_logloss: 0.219223\tvalid_1's multi_logloss: 0.319802\n",
156 | "[400]\ttraining's multi_logloss: 0.154809\tvalid_1's multi_logloss: 0.280013\n",
157 | "[500]\ttraining's multi_logloss: 0.118818\tvalid_1's multi_logloss: 0.2661\n",
158 | "[600]\ttraining's multi_logloss: 0.0962369\tvalid_1's multi_logloss: 0.262496\n",
159 | "[700]\ttraining's multi_logloss: 0.0801419\tvalid_1's multi_logloss: 0.262299\n",
160 | "Early stopping, best iteration is:\n",
161 | "[660]\ttraining's multi_logloss: 0.0860689\tvalid_1's multi_logloss: 0.261957\n",
162 | "cur time = 2018/09/21 20:18:25\n",
163 | "FOLD: 4\n",
164 | "2775 11112\n",
165 | "Training until validation scores don't improve for 100 rounds.\n",
166 | "[100]\ttraining's multi_logloss: 0.711242\tvalid_1's multi_logloss: 0.757122\n",
167 | "[200]\ttraining's multi_logloss: 0.357074\tvalid_1's multi_logloss: 0.432454\n",
168 | "[300]\ttraining's multi_logloss: 0.219319\tvalid_1's multi_logloss: 0.319717\n",
169 | "[400]\ttraining's multi_logloss: 0.155336\tvalid_1's multi_logloss: 0.27957\n",
170 | "[500]\ttraining's multi_logloss: 0.12014\tvalid_1's multi_logloss: 0.264735\n",
171 | "[600]\ttraining's multi_logloss: 0.0976771\tvalid_1's multi_logloss: 0.260499\n",
172 | "[700]\ttraining's multi_logloss: 0.0816694\tvalid_1's multi_logloss: 0.260613\n",
173 | "Early stopping, best iteration is:\n",
174 | "[625]\ttraining's multi_logloss: 0.0932625\tvalid_1's multi_logloss: 0.260267\n",
175 | "cur time = 2018/09/21 20:20:08\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "\n",
181 | "\n",
182 | "meta_test = np.zeros(shape = (len(test),8))\n",
183 | "\n",
184 | "for seed in range(1):\n",
185 | " print 'Times: ',seed\n",
186 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
187 | " skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=seed)\n",
188 | " for i,(tr_ind,te_ind) in enumerate(skf):\n",
189 | " print 'FOLD: ',i\n",
190 | " print len(te_ind),len(tr_ind)\n",
191 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
192 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
193 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
194 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
195 | " params = {\n",
196 | " 'task':'train', \n",
197 | " 'boosting_type':'gbdt',\n",
198 | " 'num_leaves': 15,\n",
199 | " 'objective': 'multiclass',\n",
200 | " 'num_class':8,\n",
201 | " 'learning_rate': 0.01,\n",
202 | " 'feature_fraction': 0.85,\n",
203 | " 'subsample':0.85,\n",
204 | " 'num_threads': 54,\n",
205 | " 'metric':'multi_logloss',\n",
206 | " 'seed':seed\n",
207 | " } \n",
208 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
209 | " pred_test = model.predict(test)\n",
210 | "\n",
211 | " #meta_train[te_ind] = pred_val\n",
212 | " meta_test += pred_test\n",
213 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
214 | "\n",
215 | "meta_test/=5.0\n",
216 | "res = pd.DataFrame(meta_test,columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])\n",
217 | "res.index.name='file_id'\n",
218 | "res.round(7).to_csv('submit.csv', index = True, header=True)\n",
219 | " \n",
220 | " "
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 74,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "res.shape\n",
230 | "res.index = range(1,res.shape[0]+1)\n",
231 | "res.index.name = 'file_id'"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 77,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "en =res.copy()"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 79,
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "data": {
250 | "text/plain": [
251 | "1.0000000000000004"
252 | ]
253 | },
254 | "execution_count": 79,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "en.sum(axis=1).max()"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 81,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "en.to_csv('../fuucccccccck.csv',index=True,header=True)"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 83,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "data": {
279 | "text/html": [
280 | "
\n",
281 | "\n",
294 | "
\n",
295 | " \n",
296 | " \n",
297 | " | \n",
298 | " prob0 | \n",
299 | " prob1 | \n",
300 | " prob2 | \n",
301 | " prob3 | \n",
302 | " prob4 | \n",
303 | " prob5 | \n",
304 | " prob6 | \n",
305 | " prob7 | \n",
306 | "
\n",
307 | " \n",
308 | " file_id | \n",
309 | " | \n",
310 | " | \n",
311 | " | \n",
312 | " | \n",
313 | " | \n",
314 | " | \n",
315 | " | \n",
316 | " | \n",
317 | "
\n",
318 | " \n",
319 | " \n",
320 | " \n",
321 | " 1 | \n",
322 | " 0.002035 | \n",
323 | " 0.002127 | \n",
324 | " 0.949751 | \n",
325 | " 0.009502 | \n",
326 | " 0.001805 | \n",
327 | " 0.002404 | \n",
328 | " 0.005550 | \n",
329 | " 0.026825 | \n",
330 | "
\n",
331 | " \n",
332 | " 2 | \n",
333 | " 0.931129 | \n",
334 | " 0.002137 | \n",
335 | " 0.003289 | \n",
336 | " 0.003913 | \n",
337 | " 0.002060 | \n",
338 | " 0.009254 | \n",
339 | " 0.010101 | \n",
340 | " 0.038117 | \n",
341 | "
\n",
342 | " \n",
343 | " 3 | \n",
344 | " 0.996000 | \n",
345 | " 0.000453 | \n",
346 | " 0.000597 | \n",
347 | " 0.000630 | \n",
348 | " 0.000429 | \n",
349 | " 0.000575 | \n",
350 | " 0.000560 | \n",
351 | " 0.000755 | \n",
352 | "
\n",
353 | " \n",
354 | " 4 | \n",
355 | " 0.013627 | \n",
356 | " 0.008015 | \n",
357 | " 0.018625 | \n",
358 | " 0.098806 | \n",
359 | " 0.054051 | \n",
360 | " 0.092254 | \n",
361 | " 0.180903 | \n",
362 | " 0.533720 | \n",
363 | "
\n",
364 | " \n",
365 | " 5 | \n",
366 | " 0.993833 | \n",
367 | " 0.000578 | \n",
368 | " 0.001065 | \n",
369 | " 0.000852 | \n",
370 | " 0.000608 | \n",
371 | " 0.000776 | \n",
372 | " 0.000779 | \n",
373 | " 0.001510 | \n",
374 | "
\n",
375 | " \n",
376 | "
\n",
377 | "
"
378 | ],
379 | "text/plain": [
380 | " prob0 prob1 prob2 prob3 prob4 prob5 prob6 \\\n",
381 | "file_id \n",
382 | "1 0.002035 0.002127 0.949751 0.009502 0.001805 0.002404 0.005550 \n",
383 | "2 0.931129 0.002137 0.003289 0.003913 0.002060 0.009254 0.010101 \n",
384 | "3 0.996000 0.000453 0.000597 0.000630 0.000429 0.000575 0.000560 \n",
385 | "4 0.013627 0.008015 0.018625 0.098806 0.054051 0.092254 0.180903 \n",
386 | "5 0.993833 0.000578 0.001065 0.000852 0.000608 0.000776 0.000779 \n",
387 | "\n",
388 | " prob7 \n",
389 | "file_id \n",
390 | "1 0.026825 \n",
391 | "2 0.038117 \n",
392 | "3 0.000755 \n",
393 | "4 0.533720 \n",
394 | "5 0.001510 "
395 | ]
396 | },
397 | "execution_count": 83,
398 | "metadata": {},
399 | "output_type": "execute_result"
400 | }
401 | ],
402 | "source": [
403 | "en.head()"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": null,
409 | "metadata": {},
410 | "outputs": [],
411 | "source": []
412 | }
413 | ],
414 | "metadata": {
415 | "kernelspec": {
416 | "display_name": "Python 3",
417 | "language": "python",
418 | "name": "python3"
419 | },
420 | "language_info": {
421 | "codemirror_mode": {
422 | "name": "ipython",
423 | "version": 3
424 | },
425 | "file_extension": ".py",
426 | "mimetype": "text/x-python",
427 | "name": "python",
428 | "nbconvert_exporter": "python",
429 | "pygments_lexer": "ipython3",
430 | "version": "3.6.5"
431 | }
432 | },
433 | "nbformat": 4,
434 | "nbformat_minor": 2
435 | }
436 |
--------------------------------------------------------------------------------
/CNN_metafeature.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
13 | " return f(*args, **kwds)\n",
14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
15 | " return f(*args, **kwds)\n"
16 | ]
17 | }
18 | ],
19 | "source": [
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "import os\n",
23 | "from tqdm import tqdm\n",
24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "path = '../input/'\n",
34 | "train = pd.read_csv(path + 'final_train.csv')\n",
35 | "test = pd.read_csv(path + 'final_test.csv')"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "((89806693, 5), (79288375, 4))"
47 | ]
48 | },
49 | "execution_count": 3,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "train.shape,test.shape"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "unique_api = train['api'].unique()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "(295,)"
76 | ]
77 | },
78 | "execution_count": 5,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "unique_api.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 6,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n",
94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 7,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "train['api_idx'] = train['api'].map(api2index)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 8,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "test['api_idx'] = test['api'].map(api2index)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 9,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n",
122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 10,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "def get_sequence(df,period_idx):\n",
132 | " seq_list = []\n",
133 | " for _id,begin in enumerate(period_idx[:-1]):\n",
134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n",
135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n",
136 | " return seq_list"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 11,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n",
146 | "test_df = test[['file_id']].drop_duplicates(keep='first')"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 12,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "train_df['seq'] = get_sequence(train,train_period_idx)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 13,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "test_df['seq'] = get_sequence(test,test_period_idx)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 14,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "(19350.97816934013, 6466.961402750774, 888204)"
176 | ]
177 | },
178 | "execution_count": 14,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 15,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "(15911.676663585444, 6120.291393284446, 769590)"
196 | ]
197 | },
198 | "execution_count": 15,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 16,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stderr",
214 | "output_type": "stream",
215 | "text": [
216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
217 | " return f(*args, **kwds)\n",
218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
219 | " from ._conv import register_converters as _register_converters\n",
220 | "Using TensorFlow backend.\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "from keras.preprocessing.text import Tokenizer\n",
226 | "from keras.preprocessing.sequence import pad_sequences\n",
227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n",
228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n",
229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n",
230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n",
231 | "from keras.models import Model\n",
232 | "from keras.optimizers import RMSprop,Adam\n",
233 | "from keras.layers.normalization import BatchNormalization\n",
234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
235 | "from keras.optimizers import SGD\n",
236 | "from keras import backend as K\n",
237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n",
238 | "from keras.layers import SpatialDropout1D\n",
239 | "from keras.layers.wrappers import Bidirectional"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 17,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "def TextCNN(max_len,max_cnt,embed_size,\n",
249 | " num_filters,kernel_size,\n",
250 | " conv_action,\n",
251 | " mask_zero):\n",
252 | " _input = Input(shape=(max_len,), dtype='int32')\n",
253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n",
254 | " _embed = SpatialDropout1D(0.15)(_embed)\n",
255 | " warppers = []\n",
256 | " for _kernel_size in kernel_size:\n",
257 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)\n",
258 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n",
259 | " \n",
260 | " fc = concatenate(warppers)\n",
261 | " fc = Dropout(0.5)(fc)\n",
262 | " #fc = BatchNormalization()(fc)\n",
263 | " fc = Dense(256, activation='relu')(fc)\n",
264 | " fc = Dropout(0.25)(fc)\n",
265 | " #fc = BatchNormalization()(fc) \n",
266 | " preds = Dense(8, activation = 'softmax')(fc)\n",
267 | " \n",
268 | " model = Model(inputs=_input, outputs=preds)\n",
269 | " \n",
270 | " model.compile(loss='categorical_crossentropy',\n",
271 | " optimizer='adam',\n",
272 | " metrics=['accuracy'])\n",
273 | " return model"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 18,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "train_labels = pd.get_dummies(train_df.label).values\n",
283 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n",
284 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 20,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 21,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "max_len = 6000\n",
303 | "max_cnt = 295\n",
304 | "embed_size = 256\n",
305 | "num_filters = 64\n",
306 | "kernel_size = [2,4,6,8,10,12,14]\n",
307 | "conv_action = 'relu'\n",
308 | "mask_zero = False\n",
309 | "TRAIN = True"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 25,
315 | "metadata": {
316 | "scrolled": false
317 | },
318 | "outputs": [
319 | {
320 | "name": "stdout",
321 | "output_type": "stream",
322 | "text": [
323 | "FOLD: \n",
324 | "2780 11107\n",
325 | "2780/2780 [==============================] - 5s 2ms/step\n",
326 | "12955/12955 [==============================] - 18s 1ms/step\n",
327 | "FOLD: \n",
328 | "2779 11108\n",
329 | "2779/2779 [==============================] - 4s 2ms/step\n",
330 | "12955/12955 [==============================] - 18s 1ms/step\n",
331 | "FOLD: \n",
332 | "2777 11110\n",
333 | "2777/2777 [==============================] - 4s 2ms/step\n",
334 | "12955/12955 [==============================] - 18s 1ms/step\n",
335 | "FOLD: \n",
336 | "2776 11111\n",
337 | "2776/2776 [==============================] - 4s 2ms/step\n",
338 | "12955/12955 [==============================] - 18s 1ms/step\n",
339 | "FOLD: \n",
340 | "2775 11112\n",
341 | "2775/2775 [==============================] - 5s 2ms/step\n",
342 | "12955/12955 [==============================] - 19s 1ms/step\n"
343 | ]
344 | }
345 | ],
346 | "source": [
347 | "import os\n",
348 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
349 | "meta_train = np.zeros(shape = (len(train_seq),8))\n",
350 | "meta_test = np.zeros(shape = (len(test_seq),8))\n",
351 | "FLAG = False\n",
352 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
353 | " print('FOLD: '.format(i))\n",
354 | " print(len(te_ind),len(tr_ind))\n",
355 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n",
356 | " model_name = 'benchmark_textcnn_fold_'+str(i)\n",
357 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n",
358 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n",
359 | " \n",
360 | " model = TextCNN(max_len,max_cnt,embed_size,\n",
361 | " num_filters,kernel_size,\n",
362 | " conv_action,\n",
363 | " mask_zero)\n",
364 | " \n",
365 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n",
366 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n",
367 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n",
368 | " if TRAIN and FLAG:\n",
369 | " model.fit(X_train,X_train_label,\n",
370 | " validation_data=(X_val,X_val_label),\n",
371 | " epochs=100,batch_size=64,\n",
372 | " shuffle=True,\n",
373 | " callbacks=[early_stopping,model_checkpoint]\n",
374 | " )\n",
375 | " \n",
376 | " model.load_weights(model_save_path)\n",
377 | " pred_val = model.predict(X_val,batch_size=128,verbose=1)\n",
378 | " pred_test = model.predict(test_seq,batch_size=128,verbose=1)\n",
379 | " \n",
380 | " meta_train[te_ind] = pred_val\n",
381 | " meta_test += pred_test\n",
382 | " K.clear_session()\n",
383 | "meta_test /= 5.0\n"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 37,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "pd.to_pickle(meta_train,'../train_meta_cnn.pkl')\n",
393 | "pd.to_pickle(meta_test,'../test_meta_cnn.pkl')"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 38,
399 | "metadata": {},
400 | "outputs": [
401 | {
402 | "data": {
403 | "text/plain": [
404 | "'/home/enjoy/tianchi/安全赛复赛/src'"
405 | ]
406 | },
407 | "execution_count": 38,
408 | "metadata": {},
409 | "output_type": "execute_result"
410 | }
411 | ],
412 | "source": [
413 | "%pwd"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": null,
419 | "metadata": {},
420 | "outputs": [],
421 | "source": []
422 | }
423 | ],
424 | "metadata": {
425 | "kernelspec": {
426 | "display_name": "Python 3",
427 | "language": "python",
428 | "name": "python3"
429 | },
430 | "language_info": {
431 | "codemirror_mode": {
432 | "name": "ipython",
433 | "version": 3
434 | },
435 | "file_extension": ".py",
436 | "mimetype": "text/x-python",
437 | "name": "python",
438 | "nbconvert_exporter": "python",
439 | "pygments_lexer": "ipython3",
440 | "version": "3.6.5"
441 | }
442 | },
443 | "nbformat": 4,
444 | "nbformat_minor": 2
445 | }
446 |
--------------------------------------------------------------------------------
/CNN_metafeature_dilated.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
13 | " return f(*args, **kwds)\n",
14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
15 | " return f(*args, **kwds)\n"
16 | ]
17 | }
18 | ],
19 | "source": [
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "import os\n",
23 | "from tqdm import tqdm\n",
24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "path = '../input/'\n",
34 | "train = pd.read_csv(path + 'final_train.csv')\n",
35 | "test = pd.read_csv(path + 'final_test.csv')"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "((89806693, 5), (79288375, 4))"
47 | ]
48 | },
49 | "execution_count": 3,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "train.shape,test.shape"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "unique_api = train['api'].unique()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "(295,)"
76 | ]
77 | },
78 | "execution_count": 5,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "unique_api.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 6,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n",
94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 7,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "train['api_idx'] = train['api'].map(api2index)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 8,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "test['api_idx'] = test['api'].map(api2index)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 9,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n",
122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 10,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "def get_sequence(df,period_idx):\n",
132 | " seq_list = []\n",
133 | " for _id,begin in enumerate(period_idx[:-1]):\n",
134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n",
135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n",
136 | " return seq_list"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 11,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n",
146 | "test_df = test[['file_id']].drop_duplicates(keep='first')"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 12,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "train_df['seq'] = get_sequence(train,train_period_idx)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 13,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "test_df['seq'] = get_sequence(test,test_period_idx)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 14,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "(19350.97816934013, 6466.961402750774, 888204)"
176 | ]
177 | },
178 | "execution_count": 14,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 15,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "(15911.676663585444, 6120.291393284446, 769590)"
196 | ]
197 | },
198 | "execution_count": 15,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 16,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stderr",
214 | "output_type": "stream",
215 | "text": [
216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
217 | " return f(*args, **kwds)\n",
218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
219 | " from ._conv import register_converters as _register_converters\n",
220 | "Using TensorFlow backend.\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "from keras.preprocessing.text import Tokenizer\n",
226 | "from keras.preprocessing.sequence import pad_sequences\n",
227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n",
228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n",
229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n",
230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n",
231 | "from keras.models import Model\n",
232 | "from keras.optimizers import RMSprop,Adam\n",
233 | "from keras.layers.normalization import BatchNormalization\n",
234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
235 | "from keras.optimizers import SGD\n",
236 | "from keras import backend as K\n",
237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n",
238 | "from keras.layers import SpatialDropout1D\n",
239 | "from keras.layers.wrappers import Bidirectional"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 17,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "def TextCNN(max_len,max_cnt,embed_size,\n",
249 | " num_filters,kernel_size,\n",
250 | " conv_action,\n",
251 | " mask_zero):\n",
252 | " _input = Input(shape=(max_len,), dtype='int32')\n",
253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n",
254 | " _embed = SpatialDropout1D(0.25)(_embed)\n",
255 | " warppers = []\n",
256 | " for _kernel_size in kernel_size:\n",
257 | " for dilated_rate in [1,2,3,4]:\n",
258 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action, dilation_rate=dilated_rate)(_embed)\n",
259 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n",
260 | " \n",
261 | " fc = concatenate(warppers)\n",
262 | " fc = Dropout(0.5)(fc)\n",
263 | " #fc = BatchNormalization()(fc)\n",
264 | " fc = Dense(256, activation='relu')(fc)\n",
265 | " fc = Dropout(0.25)(fc)\n",
266 | " #fc = BatchNormalization()(fc) \n",
267 | " preds = Dense(8, activation = 'softmax')(fc)\n",
268 | " \n",
269 | " model = Model(inputs=_input, outputs=preds)\n",
270 | " \n",
271 | " model.compile(loss='categorical_crossentropy',\n",
272 | " optimizer='adam',\n",
273 | " metrics=['accuracy'])\n",
274 | " return model"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 18,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "train_labels = pd.get_dummies(train_df.label).values\n",
284 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n",
285 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 20,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 21,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "max_len = 6000\n",
304 | "max_cnt = 295\n",
305 | "embed_size = 256\n",
306 | "num_filters = 64\n",
307 | "kernel_size = [2,3,4,5]\n",
308 | "conv_action = 'relu'\n",
309 | "mask_zero = False\n",
310 | "TRAIN = True"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 22,
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "name": "stdout",
320 | "output_type": "stream",
321 | "text": [
322 | "FOLD: \n",
323 | "2780 11107\n",
324 | "FOLD: \n",
325 | "2779 11108\n",
326 | "FOLD: \n",
327 | "2777 11110\n",
328 | "FOLD: \n",
329 | "2776 11111\n",
330 | "Train on 11111 samples, validate on 2776 samples\n",
331 | "Epoch 1/100\n",
332 | "11111/11111 [==============================] - 142s 13ms/step - loss: 0.9257 - acc: 0.6915 - val_loss: 0.4994 - val_acc: 0.8505\n",
333 | "Epoch 2/100\n",
334 | "11111/11111 [==============================] - 116s 10ms/step - loss: 0.5334 - acc: 0.8335 - val_loss: 0.4226 - val_acc: 0.8689\n",
335 | "Epoch 3/100\n",
336 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4632 - acc: 0.8550 - val_loss: 0.3850 - val_acc: 0.8761\n",
337 | "Epoch 4/100\n",
338 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4105 - acc: 0.8701 - val_loss: 0.3808 - val_acc: 0.8754\n",
339 | "Epoch 5/100\n",
340 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3784 - acc: 0.8763 - val_loss: 0.3663 - val_acc: 0.8829\n",
341 | "Epoch 6/100\n",
342 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3536 - acc: 0.8840 - val_loss: 0.3467 - val_acc: 0.8872\n",
343 | "Epoch 7/100\n",
344 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3420 - acc: 0.8903 - val_loss: 0.3426 - val_acc: 0.8909\n",
345 | "Epoch 8/100\n",
346 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3284 - acc: 0.8941 - val_loss: 0.3377 - val_acc: 0.8945\n",
347 | "Epoch 9/100\n",
348 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3133 - acc: 0.8936 - val_loss: 0.3380 - val_acc: 0.8945\n",
349 | "Epoch 10/100\n",
350 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3034 - acc: 0.8971 - val_loss: 0.3415 - val_acc: 0.8923\n",
351 | "Epoch 11/100\n",
352 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2916 - acc: 0.9007 - val_loss: 0.3232 - val_acc: 0.8995\n",
353 | "Epoch 12/100\n",
354 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2765 - acc: 0.9058 - val_loss: 0.3402 - val_acc: 0.8934\n",
355 | "Epoch 13/100\n",
356 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2657 - acc: 0.9086 - val_loss: 0.3294 - val_acc: 0.8984\n",
357 | "Epoch 14/100\n",
358 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2620 - acc: 0.9079 - val_loss: 0.3411 - val_acc: 0.8977\n",
359 | "FOLD: \n",
360 | "2775 11112\n",
361 | "Train on 11112 samples, validate on 2775 samples\n",
362 | "Epoch 1/100\n",
363 | "11112/11112 [==============================] - 116s 10ms/step - loss: 0.9019 - acc: 0.7001 - val_loss: 0.4956 - val_acc: 0.8436\n",
364 | "Epoch 2/100\n",
365 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.5189 - acc: 0.8322 - val_loss: 0.4210 - val_acc: 0.8695\n",
366 | "Epoch 3/100\n",
367 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4525 - acc: 0.8543 - val_loss: 0.3906 - val_acc: 0.8778\n",
368 | "Epoch 4/100\n",
369 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4038 - acc: 0.8721 - val_loss: 0.3832 - val_acc: 0.8674\n",
370 | "Epoch 5/100\n",
371 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3802 - acc: 0.8790 - val_loss: 0.3687 - val_acc: 0.8836\n",
372 | "Epoch 6/100\n",
373 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3563 - acc: 0.8813 - val_loss: 0.3739 - val_acc: 0.8807\n",
374 | "Epoch 7/100\n",
375 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3277 - acc: 0.8909 - val_loss: 0.3597 - val_acc: 0.8840\n",
376 | "Epoch 8/100\n",
377 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3239 - acc: 0.8935 - val_loss: 0.3534 - val_acc: 0.8901\n",
378 | "Epoch 9/100\n",
379 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3061 - acc: 0.8954 - val_loss: 0.3581 - val_acc: 0.8861\n",
380 | "Epoch 10/100\n",
381 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2973 - acc: 0.8994 - val_loss: 0.3528 - val_acc: 0.8901\n",
382 | "Epoch 11/100\n",
383 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2875 - acc: 0.9035 - val_loss: 0.3537 - val_acc: 0.8847\n",
384 | "Epoch 12/100\n",
385 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2736 - acc: 0.9060 - val_loss: 0.3596 - val_acc: 0.8908\n",
386 | "Epoch 13/100\n",
387 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2613 - acc: 0.9078 - val_loss: 0.3521 - val_acc: 0.8908\n",
388 | "Epoch 14/100\n",
389 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2639 - acc: 0.9055 - val_loss: 0.3457 - val_acc: 0.8926\n",
390 | "Epoch 15/100\n",
391 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2514 - acc: 0.9121 - val_loss: 0.3702 - val_acc: 0.8865\n",
392 | "Epoch 16/100\n",
393 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2497 - acc: 0.9112 - val_loss: 0.3684 - val_acc: 0.8905\n",
394 | "Epoch 17/100\n",
395 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2366 - acc: 0.9147 - val_loss: 0.3700 - val_acc: 0.8908\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "import os\n",
401 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n",
402 | "meta_train = np.zeros(shape = (len(train_seq),8))\n",
403 | "meta_test = np.zeros(shape = (len(test_seq),8))\n",
404 | "FLAG = False\n",
405 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
406 | " if i in [3,4]:\n",
407 | " FLAG = True\n",
408 | " print('FOLD: '.format(i))\n",
409 | " print(len(te_ind),len(tr_ind))\n",
410 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n",
411 | " model_name = 'benchmark_dilated_textcnn_fold_'+str(i)\n",
412 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n",
413 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n",
414 | " \n",
415 | " model = TextCNN(max_len,max_cnt,embed_size,\n",
416 | " num_filters,kernel_size,\n",
417 | " conv_action,\n",
418 | " mask_zero)\n",
419 | " \n",
420 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n",
421 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n",
422 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n",
423 | " if TRAIN and FLAG:\n",
424 | " model.fit(X_train,X_train_label,\n",
425 | " validation_data=(X_val,X_val_label),\n",
426 | " epochs=100,batch_size=64,\n",
427 | " shuffle=True,\n",
428 | " callbacks=[early_stopping,model_checkpoint]\n",
429 | " )\n",
430 | " \n",
431 | " #model.load_weights(model_save_path)\n",
432 | " #pred_val = model.predict(X_val,batch_size=128)\n",
433 | " #pred_test = model.predict(test_seq,batch_size=128)\n",
434 | " \n",
435 | " #meta_train[te_ind] = pred_val\n",
436 | " #meta_test += pred_test\n",
437 | " FLAG = False\n",
438 | " #K.clear_session()\n",
439 | "#meta_test /= 5.0\n"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": null,
445 | "metadata": {},
446 | "outputs": [],
447 | "source": [
448 | "pd.to_pickle(meta_train,'../feature_final/train_meta_dilated_cnn.pkl')\n",
449 | "pd.to_pickle(meta_test,'../feature_final/test_meta_dilated_cnn.pkl')"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "print '1322'"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "metadata": {},
465 | "outputs": [],
466 | "source": []
467 | }
468 | ],
469 | "metadata": {
470 | "kernelspec": {
471 | "display_name": "Python 3",
472 | "language": "python",
473 | "name": "python3"
474 | },
475 | "language_info": {
476 | "codemirror_mode": {
477 | "name": "ipython",
478 | "version": 3
479 | },
480 | "file_extension": ".py",
481 | "mimetype": "text/x-python",
482 | "name": "python",
483 | "nbconvert_exporter": "python",
484 | "pygments_lexer": "ipython3",
485 | "version": "3.6.5"
486 | }
487 | },
488 | "nbformat": 4,
489 | "nbformat_minor": 2
490 | }
491 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 第三届阿里云安全算法挑战赛冠军代码
2 | ## [比赛链接](https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.100066.0.0.6acd33afwZ9hM7&raceId=231668)
3 |
4 | ## ppt: 上地西二旗人民.pptx
5 |
6 | ## 代码按照以下运行顺序
7 | * main_train.ipynb:生成train数据集特征
8 | * main_test.ipynb:生成test数据集特征
9 | * gene_npy.ipynb:将特征数据转换为npy文件
10 | * lgb_meta_features.ipynb:生成lgb元特征
11 | * CNN_metafeature.ipynb:生成cnn元特征
12 | * CNN_metafeature_dilated.ipynb:生成cnn_dilated元特征
13 | * pickle_pre.ipynb:解决py2和py3中.pkl文件不兼容的问题
14 | * submit.ipynb:stacking生成最终结果
15 |
--------------------------------------------------------------------------------
/gene_npy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 4,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "test_data_2gram_final = pd.read_csv('./test_data_2gram_final.csv')\n",
20 | "train_data_2gram_final = pd.read_csv('./train_data_2gram_final.csv')"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 6,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "cols = [item for item in train_data_2gram_final.columns if item not in ['label']]\n",
30 | "np.save('../X_test.npy',test_data_2gram_final[cols].values)\n",
31 | "np.save('../X_train.npy',train_data_2gram_final[cols].values)\n",
32 | "np.save('../labels.npy',train_data_2gram_final['label'].values)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 8,
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "data": {
42 | "text/plain": [
43 | "((13887, 3252), (12955, 3251))"
44 | ]
45 | },
46 | "execution_count": 8,
47 | "metadata": {},
48 | "output_type": "execute_result"
49 | }
50 | ],
51 | "source": [
52 | "train_data_2gram_final.shape,test_data_2gram_final.shape"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": []
61 | }
62 | ],
63 | "metadata": {
64 | "kernelspec": {
65 | "display_name": "Python 3",
66 | "language": "python",
67 | "name": "python3"
68 | },
69 | "language_info": {
70 | "codemirror_mode": {
71 | "name": "ipython",
72 | "version": 3
73 | },
74 | "file_extension": ".py",
75 | "mimetype": "text/x-python",
76 | "name": "python",
77 | "nbconvert_exporter": "python",
78 | "pygments_lexer": "ipython3",
79 | "version": "3.6.5"
80 | }
81 | },
82 | "nbformat": 4,
83 | "nbformat_minor": 2
84 | }
85 |
--------------------------------------------------------------------------------
/lgb_meta_features.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/user/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
13 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import numpy as np\n",
20 | "import lightgbm as lgb\n",
21 | "from sklearn.cross_validation import train_test_split\n",
22 | "import gc\n",
23 | "from sklearn.preprocessing import OneHotEncoder\n",
24 | "from sklearn.cross_validation import StratifiedKFold\n",
25 | "import datetime"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "cur time = 2018/09/21 18:54:08\n",
38 | "(13887, 3251) (12955, 3251)\n",
39 | "cur time = 2018/09/21 18:54:08\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
45 | "train = np.load('../X_train.npy')\n",
46 | "test = np.load('../X_test.npy')\n",
47 | "train_labels = np.load('../labels.npy')\n",
48 | "print train.shape,test.shape\n",
49 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
59 | "\n",
60 | "meta_train = np.zeros(shape = (len(train),8))\n",
61 | "meta_test = np.zeros(shape = (len(test),8))\n",
62 | "\n",
63 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
64 | " print 'FOLD: ',i\n",
65 | " print len(te_ind),len(tr_ind)\n",
66 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
67 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
68 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
69 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
70 | " params = {\n",
71 | " 'task':'train', \n",
72 | " 'boosting_type':'gbdt',\n",
73 | " 'num_leaves': 15,\n",
74 | " 'objective': 'multiclass',\n",
75 | " 'num_class':8,\n",
76 | " 'learning_rate': 0.05,\n",
77 | " 'feature_fraction': 0.85,\n",
78 | " 'subsample':0.85,\n",
79 | " 'num_threads': 32,\n",
80 | " 'metric':'multi_logloss',\n",
81 | " 'seed':100\n",
82 | " } \n",
83 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
84 | " pred_val = model.predict(X_val)\n",
85 | " pred_test = model.predict(test)\n",
86 | " \n",
87 | " meta_train[te_ind] = pred_val\n",
88 | " meta_test += pred_test\n",
89 | "meta_test /= 5.0"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "pd.to_pickle(meta_train,'../train_meta_lgb_1.pkl')\n",
99 | "pd.to_pickle(meta_test,'../test_meta_lgb_1.pkl')"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "metadata": {
106 | "scrolled": false
107 | },
108 | "outputs": [
109 | {
110 | "name": "stdout",
111 | "output_type": "stream",
112 | "text": [
113 | "FOLD: 0\n",
114 | "2780 11107\n",
115 | "Training until validation scores don't improve for 100 rounds.\n",
116 | "[100]\ttraining's multi_logloss: 0.105693\tvalid_1's multi_logloss: 0.290438\n",
117 | "[200]\ttraining's multi_logloss: 0.0243107\tvalid_1's multi_logloss: 0.28446\n",
118 | "Early stopping, best iteration is:\n",
119 | "[145]\ttraining's multi_logloss: 0.0517928\tvalid_1's multi_logloss: 0.277273\n",
120 | "FOLD: 1\n",
121 | "2779 11108\n",
122 | "Training until validation scores don't improve for 100 rounds.\n",
123 | "[100]\ttraining's multi_logloss: 0.108126\tvalid_1's multi_logloss: 0.284527\n",
124 | "[200]\ttraining's multi_logloss: 0.0254294\tvalid_1's multi_logloss: 0.283195\n",
125 | "Early stopping, best iteration is:\n",
126 | "[139]\ttraining's multi_logloss: 0.0583621\tvalid_1's multi_logloss: 0.273231\n",
127 | "FOLD: 2\n",
128 | "2777 11110\n",
129 | "Training until validation scores don't improve for 100 rounds.\n",
130 | "[100]\ttraining's multi_logloss: 0.107591\tvalid_1's multi_logloss: 0.271276\n",
131 | "[200]\ttraining's multi_logloss: 0.0256978\tvalid_1's multi_logloss: 0.267876\n",
132 | "Early stopping, best iteration is:\n",
133 | "[151]\ttraining's multi_logloss: 0.0490566\tvalid_1's multi_logloss: 0.258754\n",
134 | "FOLD: 3\n",
135 | "2776 11111\n",
136 | "Training until validation scores don't improve for 100 rounds.\n",
137 | "[100]\ttraining's multi_logloss: 0.109872\tvalid_1's multi_logloss: 0.2752\n",
138 | "[200]\ttraining's multi_logloss: 0.0267958\tvalid_1's multi_logloss: 0.266528\n",
139 | "Early stopping, best iteration is:\n",
140 | "[153]\ttraining's multi_logloss: 0.0492415\tvalid_1's multi_logloss: 0.260417\n",
141 | "FOLD: 4\n",
142 | "2775 11112\n",
143 | "Training until validation scores don't improve for 100 rounds.\n",
144 | "[100]\ttraining's multi_logloss: 0.108239\tvalid_1's multi_logloss: 0.286993\n",
145 | "[200]\ttraining's multi_logloss: 0.0260953\tvalid_1's multi_logloss: 0.276078\n",
146 | "Early stopping, best iteration is:\n",
147 | "[155]\ttraining's multi_logloss: 0.0471788\tvalid_1's multi_logloss: 0.270497\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
153 | "\n",
154 | "meta_train = np.zeros(shape = (len(train),8))\n",
155 | "meta_test = np.zeros(shape = (len(test),8))\n",
156 | "\n",
157 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
158 | " print 'FOLD: ',i\n",
159 | " print len(te_ind),len(tr_ind)\n",
160 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
161 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
162 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
163 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
164 | " params = {\n",
165 | " 'task':'train', \n",
166 | " 'boosting_type':'gbdt',\n",
167 | " 'num_leaves': 31,\n",
168 | " 'objective': 'multiclass',\n",
169 | " 'num_class':8,\n",
170 | " 'learning_rate': 0.05,\n",
171 | " 'feature_fraction': 0.85,\n",
172 | " 'subsample':0.85,\n",
173 | " 'num_threads': 32,\n",
174 | " 'metric':'multi_logloss',\n",
175 | " 'seed':100\n",
176 | " } \n",
177 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
178 | " pred_val = model.predict(X_val)\n",
179 | " pred_test = model.predict(test)\n",
180 | " \n",
181 | " meta_train[te_ind] = pred_val\n",
182 | " meta_test += pred_test\n",
183 | "meta_test /= 5.0"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 6,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "pd.to_pickle(meta_train,'../train_meta_lgb_2.pkl')\n",
193 | "pd.to_pickle(meta_test,'../test_meta_lgb_2.pkl')"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 7,
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "FOLD: 0\n",
206 | "2780 11107\n",
207 | "Training until validation scores don't improve for 100 rounds.\n",
208 | "[100]\ttraining's multi_logloss: 0.126813\tvalid_1's multi_logloss: 0.299223\n",
209 | "[200]\ttraining's multi_logloss: 0.0319222\tvalid_1's multi_logloss: 0.278803\n",
210 | "Early stopping, best iteration is:\n",
211 | "[161]\ttraining's multi_logloss: 0.0520005\tvalid_1's multi_logloss: 0.276196\n",
212 | "FOLD: 1\n",
213 | "2779 11108\n",
214 | "Training until validation scores don't improve for 100 rounds.\n",
215 | "[100]\ttraining's multi_logloss: 0.128834\tvalid_1's multi_logloss: 0.292494\n",
216 | "[200]\ttraining's multi_logloss: 0.0332951\tvalid_1's multi_logloss: 0.277843\n",
217 | "Early stopping, best iteration is:\n",
218 | "[153]\ttraining's multi_logloss: 0.0597567\tvalid_1's multi_logloss: 0.272742\n",
219 | "FOLD: 2\n",
220 | "2777 11110\n",
221 | "Training until validation scores don't improve for 100 rounds.\n",
222 | "[100]\ttraining's multi_logloss: 0.128497\tvalid_1's multi_logloss: 0.279648\n",
223 | "[200]\ttraining's multi_logloss: 0.0334364\tvalid_1's multi_logloss: 0.263845\n",
224 | "Early stopping, best iteration is:\n",
225 | "[159]\ttraining's multi_logloss: 0.0551787\tvalid_1's multi_logloss: 0.25859\n",
226 | "FOLD: 3\n",
227 | "2776 11111\n",
228 | "Training until validation scores don't improve for 100 rounds.\n",
229 | "[100]\ttraining's multi_logloss: 0.130386\tvalid_1's multi_logloss: 0.286192\n",
230 | "[200]\ttraining's multi_logloss: 0.0347223\tvalid_1's multi_logloss: 0.263253\n",
231 | "Early stopping, best iteration is:\n",
232 | "[169]\ttraining's multi_logloss: 0.0501232\tvalid_1's multi_logloss: 0.260649\n",
233 | "FOLD: 4\n",
234 | "2775 11112\n",
235 | "Training until validation scores don't improve for 100 rounds.\n",
236 | "[100]\ttraining's multi_logloss: 0.129009\tvalid_1's multi_logloss: 0.296055\n",
237 | "[200]\ttraining's multi_logloss: 0.0340881\tvalid_1's multi_logloss: 0.274158\n",
238 | "Early stopping, best iteration is:\n",
239 | "[173]\ttraining's multi_logloss: 0.0469372\tvalid_1's multi_logloss: 0.272973\n"
240 | ]
241 | }
242 | ],
243 | "source": [
244 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
245 | "\n",
246 | "meta_train = np.zeros(shape = (len(train),8))\n",
247 | "meta_test = np.zeros(shape = (len(test),8))\n",
248 | "\n",
249 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
250 | " print 'FOLD: ',i\n",
251 | " print len(te_ind),len(tr_ind)\n",
252 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
253 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
254 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
255 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
256 | " params = {\n",
257 | " 'task':'train', \n",
258 | " 'boosting_type':'gbdt',\n",
259 | " 'num_leaves': 31,\n",
260 | " 'objective': 'multiclass',\n",
261 | " 'num_class':8,\n",
262 | " 'learning_rate': 0.045,\n",
263 | " 'feature_fraction': 0.8,\n",
264 | " 'subsample':0.8,\n",
265 | " 'num_threads': 32,\n",
266 | " 'metric':'multi_logloss',\n",
267 | " 'seed':100\n",
268 | " } \n",
269 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
270 | " pred_val = model.predict(X_val)\n",
271 | " pred_test = model.predict(test)\n",
272 | " \n",
273 | " meta_train[te_ind] = pred_val\n",
274 | " meta_test += pred_test\n",
275 | "meta_test /= 5.0"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 8,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "pd.to_pickle(meta_train,'../train_meta_lgb_3.pkl')\n",
285 | "pd.to_pickle(meta_test,'../test_meta_lgb_3.pkl')"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": []
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 9,
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "FOLD: 0\n",
305 | "2780 11107\n",
306 | "Training until validation scores don't improve for 100 rounds.\n",
307 | "[100]\ttraining's multi_logloss: 0.0771172\tvalid_1's multi_logloss: 0.289138\n",
308 | "[200]\ttraining's multi_logloss: 0.00851115\tvalid_1's multi_logloss: 0.298243\n",
309 | "Early stopping, best iteration is:\n",
310 | "[133]\ttraining's multi_logloss: 0.0357694\tvalid_1's multi_logloss: 0.27818\n",
311 | "FOLD: 1\n",
312 | "2779 11108\n",
313 | "Training until validation scores don't improve for 100 rounds.\n",
314 | "[100]\ttraining's multi_logloss: 0.0780999\tvalid_1's multi_logloss: 0.289059\n",
315 | "[200]\ttraining's multi_logloss: 0.00887645\tvalid_1's multi_logloss: 0.298286\n",
316 | "Early stopping, best iteration is:\n",
317 | "[134]\ttraining's multi_logloss: 0.0357742\tvalid_1's multi_logloss: 0.278663\n",
318 | "FOLD: 2\n",
319 | "2777 11110\n",
320 | "Training until validation scores don't improve for 100 rounds.\n",
321 | "[100]\ttraining's multi_logloss: 0.0784245\tvalid_1's multi_logloss: 0.274011\n",
322 | "[200]\ttraining's multi_logloss: 0.00891692\tvalid_1's multi_logloss: 0.282485\n",
323 | "Early stopping, best iteration is:\n",
324 | "[134]\ttraining's multi_logloss: 0.0356565\tvalid_1's multi_logloss: 0.263027\n",
325 | "FOLD: 3\n",
326 | "2776 11111\n",
327 | "Training until validation scores don't improve for 100 rounds.\n",
328 | "[100]\ttraining's multi_logloss: 0.0795669\tvalid_1's multi_logloss: 0.280272\n",
329 | "[200]\ttraining's multi_logloss: 0.00927117\tvalid_1's multi_logloss: 0.284248\n",
330 | "Early stopping, best iteration is:\n",
331 | "[135]\ttraining's multi_logloss: 0.0357068\tvalid_1's multi_logloss: 0.267277\n",
332 | "FOLD: 4\n",
333 | "2775 11112\n",
334 | "Training until validation scores don't improve for 100 rounds.\n",
335 | "[100]\ttraining's multi_logloss: 0.0782005\tvalid_1's multi_logloss: 0.287082\n",
336 | "[200]\ttraining's multi_logloss: 0.00896856\tvalid_1's multi_logloss: 0.294814\n",
337 | "Early stopping, best iteration is:\n",
338 | "[129]\ttraining's multi_logloss: 0.0400827\tvalid_1's multi_logloss: 0.277252\n"
339 | ]
340 | }
341 | ],
342 | "source": [
343 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n",
344 | "\n",
345 | "meta_train = np.zeros(shape = (len(train),8))\n",
346 | "meta_test = np.zeros(shape = (len(test),8))\n",
347 | "\n",
348 | "for i,(tr_ind,te_ind) in enumerate(skf):\n",
349 | " print 'FOLD: ',i\n",
350 | " print len(te_ind),len(tr_ind)\n",
351 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
352 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
353 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
354 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
355 | " params = {\n",
356 | " 'task':'train', \n",
357 | " 'boosting_type':'gbdt',\n",
358 | " 'num_leaves': 63,\n",
359 | " 'objective': 'multiclass',\n",
360 | " 'num_class':8,\n",
361 | " 'learning_rate': 0.045,\n",
362 | " 'feature_fraction': 0.5,\n",
363 | " 'subsample':0.7,\n",
364 | " 'num_threads': 54,\n",
365 | " 'metric':'multi_logloss',\n",
366 | " 'seed':100\n",
367 | " } \n",
368 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
369 | " pred_val = model.predict(X_val)\n",
370 | " pred_test = model.predict(test)\n",
371 | " \n",
372 | " meta_train[te_ind] = pred_val\n",
373 | " meta_test += pred_test\n",
374 | "meta_test /= 5.0\n",
375 | "\n",
376 | "pd.to_pickle(meta_train,'../train_meta_lgb_4.pkl')\n",
377 | "pd.to_pickle(meta_test,'../test_meta_lgb_4.pkl')"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": []
386 | }
387 | ],
388 | "metadata": {
389 | "kernelspec": {
390 | "display_name": "Python 3",
391 | "language": "python",
392 | "name": "python3"
393 | },
394 | "language_info": {
395 | "codemirror_mode": {
396 | "name": "ipython",
397 | "version": 3
398 | },
399 | "file_extension": ".py",
400 | "mimetype": "text/x-python",
401 | "name": "python",
402 | "nbconvert_exporter": "python",
403 | "pygments_lexer": "ipython3",
404 | "version": "3.6.5"
405 | }
406 | },
407 | "nbformat": 4,
408 | "nbformat_minor": 2
409 | }
410 |
--------------------------------------------------------------------------------
/main_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 工具包导入&数据读取\n",
8 | "## 工具包导入"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [
16 | {
17 | "name": "stderr",
18 | "output_type": "stream",
19 | "text": [
20 | "/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
21 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "import pandas as pd\n",
27 | "import numpy as np\n",
28 | "import seaborn as sns\n",
29 | "import matplotlib.pyplot as plt\n",
30 | "#import lightgbm as lgb\n",
31 | "from sklearn.cross_validation import train_test_split\n",
32 | "import gc\n",
33 | "from sklearn.preprocessing import OneHotEncoder\n",
34 | "%matplotlib inline "
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/plain": [
45 | "u'/mnt/disk0/home/zhongrunxing/jupyter_code/tianchi_safe'"
46 | ]
47 | },
48 | "execution_count": 2,
49 | "metadata": {},
50 | "output_type": "execute_result"
51 | }
52 | ],
53 | "source": [
54 | "%pwd"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": []
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "## 数据读取\n",
69 | "- 为了方便分析,我们读取3000万条数据进行处理"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 3,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "path = '/home/zhongrunxing/jupyter_code/tianchi_safe/input/'\n",
79 | "#train = pd.read_csv(path + 'final_train.csv',nrows=1000000)\n",
80 | "#train = pd.read_csv(path + 'final_test.csv',nrows=1000000)\n"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 4,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "name": "stdout",
90 | "output_type": "stream",
91 | "text": [
92 | "(79288375, 4)\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "train = pd.read_csv(path + 'final_test.csv')\n",
98 | "print(train.shape)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": []
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "# 特征工程 & 验证结果(1-Gram)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 5,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "# train_data = train[['file_id','label']].drop_duplicates()\n",
122 | "# train_data.head()\n",
123 | "# train_data['label'].value_counts()"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 6,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/html": [
134 | "\n",
135 | "\n",
148 | "
\n",
149 | " \n",
150 | " \n",
151 | " | \n",
152 | " file_id | \n",
153 | "
\n",
154 | " \n",
155 | " \n",
156 | " \n",
157 | " 0 | \n",
158 | " 1 | \n",
159 | "
\n",
160 | " \n",
161 | " 97 | \n",
162 | " 2 | \n",
163 | "
\n",
164 | " \n",
165 | " 1458 | \n",
166 | " 3 | \n",
167 | "
\n",
168 | " \n",
169 | " 1474 | \n",
170 | " 4 | \n",
171 | "
\n",
172 | " \n",
173 | " 1667 | \n",
174 | " 5 | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
"
179 | ],
180 | "text/plain": [
181 | " file_id\n",
182 | "0 1\n",
183 | "97 2\n",
184 | "1458 3\n",
185 | "1474 4\n",
186 | "1667 5"
187 | ]
188 | },
189 | "execution_count": 6,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "train_data = train[['file_id']].drop_duplicates()\n",
196 | "train_data.head()"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": []
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "## 全局特征:\n",
211 | "- File_id (Api): count,nunique\n",
212 | "- File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range\n",
213 | "- File_id (Return Value): count,nunique,max,min,quantile(20,40,50,60,80),std,range\n",
214 | "- File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "### File_id (Api): count,nunique"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 7,
227 | "metadata": {},
228 | "outputs": [
229 | {
230 | "data": {
231 | "text/html": [
232 | "\n",
233 | "\n",
246 | "
\n",
247 | " \n",
248 | " \n",
249 | " | \n",
250 | " file_id | \n",
251 | " api | \n",
252 | " tid | \n",
253 | " index | \n",
254 | "
\n",
255 | " \n",
256 | " \n",
257 | " \n",
258 | " 0 | \n",
259 | " 1 | \n",
260 | " RegOpenKeyExA | \n",
261 | " 2332 | \n",
262 | " 0 | \n",
263 | "
\n",
264 | " \n",
265 | " 1 | \n",
266 | " 1 | \n",
267 | " CopyFileA | \n",
268 | " 2332 | \n",
269 | " 1 | \n",
270 | "
\n",
271 | " \n",
272 | " 2 | \n",
273 | " 1 | \n",
274 | " OpenSCManagerA | \n",
275 | " 2332 | \n",
276 | " 2 | \n",
277 | "
\n",
278 | " \n",
279 | " 3 | \n",
280 | " 1 | \n",
281 | " CreateServiceA | \n",
282 | " 2332 | \n",
283 | " 3 | \n",
284 | "
\n",
285 | " \n",
286 | " 4 | \n",
287 | " 1 | \n",
288 | " RegOpenKeyExA | \n",
289 | " 2468 | \n",
290 | " 0 | \n",
291 | "
\n",
292 | " \n",
293 | "
\n",
294 | "
"
295 | ],
296 | "text/plain": [
297 | " file_id api tid index\n",
298 | "0 1 RegOpenKeyExA 2332 0\n",
299 | "1 1 CopyFileA 2332 1\n",
300 | "2 1 OpenSCManagerA 2332 2\n",
301 | "3 1 CreateServiceA 2332 3\n",
302 | "4 1 RegOpenKeyExA 2468 0"
303 | ]
304 | },
305 | "execution_count": 7,
306 | "metadata": {},
307 | "output_type": "execute_result"
308 | }
309 | ],
310 | "source": [
311 | "train.head()"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 8,
317 | "metadata": {},
318 | "outputs": [
319 | {
320 | "name": "stdout",
321 | "output_type": "stream",
322 | "text": [
323 | "count\n"
324 | ]
325 | },
326 | {
327 | "name": "stderr",
328 | "output_type": "stream",
329 | "text": [
330 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
331 | "is deprecated and will be removed in a future version\n",
332 | " after removing the cwd from sys.path.\n"
333 | ]
334 | },
335 | {
336 | "name": "stdout",
337 | "output_type": "stream",
338 | "text": [
339 | "nunique\n"
340 | ]
341 | }
342 | ],
343 | "source": [
344 | "api_opt = ['count','nunique'] \n",
345 | "for opt in api_opt:\n",
346 | " print(opt)\n",
347 | " tmp = train.groupby(['file_id'])['api'].agg({'fileid_api_' + opt: opt}).reset_index() \n",
348 | " train_data = pd.merge(train_data,tmp,how='left', on='file_id') "
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 9,
354 | "metadata": {},
355 | "outputs": [
356 | {
357 | "data": {
358 | "text/html": [
359 | "\n",
360 | "\n",
373 | "
\n",
374 | " \n",
375 | " \n",
376 | " | \n",
377 | " file_id | \n",
378 | " fileid_api_count | \n",
379 | " fileid_api_nunique | \n",
380 | "
\n",
381 | " \n",
382 | " \n",
383 | " \n",
384 | " 0 | \n",
385 | " 1 | \n",
386 | " 97 | \n",
387 | " 15 | \n",
388 | "
\n",
389 | " \n",
390 | " 1 | \n",
391 | " 2 | \n",
392 | " 1361 | \n",
393 | " 40 | \n",
394 | "
\n",
395 | " \n",
396 | " 2 | \n",
397 | " 3 | \n",
398 | " 16 | \n",
399 | " 9 | \n",
400 | "
\n",
401 | " \n",
402 | " 3 | \n",
403 | " 4 | \n",
404 | " 193 | \n",
405 | " 34 | \n",
406 | "
\n",
407 | " \n",
408 | " 4 | \n",
409 | " 5 | \n",
410 | " 803 | \n",
411 | " 34 | \n",
412 | "
\n",
413 | " \n",
414 | "
\n",
415 | "
"
416 | ],
417 | "text/plain": [
418 | " file_id fileid_api_count fileid_api_nunique\n",
419 | "0 1 97 15\n",
420 | "1 2 1361 40\n",
421 | "2 3 16 9\n",
422 | "3 4 193 34\n",
423 | "4 5 803 34"
424 | ]
425 | },
426 | "execution_count": 9,
427 | "metadata": {},
428 | "output_type": "execute_result"
429 | }
430 | ],
431 | "source": [
432 | "train_data.head()"
433 | ]
434 | },
435 | {
436 | "cell_type": "markdown",
437 | "metadata": {},
438 | "source": [
439 | "### File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 10,
445 | "metadata": {},
446 | "outputs": [
447 | {
448 | "name": "stdout",
449 | "output_type": "stream",
450 | "text": [
451 | "count\n"
452 | ]
453 | },
454 | {
455 | "name": "stderr",
456 | "output_type": "stream",
457 | "text": [
458 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
459 | "is deprecated and will be removed in a future version\n",
460 | " after removing the cwd from sys.path.\n"
461 | ]
462 | },
463 | {
464 | "name": "stdout",
465 | "output_type": "stream",
466 | "text": [
467 | "nunique\n",
468 | "max\n",
469 | "min\n",
470 | "median\n",
471 | "std\n"
472 | ]
473 | }
474 | ],
475 | "source": [
476 | "tid_opt = ['count','nunique','max','min','median','std'] \n",
477 | "for opt in tid_opt:\n",
478 | " print(opt)\n",
479 | " tmp = train.groupby(['file_id'])['tid'].agg({'fileid_tid_' + opt: opt}).reset_index() \n",
480 | " train_data = pd.merge(train_data,tmp,how='left', on='file_id') "
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": 11,
486 | "metadata": {},
487 | "outputs": [],
488 | "source": [
489 | "secs = [0.2,0.4,0.6,0.8]\n",
490 | "for sec in secs: \n",
491 | " train_data['fileid_tid_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['tid'].quantile(sec).values\n",
492 | " \n",
493 | "train_data['fileid_tid_range'] = train.groupby(['file_id'])['tid'].quantile(0.975).values - train.groupby(['file_id'])['tid'].quantile(0.0125).values"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": []
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {},
506 | "source": [
507 | "### File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 12,
513 | "metadata": {},
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "count\n"
520 | ]
521 | },
522 | {
523 | "name": "stderr",
524 | "output_type": "stream",
525 | "text": [
526 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
527 | "is deprecated and will be removed in a future version\n",
528 | " after removing the cwd from sys.path.\n"
529 | ]
530 | },
531 | {
532 | "name": "stdout",
533 | "output_type": "stream",
534 | "text": [
535 | "nunique\n",
536 | "max\n",
537 | "min\n",
538 | "median\n",
539 | "std\n"
540 | ]
541 | }
542 | ],
543 | "source": [
544 | "index_opt = ['count','nunique','max','min','median','std'] \n",
545 | "for opt in index_opt:\n",
546 | " print(opt)\n",
547 | " tmp = train.groupby(['file_id'])['index'].agg({'fileid_index_' + opt: opt}).reset_index() \n",
548 | " train_data = pd.merge(train_data,tmp,how='left', on='file_id') "
549 | ]
550 | },
551 | {
552 | "cell_type": "code",
553 | "execution_count": 13,
554 | "metadata": {},
555 | "outputs": [],
556 | "source": [
557 | "secs = [0.2,0.4,0.6,0.8]\n",
558 | "for sec in secs: \n",
559 | " train_data['fileid_index_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['index'].quantile(sec).values\n",
560 | " \n",
561 | "train_data['fileid_index_range'] = train.groupby(['file_id'])['index'].quantile(0.975).values - train.groupby(['file_id'])['index'].quantile(0.0125).values"
562 | ]
563 | },
564 | {
565 | "cell_type": "markdown",
566 | "metadata": {},
567 | "source": [
568 | "### 全局特征的线下验证 ( 0.0969482)"
569 | ]
570 | },
571 | {
572 | "cell_type": "markdown",
573 | "metadata": {},
574 | "source": [
575 | "#### 评估指标"
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": 14,
581 | "metadata": {},
582 | "outputs": [],
583 | "source": [
584 | "def lgb_logloss(preds,data):\n",
585 | " labels_ = data.get_label()\n",
586 | " classes_ = np.unique(labels_) \n",
587 | " preds_prob = []\n",
588 | " for i in range(len(classes_)):\n",
589 | " preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)])\n",
590 | " preds_prob_ = np.vstack(preds_prob) \n",
591 | " \n",
592 | " loss = [] \n",
593 | " for i in range(preds_prob_.shape[1]): # 样本个数\n",
594 | " sum_ = 0 \n",
595 | " for j in range(preds_prob_.shape[0]): #类别个数\n",
596 | " pred = preds_prob_[j,i] # 第i个样本预测为第j类的概率\n",
597 | " if j == labels_[i]:\n",
598 | " sum_ += np.log(pred)\n",
599 | " else:\n",
600 | " sum_ += np.log(1 - pred) \n",
601 | " \n",
602 | " loss.append(sum_) \n",
603 | " \n",
604 | " return 'loss is: ' ,-1 * (np.sum(loss) / preds_prob_.shape[1]),False"
605 | ]
606 | },
607 | {
608 | "cell_type": "markdown",
609 | "metadata": {},
610 | "source": [
611 | "#### 训练特征 & 标签"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": 15,
617 | "metadata": {},
618 | "outputs": [],
619 | "source": [
620 | "train_features = [col for col in train_data.columns if col!='label' and col!='file_id']\n",
621 | "train_label = 'label'"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": 16,
627 | "metadata": {},
628 | "outputs": [],
629 | "source": [
630 | "# train_X, test_X, train_Y, test_Y = train_test_split( train_data[train_features],train_data[train_label].values, test_size = 0.33) \n",
631 | "# del _\n",
632 | "# gc.collect()\n",
633 | "\n",
634 | "# train_ind = train_X.index\n",
635 | "# test_ind = test_X.index"
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": null,
641 | "metadata": {},
642 | "outputs": [],
643 | "source": []
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": null,
648 | "metadata": {},
649 | "outputs": [],
650 | "source": []
651 | },
652 | {
653 | "cell_type": "code",
654 | "execution_count": 17,
655 | "metadata": {
656 | "scrolled": true
657 | },
658 | "outputs": [],
659 | "source": [
660 | "# dtrain = lgb.Dataset(train_X,train_Y) \n",
661 | "# dval = lgb.Dataset(test_X,test_Y, reference = dtrain) \n",
662 | "\n",
663 | "# params = {\n",
664 | "# 'task':'train', \n",
665 | "# 'num_leaves': 255,\n",
666 | "# 'objective': 'multiclass',\n",
667 | "# 'num_class':8,\n",
668 | "# #'min_data_in_leaf': 40,\n",
669 | "# 'min_data_in_leaf': 1,\n",
670 | "# 'learning_rate': 0.05,\n",
671 | "# 'feature_fraction': 0.85,\n",
672 | "# 'bagging_fraction': 0.9,\n",
673 | "# 'bagging_freq': 5, \n",
674 | "# 'max_bin':128,\n",
675 | "# 'num_threads': 10,\n",
676 | "# 'random_state':100\n",
677 | "# } \n",
678 | "# lgb_model_0_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) "
679 | ]
680 | },
681 | {
682 | "cell_type": "markdown",
683 | "metadata": {},
684 | "source": [
685 | "### 全局特征扩充\n",
686 | "- File_id + return_value分段:计数"
687 | ]
688 | },
689 | {
690 | "cell_type": "markdown",
691 | "metadata": {},
692 | "source": [
693 | "## 局部组合特征(展开形式)\n",
694 | "### File_id + Api \n",
695 | "- File_id + Api (tid): count,nunique\n",
696 | "- File_id + Api (return value): nunique, max, min, median, std\n",
697 | "- File_id + Api (index): nunique, max, min, median, std\n",
698 | "\n"
699 | ]
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | "#### File_id + Api (tid): count,nunique"
706 | ]
707 | },
708 | {
709 | "cell_type": "code",
710 | "execution_count": 18,
711 | "metadata": {},
712 | "outputs": [],
713 | "source": [
714 | "def groupby_pivot_features(data_merge, data_orig , groupby_features,col1 = None, col2 = None, opts = None):\n",
715 | " for opt in opts:\n",
716 | " print(opt)\n",
717 | " train_split = data_orig.groupby(['file_id',col1])[col2].agg({'fileid_' + col1 + '_'+col2+'_'+ str(opt):opt}).reset_index() \n",
718 | " \n",
719 | " train_split_ = pd.pivot_table(train_split, values = 'fileid_' + col1 + '_'+col2+'_'+ str(opt), index=['file_id'],columns=[col1])\n",
720 | " new_cols = [ 'fileid_' + col1 + '_'+col2+ '_' + opt + '_' + str(col) for col in train_split_.columns]\n",
721 | " \n",
722 | " groupby_features.append(new_cols)\n",
723 | " train_split_.columns = new_cols \n",
724 | "\n",
725 | " train_split_.reset_index(inplace = True)\n",
726 | " \n",
727 | " data_merge = pd.merge(data_merge,train_split_,how='left', on='file_id') \n",
728 | " return data_merge,groupby_features \n",
729 | " "
730 | ]
731 | },
732 | {
733 | "cell_type": "code",
734 | "execution_count": 19,
735 | "metadata": {},
736 | "outputs": [
737 | {
738 | "name": "stdout",
739 | "output_type": "stream",
740 | "text": [
741 | "count\n"
742 | ]
743 | },
744 | {
745 | "name": "stderr",
746 | "output_type": "stream",
747 | "text": [
748 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
749 | "is deprecated and will be removed in a future version\n",
750 | " after removing the cwd from sys.path.\n"
751 | ]
752 | },
753 | {
754 | "name": "stdout",
755 | "output_type": "stream",
756 | "text": [
757 | "nunique\n"
758 | ]
759 | }
760 | ],
761 | "source": [
762 | "groupby_features = []\n",
763 | "api_opts = ['count', 'nunique']\n",
764 | "train_data_,groupby_features = groupby_pivot_features(train_data, train, groupby_features, col1 = 'api', col2 = 'tid', opts = api_opts)"
765 | ]
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "#### File_id + Api (return value): nunique, max, min, median, std"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": 20,
777 | "metadata": {
778 | "scrolled": true
779 | },
780 | "outputs": [],
781 | "source": [
782 | "# api_opts = ['nunique','max','min','median','std']\n",
783 | "# train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'return_value', opts = api_opts) "
784 | ]
785 | },
786 | {
787 | "cell_type": "markdown",
788 | "metadata": {},
789 | "source": [
790 | "#### File_id + Api(index): nunique, max, min, median, std"
791 | ]
792 | },
793 | {
794 | "cell_type": "code",
795 | "execution_count": 21,
796 | "metadata": {},
797 | "outputs": [
798 | {
799 | "name": "stdout",
800 | "output_type": "stream",
801 | "text": [
802 | "nunique\n"
803 | ]
804 | },
805 | {
806 | "name": "stderr",
807 | "output_type": "stream",
808 | "text": [
809 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
810 | "is deprecated and will be removed in a future version\n",
811 | " after removing the cwd from sys.path.\n"
812 | ]
813 | },
814 | {
815 | "name": "stdout",
816 | "output_type": "stream",
817 | "text": [
818 | "max\n",
819 | "min\n",
820 | "median\n",
821 | "std\n"
822 | ]
823 | }
824 | ],
825 | "source": [
826 | "api_opts = ['nunique','max','min','median','std']\n",
827 | "train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'index', opts = api_opts) "
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": 22,
833 | "metadata": {},
834 | "outputs": [
835 | {
836 | "data": {
837 | "text/html": [
838 | "\n",
839 | "\n",
852 | "
\n",
853 | " \n",
854 | " \n",
855 | " | \n",
856 | " file_id | \n",
857 | " fileid_api_count | \n",
858 | " fileid_api_nunique | \n",
859 | " fileid_tid_count | \n",
860 | " fileid_tid_nunique | \n",
861 | " fileid_tid_max | \n",
862 | " fileid_tid_min | \n",
863 | " fileid_tid_median | \n",
864 | " fileid_tid_std | \n",
865 | " fileid_tid_quantile_20.0 | \n",
866 | " ... | \n",
867 | " fileid_api_index_std_recv | \n",
868 | " fileid_api_index_std_recvfrom | \n",
869 | " fileid_api_index_std_select | \n",
870 | " fileid_api_index_std_send | \n",
871 | " fileid_api_index_std_sendto | \n",
872 | " fileid_api_index_std_setsockopt | \n",
873 | " fileid_api_index_std_shutdown | \n",
874 | " fileid_api_index_std_socket | \n",
875 | " fileid_api_index_std_system | \n",
876 | " fileid_api_index_std_timeGetTime | \n",
877 | "
\n",
878 | " \n",
879 | " \n",
880 | " \n",
881 | " 0 | \n",
882 | " 1 | \n",
883 | " 97 | \n",
884 | " 15 | \n",
885 | " 97 | \n",
886 | " 4 | \n",
887 | " 2568 | \n",
888 | " 2332 | \n",
889 | " 2544 | \n",
890 | " 57.218548 | \n",
891 | " 2468.0 | \n",
892 | " ... | \n",
893 | " NaN | \n",
894 | " NaN | \n",
895 | " NaN | \n",
896 | " NaN | \n",
897 | " NaN | \n",
898 | " NaN | \n",
899 | " NaN | \n",
900 | " NaN | \n",
901 | " NaN | \n",
902 | " NaN | \n",
903 | "
\n",
904 | " \n",
905 | " 1 | \n",
906 | " 2 | \n",
907 | " 1361 | \n",
908 | " 40 | \n",
909 | " 1361 | \n",
910 | " 7 | \n",
911 | " 2748 | \n",
912 | " 2472 | \n",
913 | " 2524 | \n",
914 | " 104.399149 | \n",
915 | " 2472.0 | \n",
916 | " ... | \n",
917 | " NaN | \n",
918 | " NaN | \n",
919 | " NaN | \n",
920 | " NaN | \n",
921 | " NaN | \n",
922 | " NaN | \n",
923 | " NaN | \n",
924 | " NaN | \n",
925 | " NaN | \n",
926 | " NaN | \n",
927 | "
\n",
928 | " \n",
929 | " 2 | \n",
930 | " 3 | \n",
931 | " 16 | \n",
932 | " 9 | \n",
933 | " 16 | \n",
934 | " 1 | \n",
935 | " 2344 | \n",
936 | " 2344 | \n",
937 | " 2344 | \n",
938 | " 0.000000 | \n",
939 | " 2344.0 | \n",
940 | " ... | \n",
941 | " NaN | \n",
942 | " NaN | \n",
943 | " NaN | \n",
944 | " NaN | \n",
945 | " NaN | \n",
946 | " NaN | \n",
947 | " NaN | \n",
948 | " NaN | \n",
949 | " NaN | \n",
950 | " NaN | \n",
951 | "
\n",
952 | " \n",
953 | " 3 | \n",
954 | " 4 | \n",
955 | " 193 | \n",
956 | " 34 | \n",
957 | " 193 | \n",
958 | " 3 | \n",
959 | " 2584 | \n",
960 | " 2452 | \n",
961 | " 2452 | \n",
962 | " 50.951508 | \n",
963 | " 2452.0 | \n",
964 | " ... | \n",
965 | " NaN | \n",
966 | " NaN | \n",
967 | " NaN | \n",
968 | " NaN | \n",
969 | " NaN | \n",
970 | " NaN | \n",
971 | " NaN | \n",
972 | " NaN | \n",
973 | " NaN | \n",
974 | " NaN | \n",
975 | "
\n",
976 | " \n",
977 | " 4 | \n",
978 | " 5 | \n",
979 | " 803 | \n",
980 | " 34 | \n",
981 | " 803 | \n",
982 | " 3 | \n",
983 | " 2780 | \n",
984 | " 2332 | \n",
985 | " 2376 | \n",
986 | " 201.826813 | \n",
987 | " 2332.0 | \n",
988 | " ... | \n",
989 | " NaN | \n",
990 | " NaN | \n",
991 | " NaN | \n",
992 | " NaN | \n",
993 | " NaN | \n",
994 | " NaN | \n",
995 | " NaN | \n",
996 | " NaN | \n",
997 | " NaN | \n",
998 | " NaN | \n",
999 | "
\n",
1000 | " \n",
1001 | "
\n",
1002 | "
5 rows × 2103 columns
\n",
1003 | "
"
1004 | ],
1005 | "text/plain": [
1006 | " file_id fileid_api_count fileid_api_nunique fileid_tid_count \\\n",
1007 | "0 1 97 15 97 \n",
1008 | "1 2 1361 40 1361 \n",
1009 | "2 3 16 9 16 \n",
1010 | "3 4 193 34 193 \n",
1011 | "4 5 803 34 803 \n",
1012 | "\n",
1013 | " fileid_tid_nunique fileid_tid_max fileid_tid_min fileid_tid_median \\\n",
1014 | "0 4 2568 2332 2544 \n",
1015 | "1 7 2748 2472 2524 \n",
1016 | "2 1 2344 2344 2344 \n",
1017 | "3 3 2584 2452 2452 \n",
1018 | "4 3 2780 2332 2376 \n",
1019 | "\n",
1020 | " fileid_tid_std fileid_tid_quantile_20.0 ... \\\n",
1021 | "0 57.218548 2468.0 ... \n",
1022 | "1 104.399149 2472.0 ... \n",
1023 | "2 0.000000 2344.0 ... \n",
1024 | "3 50.951508 2452.0 ... \n",
1025 | "4 201.826813 2332.0 ... \n",
1026 | "\n",
1027 | " fileid_api_index_std_recv fileid_api_index_std_recvfrom \\\n",
1028 | "0 NaN NaN \n",
1029 | "1 NaN NaN \n",
1030 | "2 NaN NaN \n",
1031 | "3 NaN NaN \n",
1032 | "4 NaN NaN \n",
1033 | "\n",
1034 | " fileid_api_index_std_select fileid_api_index_std_send \\\n",
1035 | "0 NaN NaN \n",
1036 | "1 NaN NaN \n",
1037 | "2 NaN NaN \n",
1038 | "3 NaN NaN \n",
1039 | "4 NaN NaN \n",
1040 | "\n",
1041 | " fileid_api_index_std_sendto fileid_api_index_std_setsockopt \\\n",
1042 | "0 NaN NaN \n",
1043 | "1 NaN NaN \n",
1044 | "2 NaN NaN \n",
1045 | "3 NaN NaN \n",
1046 | "4 NaN NaN \n",
1047 | "\n",
1048 | " fileid_api_index_std_shutdown fileid_api_index_std_socket \\\n",
1049 | "0 NaN NaN \n",
1050 | "1 NaN NaN \n",
1051 | "2 NaN NaN \n",
1052 | "3 NaN NaN \n",
1053 | "4 NaN NaN \n",
1054 | "\n",
1055 | " fileid_api_index_std_system fileid_api_index_std_timeGetTime \n",
1056 | "0 NaN NaN \n",
1057 | "1 NaN NaN \n",
1058 | "2 NaN NaN \n",
1059 | "3 NaN NaN \n",
1060 | "4 NaN NaN \n",
1061 | "\n",
1062 | "[5 rows x 2103 columns]"
1063 | ]
1064 | },
1065 | "execution_count": 22,
1066 | "metadata": {},
1067 | "output_type": "execute_result"
1068 | }
1069 | ],
1070 | "source": [
1071 | "train_data_.head()"
1072 | ]
1073 | },
1074 | {
1075 | "cell_type": "markdown",
1076 | "metadata": {},
1077 | "source": [
1078 | "### 1阶特征的线下验证(File_id + Api)(0.0347293)"
1079 | ]
1080 | },
1081 | {
1082 | "cell_type": "markdown",
1083 | "metadata": {},
1084 | "source": [
1085 | "### File_id + Index \n",
1086 | "- File_id + Index (api): count,nunique\n",
1087 | "- File_id + Index (return value): nunique, max, min, median, std(暂时先搁置)\n",
1088 | "- File_id + Index (tid): nunique, max, min, median, std(暂时先搁置)\n"
1089 | ]
1090 | },
1091 | {
1092 | "cell_type": "markdown",
1093 | "metadata": {},
1094 | "source": [
1095 | "#### File_id +Tid (api): count,nunique"
1096 | ]
1097 | },
1098 | {
1099 | "cell_type": "markdown",
1100 | "metadata": {},
1101 | "source": [
1102 | "#### File_id + Index特征过拟合,删除\n"
1103 | ]
1104 | },
1105 | {
1106 | "cell_type": "code",
1107 | "execution_count": 23,
1108 | "metadata": {
1109 | "scrolled": true
1110 | },
1111 | "outputs": [],
1112 | "source": [
1113 | "# delcol = []\n",
1114 | "# for i in range(2):\n",
1115 | "# for item in groupby_features2[i]:\n",
1116 | "# delcol.append(item)"
1117 | ]
1118 | },
1119 | {
1120 | "cell_type": "code",
1121 | "execution_count": 24,
1122 | "metadata": {},
1123 | "outputs": [],
1124 | "source": [
1125 | "# train_data_.drop(delcol,axis=1,inplace=True)"
1126 | ]
1127 | },
1128 | {
1129 | "cell_type": "markdown",
1130 | "metadata": {},
1131 | "source": [
1132 | "## 特征补充(加入index的差值特征)\n",
1133 | "- File_id + Api (index_diff): 'nunique','max','min','median','std'"
1134 | ]
1135 | },
1136 | {
1137 | "cell_type": "code",
1138 | "execution_count": 25,
1139 | "metadata": {},
1140 | "outputs": [],
1141 | "source": [
1142 | "train_diff = train.groupby(['file_id','tid'])['index'].diff().fillna(-999).values"
1143 | ]
1144 | },
1145 | {
1146 | "cell_type": "code",
1147 | "execution_count": 26,
1148 | "metadata": {},
1149 | "outputs": [],
1150 | "source": [
1151 | "train['index_diff'] = train_diff"
1152 | ]
1153 | },
1154 | {
1155 | "cell_type": "code",
1156 | "execution_count": 27,
1157 | "metadata": {},
1158 | "outputs": [],
1159 | "source": [
1160 | "train_diff = train.loc[train.index_diff!=-999] "
1161 | ]
1162 | },
1163 | {
1164 | "cell_type": "code",
1165 | "execution_count": 28,
1166 | "metadata": {},
1167 | "outputs": [
1168 | {
1169 | "name": "stdout",
1170 | "output_type": "stream",
1171 | "text": [
1172 | "nunique\n"
1173 | ]
1174 | },
1175 | {
1176 | "name": "stderr",
1177 | "output_type": "stream",
1178 | "text": [
1179 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
1180 | "is deprecated and will be removed in a future version\n",
1181 | " after removing the cwd from sys.path.\n"
1182 | ]
1183 | },
1184 | {
1185 | "name": "stdout",
1186 | "output_type": "stream",
1187 | "text": [
1188 | "max\n",
1189 | "min\n",
1190 | "median\n",
1191 | "std\n"
1192 | ]
1193 | }
1194 | ],
1195 | "source": [
1196 | "api_opts = ['nunique','max','min','median','std']\n",
1197 | "train_data_,groupby_features = groupby_pivot_features(train_data_, train_diff, groupby_features, col1 = 'api', col2 = 'index_diff', opts = api_opts) "
1198 | ]
1199 | },
1200 | {
1201 | "cell_type": "markdown",
1202 | "metadata": {},
1203 | "source": [
1204 | "### 线下验证(0.0346954)"
1205 | ]
1206 | },
1207 | {
1208 | "cell_type": "code",
1209 | "execution_count": 29,
1210 | "metadata": {
1211 | "scrolled": true
1212 | },
1213 | "outputs": [],
1214 | "source": [
1215 | "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id']\n",
1216 | "# train_label = 'label'\n",
1217 | "# print(len(train_features))\n",
1218 | "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n",
1219 | "# dval = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n",
1220 | "\n",
1221 | "# params = {\n",
1222 | "# 'task':'train', \n",
1223 | "# 'num_leaves': 255,\n",
1224 | "# 'objective': 'multiclass',\n",
1225 | "# 'num_class':6,\n",
1226 | "# 'min_data_in_leaf': 40,\n",
1227 | "# 'learning_rate': 0.05,\n",
1228 | "# 'feature_fraction': 0.85,\n",
1229 | "# 'bagging_fraction': 0.9,\n",
1230 | "# 'bagging_freq': 5, \n",
1231 | "# 'max_bin':128,\n",
1232 | "# 'num_threads': 64,\n",
1233 | "# 'random_state':100\n",
1234 | "# } \n",
1235 | "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) "
1236 | ]
1237 | },
1238 | {
1239 | "cell_type": "markdown",
1240 | "metadata": {},
1241 | "source": [
1242 | "### 删除quantile,std统计变量之后的验证(0.0350054) "
1243 | ]
1244 | },
1245 | {
1246 | "cell_type": "code",
1247 | "execution_count": 30,
1248 | "metadata": {
1249 | "scrolled": true
1250 | },
1251 | "outputs": [],
1252 | "source": [
1253 | "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]\n",
1254 | "# train_label = 'label'\n",
1255 | "# print(len(train_features))\n",
1256 | "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n",
1257 | "# dval = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n",
1258 | "\n",
1259 | "# params = {\n",
1260 | "# 'task':'train', \n",
1261 | "# 'num_leaves': 255,\n",
1262 | "# 'objective': 'multiclass',\n",
1263 | "# 'num_class':6,\n",
1264 | "# 'min_data_in_leaf': 40,\n",
1265 | "# 'learning_rate': 0.05,\n",
1266 | "# 'feature_fraction': 0.85,\n",
1267 | "# 'bagging_fraction': 0.9,\n",
1268 | "# 'bagging_freq': 5, \n",
1269 | "# 'max_bin':128,\n",
1270 | "# 'num_threads': 64,\n",
1271 | "# 'random_state':100\n",
1272 | "# } \n",
1273 | "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) "
1274 | ]
1275 | },
1276 | {
1277 | "cell_type": "code",
1278 | "execution_count": 31,
1279 | "metadata": {},
1280 | "outputs": [],
1281 | "source": [
1282 | "# train_data_.to_csv('/data/Data_JieZhang/TC_SAFE/train_val/train_data.csv',index = None) "
1283 | ]
1284 | },
1285 | {
1286 | "cell_type": "markdown",
1287 | "metadata": {},
1288 | "source": [
1289 | "# 特征工程& 验证结果 2-Gram\n",
1290 | "## 全局特征\n",
1291 | "### File_id(Api_2):count,nunique"
1292 | ]
1293 | },
1294 | {
1295 | "cell_type": "code",
1296 | "execution_count": 32,
1297 | "metadata": {},
1298 | "outputs": [],
1299 | "source": [
1300 | "train['api_shift'] = train['api'].shift(-1)\n",
1301 | "train['api_2'] = train['api'] +'_' + train['api_shift']"
1302 | ]
1303 | },
1304 | {
1305 | "cell_type": "code",
1306 | "execution_count": 33,
1307 | "metadata": {},
1308 | "outputs": [],
1309 | "source": [
1310 | "train.drop(['api_shift'],axis=1,inplace=True)"
1311 | ]
1312 | },
1313 | {
1314 | "cell_type": "code",
1315 | "execution_count": 34,
1316 | "metadata": {
1317 | "scrolled": true
1318 | },
1319 | "outputs": [],
1320 | "source": [
1321 | "api_count = train['api_2'].value_counts()"
1322 | ]
1323 | },
1324 | {
1325 | "cell_type": "code",
1326 | "execution_count": 35,
1327 | "metadata": {},
1328 | "outputs": [
1329 | {
1330 | "name": "stdout",
1331 | "output_type": "stream",
1332 | "text": [
1333 | "count\n"
1334 | ]
1335 | },
1336 | {
1337 | "name": "stderr",
1338 | "output_type": "stream",
1339 | "text": [
1340 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
1341 | "is deprecated and will be removed in a future version\n",
1342 | " after removing the cwd from sys.path.\n"
1343 | ]
1344 | },
1345 | {
1346 | "name": "stdout",
1347 | "output_type": "stream",
1348 | "text": [
1349 | "nunique\n"
1350 | ]
1351 | }
1352 | ],
1353 | "source": [
1354 | "api_opt = ['count','nunique'] \n",
1355 | "for opt in api_opt:\n",
1356 | " print(opt)\n",
1357 | " tmp = train.groupby(['file_id'])['api_2'].agg({'fileid_api_2_' + opt: opt}).reset_index() \n",
1358 | " train_data_ = pd.merge(train_data_,tmp,how='left', on='file_id') "
1359 | ]
1360 | },
1361 | {
1362 | "cell_type": "markdown",
1363 | "metadata": {},
1364 | "source": [
1365 | "## 局部特征\n",
1366 | "### File_id + tid (Api_2): count特征"
1367 | ]
1368 | },
1369 | {
1370 | "cell_type": "code",
1371 | "execution_count": 36,
1372 | "metadata": {
1373 | "scrolled": true
1374 | },
1375 | "outputs": [],
1376 | "source": [
1377 | "api_value_counts = pd.DataFrame(api_count).reset_index()\n",
1378 | "api_value_counts.columns = ['api_2','api_2_count']"
1379 | ]
1380 | },
1381 | {
1382 | "cell_type": "code",
1383 | "execution_count": 37,
1384 | "metadata": {},
1385 | "outputs": [],
1386 | "source": [
1387 | "train = pd.merge(train, api_value_counts, on ='api_2' , how='left')"
1388 | ]
1389 | },
1390 | {
1391 | "cell_type": "code",
1392 | "execution_count": 38,
1393 | "metadata": {},
1394 | "outputs": [
1395 | {
1396 | "name": "stdout",
1397 | "output_type": "stream",
1398 | "text": [
1399 | "count\n"
1400 | ]
1401 | },
1402 | {
1403 | "name": "stderr",
1404 | "output_type": "stream",
1405 | "text": [
1406 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n",
1407 | "is deprecated and will be removed in a future version\n",
1408 | " after removing the cwd from sys.path.\n"
1409 | ]
1410 | }
1411 | ],
1412 | "source": [
1413 | "api_opts = ['count']\n",
1414 | "groupby_features = []\n",
1415 | "train_data_,groupby_features = groupby_pivot_features(train_data_, train.loc[train.api_2_count>=20], groupby_features, col1 = 'api_2', col2 = 'tid', opts = api_opts)"
1416 | ]
1417 | },
1418 | {
1419 | "cell_type": "markdown",
1420 | "metadata": {},
1421 | "source": [
1422 | "### 线下验证( 0.0330886)"
1423 | ]
1424 | },
1425 | {
1426 | "cell_type": "markdown",
1427 | "metadata": {},
1428 | "source": [
1429 | "### File_id + index (Api_2): max,min特征"
1430 | ]
1431 | },
1432 | {
1433 | "cell_type": "code",
1434 | "execution_count": null,
1435 | "metadata": {},
1436 | "outputs": [],
1437 | "source": []
1438 | },
1439 | {
1440 | "cell_type": "code",
1441 | "execution_count": null,
1442 | "metadata": {},
1443 | "outputs": [],
1444 | "source": []
1445 | },
1446 | {
1447 | "cell_type": "code",
1448 | "execution_count": null,
1449 | "metadata": {},
1450 | "outputs": [],
1451 | "source": []
1452 | },
1453 | {
1454 | "cell_type": "code",
1455 | "execution_count": 39,
1456 | "metadata": {
1457 | "scrolled": true
1458 | },
1459 | "outputs": [],
1460 | "source": [
1461 | "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]\n",
1462 | "# train_label = 'label'\n",
1463 | "\n",
1464 | "# train_ind = train_X.index\n",
1465 | "# test_ind = test_X.index\n",
1466 | "\n",
1467 | "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n",
1468 | "# dval = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n",
1469 | "\n",
1470 | "# params = {\n",
1471 | "# 'task':'train', \n",
1472 | "# 'num_leaves': 255,\n",
1473 | "# 'objective': 'multiclass',\n",
1474 | "# 'num_class':8,\n",
1475 | "# 'min_data_in_leaf': 10,\n",
1476 | "# #'min_data_in_leaf': 1,\n",
1477 | "# 'learning_rate': 0.05,\n",
1478 | "# 'feature_fraction': 0.85,\n",
1479 | "# 'bagging_fraction': 0.9,\n",
1480 | "# 'bagging_freq': 5, \n",
1481 | "# 'max_bin':128,\n",
1482 | "# 'num_threads': 64,\n",
1483 | "# 'random_state':100\n",
1484 | "# } \n",
1485 | "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) "
1486 | ]
1487 | },
1488 | {
1489 | "cell_type": "code",
1490 | "execution_count": 40,
1491 | "metadata": {},
1492 | "outputs": [],
1493 | "source": [
1494 | "# fea_imp = pd.DataFrame({'feature':train_features, 'imp':lgb_model_3_order.feature_importance()}).sort_values('imp')\n",
1495 | "# important_features = fea_imp.loc[fea_imp.imp >=1, 'feature'].values\n",
1496 | "# important_features = list(important_features)\n",
1497 | "\n",
1498 | "# important_features.append('file_id')\n",
1499 | "# important_features.append('label')\n",
1500 | "\n",
1501 | "# train_data_[important_features].to_csv('../feature_final/train_data_2gram.csv',index = None)\n",
1502 | " "
1503 | ]
1504 | },
1505 | {
1506 | "cell_type": "code",
1507 | "execution_count": 41,
1508 | "metadata": {},
1509 | "outputs": [],
1510 | "source": [
1511 | "train_data_.to_csv('input/test_data_2gram.csv',index = None)"
1512 | ]
1513 | },
1514 | {
1515 | "cell_type": "code",
1516 | "execution_count": null,
1517 | "metadata": {},
1518 | "outputs": [],
1519 | "source": [
1520 | "train.shape"
1521 | ]
1522 | },
1523 | {
1524 | "cell_type": "markdown",
1525 | "metadata": {},
1526 | "source": [
1527 | "# 附录\n",
1528 | "tf-idf的1Gram特征可以替换api的次数特征等,加入tf-idf有提升,提升较小"
1529 | ]
1530 | }
1531 | ],
1532 | "metadata": {
1533 | "kernelspec": {
1534 | "display_name": "Python 3",
1535 | "language": "python",
1536 | "name": "python3"
1537 | },
1538 | "language_info": {
1539 | "codemirror_mode": {
1540 | "name": "ipython",
1541 | "version": 3
1542 | },
1543 | "file_extension": ".py",
1544 | "mimetype": "text/x-python",
1545 | "name": "python",
1546 | "nbconvert_exporter": "python",
1547 | "pygments_lexer": "ipython3",
1548 | "version": "3.6.5"
1549 | },
1550 | "toc": {
1551 | "nav_menu": {},
1552 | "number_sections": true,
1553 | "sideBar": true,
1554 | "skip_h1_title": false,
1555 | "title_cell": "Table of Contents",
1556 | "title_sidebar": "Contents",
1557 | "toc_cell": false,
1558 | "toc_position": {
1559 | "height": "calc(100% - 180px)",
1560 | "left": "10px",
1561 | "top": "150px",
1562 | "width": "384px"
1563 | },
1564 | "toc_section_display": true,
1565 | "toc_window_display": true
1566 | }
1567 | },
1568 | "nbformat": 4,
1569 | "nbformat_minor": 2
1570 | }
1571 |
--------------------------------------------------------------------------------
/pickle_pre.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "\n",
11 | "meta_train = pd.read_pickle('../meta/train_meta_dilated_cnn.pkl')\n",
12 | "meta_test = pd.read_pickle('../meta/test_meta_dilated_cnn.pkl')\n",
13 | "\n",
14 | "import pickle\n",
15 | "\n",
16 | "f=open('../meta/train_meta_dilated_cnn_a.pkl','wb') \n",
17 | "pickle.dump(meta_train,f,0) \n",
18 | "f.close()\n",
19 | "\n",
20 | "f=open('../meta/test_meta_dilated_cnn_a.pkl','wb') \n",
21 | "pickle.dump(meta_test,f,0) \n",
22 | "f.close()"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "meta_train = pd.read_pickle('../meta/train_meta_cnn.pkl')\n",
32 | "meta_test = pd.read_pickle('../meta/test_meta_cnn.pkl')\n",
33 | "\n",
34 | "f=open('../meta/train_meta_cnn_a.pkl','wb') \n",
35 | "pickle.dump(meta_train,f,0) \n",
36 | "f.close()\n",
37 | "\n",
38 | "f=open('../meta/test_meta_cnn_a.pkl','wb') \n",
39 | "pickle.dump(meta_test,f,0) \n",
40 | "f.close()"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/plain": [
51 | "'/Users/didi/天池/安全赛复赛/temp'"
52 | ]
53 | },
54 | "execution_count": 2,
55 | "metadata": {},
56 | "output_type": "execute_result"
57 | }
58 | ],
59 | "source": [
60 | "%pwd"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": []
69 | }
70 | ],
71 | "metadata": {
72 | "kernelspec": {
73 | "display_name": "Python 3",
74 | "language": "python",
75 | "name": "python3"
76 | },
77 | "language_info": {
78 | "codemirror_mode": {
79 | "name": "ipython",
80 | "version": 3
81 | },
82 | "file_extension": ".py",
83 | "mimetype": "text/x-python",
84 | "name": "python",
85 | "nbconvert_exporter": "python",
86 | "pygments_lexer": "ipython3",
87 | "version": "3.6.5"
88 | }
89 | },
90 | "nbformat": 4,
91 | "nbformat_minor": 2
92 | }
93 |
--------------------------------------------------------------------------------
/submit.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 26,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "# coding: utf-8\n",
12 | "\n",
13 | "# In[1]:\n",
14 | "\n",
15 | "\n",
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "import lightgbm as lgb\n",
19 | "from sklearn.cross_validation import train_test_split\n",
20 | "import gc\n",
21 | "from sklearn.preprocessing import OneHotEncoder\n",
22 | "import datetime\n",
23 | "from sklearn.cross_validation import StratifiedKFold\n",
24 | "\n",
25 | "# In[2]:"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 27,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "cur time = 2018/09/21 20:10:16\n",
38 | "(13887, 3251) (12955, 3251)\n",
39 | "cur time = 2018/09/21 20:10:16\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
45 | "train = np.load('../X_train.npy')\n",
46 | "test = np.load('../X_test.npy')\n",
47 | "train_labels = np.load('../labels.npy')\n",
48 | "\n",
49 | "print train.shape,test.shape\n",
50 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 28,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "train_cnn_1 = pd.read_pickle('../train_meta_cnn_a.pkl')\n",
60 | "test_cnn_1 = pd.read_pickle('../test_meta_cnn_a.pkl')"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 29,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "train_cnn_2 = pd.read_pickle('../train_meta_dilated_cnn_a.pkl')\n",
70 | "test_cnn_2 = pd.read_pickle('../test_meta_dilated_cnn_a.pkl')"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 30,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "train_lgb_1 = pd.read_pickle('../train_meta_lgb_1.pkl')\n",
80 | "test_lgb_1 = pd.read_pickle('../test_meta_lgb_1.pkl')\n",
81 | "\n",
82 | "train_lgb_2 = pd.read_pickle('../train_meta_lgb_2.pkl')\n",
83 | "test_lgb_2 = pd.read_pickle('../test_meta_lgb_2.pkl')\n",
84 | "\n",
85 | "train_lgb_3 = pd.read_pickle('../train_meta_lgb_3.pkl')\n",
86 | "test_lgb_3 = pd.read_pickle('../test_meta_lgb_3.pkl')\n",
87 | "\n",
88 | "train_lgb_4 = pd.read_pickle('../train_meta_lgb_4.pkl')\n",
89 | "test_lgb_4 = pd.read_pickle('../test_meta_lgb_4.pkl')"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 33,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "train = np.hstack([train,train_cnn_1, train_cnn_2, train_lgb_1, train_lgb_2, train_lgb_3, train_lgb_4])\n",
99 | "test = np.hstack([test,test_cnn_1, test_cnn_2, test_lgb_1, test_lgb_2, test_lgb_3, test_lgb_4])"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 36,
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "name": "stdout",
109 | "output_type": "stream",
110 | "text": [
111 | "Times: 0\n",
112 | "cur time = 2018/09/21 20:12:20\n",
113 | "FOLD: 0\n",
114 | "2780 11107\n",
115 | "Training until validation scores don't improve for 100 rounds.\n",
116 | "[100]\ttraining's multi_logloss: 0.70958\tvalid_1's multi_logloss: 0.755308\n",
117 | "[200]\ttraining's multi_logloss: 0.354997\tvalid_1's multi_logloss: 0.434251\n",
118 | "[300]\ttraining's multi_logloss: 0.217096\tvalid_1's multi_logloss: 0.326171\n",
119 | "[400]\ttraining's multi_logloss: 0.152399\tvalid_1's multi_logloss: 0.289922\n",
120 | "[500]\ttraining's multi_logloss: 0.117076\tvalid_1's multi_logloss: 0.278519\n",
121 | "[600]\ttraining's multi_logloss: 0.094704\tvalid_1's multi_logloss: 0.277246\n",
122 | "Early stopping, best iteration is:\n",
123 | "[598]\ttraining's multi_logloss: 0.0950629\tvalid_1's multi_logloss: 0.277223\n",
124 | "cur time = 2018/09/21 20:13:46\n",
125 | "FOLD: 1\n",
126 | "2779 11108\n",
127 | "Training until validation scores don't improve for 100 rounds.\n",
128 | "[100]\ttraining's multi_logloss: 0.714406\tvalid_1's multi_logloss: 0.746063\n",
129 | "[200]\ttraining's multi_logloss: 0.360927\tvalid_1's multi_logloss: 0.41752\n",
130 | "[300]\ttraining's multi_logloss: 0.223406\tvalid_1's multi_logloss: 0.305014\n",
131 | "[400]\ttraining's multi_logloss: 0.159355\tvalid_1's multi_logloss: 0.265401\n",
132 | "[500]\ttraining's multi_logloss: 0.123548\tvalid_1's multi_logloss: 0.251562\n",
133 | "[600]\ttraining's multi_logloss: 0.100453\tvalid_1's multi_logloss: 0.247619\n",
134 | "[700]\ttraining's multi_logloss: 0.0840086\tvalid_1's multi_logloss: 0.247471\n",
135 | "Early stopping, best iteration is:\n",
136 | "[645]\ttraining's multi_logloss: 0.0925202\tvalid_1's multi_logloss: 0.246913\n",
137 | "cur time = 2018/09/21 20:15:23\n",
138 | "FOLD: 2\n",
139 | "2777 11110\n",
140 | "Training until validation scores don't improve for 100 rounds.\n",
141 | "[100]\ttraining's multi_logloss: 0.710447\tvalid_1's multi_logloss: 0.758826\n",
142 | "[200]\ttraining's multi_logloss: 0.354958\tvalid_1's multi_logloss: 0.436029\n",
143 | "[300]\ttraining's multi_logloss: 0.216709\tvalid_1's multi_logloss: 0.326181\n",
144 | "[400]\ttraining's multi_logloss: 0.15243\tvalid_1's multi_logloss: 0.287969\n",
145 | "[500]\ttraining's multi_logloss: 0.117201\tvalid_1's multi_logloss: 0.275582\n",
146 | "[600]\ttraining's multi_logloss: 0.0948654\tvalid_1's multi_logloss: 0.273565\n",
147 | "Early stopping, best iteration is:\n",
148 | "[578]\ttraining's multi_logloss: 0.0990779\tvalid_1's multi_logloss: 0.273456\n",
149 | "cur time = 2018/09/21 20:16:47\n",
150 | "FOLD: 3\n",
151 | "2776 11111\n",
152 | "Training until validation scores don't improve for 100 rounds.\n",
153 | "[100]\ttraining's multi_logloss: 0.710814\tvalid_1's multi_logloss: 0.757495\n",
154 | "[200]\ttraining's multi_logloss: 0.356598\tvalid_1's multi_logloss: 0.432203\n",
155 | "[300]\ttraining's multi_logloss: 0.219223\tvalid_1's multi_logloss: 0.319802\n",
156 | "[400]\ttraining's multi_logloss: 0.154809\tvalid_1's multi_logloss: 0.280013\n",
157 | "[500]\ttraining's multi_logloss: 0.118818\tvalid_1's multi_logloss: 0.2661\n",
158 | "[600]\ttraining's multi_logloss: 0.0962369\tvalid_1's multi_logloss: 0.262496\n",
159 | "[700]\ttraining's multi_logloss: 0.0801419\tvalid_1's multi_logloss: 0.262299\n",
160 | "Early stopping, best iteration is:\n",
161 | "[660]\ttraining's multi_logloss: 0.0860689\tvalid_1's multi_logloss: 0.261957\n",
162 | "cur time = 2018/09/21 20:18:25\n",
163 | "FOLD: 4\n",
164 | "2775 11112\n",
165 | "Training until validation scores don't improve for 100 rounds.\n",
166 | "[100]\ttraining's multi_logloss: 0.711242\tvalid_1's multi_logloss: 0.757122\n",
167 | "[200]\ttraining's multi_logloss: 0.357074\tvalid_1's multi_logloss: 0.432454\n",
168 | "[300]\ttraining's multi_logloss: 0.219319\tvalid_1's multi_logloss: 0.319717\n",
169 | "[400]\ttraining's multi_logloss: 0.155336\tvalid_1's multi_logloss: 0.27957\n",
170 | "[500]\ttraining's multi_logloss: 0.12014\tvalid_1's multi_logloss: 0.264735\n",
171 | "[600]\ttraining's multi_logloss: 0.0976771\tvalid_1's multi_logloss: 0.260499\n",
172 | "[700]\ttraining's multi_logloss: 0.0816694\tvalid_1's multi_logloss: 0.260613\n",
173 | "Early stopping, best iteration is:\n",
174 | "[625]\ttraining's multi_logloss: 0.0932625\tvalid_1's multi_logloss: 0.260267\n",
175 | "cur time = 2018/09/21 20:20:08\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "\n",
181 | "\n",
182 | "meta_test = np.zeros(shape = (len(test),8))\n",
183 | "\n",
184 | "for seed in range(1):\n",
185 | " print 'Times: ',seed\n",
186 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
187 | " skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=seed)\n",
188 | " for i,(tr_ind,te_ind) in enumerate(skf):\n",
189 | " print 'FOLD: ',i\n",
190 | " print len(te_ind),len(tr_ind)\n",
191 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
192 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
193 | " dtrain = lgb.Dataset(X_train,X_train_label) \n",
194 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n",
195 | " params = {\n",
196 | " 'task':'train', \n",
197 | " 'boosting_type':'gbdt',\n",
198 | " 'num_leaves': 15,\n",
199 | " 'objective': 'multiclass',\n",
200 | " 'num_class':8,\n",
201 | " 'learning_rate': 0.01,\n",
202 | " 'feature_fraction': 0.85,\n",
203 | " 'subsample':0.85,\n",
204 | " 'num_threads': 54,\n",
205 | " 'metric':'multi_logloss',\n",
206 | " 'seed':seed\n",
207 | " } \n",
208 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n",
209 | " pred_test = model.predict(test)\n",
210 | "\n",
211 | " #meta_train[te_ind] = pred_val\n",
212 | " meta_test += pred_test\n",
213 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
214 | "\n",
215 | "meta_test/=5.0\n",
216 | "res = pd.DataFrame(meta_test,columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])\n",
217 | "res.index.name='file_id'\n",
218 | "res.round(7).to_csv('submit.csv', index = True, header=True)\n",
219 | " \n",
220 | " "
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 74,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "res.shape\n",
230 | "res.index = range(1,res.shape[0]+1)\n",
231 | "res.index.name = 'file_id'"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 77,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "en =res.copy()"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 79,
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "data": {
250 | "text/plain": [
251 | "1.0000000000000004"
252 | ]
253 | },
254 | "execution_count": 79,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "en.sum(axis=1).max()"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 81,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "en.to_csv('../fuucccccccck.csv',index=True,header=True)"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 83,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "data": {
279 | "text/html": [
280 | "\n",
281 | "\n",
294 | "
\n",
295 | " \n",
296 | " \n",
297 | " | \n",
298 | " prob0 | \n",
299 | " prob1 | \n",
300 | " prob2 | \n",
301 | " prob3 | \n",
302 | " prob4 | \n",
303 | " prob5 | \n",
304 | " prob6 | \n",
305 | " prob7 | \n",
306 | "
\n",
307 | " \n",
308 | " file_id | \n",
309 | " | \n",
310 | " | \n",
311 | " | \n",
312 | " | \n",
313 | " | \n",
314 | " | \n",
315 | " | \n",
316 | " | \n",
317 | "
\n",
318 | " \n",
319 | " \n",
320 | " \n",
321 | " 1 | \n",
322 | " 0.002035 | \n",
323 | " 0.002127 | \n",
324 | " 0.949751 | \n",
325 | " 0.009502 | \n",
326 | " 0.001805 | \n",
327 | " 0.002404 | \n",
328 | " 0.005550 | \n",
329 | " 0.026825 | \n",
330 | "
\n",
331 | " \n",
332 | " 2 | \n",
333 | " 0.931129 | \n",
334 | " 0.002137 | \n",
335 | " 0.003289 | \n",
336 | " 0.003913 | \n",
337 | " 0.002060 | \n",
338 | " 0.009254 | \n",
339 | " 0.010101 | \n",
340 | " 0.038117 | \n",
341 | "
\n",
342 | " \n",
343 | " 3 | \n",
344 | " 0.996000 | \n",
345 | " 0.000453 | \n",
346 | " 0.000597 | \n",
347 | " 0.000630 | \n",
348 | " 0.000429 | \n",
349 | " 0.000575 | \n",
350 | " 0.000560 | \n",
351 | " 0.000755 | \n",
352 | "
\n",
353 | " \n",
354 | " 4 | \n",
355 | " 0.013627 | \n",
356 | " 0.008015 | \n",
357 | " 0.018625 | \n",
358 | " 0.098806 | \n",
359 | " 0.054051 | \n",
360 | " 0.092254 | \n",
361 | " 0.180903 | \n",
362 | " 0.533720 | \n",
363 | "
\n",
364 | " \n",
365 | " 5 | \n",
366 | " 0.993833 | \n",
367 | " 0.000578 | \n",
368 | " 0.001065 | \n",
369 | " 0.000852 | \n",
370 | " 0.000608 | \n",
371 | " 0.000776 | \n",
372 | " 0.000779 | \n",
373 | " 0.001510 | \n",
374 | "
\n",
375 | " \n",
376 | "
\n",
377 | "
"
378 | ],
379 | "text/plain": [
380 | " prob0 prob1 prob2 prob3 prob4 prob5 prob6 \\\n",
381 | "file_id \n",
382 | "1 0.002035 0.002127 0.949751 0.009502 0.001805 0.002404 0.005550 \n",
383 | "2 0.931129 0.002137 0.003289 0.003913 0.002060 0.009254 0.010101 \n",
384 | "3 0.996000 0.000453 0.000597 0.000630 0.000429 0.000575 0.000560 \n",
385 | "4 0.013627 0.008015 0.018625 0.098806 0.054051 0.092254 0.180903 \n",
386 | "5 0.993833 0.000578 0.001065 0.000852 0.000608 0.000776 0.000779 \n",
387 | "\n",
388 | " prob7 \n",
389 | "file_id \n",
390 | "1 0.026825 \n",
391 | "2 0.038117 \n",
392 | "3 0.000755 \n",
393 | "4 0.533720 \n",
394 | "5 0.001510 "
395 | ]
396 | },
397 | "execution_count": 83,
398 | "metadata": {},
399 | "output_type": "execute_result"
400 | }
401 | ],
402 | "source": [
403 | "en.head()"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": null,
409 | "metadata": {},
410 | "outputs": [],
411 | "source": []
412 | }
413 | ],
414 | "metadata": {
415 | "kernelspec": {
416 | "display_name": "Python 3",
417 | "language": "python",
418 | "name": "python3"
419 | },
420 | "language_info": {
421 | "codemirror_mode": {
422 | "name": "ipython",
423 | "version": 3
424 | },
425 | "file_extension": ".py",
426 | "mimetype": "text/x-python",
427 | "name": "python",
428 | "nbconvert_exporter": "python",
429 | "pygments_lexer": "ipython3",
430 | "version": "3.6.5"
431 | }
432 | },
433 | "nbformat": 4,
434 | "nbformat_minor": 2
435 | }
436 |
--------------------------------------------------------------------------------
/上地西二旗人民.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/enjoysport2022/Alibaba-3rd-Security-Algorithm-Challenge/18a43c25d62e914edb19bdcae11b209813cb8439/上地西二旗人民.pptx
--------------------------------------------------------------------------------