├── .ipynb_checkpoints ├── CNN_metafeature-checkpoint.ipynb ├── CNN_metafeature_dilated-checkpoint.ipynb ├── gene_npy-checkpoint.ipynb ├── lgb_meta_features-checkpoint.ipynb ├── main_test-checkpoint.ipynb ├── main_train-checkpoint.ipynb ├── pickle_pre-checkpoint.ipynb └── submit-checkpoint.ipynb ├── CNN_metafeature.ipynb ├── CNN_metafeature_dilated.ipynb ├── README.md ├── gene_npy.ipynb ├── lgb_meta_features.ipynb ├── main_test.ipynb ├── main_train.ipynb ├── pickle_pre.ipynb ├── submit.ipynb └── 上地西二旗人民.pptx /.ipynb_checkpoints/CNN_metafeature-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 13 | " return f(*args, **kwds)\n", 14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 15 | " return f(*args, **kwds)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import os\n", 23 | "from tqdm import tqdm\n", 24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "path = '../input/'\n", 34 | "train = pd.read_csv(path + 'final_train.csv')\n", 35 | "test = pd.read_csv(path + 'final_test.csv')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "((89806693, 5), (79288375, 4))" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "train.shape,test.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "unique_api = train['api'].unique()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "(295,)" 76 | ] 77 | }, 78 | "execution_count": 5, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "unique_api.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n", 94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "train['api_idx'] = train['api'].map(api2index)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 8, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "test['api_idx'] = test['api'].map(api2index)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 9, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n", 122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "def get_sequence(df,period_idx):\n", 132 | " seq_list = []\n", 133 | " for _id,begin in enumerate(period_idx[:-1]):\n", 134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n", 135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n", 136 | " return seq_list" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n", 146 | "test_df = test[['file_id']].drop_duplicates(keep='first')" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 12, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "train_df['seq'] = get_sequence(train,train_period_idx)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 13, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "test_df['seq'] = get_sequence(test,test_period_idx)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 14, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "(19350.97816934013, 6466.961402750774, 888204)" 176 | ] 177 | }, 178 | "execution_count": 14, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 15, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "(15911.676663585444, 6120.291393284446, 769590)" 196 | ] 197 | }, 198 | "execution_count": 15, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 16, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stderr", 214 | "output_type": "stream", 215 | "text": [ 216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 217 | " return f(*args, **kwds)\n", 218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 219 | " from ._conv import register_converters as _register_converters\n", 220 | "Using TensorFlow backend.\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "from keras.preprocessing.text import Tokenizer\n", 226 | "from keras.preprocessing.sequence import pad_sequences\n", 227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n", 228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n", 229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n", 230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n", 231 | "from keras.models import Model\n", 232 | "from keras.optimizers import RMSprop,Adam\n", 233 | "from keras.layers.normalization import BatchNormalization\n", 234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 235 | "from keras.optimizers import SGD\n", 236 | "from keras import backend as K\n", 237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n", 238 | "from keras.layers import SpatialDropout1D\n", 239 | "from keras.layers.wrappers import Bidirectional" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 17, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "def TextCNN(max_len,max_cnt,embed_size,\n", 249 | " num_filters,kernel_size,\n", 250 | " conv_action,\n", 251 | " mask_zero):\n", 252 | " _input = Input(shape=(max_len,), dtype='int32')\n", 253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n", 254 | " _embed = SpatialDropout1D(0.15)(_embed)\n", 255 | " warppers = []\n", 256 | " for _kernel_size in kernel_size:\n", 257 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)\n", 258 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n", 259 | " \n", 260 | " fc = concatenate(warppers)\n", 261 | " fc = Dropout(0.5)(fc)\n", 262 | " #fc = BatchNormalization()(fc)\n", 263 | " fc = Dense(256, activation='relu')(fc)\n", 264 | " fc = Dropout(0.25)(fc)\n", 265 | " #fc = BatchNormalization()(fc) \n", 266 | " preds = Dense(8, activation = 'softmax')(fc)\n", 267 | " \n", 268 | " model = Model(inputs=_input, outputs=preds)\n", 269 | " \n", 270 | " model.compile(loss='categorical_crossentropy',\n", 271 | " optimizer='adam',\n", 272 | " metrics=['accuracy'])\n", 273 | " return model" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 18, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "train_labels = pd.get_dummies(train_df.label).values\n", 283 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n", 284 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 20, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 21, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "max_len = 6000\n", 303 | "max_cnt = 295\n", 304 | "embed_size = 256\n", 305 | "num_filters = 64\n", 306 | "kernel_size = [2,4,6,8,10,12,14]\n", 307 | "conv_action = 'relu'\n", 308 | "mask_zero = False\n", 309 | "TRAIN = True" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 25, 315 | "metadata": { 316 | "scrolled": false 317 | }, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "FOLD: \n", 324 | "2780 11107\n", 325 | "2780/2780 [==============================] - 5s 2ms/step\n", 326 | "12955/12955 [==============================] - 18s 1ms/step\n", 327 | "FOLD: \n", 328 | "2779 11108\n", 329 | "2779/2779 [==============================] - 4s 2ms/step\n", 330 | "12955/12955 [==============================] - 18s 1ms/step\n", 331 | "FOLD: \n", 332 | "2777 11110\n", 333 | "2777/2777 [==============================] - 4s 2ms/step\n", 334 | "12955/12955 [==============================] - 18s 1ms/step\n", 335 | "FOLD: \n", 336 | "2776 11111\n", 337 | "2776/2776 [==============================] - 4s 2ms/step\n", 338 | "12955/12955 [==============================] - 18s 1ms/step\n", 339 | "FOLD: \n", 340 | "2775 11112\n", 341 | "2775/2775 [==============================] - 5s 2ms/step\n", 342 | "12955/12955 [==============================] - 19s 1ms/step\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "import os\n", 348 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 349 | "meta_train = np.zeros(shape = (len(train_seq),8))\n", 350 | "meta_test = np.zeros(shape = (len(test_seq),8))\n", 351 | "FLAG = False\n", 352 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 353 | " print('FOLD: '.format(i))\n", 354 | " print(len(te_ind),len(tr_ind))\n", 355 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n", 356 | " model_name = 'benchmark_textcnn_fold_'+str(i)\n", 357 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n", 358 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n", 359 | " \n", 360 | " model = TextCNN(max_len,max_cnt,embed_size,\n", 361 | " num_filters,kernel_size,\n", 362 | " conv_action,\n", 363 | " mask_zero)\n", 364 | " \n", 365 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n", 366 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n", 367 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n", 368 | " if TRAIN and FLAG:\n", 369 | " model.fit(X_train,X_train_label,\n", 370 | " validation_data=(X_val,X_val_label),\n", 371 | " epochs=100,batch_size=64,\n", 372 | " shuffle=True,\n", 373 | " callbacks=[early_stopping,model_checkpoint]\n", 374 | " )\n", 375 | " \n", 376 | " model.load_weights(model_save_path)\n", 377 | " pred_val = model.predict(X_val,batch_size=128,verbose=1)\n", 378 | " pred_test = model.predict(test_seq,batch_size=128,verbose=1)\n", 379 | " \n", 380 | " meta_train[te_ind] = pred_val\n", 381 | " meta_test += pred_test\n", 382 | " K.clear_session()\n", 383 | "meta_test /= 5.0\n" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 37, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "pd.to_pickle(meta_train,'../train_meta_cnn.pkl')\n", 393 | "pd.to_pickle(meta_test,'../test_meta_cnn.pkl')" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 38, 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "text/plain": [ 404 | "'/home/enjoy/tianchi/安全赛复赛/src'" 405 | ] 406 | }, 407 | "execution_count": 38, 408 | "metadata": {}, 409 | "output_type": "execute_result" 410 | } 411 | ], 412 | "source": [ 413 | "%pwd" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [] 422 | } 423 | ], 424 | "metadata": { 425 | "kernelspec": { 426 | "display_name": "Python 3", 427 | "language": "python", 428 | "name": "python3" 429 | }, 430 | "language_info": { 431 | "codemirror_mode": { 432 | "name": "ipython", 433 | "version": 3 434 | }, 435 | "file_extension": ".py", 436 | "mimetype": "text/x-python", 437 | "name": "python", 438 | "nbconvert_exporter": "python", 439 | "pygments_lexer": "ipython3", 440 | "version": "3.6.5" 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 2 445 | } 446 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/CNN_metafeature_dilated-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 13 | " return f(*args, **kwds)\n", 14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 15 | " return f(*args, **kwds)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import os\n", 23 | "from tqdm import tqdm\n", 24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "path = '../input/'\n", 34 | "train = pd.read_csv(path + 'final_train.csv')\n", 35 | "test = pd.read_csv(path + 'final_test.csv')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "((89806693, 5), (79288375, 4))" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "train.shape,test.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "unique_api = train['api'].unique()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "(295,)" 76 | ] 77 | }, 78 | "execution_count": 5, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "unique_api.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n", 94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "train['api_idx'] = train['api'].map(api2index)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 8, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "test['api_idx'] = test['api'].map(api2index)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 9, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n", 122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "def get_sequence(df,period_idx):\n", 132 | " seq_list = []\n", 133 | " for _id,begin in enumerate(period_idx[:-1]):\n", 134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n", 135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n", 136 | " return seq_list" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n", 146 | "test_df = test[['file_id']].drop_duplicates(keep='first')" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 12, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "train_df['seq'] = get_sequence(train,train_period_idx)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 13, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "test_df['seq'] = get_sequence(test,test_period_idx)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 14, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "(19350.97816934013, 6466.961402750774, 888204)" 176 | ] 177 | }, 178 | "execution_count": 14, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 15, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "(15911.676663585444, 6120.291393284446, 769590)" 196 | ] 197 | }, 198 | "execution_count": 15, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 16, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stderr", 214 | "output_type": "stream", 215 | "text": [ 216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 217 | " return f(*args, **kwds)\n", 218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 219 | " from ._conv import register_converters as _register_converters\n", 220 | "Using TensorFlow backend.\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "from keras.preprocessing.text import Tokenizer\n", 226 | "from keras.preprocessing.sequence import pad_sequences\n", 227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n", 228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n", 229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n", 230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n", 231 | "from keras.models import Model\n", 232 | "from keras.optimizers import RMSprop,Adam\n", 233 | "from keras.layers.normalization import BatchNormalization\n", 234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 235 | "from keras.optimizers import SGD\n", 236 | "from keras import backend as K\n", 237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n", 238 | "from keras.layers import SpatialDropout1D\n", 239 | "from keras.layers.wrappers import Bidirectional" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 17, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "def TextCNN(max_len,max_cnt,embed_size,\n", 249 | " num_filters,kernel_size,\n", 250 | " conv_action,\n", 251 | " mask_zero):\n", 252 | " _input = Input(shape=(max_len,), dtype='int32')\n", 253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n", 254 | " _embed = SpatialDropout1D(0.25)(_embed)\n", 255 | " warppers = []\n", 256 | " for _kernel_size in kernel_size:\n", 257 | " for dilated_rate in [1,2,3,4]:\n", 258 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action, dilation_rate=dilated_rate)(_embed)\n", 259 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n", 260 | " \n", 261 | " fc = concatenate(warppers)\n", 262 | " fc = Dropout(0.5)(fc)\n", 263 | " #fc = BatchNormalization()(fc)\n", 264 | " fc = Dense(256, activation='relu')(fc)\n", 265 | " fc = Dropout(0.25)(fc)\n", 266 | " #fc = BatchNormalization()(fc) \n", 267 | " preds = Dense(8, activation = 'softmax')(fc)\n", 268 | " \n", 269 | " model = Model(inputs=_input, outputs=preds)\n", 270 | " \n", 271 | " model.compile(loss='categorical_crossentropy',\n", 272 | " optimizer='adam',\n", 273 | " metrics=['accuracy'])\n", 274 | " return model" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 18, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "train_labels = pd.get_dummies(train_df.label).values\n", 284 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n", 285 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 20, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 21, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "max_len = 6000\n", 304 | "max_cnt = 295\n", 305 | "embed_size = 256\n", 306 | "num_filters = 64\n", 307 | "kernel_size = [2,3,4,5]\n", 308 | "conv_action = 'relu'\n", 309 | "mask_zero = False\n", 310 | "TRAIN = True" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 22, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "FOLD: \n", 323 | "2780 11107\n", 324 | "FOLD: \n", 325 | "2779 11108\n", 326 | "FOLD: \n", 327 | "2777 11110\n", 328 | "FOLD: \n", 329 | "2776 11111\n", 330 | "Train on 11111 samples, validate on 2776 samples\n", 331 | "Epoch 1/100\n", 332 | "11111/11111 [==============================] - 142s 13ms/step - loss: 0.9257 - acc: 0.6915 - val_loss: 0.4994 - val_acc: 0.8505\n", 333 | "Epoch 2/100\n", 334 | "11111/11111 [==============================] - 116s 10ms/step - loss: 0.5334 - acc: 0.8335 - val_loss: 0.4226 - val_acc: 0.8689\n", 335 | "Epoch 3/100\n", 336 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4632 - acc: 0.8550 - val_loss: 0.3850 - val_acc: 0.8761\n", 337 | "Epoch 4/100\n", 338 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4105 - acc: 0.8701 - val_loss: 0.3808 - val_acc: 0.8754\n", 339 | "Epoch 5/100\n", 340 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3784 - acc: 0.8763 - val_loss: 0.3663 - val_acc: 0.8829\n", 341 | "Epoch 6/100\n", 342 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3536 - acc: 0.8840 - val_loss: 0.3467 - val_acc: 0.8872\n", 343 | "Epoch 7/100\n", 344 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3420 - acc: 0.8903 - val_loss: 0.3426 - val_acc: 0.8909\n", 345 | "Epoch 8/100\n", 346 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3284 - acc: 0.8941 - val_loss: 0.3377 - val_acc: 0.8945\n", 347 | "Epoch 9/100\n", 348 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3133 - acc: 0.8936 - val_loss: 0.3380 - val_acc: 0.8945\n", 349 | "Epoch 10/100\n", 350 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3034 - acc: 0.8971 - val_loss: 0.3415 - val_acc: 0.8923\n", 351 | "Epoch 11/100\n", 352 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2916 - acc: 0.9007 - val_loss: 0.3232 - val_acc: 0.8995\n", 353 | "Epoch 12/100\n", 354 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2765 - acc: 0.9058 - val_loss: 0.3402 - val_acc: 0.8934\n", 355 | "Epoch 13/100\n", 356 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2657 - acc: 0.9086 - val_loss: 0.3294 - val_acc: 0.8984\n", 357 | "Epoch 14/100\n", 358 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2620 - acc: 0.9079 - val_loss: 0.3411 - val_acc: 0.8977\n", 359 | "FOLD: \n", 360 | "2775 11112\n", 361 | "Train on 11112 samples, validate on 2775 samples\n", 362 | "Epoch 1/100\n", 363 | "11112/11112 [==============================] - 116s 10ms/step - loss: 0.9019 - acc: 0.7001 - val_loss: 0.4956 - val_acc: 0.8436\n", 364 | "Epoch 2/100\n", 365 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.5189 - acc: 0.8322 - val_loss: 0.4210 - val_acc: 0.8695\n", 366 | "Epoch 3/100\n", 367 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4525 - acc: 0.8543 - val_loss: 0.3906 - val_acc: 0.8778\n", 368 | "Epoch 4/100\n", 369 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4038 - acc: 0.8721 - val_loss: 0.3832 - val_acc: 0.8674\n", 370 | "Epoch 5/100\n", 371 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3802 - acc: 0.8790 - val_loss: 0.3687 - val_acc: 0.8836\n", 372 | "Epoch 6/100\n", 373 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3563 - acc: 0.8813 - val_loss: 0.3739 - val_acc: 0.8807\n", 374 | "Epoch 7/100\n", 375 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3277 - acc: 0.8909 - val_loss: 0.3597 - val_acc: 0.8840\n", 376 | "Epoch 8/100\n", 377 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3239 - acc: 0.8935 - val_loss: 0.3534 - val_acc: 0.8901\n", 378 | "Epoch 9/100\n", 379 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3061 - acc: 0.8954 - val_loss: 0.3581 - val_acc: 0.8861\n", 380 | "Epoch 10/100\n", 381 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2973 - acc: 0.8994 - val_loss: 0.3528 - val_acc: 0.8901\n", 382 | "Epoch 11/100\n", 383 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2875 - acc: 0.9035 - val_loss: 0.3537 - val_acc: 0.8847\n", 384 | "Epoch 12/100\n", 385 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2736 - acc: 0.9060 - val_loss: 0.3596 - val_acc: 0.8908\n", 386 | "Epoch 13/100\n", 387 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2613 - acc: 0.9078 - val_loss: 0.3521 - val_acc: 0.8908\n", 388 | "Epoch 14/100\n", 389 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2639 - acc: 0.9055 - val_loss: 0.3457 - val_acc: 0.8926\n", 390 | "Epoch 15/100\n", 391 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2514 - acc: 0.9121 - val_loss: 0.3702 - val_acc: 0.8865\n", 392 | "Epoch 16/100\n", 393 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2497 - acc: 0.9112 - val_loss: 0.3684 - val_acc: 0.8905\n", 394 | "Epoch 17/100\n", 395 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2366 - acc: 0.9147 - val_loss: 0.3700 - val_acc: 0.8908\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "import os\n", 401 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n", 402 | "meta_train = np.zeros(shape = (len(train_seq),8))\n", 403 | "meta_test = np.zeros(shape = (len(test_seq),8))\n", 404 | "FLAG = False\n", 405 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 406 | " if i in [3,4]:\n", 407 | " FLAG = True\n", 408 | " print('FOLD: '.format(i))\n", 409 | " print(len(te_ind),len(tr_ind))\n", 410 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n", 411 | " model_name = 'benchmark_dilated_textcnn_fold_'+str(i)\n", 412 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n", 413 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n", 414 | " \n", 415 | " model = TextCNN(max_len,max_cnt,embed_size,\n", 416 | " num_filters,kernel_size,\n", 417 | " conv_action,\n", 418 | " mask_zero)\n", 419 | " \n", 420 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n", 421 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n", 422 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n", 423 | " if TRAIN and FLAG:\n", 424 | " model.fit(X_train,X_train_label,\n", 425 | " validation_data=(X_val,X_val_label),\n", 426 | " epochs=100,batch_size=64,\n", 427 | " shuffle=True,\n", 428 | " callbacks=[early_stopping,model_checkpoint]\n", 429 | " )\n", 430 | " \n", 431 | " #model.load_weights(model_save_path)\n", 432 | " #pred_val = model.predict(X_val,batch_size=128)\n", 433 | " #pred_test = model.predict(test_seq,batch_size=128)\n", 434 | " \n", 435 | " #meta_train[te_ind] = pred_val\n", 436 | " #meta_test += pred_test\n", 437 | " FLAG = False\n", 438 | " #K.clear_session()\n", 439 | "#meta_test /= 5.0\n" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "pd.to_pickle(meta_train,'../feature_final/train_meta_dilated_cnn.pkl')\n", 449 | "pd.to_pickle(meta_test,'../feature_final/test_meta_dilated_cnn.pkl')" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "print '1322'" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [] 467 | } 468 | ], 469 | "metadata": { 470 | "kernelspec": { 471 | "display_name": "Python 3", 472 | "language": "python", 473 | "name": "python3" 474 | }, 475 | "language_info": { 476 | "codemirror_mode": { 477 | "name": "ipython", 478 | "version": 3 479 | }, 480 | "file_extension": ".py", 481 | "mimetype": "text/x-python", 482 | "name": "python", 483 | "nbconvert_exporter": "python", 484 | "pygments_lexer": "ipython3", 485 | "version": "3.6.5" 486 | } 487 | }, 488 | "nbformat": 4, 489 | "nbformat_minor": 2 490 | } 491 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/gene_npy-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 4, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "test_data_2gram_final = pd.read_csv('./test_data_2gram_final.csv')\n", 20 | "train_data_2gram_final = pd.read_csv('./train_data_2gram_final.csv')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 6, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "cols = [item for item in train_data_2gram_final.columns if item not in ['label']]\n", 30 | "np.save('../X_test.npy',test_data_2gram_final[cols].values)\n", 31 | "np.save('../X_train.npy',train_data_2gram_final[cols].values)\n", 32 | "np.save('../labels.npy',train_data_2gram_final['label'].values)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 8, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "((13887, 3252), (12955, 3251))" 44 | ] 45 | }, 46 | "execution_count": 8, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "train_data_2gram_final.shape,test_data_2gram_final.shape" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.6.5" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 2 84 | } 85 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/lgb_meta_features-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/user/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 13 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import lightgbm as lgb\n", 21 | "from sklearn.cross_validation import train_test_split\n", 22 | "import gc\n", 23 | "from sklearn.preprocessing import OneHotEncoder\n", 24 | "from sklearn.cross_validation import StratifiedKFold\n", 25 | "import datetime" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "cur time = 2018/09/21 18:54:08\n", 38 | "(13887, 3251) (12955, 3251)\n", 39 | "cur time = 2018/09/21 18:54:08\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 45 | "train = np.load('../X_train.npy')\n", 46 | "test = np.load('../X_test.npy')\n", 47 | "train_labels = np.load('../labels.npy')\n", 48 | "print train.shape,test.shape\n", 49 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 59 | "\n", 60 | "meta_train = np.zeros(shape = (len(train),8))\n", 61 | "meta_test = np.zeros(shape = (len(test),8))\n", 62 | "\n", 63 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 64 | " print 'FOLD: ',i\n", 65 | " print len(te_ind),len(tr_ind)\n", 66 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 67 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 68 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 69 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 70 | " params = {\n", 71 | " 'task':'train', \n", 72 | " 'boosting_type':'gbdt',\n", 73 | " 'num_leaves': 15,\n", 74 | " 'objective': 'multiclass',\n", 75 | " 'num_class':8,\n", 76 | " 'learning_rate': 0.05,\n", 77 | " 'feature_fraction': 0.85,\n", 78 | " 'subsample':0.85,\n", 79 | " 'num_threads': 32,\n", 80 | " 'metric':'multi_logloss',\n", 81 | " 'seed':100\n", 82 | " } \n", 83 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 84 | " pred_val = model.predict(X_val)\n", 85 | " pred_test = model.predict(test)\n", 86 | " \n", 87 | " meta_train[te_ind] = pred_val\n", 88 | " meta_test += pred_test\n", 89 | "meta_test /= 5.0" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "pd.to_pickle(meta_train,'../train_meta_lgb_1.pkl')\n", 99 | "pd.to_pickle(meta_test,'../test_meta_lgb_1.pkl')" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": { 106 | "scrolled": false 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "FOLD: 0\n", 114 | "2780 11107\n", 115 | "Training until validation scores don't improve for 100 rounds.\n", 116 | "[100]\ttraining's multi_logloss: 0.105693\tvalid_1's multi_logloss: 0.290438\n", 117 | "[200]\ttraining's multi_logloss: 0.0243107\tvalid_1's multi_logloss: 0.28446\n", 118 | "Early stopping, best iteration is:\n", 119 | "[145]\ttraining's multi_logloss: 0.0517928\tvalid_1's multi_logloss: 0.277273\n", 120 | "FOLD: 1\n", 121 | "2779 11108\n", 122 | "Training until validation scores don't improve for 100 rounds.\n", 123 | "[100]\ttraining's multi_logloss: 0.108126\tvalid_1's multi_logloss: 0.284527\n", 124 | "[200]\ttraining's multi_logloss: 0.0254294\tvalid_1's multi_logloss: 0.283195\n", 125 | "Early stopping, best iteration is:\n", 126 | "[139]\ttraining's multi_logloss: 0.0583621\tvalid_1's multi_logloss: 0.273231\n", 127 | "FOLD: 2\n", 128 | "2777 11110\n", 129 | "Training until validation scores don't improve for 100 rounds.\n", 130 | "[100]\ttraining's multi_logloss: 0.107591\tvalid_1's multi_logloss: 0.271276\n", 131 | "[200]\ttraining's multi_logloss: 0.0256978\tvalid_1's multi_logloss: 0.267876\n", 132 | "Early stopping, best iteration is:\n", 133 | "[151]\ttraining's multi_logloss: 0.0490566\tvalid_1's multi_logloss: 0.258754\n", 134 | "FOLD: 3\n", 135 | "2776 11111\n", 136 | "Training until validation scores don't improve for 100 rounds.\n", 137 | "[100]\ttraining's multi_logloss: 0.109872\tvalid_1's multi_logloss: 0.2752\n", 138 | "[200]\ttraining's multi_logloss: 0.0267958\tvalid_1's multi_logloss: 0.266528\n", 139 | "Early stopping, best iteration is:\n", 140 | "[153]\ttraining's multi_logloss: 0.0492415\tvalid_1's multi_logloss: 0.260417\n", 141 | "FOLD: 4\n", 142 | "2775 11112\n", 143 | "Training until validation scores don't improve for 100 rounds.\n", 144 | "[100]\ttraining's multi_logloss: 0.108239\tvalid_1's multi_logloss: 0.286993\n", 145 | "[200]\ttraining's multi_logloss: 0.0260953\tvalid_1's multi_logloss: 0.276078\n", 146 | "Early stopping, best iteration is:\n", 147 | "[155]\ttraining's multi_logloss: 0.0471788\tvalid_1's multi_logloss: 0.270497\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 153 | "\n", 154 | "meta_train = np.zeros(shape = (len(train),8))\n", 155 | "meta_test = np.zeros(shape = (len(test),8))\n", 156 | "\n", 157 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 158 | " print 'FOLD: ',i\n", 159 | " print len(te_ind),len(tr_ind)\n", 160 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 161 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 162 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 163 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 164 | " params = {\n", 165 | " 'task':'train', \n", 166 | " 'boosting_type':'gbdt',\n", 167 | " 'num_leaves': 31,\n", 168 | " 'objective': 'multiclass',\n", 169 | " 'num_class':8,\n", 170 | " 'learning_rate': 0.05,\n", 171 | " 'feature_fraction': 0.85,\n", 172 | " 'subsample':0.85,\n", 173 | " 'num_threads': 32,\n", 174 | " 'metric':'multi_logloss',\n", 175 | " 'seed':100\n", 176 | " } \n", 177 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 178 | " pred_val = model.predict(X_val)\n", 179 | " pred_test = model.predict(test)\n", 180 | " \n", 181 | " meta_train[te_ind] = pred_val\n", 182 | " meta_test += pred_test\n", 183 | "meta_test /= 5.0" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 6, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "pd.to_pickle(meta_train,'../train_meta_lgb_2.pkl')\n", 193 | "pd.to_pickle(meta_test,'../test_meta_lgb_2.pkl')" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 7, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "FOLD: 0\n", 206 | "2780 11107\n", 207 | "Training until validation scores don't improve for 100 rounds.\n", 208 | "[100]\ttraining's multi_logloss: 0.126813\tvalid_1's multi_logloss: 0.299223\n", 209 | "[200]\ttraining's multi_logloss: 0.0319222\tvalid_1's multi_logloss: 0.278803\n", 210 | "Early stopping, best iteration is:\n", 211 | "[161]\ttraining's multi_logloss: 0.0520005\tvalid_1's multi_logloss: 0.276196\n", 212 | "FOLD: 1\n", 213 | "2779 11108\n", 214 | "Training until validation scores don't improve for 100 rounds.\n", 215 | "[100]\ttraining's multi_logloss: 0.128834\tvalid_1's multi_logloss: 0.292494\n", 216 | "[200]\ttraining's multi_logloss: 0.0332951\tvalid_1's multi_logloss: 0.277843\n", 217 | "Early stopping, best iteration is:\n", 218 | "[153]\ttraining's multi_logloss: 0.0597567\tvalid_1's multi_logloss: 0.272742\n", 219 | "FOLD: 2\n", 220 | "2777 11110\n", 221 | "Training until validation scores don't improve for 100 rounds.\n", 222 | "[100]\ttraining's multi_logloss: 0.128497\tvalid_1's multi_logloss: 0.279648\n", 223 | "[200]\ttraining's multi_logloss: 0.0334364\tvalid_1's multi_logloss: 0.263845\n", 224 | "Early stopping, best iteration is:\n", 225 | "[159]\ttraining's multi_logloss: 0.0551787\tvalid_1's multi_logloss: 0.25859\n", 226 | "FOLD: 3\n", 227 | "2776 11111\n", 228 | "Training until validation scores don't improve for 100 rounds.\n", 229 | "[100]\ttraining's multi_logloss: 0.130386\tvalid_1's multi_logloss: 0.286192\n", 230 | "[200]\ttraining's multi_logloss: 0.0347223\tvalid_1's multi_logloss: 0.263253\n", 231 | "Early stopping, best iteration is:\n", 232 | "[169]\ttraining's multi_logloss: 0.0501232\tvalid_1's multi_logloss: 0.260649\n", 233 | "FOLD: 4\n", 234 | "2775 11112\n", 235 | "Training until validation scores don't improve for 100 rounds.\n", 236 | "[100]\ttraining's multi_logloss: 0.129009\tvalid_1's multi_logloss: 0.296055\n", 237 | "[200]\ttraining's multi_logloss: 0.0340881\tvalid_1's multi_logloss: 0.274158\n", 238 | "Early stopping, best iteration is:\n", 239 | "[173]\ttraining's multi_logloss: 0.0469372\tvalid_1's multi_logloss: 0.272973\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 245 | "\n", 246 | "meta_train = np.zeros(shape = (len(train),8))\n", 247 | "meta_test = np.zeros(shape = (len(test),8))\n", 248 | "\n", 249 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 250 | " print 'FOLD: ',i\n", 251 | " print len(te_ind),len(tr_ind)\n", 252 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 253 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 254 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 255 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 256 | " params = {\n", 257 | " 'task':'train', \n", 258 | " 'boosting_type':'gbdt',\n", 259 | " 'num_leaves': 31,\n", 260 | " 'objective': 'multiclass',\n", 261 | " 'num_class':8,\n", 262 | " 'learning_rate': 0.045,\n", 263 | " 'feature_fraction': 0.8,\n", 264 | " 'subsample':0.8,\n", 265 | " 'num_threads': 32,\n", 266 | " 'metric':'multi_logloss',\n", 267 | " 'seed':100\n", 268 | " } \n", 269 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 270 | " pred_val = model.predict(X_val)\n", 271 | " pred_test = model.predict(test)\n", 272 | " \n", 273 | " meta_train[te_ind] = pred_val\n", 274 | " meta_test += pred_test\n", 275 | "meta_test /= 5.0" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 8, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "pd.to_pickle(meta_train,'../train_meta_lgb_3.pkl')\n", 285 | "pd.to_pickle(meta_test,'../test_meta_lgb_3.pkl')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 9, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "FOLD: 0\n", 305 | "2780 11107\n", 306 | "Training until validation scores don't improve for 100 rounds.\n", 307 | "[100]\ttraining's multi_logloss: 0.0771172\tvalid_1's multi_logloss: 0.289138\n", 308 | "[200]\ttraining's multi_logloss: 0.00851115\tvalid_1's multi_logloss: 0.298243\n", 309 | "Early stopping, best iteration is:\n", 310 | "[133]\ttraining's multi_logloss: 0.0357694\tvalid_1's multi_logloss: 0.27818\n", 311 | "FOLD: 1\n", 312 | "2779 11108\n", 313 | "Training until validation scores don't improve for 100 rounds.\n", 314 | "[100]\ttraining's multi_logloss: 0.0780999\tvalid_1's multi_logloss: 0.289059\n", 315 | "[200]\ttraining's multi_logloss: 0.00887645\tvalid_1's multi_logloss: 0.298286\n", 316 | "Early stopping, best iteration is:\n", 317 | "[134]\ttraining's multi_logloss: 0.0357742\tvalid_1's multi_logloss: 0.278663\n", 318 | "FOLD: 2\n", 319 | "2777 11110\n", 320 | "Training until validation scores don't improve for 100 rounds.\n", 321 | "[100]\ttraining's multi_logloss: 0.0784245\tvalid_1's multi_logloss: 0.274011\n", 322 | "[200]\ttraining's multi_logloss: 0.00891692\tvalid_1's multi_logloss: 0.282485\n", 323 | "Early stopping, best iteration is:\n", 324 | "[134]\ttraining's multi_logloss: 0.0356565\tvalid_1's multi_logloss: 0.263027\n", 325 | "FOLD: 3\n", 326 | "2776 11111\n", 327 | "Training until validation scores don't improve for 100 rounds.\n", 328 | "[100]\ttraining's multi_logloss: 0.0795669\tvalid_1's multi_logloss: 0.280272\n", 329 | "[200]\ttraining's multi_logloss: 0.00927117\tvalid_1's multi_logloss: 0.284248\n", 330 | "Early stopping, best iteration is:\n", 331 | "[135]\ttraining's multi_logloss: 0.0357068\tvalid_1's multi_logloss: 0.267277\n", 332 | "FOLD: 4\n", 333 | "2775 11112\n", 334 | "Training until validation scores don't improve for 100 rounds.\n", 335 | "[100]\ttraining's multi_logloss: 0.0782005\tvalid_1's multi_logloss: 0.287082\n", 336 | "[200]\ttraining's multi_logloss: 0.00896856\tvalid_1's multi_logloss: 0.294814\n", 337 | "Early stopping, best iteration is:\n", 338 | "[129]\ttraining's multi_logloss: 0.0400827\tvalid_1's multi_logloss: 0.277252\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 344 | "\n", 345 | "meta_train = np.zeros(shape = (len(train),8))\n", 346 | "meta_test = np.zeros(shape = (len(test),8))\n", 347 | "\n", 348 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 349 | " print 'FOLD: ',i\n", 350 | " print len(te_ind),len(tr_ind)\n", 351 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 352 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 353 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 354 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 355 | " params = {\n", 356 | " 'task':'train', \n", 357 | " 'boosting_type':'gbdt',\n", 358 | " 'num_leaves': 63,\n", 359 | " 'objective': 'multiclass',\n", 360 | " 'num_class':8,\n", 361 | " 'learning_rate': 0.045,\n", 362 | " 'feature_fraction': 0.5,\n", 363 | " 'subsample':0.7,\n", 364 | " 'num_threads': 54,\n", 365 | " 'metric':'multi_logloss',\n", 366 | " 'seed':100\n", 367 | " } \n", 368 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 369 | " pred_val = model.predict(X_val)\n", 370 | " pred_test = model.predict(test)\n", 371 | " \n", 372 | " meta_train[te_ind] = pred_val\n", 373 | " meta_test += pred_test\n", 374 | "meta_test /= 5.0\n", 375 | "\n", 376 | "pd.to_pickle(meta_train,'../train_meta_lgb_4.pkl')\n", 377 | "pd.to_pickle(meta_test,'../test_meta_lgb_4.pkl')" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [] 386 | } 387 | ], 388 | "metadata": { 389 | "kernelspec": { 390 | "display_name": "Python 3", 391 | "language": "python", 392 | "name": "python3" 393 | }, 394 | "language_info": { 395 | "codemirror_mode": { 396 | "name": "ipython", 397 | "version": 3 398 | }, 399 | "file_extension": ".py", 400 | "mimetype": "text/x-python", 401 | "name": "python", 402 | "nbconvert_exporter": "python", 403 | "pygments_lexer": "ipython3", 404 | "version": "3.6.5" 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 2 409 | } 410 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/pickle_pre-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "meta_train = pd.read_pickle('../meta/train_meta_dilated_cnn.pkl')\n", 12 | "meta_test = pd.read_pickle('../meta/test_meta_dilated_cnn.pkl')\n", 13 | "\n", 14 | "import pickle\n", 15 | "\n", 16 | "f=open('../meta/train_meta_dilated_cnn_a.pkl','wb') \n", 17 | "pickle.dump(meta_train,f,0) \n", 18 | "f.close()\n", 19 | "\n", 20 | "f=open('../meta/test_meta_dilated_cnn_a.pkl','wb') \n", 21 | "pickle.dump(meta_test,f,0) \n", 22 | "f.close()" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "meta_train = pd.read_pickle('../meta/train_meta_cnn.pkl')\n", 32 | "meta_test = pd.read_pickle('../meta/test_meta_cnn.pkl')\n", 33 | "\n", 34 | "f=open('../meta/train_meta_cnn_a.pkl','wb') \n", 35 | "pickle.dump(meta_train,f,0) \n", 36 | "f.close()\n", 37 | "\n", 38 | "f=open('../meta/test_meta_cnn_a.pkl','wb') \n", 39 | "pickle.dump(meta_test,f,0) \n", 40 | "f.close()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "'/Users/didi/天池/安全赛复赛/temp'" 52 | ] 53 | }, 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "%pwd" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 3", 74 | "language": "python", 75 | "name": "python3" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 3 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython3", 87 | "version": "3.6.5" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } 93 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/submit-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# coding: utf-8\n", 12 | "\n", 13 | "# In[1]:\n", 14 | "\n", 15 | "\n", 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "import lightgbm as lgb\n", 19 | "from sklearn.cross_validation import train_test_split\n", 20 | "import gc\n", 21 | "from sklearn.preprocessing import OneHotEncoder\n", 22 | "import datetime\n", 23 | "from sklearn.cross_validation import StratifiedKFold\n", 24 | "\n", 25 | "# In[2]:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 27, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "cur time = 2018/09/21 20:10:16\n", 38 | "(13887, 3251) (12955, 3251)\n", 39 | "cur time = 2018/09/21 20:10:16\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 45 | "train = np.load('../X_train.npy')\n", 46 | "test = np.load('../X_test.npy')\n", 47 | "train_labels = np.load('../labels.npy')\n", 48 | "\n", 49 | "print train.shape,test.shape\n", 50 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 28, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "train_cnn_1 = pd.read_pickle('../train_meta_cnn_a.pkl')\n", 60 | "test_cnn_1 = pd.read_pickle('../test_meta_cnn_a.pkl')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 29, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "train_cnn_2 = pd.read_pickle('../train_meta_dilated_cnn_a.pkl')\n", 70 | "test_cnn_2 = pd.read_pickle('../test_meta_dilated_cnn_a.pkl')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 30, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "train_lgb_1 = pd.read_pickle('../train_meta_lgb_1.pkl')\n", 80 | "test_lgb_1 = pd.read_pickle('../test_meta_lgb_1.pkl')\n", 81 | "\n", 82 | "train_lgb_2 = pd.read_pickle('../train_meta_lgb_2.pkl')\n", 83 | "test_lgb_2 = pd.read_pickle('../test_meta_lgb_2.pkl')\n", 84 | "\n", 85 | "train_lgb_3 = pd.read_pickle('../train_meta_lgb_3.pkl')\n", 86 | "test_lgb_3 = pd.read_pickle('../test_meta_lgb_3.pkl')\n", 87 | "\n", 88 | "train_lgb_4 = pd.read_pickle('../train_meta_lgb_4.pkl')\n", 89 | "test_lgb_4 = pd.read_pickle('../test_meta_lgb_4.pkl')" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 33, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "train = np.hstack([train,train_cnn_1, train_cnn_2, train_lgb_1, train_lgb_2, train_lgb_3, train_lgb_4])\n", 99 | "test = np.hstack([test,test_cnn_1, test_cnn_2, test_lgb_1, test_lgb_2, test_lgb_3, test_lgb_4])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 36, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Times: 0\n", 112 | "cur time = 2018/09/21 20:12:20\n", 113 | "FOLD: 0\n", 114 | "2780 11107\n", 115 | "Training until validation scores don't improve for 100 rounds.\n", 116 | "[100]\ttraining's multi_logloss: 0.70958\tvalid_1's multi_logloss: 0.755308\n", 117 | "[200]\ttraining's multi_logloss: 0.354997\tvalid_1's multi_logloss: 0.434251\n", 118 | "[300]\ttraining's multi_logloss: 0.217096\tvalid_1's multi_logloss: 0.326171\n", 119 | "[400]\ttraining's multi_logloss: 0.152399\tvalid_1's multi_logloss: 0.289922\n", 120 | "[500]\ttraining's multi_logloss: 0.117076\tvalid_1's multi_logloss: 0.278519\n", 121 | "[600]\ttraining's multi_logloss: 0.094704\tvalid_1's multi_logloss: 0.277246\n", 122 | "Early stopping, best iteration is:\n", 123 | "[598]\ttraining's multi_logloss: 0.0950629\tvalid_1's multi_logloss: 0.277223\n", 124 | "cur time = 2018/09/21 20:13:46\n", 125 | "FOLD: 1\n", 126 | "2779 11108\n", 127 | "Training until validation scores don't improve for 100 rounds.\n", 128 | "[100]\ttraining's multi_logloss: 0.714406\tvalid_1's multi_logloss: 0.746063\n", 129 | "[200]\ttraining's multi_logloss: 0.360927\tvalid_1's multi_logloss: 0.41752\n", 130 | "[300]\ttraining's multi_logloss: 0.223406\tvalid_1's multi_logloss: 0.305014\n", 131 | "[400]\ttraining's multi_logloss: 0.159355\tvalid_1's multi_logloss: 0.265401\n", 132 | "[500]\ttraining's multi_logloss: 0.123548\tvalid_1's multi_logloss: 0.251562\n", 133 | "[600]\ttraining's multi_logloss: 0.100453\tvalid_1's multi_logloss: 0.247619\n", 134 | "[700]\ttraining's multi_logloss: 0.0840086\tvalid_1's multi_logloss: 0.247471\n", 135 | "Early stopping, best iteration is:\n", 136 | "[645]\ttraining's multi_logloss: 0.0925202\tvalid_1's multi_logloss: 0.246913\n", 137 | "cur time = 2018/09/21 20:15:23\n", 138 | "FOLD: 2\n", 139 | "2777 11110\n", 140 | "Training until validation scores don't improve for 100 rounds.\n", 141 | "[100]\ttraining's multi_logloss: 0.710447\tvalid_1's multi_logloss: 0.758826\n", 142 | "[200]\ttraining's multi_logloss: 0.354958\tvalid_1's multi_logloss: 0.436029\n", 143 | "[300]\ttraining's multi_logloss: 0.216709\tvalid_1's multi_logloss: 0.326181\n", 144 | "[400]\ttraining's multi_logloss: 0.15243\tvalid_1's multi_logloss: 0.287969\n", 145 | "[500]\ttraining's multi_logloss: 0.117201\tvalid_1's multi_logloss: 0.275582\n", 146 | "[600]\ttraining's multi_logloss: 0.0948654\tvalid_1's multi_logloss: 0.273565\n", 147 | "Early stopping, best iteration is:\n", 148 | "[578]\ttraining's multi_logloss: 0.0990779\tvalid_1's multi_logloss: 0.273456\n", 149 | "cur time = 2018/09/21 20:16:47\n", 150 | "FOLD: 3\n", 151 | "2776 11111\n", 152 | "Training until validation scores don't improve for 100 rounds.\n", 153 | "[100]\ttraining's multi_logloss: 0.710814\tvalid_1's multi_logloss: 0.757495\n", 154 | "[200]\ttraining's multi_logloss: 0.356598\tvalid_1's multi_logloss: 0.432203\n", 155 | "[300]\ttraining's multi_logloss: 0.219223\tvalid_1's multi_logloss: 0.319802\n", 156 | "[400]\ttraining's multi_logloss: 0.154809\tvalid_1's multi_logloss: 0.280013\n", 157 | "[500]\ttraining's multi_logloss: 0.118818\tvalid_1's multi_logloss: 0.2661\n", 158 | "[600]\ttraining's multi_logloss: 0.0962369\tvalid_1's multi_logloss: 0.262496\n", 159 | "[700]\ttraining's multi_logloss: 0.0801419\tvalid_1's multi_logloss: 0.262299\n", 160 | "Early stopping, best iteration is:\n", 161 | "[660]\ttraining's multi_logloss: 0.0860689\tvalid_1's multi_logloss: 0.261957\n", 162 | "cur time = 2018/09/21 20:18:25\n", 163 | "FOLD: 4\n", 164 | "2775 11112\n", 165 | "Training until validation scores don't improve for 100 rounds.\n", 166 | "[100]\ttraining's multi_logloss: 0.711242\tvalid_1's multi_logloss: 0.757122\n", 167 | "[200]\ttraining's multi_logloss: 0.357074\tvalid_1's multi_logloss: 0.432454\n", 168 | "[300]\ttraining's multi_logloss: 0.219319\tvalid_1's multi_logloss: 0.319717\n", 169 | "[400]\ttraining's multi_logloss: 0.155336\tvalid_1's multi_logloss: 0.27957\n", 170 | "[500]\ttraining's multi_logloss: 0.12014\tvalid_1's multi_logloss: 0.264735\n", 171 | "[600]\ttraining's multi_logloss: 0.0976771\tvalid_1's multi_logloss: 0.260499\n", 172 | "[700]\ttraining's multi_logloss: 0.0816694\tvalid_1's multi_logloss: 0.260613\n", 173 | "Early stopping, best iteration is:\n", 174 | "[625]\ttraining's multi_logloss: 0.0932625\tvalid_1's multi_logloss: 0.260267\n", 175 | "cur time = 2018/09/21 20:20:08\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "\n", 181 | "\n", 182 | "meta_test = np.zeros(shape = (len(test),8))\n", 183 | "\n", 184 | "for seed in range(1):\n", 185 | " print 'Times: ',seed\n", 186 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 187 | " skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=seed)\n", 188 | " for i,(tr_ind,te_ind) in enumerate(skf):\n", 189 | " print 'FOLD: ',i\n", 190 | " print len(te_ind),len(tr_ind)\n", 191 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 192 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 193 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 194 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 195 | " params = {\n", 196 | " 'task':'train', \n", 197 | " 'boosting_type':'gbdt',\n", 198 | " 'num_leaves': 15,\n", 199 | " 'objective': 'multiclass',\n", 200 | " 'num_class':8,\n", 201 | " 'learning_rate': 0.01,\n", 202 | " 'feature_fraction': 0.85,\n", 203 | " 'subsample':0.85,\n", 204 | " 'num_threads': 54,\n", 205 | " 'metric':'multi_logloss',\n", 206 | " 'seed':seed\n", 207 | " } \n", 208 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 209 | " pred_test = model.predict(test)\n", 210 | "\n", 211 | " #meta_train[te_ind] = pred_val\n", 212 | " meta_test += pred_test\n", 213 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 214 | "\n", 215 | "meta_test/=5.0\n", 216 | "res = pd.DataFrame(meta_test,columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])\n", 217 | "res.index.name='file_id'\n", 218 | "res.round(7).to_csv('submit.csv', index = True, header=True)\n", 219 | " \n", 220 | " " 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 74, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "res.shape\n", 230 | "res.index = range(1,res.shape[0]+1)\n", 231 | "res.index.name = 'file_id'" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 77, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "en =res.copy()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 79, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "1.0000000000000004" 252 | ] 253 | }, 254 | "execution_count": 79, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "en.sum(axis=1).max()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 81, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "en.to_csv('../fuucccccccck.csv',index=True,header=True)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 83, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/html": [ 280 | "
\n", 281 | "\n", 294 | "\n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | "
prob0prob1prob2prob3prob4prob5prob6prob7
file_id
10.0020350.0021270.9497510.0095020.0018050.0024040.0055500.026825
20.9311290.0021370.0032890.0039130.0020600.0092540.0101010.038117
30.9960000.0004530.0005970.0006300.0004290.0005750.0005600.000755
40.0136270.0080150.0186250.0988060.0540510.0922540.1809030.533720
50.9938330.0005780.0010650.0008520.0006080.0007760.0007790.001510
\n", 377 | "
" 378 | ], 379 | "text/plain": [ 380 | " prob0 prob1 prob2 prob3 prob4 prob5 prob6 \\\n", 381 | "file_id \n", 382 | "1 0.002035 0.002127 0.949751 0.009502 0.001805 0.002404 0.005550 \n", 383 | "2 0.931129 0.002137 0.003289 0.003913 0.002060 0.009254 0.010101 \n", 384 | "3 0.996000 0.000453 0.000597 0.000630 0.000429 0.000575 0.000560 \n", 385 | "4 0.013627 0.008015 0.018625 0.098806 0.054051 0.092254 0.180903 \n", 386 | "5 0.993833 0.000578 0.001065 0.000852 0.000608 0.000776 0.000779 \n", 387 | "\n", 388 | " prob7 \n", 389 | "file_id \n", 390 | "1 0.026825 \n", 391 | "2 0.038117 \n", 392 | "3 0.000755 \n", 393 | "4 0.533720 \n", 394 | "5 0.001510 " 395 | ] 396 | }, 397 | "execution_count": 83, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "en.head()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [] 412 | } 413 | ], 414 | "metadata": { 415 | "kernelspec": { 416 | "display_name": "Python 3", 417 | "language": "python", 418 | "name": "python3" 419 | }, 420 | "language_info": { 421 | "codemirror_mode": { 422 | "name": "ipython", 423 | "version": 3 424 | }, 425 | "file_extension": ".py", 426 | "mimetype": "text/x-python", 427 | "name": "python", 428 | "nbconvert_exporter": "python", 429 | "pygments_lexer": "ipython3", 430 | "version": "3.6.5" 431 | } 432 | }, 433 | "nbformat": 4, 434 | "nbformat_minor": 2 435 | } 436 | -------------------------------------------------------------------------------- /CNN_metafeature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 13 | " return f(*args, **kwds)\n", 14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 15 | " return f(*args, **kwds)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import os\n", 23 | "from tqdm import tqdm\n", 24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "path = '../input/'\n", 34 | "train = pd.read_csv(path + 'final_train.csv')\n", 35 | "test = pd.read_csv(path + 'final_test.csv')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "((89806693, 5), (79288375, 4))" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "train.shape,test.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "unique_api = train['api'].unique()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "(295,)" 76 | ] 77 | }, 78 | "execution_count": 5, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "unique_api.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n", 94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "train['api_idx'] = train['api'].map(api2index)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 8, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "test['api_idx'] = test['api'].map(api2index)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 9, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n", 122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "def get_sequence(df,period_idx):\n", 132 | " seq_list = []\n", 133 | " for _id,begin in enumerate(period_idx[:-1]):\n", 134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n", 135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n", 136 | " return seq_list" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n", 146 | "test_df = test[['file_id']].drop_duplicates(keep='first')" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 12, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "train_df['seq'] = get_sequence(train,train_period_idx)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 13, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "test_df['seq'] = get_sequence(test,test_period_idx)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 14, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "(19350.97816934013, 6466.961402750774, 888204)" 176 | ] 177 | }, 178 | "execution_count": 14, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 15, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "(15911.676663585444, 6120.291393284446, 769590)" 196 | ] 197 | }, 198 | "execution_count": 15, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 16, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stderr", 214 | "output_type": "stream", 215 | "text": [ 216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 217 | " return f(*args, **kwds)\n", 218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 219 | " from ._conv import register_converters as _register_converters\n", 220 | "Using TensorFlow backend.\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "from keras.preprocessing.text import Tokenizer\n", 226 | "from keras.preprocessing.sequence import pad_sequences\n", 227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n", 228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n", 229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n", 230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n", 231 | "from keras.models import Model\n", 232 | "from keras.optimizers import RMSprop,Adam\n", 233 | "from keras.layers.normalization import BatchNormalization\n", 234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 235 | "from keras.optimizers import SGD\n", 236 | "from keras import backend as K\n", 237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n", 238 | "from keras.layers import SpatialDropout1D\n", 239 | "from keras.layers.wrappers import Bidirectional" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 17, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "def TextCNN(max_len,max_cnt,embed_size,\n", 249 | " num_filters,kernel_size,\n", 250 | " conv_action,\n", 251 | " mask_zero):\n", 252 | " _input = Input(shape=(max_len,), dtype='int32')\n", 253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n", 254 | " _embed = SpatialDropout1D(0.15)(_embed)\n", 255 | " warppers = []\n", 256 | " for _kernel_size in kernel_size:\n", 257 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)\n", 258 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n", 259 | " \n", 260 | " fc = concatenate(warppers)\n", 261 | " fc = Dropout(0.5)(fc)\n", 262 | " #fc = BatchNormalization()(fc)\n", 263 | " fc = Dense(256, activation='relu')(fc)\n", 264 | " fc = Dropout(0.25)(fc)\n", 265 | " #fc = BatchNormalization()(fc) \n", 266 | " preds = Dense(8, activation = 'softmax')(fc)\n", 267 | " \n", 268 | " model = Model(inputs=_input, outputs=preds)\n", 269 | " \n", 270 | " model.compile(loss='categorical_crossentropy',\n", 271 | " optimizer='adam',\n", 272 | " metrics=['accuracy'])\n", 273 | " return model" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 18, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "train_labels = pd.get_dummies(train_df.label).values\n", 283 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n", 284 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 20, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 21, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "max_len = 6000\n", 303 | "max_cnt = 295\n", 304 | "embed_size = 256\n", 305 | "num_filters = 64\n", 306 | "kernel_size = [2,4,6,8,10,12,14]\n", 307 | "conv_action = 'relu'\n", 308 | "mask_zero = False\n", 309 | "TRAIN = True" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 25, 315 | "metadata": { 316 | "scrolled": false 317 | }, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "FOLD: \n", 324 | "2780 11107\n", 325 | "2780/2780 [==============================] - 5s 2ms/step\n", 326 | "12955/12955 [==============================] - 18s 1ms/step\n", 327 | "FOLD: \n", 328 | "2779 11108\n", 329 | "2779/2779 [==============================] - 4s 2ms/step\n", 330 | "12955/12955 [==============================] - 18s 1ms/step\n", 331 | "FOLD: \n", 332 | "2777 11110\n", 333 | "2777/2777 [==============================] - 4s 2ms/step\n", 334 | "12955/12955 [==============================] - 18s 1ms/step\n", 335 | "FOLD: \n", 336 | "2776 11111\n", 337 | "2776/2776 [==============================] - 4s 2ms/step\n", 338 | "12955/12955 [==============================] - 18s 1ms/step\n", 339 | "FOLD: \n", 340 | "2775 11112\n", 341 | "2775/2775 [==============================] - 5s 2ms/step\n", 342 | "12955/12955 [==============================] - 19s 1ms/step\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "import os\n", 348 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 349 | "meta_train = np.zeros(shape = (len(train_seq),8))\n", 350 | "meta_test = np.zeros(shape = (len(test_seq),8))\n", 351 | "FLAG = False\n", 352 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 353 | " print('FOLD: '.format(i))\n", 354 | " print(len(te_ind),len(tr_ind))\n", 355 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n", 356 | " model_name = 'benchmark_textcnn_fold_'+str(i)\n", 357 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n", 358 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n", 359 | " \n", 360 | " model = TextCNN(max_len,max_cnt,embed_size,\n", 361 | " num_filters,kernel_size,\n", 362 | " conv_action,\n", 363 | " mask_zero)\n", 364 | " \n", 365 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n", 366 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n", 367 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n", 368 | " if TRAIN and FLAG:\n", 369 | " model.fit(X_train,X_train_label,\n", 370 | " validation_data=(X_val,X_val_label),\n", 371 | " epochs=100,batch_size=64,\n", 372 | " shuffle=True,\n", 373 | " callbacks=[early_stopping,model_checkpoint]\n", 374 | " )\n", 375 | " \n", 376 | " model.load_weights(model_save_path)\n", 377 | " pred_val = model.predict(X_val,batch_size=128,verbose=1)\n", 378 | " pred_test = model.predict(test_seq,batch_size=128,verbose=1)\n", 379 | " \n", 380 | " meta_train[te_ind] = pred_val\n", 381 | " meta_test += pred_test\n", 382 | " K.clear_session()\n", 383 | "meta_test /= 5.0\n" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 37, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "pd.to_pickle(meta_train,'../train_meta_cnn.pkl')\n", 393 | "pd.to_pickle(meta_test,'../test_meta_cnn.pkl')" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 38, 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "text/plain": [ 404 | "'/home/enjoy/tianchi/安全赛复赛/src'" 405 | ] 406 | }, 407 | "execution_count": 38, 408 | "metadata": {}, 409 | "output_type": "execute_result" 410 | } 411 | ], 412 | "source": [ 413 | "%pwd" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [] 422 | } 423 | ], 424 | "metadata": { 425 | "kernelspec": { 426 | "display_name": "Python 3", 427 | "language": "python", 428 | "name": "python3" 429 | }, 430 | "language_info": { 431 | "codemirror_mode": { 432 | "name": "ipython", 433 | "version": 3 434 | }, 435 | "file_extension": ".py", 436 | "mimetype": "text/x-python", 437 | "name": "python", 438 | "nbconvert_exporter": "python", 439 | "pygments_lexer": "ipython3", 440 | "version": "3.6.5" 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 2 445 | } 446 | -------------------------------------------------------------------------------- /CNN_metafeature_dilated.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 13 | " return f(*args, **kwds)\n", 14 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 15 | " return f(*args, **kwds)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import os\n", 23 | "from tqdm import tqdm\n", 24 | "from sklearn.preprocessing import LabelBinarizer,LabelEncoder" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "path = '../input/'\n", 34 | "train = pd.read_csv(path + 'final_train.csv')\n", 35 | "test = pd.read_csv(path + 'final_test.csv')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "((89806693, 5), (79288375, 4))" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "train.shape,test.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "unique_api = train['api'].unique()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "(295,)" 76 | ] 77 | }, 78 | "execution_count": 5, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "unique_api.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n", 94 | "index2api = {(i+1):item for i,item in enumerate(unique_api)}" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "train['api_idx'] = train['api'].map(api2index)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 8, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "test['api_idx'] = test['api'].map(api2index)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 9, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n", 122 | "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "def get_sequence(df,period_idx):\n", 132 | " seq_list = []\n", 133 | " for _id,begin in enumerate(period_idx[:-1]):\n", 134 | " seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n", 135 | " seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n", 136 | " return seq_list" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n", 146 | "test_df = test[['file_id']].drop_duplicates(keep='first')" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 12, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "train_df['seq'] = get_sequence(train,train_period_idx)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 13, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "test_df['seq'] = get_sequence(test,test_period_idx)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 14, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "(19350.97816934013, 6466.961402750774, 888204)" 176 | ] 177 | }, 178 | "execution_count": 14, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 15, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "(15911.676663585444, 6120.291393284446, 769590)" 196 | ] 197 | }, 198 | "execution_count": 15, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 16, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stderr", 214 | "output_type": "stream", 215 | "text": [ 216 | "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 217 | " return f(*args, **kwds)\n", 218 | "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 219 | " from ._conv import register_converters as _register_converters\n", 220 | "Using TensorFlow backend.\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "from keras.preprocessing.text import Tokenizer\n", 226 | "from keras.preprocessing.sequence import pad_sequences\n", 227 | "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n", 228 | "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n", 229 | "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n", 230 | "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n", 231 | "from keras.models import Model\n", 232 | "from keras.optimizers import RMSprop,Adam\n", 233 | "from keras.layers.normalization import BatchNormalization\n", 234 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 235 | "from keras.optimizers import SGD\n", 236 | "from keras import backend as K\n", 237 | "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n", 238 | "from keras.layers import SpatialDropout1D\n", 239 | "from keras.layers.wrappers import Bidirectional" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 17, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "def TextCNN(max_len,max_cnt,embed_size,\n", 249 | " num_filters,kernel_size,\n", 250 | " conv_action,\n", 251 | " mask_zero):\n", 252 | " _input = Input(shape=(max_len,), dtype='int32')\n", 253 | " _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n", 254 | " _embed = SpatialDropout1D(0.25)(_embed)\n", 255 | " warppers = []\n", 256 | " for _kernel_size in kernel_size:\n", 257 | " for dilated_rate in [1,2,3,4]:\n", 258 | " conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action, dilation_rate=dilated_rate)(_embed)\n", 259 | " warppers.append(GlobalMaxPooling1D()(conv1d))\n", 260 | " \n", 261 | " fc = concatenate(warppers)\n", 262 | " fc = Dropout(0.5)(fc)\n", 263 | " #fc = BatchNormalization()(fc)\n", 264 | " fc = Dense(256, activation='relu')(fc)\n", 265 | " fc = Dropout(0.25)(fc)\n", 266 | " #fc = BatchNormalization()(fc) \n", 267 | " preds = Dense(8, activation = 'softmax')(fc)\n", 268 | " \n", 269 | " model = Model(inputs=_input, outputs=preds)\n", 270 | " \n", 271 | " model.compile(loss='categorical_crossentropy',\n", 272 | " optimizer='adam',\n", 273 | " metrics=['accuracy'])\n", 274 | " return model" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 18, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "train_labels = pd.get_dummies(train_df.label).values\n", 284 | "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n", 285 | "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 20, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 21, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "max_len = 6000\n", 304 | "max_cnt = 295\n", 305 | "embed_size = 256\n", 306 | "num_filters = 64\n", 307 | "kernel_size = [2,3,4,5]\n", 308 | "conv_action = 'relu'\n", 309 | "mask_zero = False\n", 310 | "TRAIN = True" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 22, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "FOLD: \n", 323 | "2780 11107\n", 324 | "FOLD: \n", 325 | "2779 11108\n", 326 | "FOLD: \n", 327 | "2777 11110\n", 328 | "FOLD: \n", 329 | "2776 11111\n", 330 | "Train on 11111 samples, validate on 2776 samples\n", 331 | "Epoch 1/100\n", 332 | "11111/11111 [==============================] - 142s 13ms/step - loss: 0.9257 - acc: 0.6915 - val_loss: 0.4994 - val_acc: 0.8505\n", 333 | "Epoch 2/100\n", 334 | "11111/11111 [==============================] - 116s 10ms/step - loss: 0.5334 - acc: 0.8335 - val_loss: 0.4226 - val_acc: 0.8689\n", 335 | "Epoch 3/100\n", 336 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4632 - acc: 0.8550 - val_loss: 0.3850 - val_acc: 0.8761\n", 337 | "Epoch 4/100\n", 338 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.4105 - acc: 0.8701 - val_loss: 0.3808 - val_acc: 0.8754\n", 339 | "Epoch 5/100\n", 340 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3784 - acc: 0.8763 - val_loss: 0.3663 - val_acc: 0.8829\n", 341 | "Epoch 6/100\n", 342 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3536 - acc: 0.8840 - val_loss: 0.3467 - val_acc: 0.8872\n", 343 | "Epoch 7/100\n", 344 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3420 - acc: 0.8903 - val_loss: 0.3426 - val_acc: 0.8909\n", 345 | "Epoch 8/100\n", 346 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3284 - acc: 0.8941 - val_loss: 0.3377 - val_acc: 0.8945\n", 347 | "Epoch 9/100\n", 348 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3133 - acc: 0.8936 - val_loss: 0.3380 - val_acc: 0.8945\n", 349 | "Epoch 10/100\n", 350 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.3034 - acc: 0.8971 - val_loss: 0.3415 - val_acc: 0.8923\n", 351 | "Epoch 11/100\n", 352 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2916 - acc: 0.9007 - val_loss: 0.3232 - val_acc: 0.8995\n", 353 | "Epoch 12/100\n", 354 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2765 - acc: 0.9058 - val_loss: 0.3402 - val_acc: 0.8934\n", 355 | "Epoch 13/100\n", 356 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2657 - acc: 0.9086 - val_loss: 0.3294 - val_acc: 0.8984\n", 357 | "Epoch 14/100\n", 358 | "11111/11111 [==============================] - 113s 10ms/step - loss: 0.2620 - acc: 0.9079 - val_loss: 0.3411 - val_acc: 0.8977\n", 359 | "FOLD: \n", 360 | "2775 11112\n", 361 | "Train on 11112 samples, validate on 2775 samples\n", 362 | "Epoch 1/100\n", 363 | "11112/11112 [==============================] - 116s 10ms/step - loss: 0.9019 - acc: 0.7001 - val_loss: 0.4956 - val_acc: 0.8436\n", 364 | "Epoch 2/100\n", 365 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.5189 - acc: 0.8322 - val_loss: 0.4210 - val_acc: 0.8695\n", 366 | "Epoch 3/100\n", 367 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4525 - acc: 0.8543 - val_loss: 0.3906 - val_acc: 0.8778\n", 368 | "Epoch 4/100\n", 369 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.4038 - acc: 0.8721 - val_loss: 0.3832 - val_acc: 0.8674\n", 370 | "Epoch 5/100\n", 371 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3802 - acc: 0.8790 - val_loss: 0.3687 - val_acc: 0.8836\n", 372 | "Epoch 6/100\n", 373 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3563 - acc: 0.8813 - val_loss: 0.3739 - val_acc: 0.8807\n", 374 | "Epoch 7/100\n", 375 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3277 - acc: 0.8909 - val_loss: 0.3597 - val_acc: 0.8840\n", 376 | "Epoch 8/100\n", 377 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3239 - acc: 0.8935 - val_loss: 0.3534 - val_acc: 0.8901\n", 378 | "Epoch 9/100\n", 379 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.3061 - acc: 0.8954 - val_loss: 0.3581 - val_acc: 0.8861\n", 380 | "Epoch 10/100\n", 381 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2973 - acc: 0.8994 - val_loss: 0.3528 - val_acc: 0.8901\n", 382 | "Epoch 11/100\n", 383 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2875 - acc: 0.9035 - val_loss: 0.3537 - val_acc: 0.8847\n", 384 | "Epoch 12/100\n", 385 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2736 - acc: 0.9060 - val_loss: 0.3596 - val_acc: 0.8908\n", 386 | "Epoch 13/100\n", 387 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2613 - acc: 0.9078 - val_loss: 0.3521 - val_acc: 0.8908\n", 388 | "Epoch 14/100\n", 389 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2639 - acc: 0.9055 - val_loss: 0.3457 - val_acc: 0.8926\n", 390 | "Epoch 15/100\n", 391 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2514 - acc: 0.9121 - val_loss: 0.3702 - val_acc: 0.8865\n", 392 | "Epoch 16/100\n", 393 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2497 - acc: 0.9112 - val_loss: 0.3684 - val_acc: 0.8905\n", 394 | "Epoch 17/100\n", 395 | "11112/11112 [==============================] - 113s 10ms/step - loss: 0.2366 - acc: 0.9147 - val_loss: 0.3700 - val_acc: 0.8908\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "import os\n", 401 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n", 402 | "meta_train = np.zeros(shape = (len(train_seq),8))\n", 403 | "meta_test = np.zeros(shape = (len(test_seq),8))\n", 404 | "FLAG = False\n", 405 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 406 | " if i in [3,4]:\n", 407 | " FLAG = True\n", 408 | " print('FOLD: '.format(i))\n", 409 | " print(len(te_ind),len(tr_ind))\n", 410 | " model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n", 411 | " model_name = 'benchmark_dilated_textcnn_fold_'+str(i)\n", 412 | " X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n", 413 | " X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n", 414 | " \n", 415 | " model = TextCNN(max_len,max_cnt,embed_size,\n", 416 | " num_filters,kernel_size,\n", 417 | " conv_action,\n", 418 | " mask_zero)\n", 419 | " \n", 420 | " model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n", 421 | " early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n", 422 | " model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n", 423 | " if TRAIN and FLAG:\n", 424 | " model.fit(X_train,X_train_label,\n", 425 | " validation_data=(X_val,X_val_label),\n", 426 | " epochs=100,batch_size=64,\n", 427 | " shuffle=True,\n", 428 | " callbacks=[early_stopping,model_checkpoint]\n", 429 | " )\n", 430 | " \n", 431 | " #model.load_weights(model_save_path)\n", 432 | " #pred_val = model.predict(X_val,batch_size=128)\n", 433 | " #pred_test = model.predict(test_seq,batch_size=128)\n", 434 | " \n", 435 | " #meta_train[te_ind] = pred_val\n", 436 | " #meta_test += pred_test\n", 437 | " FLAG = False\n", 438 | " #K.clear_session()\n", 439 | "#meta_test /= 5.0\n" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "pd.to_pickle(meta_train,'../feature_final/train_meta_dilated_cnn.pkl')\n", 449 | "pd.to_pickle(meta_test,'../feature_final/test_meta_dilated_cnn.pkl')" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "print '1322'" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [] 467 | } 468 | ], 469 | "metadata": { 470 | "kernelspec": { 471 | "display_name": "Python 3", 472 | "language": "python", 473 | "name": "python3" 474 | }, 475 | "language_info": { 476 | "codemirror_mode": { 477 | "name": "ipython", 478 | "version": 3 479 | }, 480 | "file_extension": ".py", 481 | "mimetype": "text/x-python", 482 | "name": "python", 483 | "nbconvert_exporter": "python", 484 | "pygments_lexer": "ipython3", 485 | "version": "3.6.5" 486 | } 487 | }, 488 | "nbformat": 4, 489 | "nbformat_minor": 2 490 | } 491 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 第三届阿里云安全算法挑战赛冠军代码 2 | ## [比赛链接](https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.100066.0.0.6acd33afwZ9hM7&raceId=231668) 3 | 4 | ## ppt: 上地西二旗人民.pptx 5 | 6 | ## 代码按照以下运行顺序 7 | * main_train.ipynb:生成train数据集特征 8 | * main_test.ipynb:生成test数据集特征 9 | * gene_npy.ipynb:将特征数据转换为npy文件 10 | * lgb_meta_features.ipynb:生成lgb元特征 11 | * CNN_metafeature.ipynb:生成cnn元特征 12 | * CNN_metafeature_dilated.ipynb:生成cnn_dilated元特征 13 | * pickle_pre.ipynb:解决py2和py3中.pkl文件不兼容的问题 14 | * submit.ipynb:stacking生成最终结果 15 | -------------------------------------------------------------------------------- /gene_npy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 4, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "test_data_2gram_final = pd.read_csv('./test_data_2gram_final.csv')\n", 20 | "train_data_2gram_final = pd.read_csv('./train_data_2gram_final.csv')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 6, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "cols = [item for item in train_data_2gram_final.columns if item not in ['label']]\n", 30 | "np.save('../X_test.npy',test_data_2gram_final[cols].values)\n", 31 | "np.save('../X_train.npy',train_data_2gram_final[cols].values)\n", 32 | "np.save('../labels.npy',train_data_2gram_final['label'].values)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 8, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "((13887, 3252), (12955, 3251))" 44 | ] 45 | }, 46 | "execution_count": 8, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "train_data_2gram_final.shape,test_data_2gram_final.shape" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.6.5" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 2 84 | } 85 | -------------------------------------------------------------------------------- /lgb_meta_features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/user/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 13 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import lightgbm as lgb\n", 21 | "from sklearn.cross_validation import train_test_split\n", 22 | "import gc\n", 23 | "from sklearn.preprocessing import OneHotEncoder\n", 24 | "from sklearn.cross_validation import StratifiedKFold\n", 25 | "import datetime" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "cur time = 2018/09/21 18:54:08\n", 38 | "(13887, 3251) (12955, 3251)\n", 39 | "cur time = 2018/09/21 18:54:08\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 45 | "train = np.load('../X_train.npy')\n", 46 | "test = np.load('../X_test.npy')\n", 47 | "train_labels = np.load('../labels.npy')\n", 48 | "print train.shape,test.shape\n", 49 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 59 | "\n", 60 | "meta_train = np.zeros(shape = (len(train),8))\n", 61 | "meta_test = np.zeros(shape = (len(test),8))\n", 62 | "\n", 63 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 64 | " print 'FOLD: ',i\n", 65 | " print len(te_ind),len(tr_ind)\n", 66 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 67 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 68 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 69 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 70 | " params = {\n", 71 | " 'task':'train', \n", 72 | " 'boosting_type':'gbdt',\n", 73 | " 'num_leaves': 15,\n", 74 | " 'objective': 'multiclass',\n", 75 | " 'num_class':8,\n", 76 | " 'learning_rate': 0.05,\n", 77 | " 'feature_fraction': 0.85,\n", 78 | " 'subsample':0.85,\n", 79 | " 'num_threads': 32,\n", 80 | " 'metric':'multi_logloss',\n", 81 | " 'seed':100\n", 82 | " } \n", 83 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 84 | " pred_val = model.predict(X_val)\n", 85 | " pred_test = model.predict(test)\n", 86 | " \n", 87 | " meta_train[te_ind] = pred_val\n", 88 | " meta_test += pred_test\n", 89 | "meta_test /= 5.0" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "pd.to_pickle(meta_train,'../train_meta_lgb_1.pkl')\n", 99 | "pd.to_pickle(meta_test,'../test_meta_lgb_1.pkl')" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": { 106 | "scrolled": false 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "FOLD: 0\n", 114 | "2780 11107\n", 115 | "Training until validation scores don't improve for 100 rounds.\n", 116 | "[100]\ttraining's multi_logloss: 0.105693\tvalid_1's multi_logloss: 0.290438\n", 117 | "[200]\ttraining's multi_logloss: 0.0243107\tvalid_1's multi_logloss: 0.28446\n", 118 | "Early stopping, best iteration is:\n", 119 | "[145]\ttraining's multi_logloss: 0.0517928\tvalid_1's multi_logloss: 0.277273\n", 120 | "FOLD: 1\n", 121 | "2779 11108\n", 122 | "Training until validation scores don't improve for 100 rounds.\n", 123 | "[100]\ttraining's multi_logloss: 0.108126\tvalid_1's multi_logloss: 0.284527\n", 124 | "[200]\ttraining's multi_logloss: 0.0254294\tvalid_1's multi_logloss: 0.283195\n", 125 | "Early stopping, best iteration is:\n", 126 | "[139]\ttraining's multi_logloss: 0.0583621\tvalid_1's multi_logloss: 0.273231\n", 127 | "FOLD: 2\n", 128 | "2777 11110\n", 129 | "Training until validation scores don't improve for 100 rounds.\n", 130 | "[100]\ttraining's multi_logloss: 0.107591\tvalid_1's multi_logloss: 0.271276\n", 131 | "[200]\ttraining's multi_logloss: 0.0256978\tvalid_1's multi_logloss: 0.267876\n", 132 | "Early stopping, best iteration is:\n", 133 | "[151]\ttraining's multi_logloss: 0.0490566\tvalid_1's multi_logloss: 0.258754\n", 134 | "FOLD: 3\n", 135 | "2776 11111\n", 136 | "Training until validation scores don't improve for 100 rounds.\n", 137 | "[100]\ttraining's multi_logloss: 0.109872\tvalid_1's multi_logloss: 0.2752\n", 138 | "[200]\ttraining's multi_logloss: 0.0267958\tvalid_1's multi_logloss: 0.266528\n", 139 | "Early stopping, best iteration is:\n", 140 | "[153]\ttraining's multi_logloss: 0.0492415\tvalid_1's multi_logloss: 0.260417\n", 141 | "FOLD: 4\n", 142 | "2775 11112\n", 143 | "Training until validation scores don't improve for 100 rounds.\n", 144 | "[100]\ttraining's multi_logloss: 0.108239\tvalid_1's multi_logloss: 0.286993\n", 145 | "[200]\ttraining's multi_logloss: 0.0260953\tvalid_1's multi_logloss: 0.276078\n", 146 | "Early stopping, best iteration is:\n", 147 | "[155]\ttraining's multi_logloss: 0.0471788\tvalid_1's multi_logloss: 0.270497\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 153 | "\n", 154 | "meta_train = np.zeros(shape = (len(train),8))\n", 155 | "meta_test = np.zeros(shape = (len(test),8))\n", 156 | "\n", 157 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 158 | " print 'FOLD: ',i\n", 159 | " print len(te_ind),len(tr_ind)\n", 160 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 161 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 162 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 163 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 164 | " params = {\n", 165 | " 'task':'train', \n", 166 | " 'boosting_type':'gbdt',\n", 167 | " 'num_leaves': 31,\n", 168 | " 'objective': 'multiclass',\n", 169 | " 'num_class':8,\n", 170 | " 'learning_rate': 0.05,\n", 171 | " 'feature_fraction': 0.85,\n", 172 | " 'subsample':0.85,\n", 173 | " 'num_threads': 32,\n", 174 | " 'metric':'multi_logloss',\n", 175 | " 'seed':100\n", 176 | " } \n", 177 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 178 | " pred_val = model.predict(X_val)\n", 179 | " pred_test = model.predict(test)\n", 180 | " \n", 181 | " meta_train[te_ind] = pred_val\n", 182 | " meta_test += pred_test\n", 183 | "meta_test /= 5.0" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 6, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "pd.to_pickle(meta_train,'../train_meta_lgb_2.pkl')\n", 193 | "pd.to_pickle(meta_test,'../test_meta_lgb_2.pkl')" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 7, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "FOLD: 0\n", 206 | "2780 11107\n", 207 | "Training until validation scores don't improve for 100 rounds.\n", 208 | "[100]\ttraining's multi_logloss: 0.126813\tvalid_1's multi_logloss: 0.299223\n", 209 | "[200]\ttraining's multi_logloss: 0.0319222\tvalid_1's multi_logloss: 0.278803\n", 210 | "Early stopping, best iteration is:\n", 211 | "[161]\ttraining's multi_logloss: 0.0520005\tvalid_1's multi_logloss: 0.276196\n", 212 | "FOLD: 1\n", 213 | "2779 11108\n", 214 | "Training until validation scores don't improve for 100 rounds.\n", 215 | "[100]\ttraining's multi_logloss: 0.128834\tvalid_1's multi_logloss: 0.292494\n", 216 | "[200]\ttraining's multi_logloss: 0.0332951\tvalid_1's multi_logloss: 0.277843\n", 217 | "Early stopping, best iteration is:\n", 218 | "[153]\ttraining's multi_logloss: 0.0597567\tvalid_1's multi_logloss: 0.272742\n", 219 | "FOLD: 2\n", 220 | "2777 11110\n", 221 | "Training until validation scores don't improve for 100 rounds.\n", 222 | "[100]\ttraining's multi_logloss: 0.128497\tvalid_1's multi_logloss: 0.279648\n", 223 | "[200]\ttraining's multi_logloss: 0.0334364\tvalid_1's multi_logloss: 0.263845\n", 224 | "Early stopping, best iteration is:\n", 225 | "[159]\ttraining's multi_logloss: 0.0551787\tvalid_1's multi_logloss: 0.25859\n", 226 | "FOLD: 3\n", 227 | "2776 11111\n", 228 | "Training until validation scores don't improve for 100 rounds.\n", 229 | "[100]\ttraining's multi_logloss: 0.130386\tvalid_1's multi_logloss: 0.286192\n", 230 | "[200]\ttraining's multi_logloss: 0.0347223\tvalid_1's multi_logloss: 0.263253\n", 231 | "Early stopping, best iteration is:\n", 232 | "[169]\ttraining's multi_logloss: 0.0501232\tvalid_1's multi_logloss: 0.260649\n", 233 | "FOLD: 4\n", 234 | "2775 11112\n", 235 | "Training until validation scores don't improve for 100 rounds.\n", 236 | "[100]\ttraining's multi_logloss: 0.129009\tvalid_1's multi_logloss: 0.296055\n", 237 | "[200]\ttraining's multi_logloss: 0.0340881\tvalid_1's multi_logloss: 0.274158\n", 238 | "Early stopping, best iteration is:\n", 239 | "[173]\ttraining's multi_logloss: 0.0469372\tvalid_1's multi_logloss: 0.272973\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 245 | "\n", 246 | "meta_train = np.zeros(shape = (len(train),8))\n", 247 | "meta_test = np.zeros(shape = (len(test),8))\n", 248 | "\n", 249 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 250 | " print 'FOLD: ',i\n", 251 | " print len(te_ind),len(tr_ind)\n", 252 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 253 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 254 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 255 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 256 | " params = {\n", 257 | " 'task':'train', \n", 258 | " 'boosting_type':'gbdt',\n", 259 | " 'num_leaves': 31,\n", 260 | " 'objective': 'multiclass',\n", 261 | " 'num_class':8,\n", 262 | " 'learning_rate': 0.045,\n", 263 | " 'feature_fraction': 0.8,\n", 264 | " 'subsample':0.8,\n", 265 | " 'num_threads': 32,\n", 266 | " 'metric':'multi_logloss',\n", 267 | " 'seed':100\n", 268 | " } \n", 269 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 270 | " pred_val = model.predict(X_val)\n", 271 | " pred_test = model.predict(test)\n", 272 | " \n", 273 | " meta_train[te_ind] = pred_val\n", 274 | " meta_test += pred_test\n", 275 | "meta_test /= 5.0" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 8, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "pd.to_pickle(meta_train,'../train_meta_lgb_3.pkl')\n", 285 | "pd.to_pickle(meta_test,'../test_meta_lgb_3.pkl')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 9, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "FOLD: 0\n", 305 | "2780 11107\n", 306 | "Training until validation scores don't improve for 100 rounds.\n", 307 | "[100]\ttraining's multi_logloss: 0.0771172\tvalid_1's multi_logloss: 0.289138\n", 308 | "[200]\ttraining's multi_logloss: 0.00851115\tvalid_1's multi_logloss: 0.298243\n", 309 | "Early stopping, best iteration is:\n", 310 | "[133]\ttraining's multi_logloss: 0.0357694\tvalid_1's multi_logloss: 0.27818\n", 311 | "FOLD: 1\n", 312 | "2779 11108\n", 313 | "Training until validation scores don't improve for 100 rounds.\n", 314 | "[100]\ttraining's multi_logloss: 0.0780999\tvalid_1's multi_logloss: 0.289059\n", 315 | "[200]\ttraining's multi_logloss: 0.00887645\tvalid_1's multi_logloss: 0.298286\n", 316 | "Early stopping, best iteration is:\n", 317 | "[134]\ttraining's multi_logloss: 0.0357742\tvalid_1's multi_logloss: 0.278663\n", 318 | "FOLD: 2\n", 319 | "2777 11110\n", 320 | "Training until validation scores don't improve for 100 rounds.\n", 321 | "[100]\ttraining's multi_logloss: 0.0784245\tvalid_1's multi_logloss: 0.274011\n", 322 | "[200]\ttraining's multi_logloss: 0.00891692\tvalid_1's multi_logloss: 0.282485\n", 323 | "Early stopping, best iteration is:\n", 324 | "[134]\ttraining's multi_logloss: 0.0356565\tvalid_1's multi_logloss: 0.263027\n", 325 | "FOLD: 3\n", 326 | "2776 11111\n", 327 | "Training until validation scores don't improve for 100 rounds.\n", 328 | "[100]\ttraining's multi_logloss: 0.0795669\tvalid_1's multi_logloss: 0.280272\n", 329 | "[200]\ttraining's multi_logloss: 0.00927117\tvalid_1's multi_logloss: 0.284248\n", 330 | "Early stopping, best iteration is:\n", 331 | "[135]\ttraining's multi_logloss: 0.0357068\tvalid_1's multi_logloss: 0.267277\n", 332 | "FOLD: 4\n", 333 | "2775 11112\n", 334 | "Training until validation scores don't improve for 100 rounds.\n", 335 | "[100]\ttraining's multi_logloss: 0.0782005\tvalid_1's multi_logloss: 0.287082\n", 336 | "[200]\ttraining's multi_logloss: 0.00896856\tvalid_1's multi_logloss: 0.294814\n", 337 | "Early stopping, best iteration is:\n", 338 | "[129]\ttraining's multi_logloss: 0.0400827\tvalid_1's multi_logloss: 0.277252\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)\n", 344 | "\n", 345 | "meta_train = np.zeros(shape = (len(train),8))\n", 346 | "meta_test = np.zeros(shape = (len(test),8))\n", 347 | "\n", 348 | "for i,(tr_ind,te_ind) in enumerate(skf):\n", 349 | " print 'FOLD: ',i\n", 350 | " print len(te_ind),len(tr_ind)\n", 351 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 352 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 353 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 354 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 355 | " params = {\n", 356 | " 'task':'train', \n", 357 | " 'boosting_type':'gbdt',\n", 358 | " 'num_leaves': 63,\n", 359 | " 'objective': 'multiclass',\n", 360 | " 'num_class':8,\n", 361 | " 'learning_rate': 0.045,\n", 362 | " 'feature_fraction': 0.5,\n", 363 | " 'subsample':0.7,\n", 364 | " 'num_threads': 54,\n", 365 | " 'metric':'multi_logloss',\n", 366 | " 'seed':100\n", 367 | " } \n", 368 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 369 | " pred_val = model.predict(X_val)\n", 370 | " pred_test = model.predict(test)\n", 371 | " \n", 372 | " meta_train[te_ind] = pred_val\n", 373 | " meta_test += pred_test\n", 374 | "meta_test /= 5.0\n", 375 | "\n", 376 | "pd.to_pickle(meta_train,'../train_meta_lgb_4.pkl')\n", 377 | "pd.to_pickle(meta_test,'../test_meta_lgb_4.pkl')" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [] 386 | } 387 | ], 388 | "metadata": { 389 | "kernelspec": { 390 | "display_name": "Python 3", 391 | "language": "python", 392 | "name": "python3" 393 | }, 394 | "language_info": { 395 | "codemirror_mode": { 396 | "name": "ipython", 397 | "version": 3 398 | }, 399 | "file_extension": ".py", 400 | "mimetype": "text/x-python", 401 | "name": "python", 402 | "nbconvert_exporter": "python", 403 | "pygments_lexer": "ipython3", 404 | "version": "3.6.5" 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 2 409 | } 410 | -------------------------------------------------------------------------------- /main_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 工具包导入&数据读取\n", 8 | "## 工具包导入" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "name": "stderr", 18 | "output_type": "stream", 19 | "text": [ 20 | "/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 21 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import numpy as np\n", 28 | "import seaborn as sns\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "#import lightgbm as lgb\n", 31 | "from sklearn.cross_validation import train_test_split\n", 32 | "import gc\n", 33 | "from sklearn.preprocessing import OneHotEncoder\n", 34 | "%matplotlib inline " 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "u'/mnt/disk0/home/zhongrunxing/jupyter_code/tianchi_safe'" 46 | ] 47 | }, 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "%pwd" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## 数据读取\n", 69 | "- 为了方便分析,我们读取3000万条数据进行处理" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "path = '/home/zhongrunxing/jupyter_code/tianchi_safe/input/'\n", 79 | "#train = pd.read_csv(path + 'final_train.csv',nrows=1000000)\n", 80 | "#train = pd.read_csv(path + 'final_test.csv',nrows=1000000)\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "(79288375, 4)\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "train = pd.read_csv(path + 'final_test.csv')\n", 98 | "print(train.shape)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "# 特征工程 & 验证结果(1-Gram)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "# train_data = train[['file_id','label']].drop_duplicates()\n", 122 | "# train_data.head()\n", 123 | "# train_data['label'].value_counts()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 6, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/html": [ 134 | "
\n", 135 | "\n", 148 | "\n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
file_id
01
972
14583
14744
16675
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " file_id\n", 182 | "0 1\n", 183 | "97 2\n", 184 | "1458 3\n", 185 | "1474 4\n", 186 | "1667 5" 187 | ] 188 | }, 189 | "execution_count": 6, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "train_data = train[['file_id']].drop_duplicates()\n", 196 | "train_data.head()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## 全局特征:\n", 211 | "- File_id (Api): count,nunique\n", 212 | "- File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range\n", 213 | "- File_id (Return Value): count,nunique,max,min,quantile(20,40,50,60,80),std,range\n", 214 | "- File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### File_id (Api): count,nunique" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 7, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/html": [ 232 | "
\n", 233 | "\n", 246 | "\n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | "
file_idapitidindex
01RegOpenKeyExA23320
11CopyFileA23321
21OpenSCManagerA23322
31CreateServiceA23323
41RegOpenKeyExA24680
\n", 294 | "
" 295 | ], 296 | "text/plain": [ 297 | " file_id api tid index\n", 298 | "0 1 RegOpenKeyExA 2332 0\n", 299 | "1 1 CopyFileA 2332 1\n", 300 | "2 1 OpenSCManagerA 2332 2\n", 301 | "3 1 CreateServiceA 2332 3\n", 302 | "4 1 RegOpenKeyExA 2468 0" 303 | ] 304 | }, 305 | "execution_count": 7, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "train.head()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 8, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "count\n" 324 | ] 325 | }, 326 | { 327 | "name": "stderr", 328 | "output_type": "stream", 329 | "text": [ 330 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 331 | "is deprecated and will be removed in a future version\n", 332 | " after removing the cwd from sys.path.\n" 333 | ] 334 | }, 335 | { 336 | "name": "stdout", 337 | "output_type": "stream", 338 | "text": [ 339 | "nunique\n" 340 | ] 341 | } 342 | ], 343 | "source": [ 344 | "api_opt = ['count','nunique'] \n", 345 | "for opt in api_opt:\n", 346 | " print(opt)\n", 347 | " tmp = train.groupby(['file_id'])['api'].agg({'fileid_api_' + opt: opt}).reset_index() \n", 348 | " train_data = pd.merge(train_data,tmp,how='left', on='file_id') " 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 9, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/html": [ 359 | "
\n", 360 | "\n", 373 | "\n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | "
file_idfileid_api_countfileid_api_nunique
019715
12136140
23169
3419334
4580334
\n", 415 | "
" 416 | ], 417 | "text/plain": [ 418 | " file_id fileid_api_count fileid_api_nunique\n", 419 | "0 1 97 15\n", 420 | "1 2 1361 40\n", 421 | "2 3 16 9\n", 422 | "3 4 193 34\n", 423 | "4 5 803 34" 424 | ] 425 | }, 426 | "execution_count": 9, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "train_data.head()" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "### File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 10, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "count\n" 452 | ] 453 | }, 454 | { 455 | "name": "stderr", 456 | "output_type": "stream", 457 | "text": [ 458 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 459 | "is deprecated and will be removed in a future version\n", 460 | " after removing the cwd from sys.path.\n" 461 | ] 462 | }, 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "nunique\n", 468 | "max\n", 469 | "min\n", 470 | "median\n", 471 | "std\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "tid_opt = ['count','nunique','max','min','median','std'] \n", 477 | "for opt in tid_opt:\n", 478 | " print(opt)\n", 479 | " tmp = train.groupby(['file_id'])['tid'].agg({'fileid_tid_' + opt: opt}).reset_index() \n", 480 | " train_data = pd.merge(train_data,tmp,how='left', on='file_id') " 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 11, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "secs = [0.2,0.4,0.6,0.8]\n", 490 | "for sec in secs: \n", 491 | " train_data['fileid_tid_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['tid'].quantile(sec).values\n", 492 | " \n", 493 | "train_data['fileid_tid_range'] = train.groupby(['file_id'])['tid'].quantile(0.975).values - train.groupby(['file_id'])['tid'].quantile(0.0125).values" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "### File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 12, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "count\n" 520 | ] 521 | }, 522 | { 523 | "name": "stderr", 524 | "output_type": "stream", 525 | "text": [ 526 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 527 | "is deprecated and will be removed in a future version\n", 528 | " after removing the cwd from sys.path.\n" 529 | ] 530 | }, 531 | { 532 | "name": "stdout", 533 | "output_type": "stream", 534 | "text": [ 535 | "nunique\n", 536 | "max\n", 537 | "min\n", 538 | "median\n", 539 | "std\n" 540 | ] 541 | } 542 | ], 543 | "source": [ 544 | "index_opt = ['count','nunique','max','min','median','std'] \n", 545 | "for opt in index_opt:\n", 546 | " print(opt)\n", 547 | " tmp = train.groupby(['file_id'])['index'].agg({'fileid_index_' + opt: opt}).reset_index() \n", 548 | " train_data = pd.merge(train_data,tmp,how='left', on='file_id') " 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 13, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "secs = [0.2,0.4,0.6,0.8]\n", 558 | "for sec in secs: \n", 559 | " train_data['fileid_index_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['index'].quantile(sec).values\n", 560 | " \n", 561 | "train_data['fileid_index_range'] = train.groupby(['file_id'])['index'].quantile(0.975).values - train.groupby(['file_id'])['index'].quantile(0.0125).values" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "### 全局特征的线下验证 ( 0.0969482)" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "#### 评估指标" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 14, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [ 584 | "def lgb_logloss(preds,data):\n", 585 | " labels_ = data.get_label()\n", 586 | " classes_ = np.unique(labels_) \n", 587 | " preds_prob = []\n", 588 | " for i in range(len(classes_)):\n", 589 | " preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)])\n", 590 | " preds_prob_ = np.vstack(preds_prob) \n", 591 | " \n", 592 | " loss = [] \n", 593 | " for i in range(preds_prob_.shape[1]): # 样本个数\n", 594 | " sum_ = 0 \n", 595 | " for j in range(preds_prob_.shape[0]): #类别个数\n", 596 | " pred = preds_prob_[j,i] # 第i个样本预测为第j类的概率\n", 597 | " if j == labels_[i]:\n", 598 | " sum_ += np.log(pred)\n", 599 | " else:\n", 600 | " sum_ += np.log(1 - pred) \n", 601 | " \n", 602 | " loss.append(sum_) \n", 603 | " \n", 604 | " return 'loss is: ' ,-1 * (np.sum(loss) / preds_prob_.shape[1]),False" 605 | ] 606 | }, 607 | { 608 | "cell_type": "markdown", 609 | "metadata": {}, 610 | "source": [ 611 | "#### 训练特征 & 标签" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 15, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "train_features = [col for col in train_data.columns if col!='label' and col!='file_id']\n", 621 | "train_label = 'label'" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 16, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [ 630 | "# train_X, test_X, train_Y, test_Y = train_test_split( train_data[train_features],train_data[train_label].values, test_size = 0.33) \n", 631 | "# del _\n", 632 | "# gc.collect()\n", 633 | "\n", 634 | "# train_ind = train_X.index\n", 635 | "# test_ind = test_X.index" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 17, 655 | "metadata": { 656 | "scrolled": true 657 | }, 658 | "outputs": [], 659 | "source": [ 660 | "# dtrain = lgb.Dataset(train_X,train_Y) \n", 661 | "# dval = lgb.Dataset(test_X,test_Y, reference = dtrain) \n", 662 | "\n", 663 | "# params = {\n", 664 | "# 'task':'train', \n", 665 | "# 'num_leaves': 255,\n", 666 | "# 'objective': 'multiclass',\n", 667 | "# 'num_class':8,\n", 668 | "# #'min_data_in_leaf': 40,\n", 669 | "# 'min_data_in_leaf': 1,\n", 670 | "# 'learning_rate': 0.05,\n", 671 | "# 'feature_fraction': 0.85,\n", 672 | "# 'bagging_fraction': 0.9,\n", 673 | "# 'bagging_freq': 5, \n", 674 | "# 'max_bin':128,\n", 675 | "# 'num_threads': 10,\n", 676 | "# 'random_state':100\n", 677 | "# } \n", 678 | "# lgb_model_0_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) " 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "### 全局特征扩充\n", 686 | "- File_id + return_value分段:计数" 687 | ] 688 | }, 689 | { 690 | "cell_type": "markdown", 691 | "metadata": {}, 692 | "source": [ 693 | "## 局部组合特征(展开形式)\n", 694 | "### File_id + Api \n", 695 | "- File_id + Api (tid): count,nunique\n", 696 | "- File_id + Api (return value): nunique, max, min, median, std\n", 697 | "- File_id + Api (index): nunique, max, min, median, std\n", 698 | "\n" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "#### File_id + Api (tid): count,nunique" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 18, 711 | "metadata": {}, 712 | "outputs": [], 713 | "source": [ 714 | "def groupby_pivot_features(data_merge, data_orig , groupby_features,col1 = None, col2 = None, opts = None):\n", 715 | " for opt in opts:\n", 716 | " print(opt)\n", 717 | " train_split = data_orig.groupby(['file_id',col1])[col2].agg({'fileid_' + col1 + '_'+col2+'_'+ str(opt):opt}).reset_index() \n", 718 | " \n", 719 | " train_split_ = pd.pivot_table(train_split, values = 'fileid_' + col1 + '_'+col2+'_'+ str(opt), index=['file_id'],columns=[col1])\n", 720 | " new_cols = [ 'fileid_' + col1 + '_'+col2+ '_' + opt + '_' + str(col) for col in train_split_.columns]\n", 721 | " \n", 722 | " groupby_features.append(new_cols)\n", 723 | " train_split_.columns = new_cols \n", 724 | "\n", 725 | " train_split_.reset_index(inplace = True)\n", 726 | " \n", 727 | " data_merge = pd.merge(data_merge,train_split_,how='left', on='file_id') \n", 728 | " return data_merge,groupby_features \n", 729 | " " 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 19, 735 | "metadata": {}, 736 | "outputs": [ 737 | { 738 | "name": "stdout", 739 | "output_type": "stream", 740 | "text": [ 741 | "count\n" 742 | ] 743 | }, 744 | { 745 | "name": "stderr", 746 | "output_type": "stream", 747 | "text": [ 748 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 749 | "is deprecated and will be removed in a future version\n", 750 | " after removing the cwd from sys.path.\n" 751 | ] 752 | }, 753 | { 754 | "name": "stdout", 755 | "output_type": "stream", 756 | "text": [ 757 | "nunique\n" 758 | ] 759 | } 760 | ], 761 | "source": [ 762 | "groupby_features = []\n", 763 | "api_opts = ['count', 'nunique']\n", 764 | "train_data_,groupby_features = groupby_pivot_features(train_data, train, groupby_features, col1 = 'api', col2 = 'tid', opts = api_opts)" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "#### File_id + Api (return value): nunique, max, min, median, std" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 20, 777 | "metadata": { 778 | "scrolled": true 779 | }, 780 | "outputs": [], 781 | "source": [ 782 | "# api_opts = ['nunique','max','min','median','std']\n", 783 | "# train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'return_value', opts = api_opts) " 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "#### File_id + Api(index): nunique, max, min, median, std" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 21, 796 | "metadata": {}, 797 | "outputs": [ 798 | { 799 | "name": "stdout", 800 | "output_type": "stream", 801 | "text": [ 802 | "nunique\n" 803 | ] 804 | }, 805 | { 806 | "name": "stderr", 807 | "output_type": "stream", 808 | "text": [ 809 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 810 | "is deprecated and will be removed in a future version\n", 811 | " after removing the cwd from sys.path.\n" 812 | ] 813 | }, 814 | { 815 | "name": "stdout", 816 | "output_type": "stream", 817 | "text": [ 818 | "max\n", 819 | "min\n", 820 | "median\n", 821 | "std\n" 822 | ] 823 | } 824 | ], 825 | "source": [ 826 | "api_opts = ['nunique','max','min','median','std']\n", 827 | "train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'index', opts = api_opts) " 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 22, 833 | "metadata": {}, 834 | "outputs": [ 835 | { 836 | "data": { 837 | "text/html": [ 838 | "
\n", 839 | "\n", 852 | "\n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | "
file_idfileid_api_countfileid_api_nuniquefileid_tid_countfileid_tid_nuniquefileid_tid_maxfileid_tid_minfileid_tid_medianfileid_tid_stdfileid_tid_quantile_20.0...fileid_api_index_std_recvfileid_api_index_std_recvfromfileid_api_index_std_selectfileid_api_index_std_sendfileid_api_index_std_sendtofileid_api_index_std_setsockoptfileid_api_index_std_shutdownfileid_api_index_std_socketfileid_api_index_std_systemfileid_api_index_std_timeGetTime
01971597425682332254457.2185482468.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1213614013617274824722524104.3991492472.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
231691612344234423440.0000002344.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3419334193325842452245250.9515082452.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
45803348033278023322376201.8268132332.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 1002 | "

5 rows × 2103 columns

\n", 1003 | "
" 1004 | ], 1005 | "text/plain": [ 1006 | " file_id fileid_api_count fileid_api_nunique fileid_tid_count \\\n", 1007 | "0 1 97 15 97 \n", 1008 | "1 2 1361 40 1361 \n", 1009 | "2 3 16 9 16 \n", 1010 | "3 4 193 34 193 \n", 1011 | "4 5 803 34 803 \n", 1012 | "\n", 1013 | " fileid_tid_nunique fileid_tid_max fileid_tid_min fileid_tid_median \\\n", 1014 | "0 4 2568 2332 2544 \n", 1015 | "1 7 2748 2472 2524 \n", 1016 | "2 1 2344 2344 2344 \n", 1017 | "3 3 2584 2452 2452 \n", 1018 | "4 3 2780 2332 2376 \n", 1019 | "\n", 1020 | " fileid_tid_std fileid_tid_quantile_20.0 ... \\\n", 1021 | "0 57.218548 2468.0 ... \n", 1022 | "1 104.399149 2472.0 ... \n", 1023 | "2 0.000000 2344.0 ... \n", 1024 | "3 50.951508 2452.0 ... \n", 1025 | "4 201.826813 2332.0 ... \n", 1026 | "\n", 1027 | " fileid_api_index_std_recv fileid_api_index_std_recvfrom \\\n", 1028 | "0 NaN NaN \n", 1029 | "1 NaN NaN \n", 1030 | "2 NaN NaN \n", 1031 | "3 NaN NaN \n", 1032 | "4 NaN NaN \n", 1033 | "\n", 1034 | " fileid_api_index_std_select fileid_api_index_std_send \\\n", 1035 | "0 NaN NaN \n", 1036 | "1 NaN NaN \n", 1037 | "2 NaN NaN \n", 1038 | "3 NaN NaN \n", 1039 | "4 NaN NaN \n", 1040 | "\n", 1041 | " fileid_api_index_std_sendto fileid_api_index_std_setsockopt \\\n", 1042 | "0 NaN NaN \n", 1043 | "1 NaN NaN \n", 1044 | "2 NaN NaN \n", 1045 | "3 NaN NaN \n", 1046 | "4 NaN NaN \n", 1047 | "\n", 1048 | " fileid_api_index_std_shutdown fileid_api_index_std_socket \\\n", 1049 | "0 NaN NaN \n", 1050 | "1 NaN NaN \n", 1051 | "2 NaN NaN \n", 1052 | "3 NaN NaN \n", 1053 | "4 NaN NaN \n", 1054 | "\n", 1055 | " fileid_api_index_std_system fileid_api_index_std_timeGetTime \n", 1056 | "0 NaN NaN \n", 1057 | "1 NaN NaN \n", 1058 | "2 NaN NaN \n", 1059 | "3 NaN NaN \n", 1060 | "4 NaN NaN \n", 1061 | "\n", 1062 | "[5 rows x 2103 columns]" 1063 | ] 1064 | }, 1065 | "execution_count": 22, 1066 | "metadata": {}, 1067 | "output_type": "execute_result" 1068 | } 1069 | ], 1070 | "source": [ 1071 | "train_data_.head()" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "markdown", 1076 | "metadata": {}, 1077 | "source": [ 1078 | "### 1阶特征的线下验证(File_id + Api)(0.0347293)" 1079 | ] 1080 | }, 1081 | { 1082 | "cell_type": "markdown", 1083 | "metadata": {}, 1084 | "source": [ 1085 | "### File_id + Index \n", 1086 | "- File_id + Index (api): count,nunique\n", 1087 | "- File_id + Index (return value): nunique, max, min, median, std(暂时先搁置)\n", 1088 | "- File_id + Index (tid): nunique, max, min, median, std(暂时先搁置)\n" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "markdown", 1093 | "metadata": {}, 1094 | "source": [ 1095 | "#### File_id +Tid (api): count,nunique" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "markdown", 1100 | "metadata": {}, 1101 | "source": [ 1102 | "#### File_id + Index特征过拟合,删除\n" 1103 | ] 1104 | }, 1105 | { 1106 | "cell_type": "code", 1107 | "execution_count": 23, 1108 | "metadata": { 1109 | "scrolled": true 1110 | }, 1111 | "outputs": [], 1112 | "source": [ 1113 | "# delcol = []\n", 1114 | "# for i in range(2):\n", 1115 | "# for item in groupby_features2[i]:\n", 1116 | "# delcol.append(item)" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": 24, 1122 | "metadata": {}, 1123 | "outputs": [], 1124 | "source": [ 1125 | "# train_data_.drop(delcol,axis=1,inplace=True)" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "markdown", 1130 | "metadata": {}, 1131 | "source": [ 1132 | "## 特征补充(加入index的差值特征)\n", 1133 | "- File_id + Api (index_diff): 'nunique','max','min','median','std'" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": 25, 1139 | "metadata": {}, 1140 | "outputs": [], 1141 | "source": [ 1142 | "train_diff = train.groupby(['file_id','tid'])['index'].diff().fillna(-999).values" 1143 | ] 1144 | }, 1145 | { 1146 | "cell_type": "code", 1147 | "execution_count": 26, 1148 | "metadata": {}, 1149 | "outputs": [], 1150 | "source": [ 1151 | "train['index_diff'] = train_diff" 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "code", 1156 | "execution_count": 27, 1157 | "metadata": {}, 1158 | "outputs": [], 1159 | "source": [ 1160 | "train_diff = train.loc[train.index_diff!=-999] " 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "code", 1165 | "execution_count": 28, 1166 | "metadata": {}, 1167 | "outputs": [ 1168 | { 1169 | "name": "stdout", 1170 | "output_type": "stream", 1171 | "text": [ 1172 | "nunique\n" 1173 | ] 1174 | }, 1175 | { 1176 | "name": "stderr", 1177 | "output_type": "stream", 1178 | "text": [ 1179 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 1180 | "is deprecated and will be removed in a future version\n", 1181 | " after removing the cwd from sys.path.\n" 1182 | ] 1183 | }, 1184 | { 1185 | "name": "stdout", 1186 | "output_type": "stream", 1187 | "text": [ 1188 | "max\n", 1189 | "min\n", 1190 | "median\n", 1191 | "std\n" 1192 | ] 1193 | } 1194 | ], 1195 | "source": [ 1196 | "api_opts = ['nunique','max','min','median','std']\n", 1197 | "train_data_,groupby_features = groupby_pivot_features(train_data_, train_diff, groupby_features, col1 = 'api', col2 = 'index_diff', opts = api_opts) " 1198 | ] 1199 | }, 1200 | { 1201 | "cell_type": "markdown", 1202 | "metadata": {}, 1203 | "source": [ 1204 | "### 线下验证(0.0346954)" 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": 29, 1210 | "metadata": { 1211 | "scrolled": true 1212 | }, 1213 | "outputs": [], 1214 | "source": [ 1215 | "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id']\n", 1216 | "# train_label = 'label'\n", 1217 | "# print(len(train_features))\n", 1218 | "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n", 1219 | "# dval = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n", 1220 | "\n", 1221 | "# params = {\n", 1222 | "# 'task':'train', \n", 1223 | "# 'num_leaves': 255,\n", 1224 | "# 'objective': 'multiclass',\n", 1225 | "# 'num_class':6,\n", 1226 | "# 'min_data_in_leaf': 40,\n", 1227 | "# 'learning_rate': 0.05,\n", 1228 | "# 'feature_fraction': 0.85,\n", 1229 | "# 'bagging_fraction': 0.9,\n", 1230 | "# 'bagging_freq': 5, \n", 1231 | "# 'max_bin':128,\n", 1232 | "# 'num_threads': 64,\n", 1233 | "# 'random_state':100\n", 1234 | "# } \n", 1235 | "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) " 1236 | ] 1237 | }, 1238 | { 1239 | "cell_type": "markdown", 1240 | "metadata": {}, 1241 | "source": [ 1242 | "### 删除quantile,std统计变量之后的验证(0.0350054) " 1243 | ] 1244 | }, 1245 | { 1246 | "cell_type": "code", 1247 | "execution_count": 30, 1248 | "metadata": { 1249 | "scrolled": true 1250 | }, 1251 | "outputs": [], 1252 | "source": [ 1253 | "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]\n", 1254 | "# train_label = 'label'\n", 1255 | "# print(len(train_features))\n", 1256 | "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n", 1257 | "# dval = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n", 1258 | "\n", 1259 | "# params = {\n", 1260 | "# 'task':'train', \n", 1261 | "# 'num_leaves': 255,\n", 1262 | "# 'objective': 'multiclass',\n", 1263 | "# 'num_class':6,\n", 1264 | "# 'min_data_in_leaf': 40,\n", 1265 | "# 'learning_rate': 0.05,\n", 1266 | "# 'feature_fraction': 0.85,\n", 1267 | "# 'bagging_fraction': 0.9,\n", 1268 | "# 'bagging_freq': 5, \n", 1269 | "# 'max_bin':128,\n", 1270 | "# 'num_threads': 64,\n", 1271 | "# 'random_state':100\n", 1272 | "# } \n", 1273 | "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) " 1274 | ] 1275 | }, 1276 | { 1277 | "cell_type": "code", 1278 | "execution_count": 31, 1279 | "metadata": {}, 1280 | "outputs": [], 1281 | "source": [ 1282 | "# train_data_.to_csv('/data/Data_JieZhang/TC_SAFE/train_val/train_data.csv',index = None) " 1283 | ] 1284 | }, 1285 | { 1286 | "cell_type": "markdown", 1287 | "metadata": {}, 1288 | "source": [ 1289 | "# 特征工程& 验证结果 2-Gram\n", 1290 | "## 全局特征\n", 1291 | "### File_id(Api_2):count,nunique" 1292 | ] 1293 | }, 1294 | { 1295 | "cell_type": "code", 1296 | "execution_count": 32, 1297 | "metadata": {}, 1298 | "outputs": [], 1299 | "source": [ 1300 | "train['api_shift'] = train['api'].shift(-1)\n", 1301 | "train['api_2'] = train['api'] +'_' + train['api_shift']" 1302 | ] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "execution_count": 33, 1307 | "metadata": {}, 1308 | "outputs": [], 1309 | "source": [ 1310 | "train.drop(['api_shift'],axis=1,inplace=True)" 1311 | ] 1312 | }, 1313 | { 1314 | "cell_type": "code", 1315 | "execution_count": 34, 1316 | "metadata": { 1317 | "scrolled": true 1318 | }, 1319 | "outputs": [], 1320 | "source": [ 1321 | "api_count = train['api_2'].value_counts()" 1322 | ] 1323 | }, 1324 | { 1325 | "cell_type": "code", 1326 | "execution_count": 35, 1327 | "metadata": {}, 1328 | "outputs": [ 1329 | { 1330 | "name": "stdout", 1331 | "output_type": "stream", 1332 | "text": [ 1333 | "count\n" 1334 | ] 1335 | }, 1336 | { 1337 | "name": "stderr", 1338 | "output_type": "stream", 1339 | "text": [ 1340 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 1341 | "is deprecated and will be removed in a future version\n", 1342 | " after removing the cwd from sys.path.\n" 1343 | ] 1344 | }, 1345 | { 1346 | "name": "stdout", 1347 | "output_type": "stream", 1348 | "text": [ 1349 | "nunique\n" 1350 | ] 1351 | } 1352 | ], 1353 | "source": [ 1354 | "api_opt = ['count','nunique'] \n", 1355 | "for opt in api_opt:\n", 1356 | " print(opt)\n", 1357 | " tmp = train.groupby(['file_id'])['api_2'].agg({'fileid_api_2_' + opt: opt}).reset_index() \n", 1358 | " train_data_ = pd.merge(train_data_,tmp,how='left', on='file_id') " 1359 | ] 1360 | }, 1361 | { 1362 | "cell_type": "markdown", 1363 | "metadata": {}, 1364 | "source": [ 1365 | "## 局部特征\n", 1366 | "### File_id + tid (Api_2): count特征" 1367 | ] 1368 | }, 1369 | { 1370 | "cell_type": "code", 1371 | "execution_count": 36, 1372 | "metadata": { 1373 | "scrolled": true 1374 | }, 1375 | "outputs": [], 1376 | "source": [ 1377 | "api_value_counts = pd.DataFrame(api_count).reset_index()\n", 1378 | "api_value_counts.columns = ['api_2','api_2_count']" 1379 | ] 1380 | }, 1381 | { 1382 | "cell_type": "code", 1383 | "execution_count": 37, 1384 | "metadata": {}, 1385 | "outputs": [], 1386 | "source": [ 1387 | "train = pd.merge(train, api_value_counts, on ='api_2' , how='left')" 1388 | ] 1389 | }, 1390 | { 1391 | "cell_type": "code", 1392 | "execution_count": 38, 1393 | "metadata": {}, 1394 | "outputs": [ 1395 | { 1396 | "name": "stdout", 1397 | "output_type": "stream", 1398 | "text": [ 1399 | "count\n" 1400 | ] 1401 | }, 1402 | { 1403 | "name": "stderr", 1404 | "output_type": "stream", 1405 | "text": [ 1406 | "/mnt/disk0/home/zhongrunxing/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:4: FutureWarning: using a dict on a Series for aggregation\n", 1407 | "is deprecated and will be removed in a future version\n", 1408 | " after removing the cwd from sys.path.\n" 1409 | ] 1410 | } 1411 | ], 1412 | "source": [ 1413 | "api_opts = ['count']\n", 1414 | "groupby_features = []\n", 1415 | "train_data_,groupby_features = groupby_pivot_features(train_data_, train.loc[train.api_2_count>=20], groupby_features, col1 = 'api_2', col2 = 'tid', opts = api_opts)" 1416 | ] 1417 | }, 1418 | { 1419 | "cell_type": "markdown", 1420 | "metadata": {}, 1421 | "source": [ 1422 | "### 线下验证( 0.0330886)" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "markdown", 1427 | "metadata": {}, 1428 | "source": [ 1429 | "### File_id + index (Api_2): max,min特征" 1430 | ] 1431 | }, 1432 | { 1433 | "cell_type": "code", 1434 | "execution_count": null, 1435 | "metadata": {}, 1436 | "outputs": [], 1437 | "source": [] 1438 | }, 1439 | { 1440 | "cell_type": "code", 1441 | "execution_count": null, 1442 | "metadata": {}, 1443 | "outputs": [], 1444 | "source": [] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "execution_count": null, 1449 | "metadata": {}, 1450 | "outputs": [], 1451 | "source": [] 1452 | }, 1453 | { 1454 | "cell_type": "code", 1455 | "execution_count": 39, 1456 | "metadata": { 1457 | "scrolled": true 1458 | }, 1459 | "outputs": [], 1460 | "source": [ 1461 | "# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]\n", 1462 | "# train_label = 'label'\n", 1463 | "\n", 1464 | "# train_ind = train_X.index\n", 1465 | "# test_ind = test_X.index\n", 1466 | "\n", 1467 | "# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) \n", 1468 | "# dval = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) \n", 1469 | "\n", 1470 | "# params = {\n", 1471 | "# 'task':'train', \n", 1472 | "# 'num_leaves': 255,\n", 1473 | "# 'objective': 'multiclass',\n", 1474 | "# 'num_class':8,\n", 1475 | "# 'min_data_in_leaf': 10,\n", 1476 | "# #'min_data_in_leaf': 1,\n", 1477 | "# 'learning_rate': 0.05,\n", 1478 | "# 'feature_fraction': 0.85,\n", 1479 | "# 'bagging_fraction': 0.9,\n", 1480 | "# 'bagging_freq': 5, \n", 1481 | "# 'max_bin':128,\n", 1482 | "# 'num_threads': 64,\n", 1483 | "# 'random_state':100\n", 1484 | "# } \n", 1485 | "# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss) " 1486 | ] 1487 | }, 1488 | { 1489 | "cell_type": "code", 1490 | "execution_count": 40, 1491 | "metadata": {}, 1492 | "outputs": [], 1493 | "source": [ 1494 | "# fea_imp = pd.DataFrame({'feature':train_features, 'imp':lgb_model_3_order.feature_importance()}).sort_values('imp')\n", 1495 | "# important_features = fea_imp.loc[fea_imp.imp >=1, 'feature'].values\n", 1496 | "# important_features = list(important_features)\n", 1497 | "\n", 1498 | "# important_features.append('file_id')\n", 1499 | "# important_features.append('label')\n", 1500 | "\n", 1501 | "# train_data_[important_features].to_csv('../feature_final/train_data_2gram.csv',index = None)\n", 1502 | " " 1503 | ] 1504 | }, 1505 | { 1506 | "cell_type": "code", 1507 | "execution_count": 41, 1508 | "metadata": {}, 1509 | "outputs": [], 1510 | "source": [ 1511 | "train_data_.to_csv('input/test_data_2gram.csv',index = None)" 1512 | ] 1513 | }, 1514 | { 1515 | "cell_type": "code", 1516 | "execution_count": null, 1517 | "metadata": {}, 1518 | "outputs": [], 1519 | "source": [ 1520 | "train.shape" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "markdown", 1525 | "metadata": {}, 1526 | "source": [ 1527 | "# 附录\n", 1528 | "tf-idf的1Gram特征可以替换api的次数特征等,加入tf-idf有提升,提升较小" 1529 | ] 1530 | } 1531 | ], 1532 | "metadata": { 1533 | "kernelspec": { 1534 | "display_name": "Python 3", 1535 | "language": "python", 1536 | "name": "python3" 1537 | }, 1538 | "language_info": { 1539 | "codemirror_mode": { 1540 | "name": "ipython", 1541 | "version": 3 1542 | }, 1543 | "file_extension": ".py", 1544 | "mimetype": "text/x-python", 1545 | "name": "python", 1546 | "nbconvert_exporter": "python", 1547 | "pygments_lexer": "ipython3", 1548 | "version": "3.6.5" 1549 | }, 1550 | "toc": { 1551 | "nav_menu": {}, 1552 | "number_sections": true, 1553 | "sideBar": true, 1554 | "skip_h1_title": false, 1555 | "title_cell": "Table of Contents", 1556 | "title_sidebar": "Contents", 1557 | "toc_cell": false, 1558 | "toc_position": { 1559 | "height": "calc(100% - 180px)", 1560 | "left": "10px", 1561 | "top": "150px", 1562 | "width": "384px" 1563 | }, 1564 | "toc_section_display": true, 1565 | "toc_window_display": true 1566 | } 1567 | }, 1568 | "nbformat": 4, 1569 | "nbformat_minor": 2 1570 | } 1571 | -------------------------------------------------------------------------------- /pickle_pre.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "meta_train = pd.read_pickle('../meta/train_meta_dilated_cnn.pkl')\n", 12 | "meta_test = pd.read_pickle('../meta/test_meta_dilated_cnn.pkl')\n", 13 | "\n", 14 | "import pickle\n", 15 | "\n", 16 | "f=open('../meta/train_meta_dilated_cnn_a.pkl','wb') \n", 17 | "pickle.dump(meta_train,f,0) \n", 18 | "f.close()\n", 19 | "\n", 20 | "f=open('../meta/test_meta_dilated_cnn_a.pkl','wb') \n", 21 | "pickle.dump(meta_test,f,0) \n", 22 | "f.close()" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "meta_train = pd.read_pickle('../meta/train_meta_cnn.pkl')\n", 32 | "meta_test = pd.read_pickle('../meta/test_meta_cnn.pkl')\n", 33 | "\n", 34 | "f=open('../meta/train_meta_cnn_a.pkl','wb') \n", 35 | "pickle.dump(meta_train,f,0) \n", 36 | "f.close()\n", 37 | "\n", 38 | "f=open('../meta/test_meta_cnn_a.pkl','wb') \n", 39 | "pickle.dump(meta_test,f,0) \n", 40 | "f.close()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "'/Users/didi/天池/安全赛复赛/temp'" 52 | ] 53 | }, 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "%pwd" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 3", 74 | "language": "python", 75 | "name": "python3" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 3 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython3", 87 | "version": "3.6.5" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } 93 | -------------------------------------------------------------------------------- /submit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# coding: utf-8\n", 12 | "\n", 13 | "# In[1]:\n", 14 | "\n", 15 | "\n", 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "import lightgbm as lgb\n", 19 | "from sklearn.cross_validation import train_test_split\n", 20 | "import gc\n", 21 | "from sklearn.preprocessing import OneHotEncoder\n", 22 | "import datetime\n", 23 | "from sklearn.cross_validation import StratifiedKFold\n", 24 | "\n", 25 | "# In[2]:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 27, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "cur time = 2018/09/21 20:10:16\n", 38 | "(13887, 3251) (12955, 3251)\n", 39 | "cur time = 2018/09/21 20:10:16\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 45 | "train = np.load('../X_train.npy')\n", 46 | "test = np.load('../X_test.npy')\n", 47 | "train_labels = np.load('../labels.npy')\n", 48 | "\n", 49 | "print train.shape,test.shape\n", 50 | "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 28, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "train_cnn_1 = pd.read_pickle('../train_meta_cnn_a.pkl')\n", 60 | "test_cnn_1 = pd.read_pickle('../test_meta_cnn_a.pkl')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 29, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "train_cnn_2 = pd.read_pickle('../train_meta_dilated_cnn_a.pkl')\n", 70 | "test_cnn_2 = pd.read_pickle('../test_meta_dilated_cnn_a.pkl')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 30, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "train_lgb_1 = pd.read_pickle('../train_meta_lgb_1.pkl')\n", 80 | "test_lgb_1 = pd.read_pickle('../test_meta_lgb_1.pkl')\n", 81 | "\n", 82 | "train_lgb_2 = pd.read_pickle('../train_meta_lgb_2.pkl')\n", 83 | "test_lgb_2 = pd.read_pickle('../test_meta_lgb_2.pkl')\n", 84 | "\n", 85 | "train_lgb_3 = pd.read_pickle('../train_meta_lgb_3.pkl')\n", 86 | "test_lgb_3 = pd.read_pickle('../test_meta_lgb_3.pkl')\n", 87 | "\n", 88 | "train_lgb_4 = pd.read_pickle('../train_meta_lgb_4.pkl')\n", 89 | "test_lgb_4 = pd.read_pickle('../test_meta_lgb_4.pkl')" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 33, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "train = np.hstack([train,train_cnn_1, train_cnn_2, train_lgb_1, train_lgb_2, train_lgb_3, train_lgb_4])\n", 99 | "test = np.hstack([test,test_cnn_1, test_cnn_2, test_lgb_1, test_lgb_2, test_lgb_3, test_lgb_4])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 36, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Times: 0\n", 112 | "cur time = 2018/09/21 20:12:20\n", 113 | "FOLD: 0\n", 114 | "2780 11107\n", 115 | "Training until validation scores don't improve for 100 rounds.\n", 116 | "[100]\ttraining's multi_logloss: 0.70958\tvalid_1's multi_logloss: 0.755308\n", 117 | "[200]\ttraining's multi_logloss: 0.354997\tvalid_1's multi_logloss: 0.434251\n", 118 | "[300]\ttraining's multi_logloss: 0.217096\tvalid_1's multi_logloss: 0.326171\n", 119 | "[400]\ttraining's multi_logloss: 0.152399\tvalid_1's multi_logloss: 0.289922\n", 120 | "[500]\ttraining's multi_logloss: 0.117076\tvalid_1's multi_logloss: 0.278519\n", 121 | "[600]\ttraining's multi_logloss: 0.094704\tvalid_1's multi_logloss: 0.277246\n", 122 | "Early stopping, best iteration is:\n", 123 | "[598]\ttraining's multi_logloss: 0.0950629\tvalid_1's multi_logloss: 0.277223\n", 124 | "cur time = 2018/09/21 20:13:46\n", 125 | "FOLD: 1\n", 126 | "2779 11108\n", 127 | "Training until validation scores don't improve for 100 rounds.\n", 128 | "[100]\ttraining's multi_logloss: 0.714406\tvalid_1's multi_logloss: 0.746063\n", 129 | "[200]\ttraining's multi_logloss: 0.360927\tvalid_1's multi_logloss: 0.41752\n", 130 | "[300]\ttraining's multi_logloss: 0.223406\tvalid_1's multi_logloss: 0.305014\n", 131 | "[400]\ttraining's multi_logloss: 0.159355\tvalid_1's multi_logloss: 0.265401\n", 132 | "[500]\ttraining's multi_logloss: 0.123548\tvalid_1's multi_logloss: 0.251562\n", 133 | "[600]\ttraining's multi_logloss: 0.100453\tvalid_1's multi_logloss: 0.247619\n", 134 | "[700]\ttraining's multi_logloss: 0.0840086\tvalid_1's multi_logloss: 0.247471\n", 135 | "Early stopping, best iteration is:\n", 136 | "[645]\ttraining's multi_logloss: 0.0925202\tvalid_1's multi_logloss: 0.246913\n", 137 | "cur time = 2018/09/21 20:15:23\n", 138 | "FOLD: 2\n", 139 | "2777 11110\n", 140 | "Training until validation scores don't improve for 100 rounds.\n", 141 | "[100]\ttraining's multi_logloss: 0.710447\tvalid_1's multi_logloss: 0.758826\n", 142 | "[200]\ttraining's multi_logloss: 0.354958\tvalid_1's multi_logloss: 0.436029\n", 143 | "[300]\ttraining's multi_logloss: 0.216709\tvalid_1's multi_logloss: 0.326181\n", 144 | "[400]\ttraining's multi_logloss: 0.15243\tvalid_1's multi_logloss: 0.287969\n", 145 | "[500]\ttraining's multi_logloss: 0.117201\tvalid_1's multi_logloss: 0.275582\n", 146 | "[600]\ttraining's multi_logloss: 0.0948654\tvalid_1's multi_logloss: 0.273565\n", 147 | "Early stopping, best iteration is:\n", 148 | "[578]\ttraining's multi_logloss: 0.0990779\tvalid_1's multi_logloss: 0.273456\n", 149 | "cur time = 2018/09/21 20:16:47\n", 150 | "FOLD: 3\n", 151 | "2776 11111\n", 152 | "Training until validation scores don't improve for 100 rounds.\n", 153 | "[100]\ttraining's multi_logloss: 0.710814\tvalid_1's multi_logloss: 0.757495\n", 154 | "[200]\ttraining's multi_logloss: 0.356598\tvalid_1's multi_logloss: 0.432203\n", 155 | "[300]\ttraining's multi_logloss: 0.219223\tvalid_1's multi_logloss: 0.319802\n", 156 | "[400]\ttraining's multi_logloss: 0.154809\tvalid_1's multi_logloss: 0.280013\n", 157 | "[500]\ttraining's multi_logloss: 0.118818\tvalid_1's multi_logloss: 0.2661\n", 158 | "[600]\ttraining's multi_logloss: 0.0962369\tvalid_1's multi_logloss: 0.262496\n", 159 | "[700]\ttraining's multi_logloss: 0.0801419\tvalid_1's multi_logloss: 0.262299\n", 160 | "Early stopping, best iteration is:\n", 161 | "[660]\ttraining's multi_logloss: 0.0860689\tvalid_1's multi_logloss: 0.261957\n", 162 | "cur time = 2018/09/21 20:18:25\n", 163 | "FOLD: 4\n", 164 | "2775 11112\n", 165 | "Training until validation scores don't improve for 100 rounds.\n", 166 | "[100]\ttraining's multi_logloss: 0.711242\tvalid_1's multi_logloss: 0.757122\n", 167 | "[200]\ttraining's multi_logloss: 0.357074\tvalid_1's multi_logloss: 0.432454\n", 168 | "[300]\ttraining's multi_logloss: 0.219319\tvalid_1's multi_logloss: 0.319717\n", 169 | "[400]\ttraining's multi_logloss: 0.155336\tvalid_1's multi_logloss: 0.27957\n", 170 | "[500]\ttraining's multi_logloss: 0.12014\tvalid_1's multi_logloss: 0.264735\n", 171 | "[600]\ttraining's multi_logloss: 0.0976771\tvalid_1's multi_logloss: 0.260499\n", 172 | "[700]\ttraining's multi_logloss: 0.0816694\tvalid_1's multi_logloss: 0.260613\n", 173 | "Early stopping, best iteration is:\n", 174 | "[625]\ttraining's multi_logloss: 0.0932625\tvalid_1's multi_logloss: 0.260267\n", 175 | "cur time = 2018/09/21 20:20:08\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "\n", 181 | "\n", 182 | "meta_test = np.zeros(shape = (len(test),8))\n", 183 | "\n", 184 | "for seed in range(1):\n", 185 | " print 'Times: ',seed\n", 186 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 187 | " skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=seed)\n", 188 | " for i,(tr_ind,te_ind) in enumerate(skf):\n", 189 | " print 'FOLD: ',i\n", 190 | " print len(te_ind),len(tr_ind)\n", 191 | " X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n", 192 | " X_val,X_val_label = train[te_ind],train_labels[te_ind]\n", 193 | " dtrain = lgb.Dataset(X_train,X_train_label) \n", 194 | " dval = lgb.Dataset(X_val,X_val_label, reference = dtrain) \n", 195 | " params = {\n", 196 | " 'task':'train', \n", 197 | " 'boosting_type':'gbdt',\n", 198 | " 'num_leaves': 15,\n", 199 | " 'objective': 'multiclass',\n", 200 | " 'num_class':8,\n", 201 | " 'learning_rate': 0.01,\n", 202 | " 'feature_fraction': 0.85,\n", 203 | " 'subsample':0.85,\n", 204 | " 'num_threads': 54,\n", 205 | " 'metric':'multi_logloss',\n", 206 | " 'seed':seed\n", 207 | " } \n", 208 | " model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100) \n", 209 | " pred_test = model.predict(test)\n", 210 | "\n", 211 | " #meta_train[te_ind] = pred_val\n", 212 | " meta_test += pred_test\n", 213 | " print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n", 214 | "\n", 215 | "meta_test/=5.0\n", 216 | "res = pd.DataFrame(meta_test,columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])\n", 217 | "res.index.name='file_id'\n", 218 | "res.round(7).to_csv('submit.csv', index = True, header=True)\n", 219 | " \n", 220 | " " 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 74, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "res.shape\n", 230 | "res.index = range(1,res.shape[0]+1)\n", 231 | "res.index.name = 'file_id'" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 77, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "en =res.copy()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 79, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "1.0000000000000004" 252 | ] 253 | }, 254 | "execution_count": 79, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "en.sum(axis=1).max()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 81, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "en.to_csv('../fuucccccccck.csv',index=True,header=True)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 83, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/html": [ 280 | "
\n", 281 | "\n", 294 | "\n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | "
prob0prob1prob2prob3prob4prob5prob6prob7
file_id
10.0020350.0021270.9497510.0095020.0018050.0024040.0055500.026825
20.9311290.0021370.0032890.0039130.0020600.0092540.0101010.038117
30.9960000.0004530.0005970.0006300.0004290.0005750.0005600.000755
40.0136270.0080150.0186250.0988060.0540510.0922540.1809030.533720
50.9938330.0005780.0010650.0008520.0006080.0007760.0007790.001510
\n", 377 | "
" 378 | ], 379 | "text/plain": [ 380 | " prob0 prob1 prob2 prob3 prob4 prob5 prob6 \\\n", 381 | "file_id \n", 382 | "1 0.002035 0.002127 0.949751 0.009502 0.001805 0.002404 0.005550 \n", 383 | "2 0.931129 0.002137 0.003289 0.003913 0.002060 0.009254 0.010101 \n", 384 | "3 0.996000 0.000453 0.000597 0.000630 0.000429 0.000575 0.000560 \n", 385 | "4 0.013627 0.008015 0.018625 0.098806 0.054051 0.092254 0.180903 \n", 386 | "5 0.993833 0.000578 0.001065 0.000852 0.000608 0.000776 0.000779 \n", 387 | "\n", 388 | " prob7 \n", 389 | "file_id \n", 390 | "1 0.026825 \n", 391 | "2 0.038117 \n", 392 | "3 0.000755 \n", 393 | "4 0.533720 \n", 394 | "5 0.001510 " 395 | ] 396 | }, 397 | "execution_count": 83, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "en.head()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [] 412 | } 413 | ], 414 | "metadata": { 415 | "kernelspec": { 416 | "display_name": "Python 3", 417 | "language": "python", 418 | "name": "python3" 419 | }, 420 | "language_info": { 421 | "codemirror_mode": { 422 | "name": "ipython", 423 | "version": 3 424 | }, 425 | "file_extension": ".py", 426 | "mimetype": "text/x-python", 427 | "name": "python", 428 | "nbconvert_exporter": "python", 429 | "pygments_lexer": "ipython3", 430 | "version": "3.6.5" 431 | } 432 | }, 433 | "nbformat": 4, 434 | "nbformat_minor": 2 435 | } 436 | -------------------------------------------------------------------------------- /上地西二旗人民.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enjoysport2022/Alibaba-3rd-Security-Algorithm-Challenge/18a43c25d62e914edb19bdcae11b209813cb8439/上地西二旗人民.pptx --------------------------------------------------------------------------------