├── DIGIX算法精英大赛.pptx ├── README.md ├── .gitignore └── Travis ├── HUAWEI_FurtureLab_MLP.ipynb ├── HUAWEI_FutureLab_result_ensemble.ipynb └── HUAWEI_FutureLab_model.ipynb /DIGIX算法精英大赛.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Travisgogogo/HUAWEI-DIGIX-Age-prediction/HEAD/DIGIX算法精英大赛.pptx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HUAWEI-DIGIX-用户人口属性预测赛道 2 | Rank 6th 3 | Link: https://developer.huawei.com/consumer/cn/activity/devStarAI/algo/competition.html#/preliminary/info/digix-trail-02/introduction 4 | 5 | # 赛题说明 6 | 年龄是用户人口属性的重要维度,本次比赛任务为根据用户的手机使用行为习惯来预估用户所处的年龄段。每个用户(以唯一ID标识)对应唯一的年龄段。年龄段有6种划分,分别代表不同年龄段,分别为:小于等于18岁, 19-23岁, 24-34岁, 35-44岁, 45-54岁,大于等于55岁。参赛同学根据华为提供数据构建预测模型进行年龄段预估,在测试数据集上给出预估结果。 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /Travis/HUAWEI_FurtureLab_MLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Initialize libraries\n" 13 | ] 14 | }, 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "Using TensorFlow backend.\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "print(\"Initialize libraries\")\n", 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "from keras.models import Sequential\n", 28 | "from keras.layers import Dense, Dropout, Activation,normalization\n", 29 | "from keras.wrappers.scikit_learn import KerasClassifier\n", 30 | "from keras.utils import np_utils\n", 31 | "from keras.optimizers import SGD\n", 32 | "from keras.layers.advanced_activations import PReLU\n", 33 | "\n", 34 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", 35 | "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler\n", 36 | "from sklearn.model_selection import train_test_split\n", 37 | "from sklearn.metrics import accuracy_score\n", 38 | "from keras.callbacks import ModelCheckpoint\n", 39 | "from keras.models import Model\n", 40 | "from keras.layers import Input,concatenate,Dense\n", 41 | "import keras.backend as K\n", 42 | "from keras import activations\n", 43 | "from keras.engine.topology import Layer, InputSpec\n", 44 | "import gc\n", 45 | "import os\n", 46 | "import warnings\n", 47 | "warnings.filterwarnings(\"ignore\")\n", 48 | "import keras.backend.tensorflow_backend as KTF\n", 49 | "from scipy.sparse import csr_matrix, hstack, coo_matrix\n", 50 | "import tensorflow as tf\n", 51 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 52 | "config = tf.ConfigProto()\n", 53 | "config.gpu_options.allow_growth=True\n", 54 | "sess = tf.Session(config=config)\n", 55 | "\n", 56 | "KTF.set_session(sess)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "# Basic Features" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": { 70 | "scrolled": true 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "CPU times: user 502 ms, sys: 143 ms, total: 645 ms\n", 78 | "Wall time: 643 ms\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "%%time\n", 84 | "age_test = pd.read_csv('../age_test.csv', header = None, names = ['uId'])\n", 85 | "age_train = pd.read_csv('../age_train.csv', header = None, names = ['uId','age_group'])\n", 86 | "data = pd.concat([age_train,age_test], axis = 0,sort=True).reset_index()\n", 87 | "data.drop(['index'],axis=1,inplace=True)\n", 88 | "del age_test, age_train\n", 89 | "gc.collect()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "70" 101 | ] 102 | }, 103 | "execution_count": 4, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "user_basic_info = pd.read_csv('../user_basic_info.csv',header= None, names=['uId','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])\n", 110 | "user_behavior_info = pd.read_csv('../user_behavior_info.csv', header = None, names = ['uId','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])\n", 111 | "data = data.merge(user_basic_info).merge(user_behavior_info)\n", 112 | "del user_basic_info, user_behavior_info\n", 113 | "gc.collect()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "CPU times: user 496 ms, sys: 155 ms, total: 651 ms\n", 126 | "Wall time: 627 ms\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "%%time\n", 132 | "for i in ['A','B','C','D','E','F']:\n", 133 | " data['{}FuncTimes'.format(i)] = round(abs(data['{}FuncTimes'.format(i)]))\n", 134 | " \n", 135 | "data['ramLeftCapacity'] = data['ramCapacity'] * data['ramLeftRation']\n", 136 | "data['romLeftCapacity'] = data['romCapacity'] * data['romLeftRation']\n", 137 | "data['romLeftRation'][data.romLeftRation>1] = 1\n", 138 | "data['ramLeftRation'][data.ramLeftRation>1] = 1" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 5, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "CPU times: user 2min 45s, sys: 10.5 s, total: 2min 55s\n", 151 | "Wall time: 3min 1s\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "%%time\n", 157 | "#usage表中top5000使用量的appId\n", 158 | "app_usage5000 = pd.read_hdf('./feature/app_usage5000.h5', key='data')\n", 159 | "user_app_actived = pd.read_csv('./data/user_app_actived.csv', header = None, names =['uId','appId'])\n", 160 | "app_usage5000.columns=['uId', 'usage_appId']\n", 161 | "data = data.merge(user_app_actived, on='uId', how='left').merge(app_usage5000, how='left', on='uId')\n", 162 | "X_app = CountVectorizer(token_pattern='a\\d+',binary=True).fit_transform(data['appId'])\n", 163 | "X_usage = CountVectorizer(token_pattern='a\\d+', binary=True).fit_transform(data['usage_appId'].fillna('None'))\n", 164 | "del user_app_actived\n", 165 | "del app_usage5000\n", 166 | "gc.collect()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 8, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "CPU times: user 50.7 s, sys: 3.55 s, total: 54.3 s\n", 179 | "Wall time: 54.5 s\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "%%time\n", 185 | "app_usage5000 = pd.read_hdf('../feature/app_usage5000_top_times.h5', key='data')\n", 186 | "app_usage5000.columns=['uId', 'usage_appId']\n", 187 | "data = data.merge(app_usage5000, how='left', on='uId')\n", 188 | "X_usage = CountVectorizer(token_pattern='a\\d+', binary=True).fit_transform(data['usage_appId'].fillna('None'))\n", 189 | "del app_usage5000\n", 190 | "gc.collect()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 11, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "57" 202 | ] 203 | }, 204 | "execution_count": 11, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "cate_feat = [ 'gender', 'city', 'prodName', 'color', 'fontSize', 'ct', 'carrier', 'os']\n", 211 | "num_feat = ['ramLeftCapacity','romLeftCapacity', 'ramCapacity', 'ramLeftRation', 'romCapacity', 'romLeftRation', 'bootTimes',\n", 212 | " 'AFuncTimes', 'BFuncTimes', 'CFuncTimes', 'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'FFuncSum']\n", 213 | "\n", 214 | "X_num = RobustScaler().fit_transform(data[num_feat].fillna(0))\n", 215 | "\n", 216 | "for feat in cate_feat:\n", 217 | " data[feat] = LabelEncoder().fit_transform(data[feat].fillna('None').apply(str))\n", 218 | "\n", 219 | "X_cate = OneHotEncoder().fit_transform(data[cate_feat].fillna(-1))\n", 220 | "#X1 = hstack((X_app, X_cate, X_num), format='csr')\n", 221 | "X2 = hstack((X_usage, X_cate, X_num), format='csr')\n", 222 | "del X_cate, X_num\n", 223 | "gc.collect()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 12, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "All features: train shape (2010000, 11767), test shape (502500, 11767)\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "test_index = list(data[np.isnan(data.age_group)].index)\n", 241 | "train_index = list(data[~np.isnan(data.age_group)].index)\n", 242 | "# train_x_app = X1[train_index]\n", 243 | "# test_x_app = X1[test_index]\n", 244 | "train_x_usage = X2[train_index]\n", 245 | "test_x_usage = X2[test_index]\n", 246 | "print('All features: train shape {}, test shape {}'.format(train_x_usage.shape, test_x_usage.shape))" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 14, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "targetencoder = LabelEncoder().fit(data[~np.isnan(data.age_group)].age_group)\n", 256 | "y = targetencoder.transform(data[~np.isnan(data.age_group)].age_group)\n", 257 | "nclasses = len(targetencoder.classes_)\n", 258 | "dummy_y = np_utils.to_categorical(y)\n", 259 | "\n", 260 | "#X_train_app, X_val_app, y_train_app, y_val_app = train_test_split(train_x_app, dummy_y, test_size=0.02, random_state=42)\n", 261 | "X_train_usage, X_val_usage, y_train_usage, y_val_usage = train_test_split(train_x_usage, dummy_y, test_size=0.02, random_state=42)\n", 262 | "#del train_x_app, train_x_usage, dummy_y\n", 263 | "#gc.collect()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 15, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "def batch_generator(X, y, batch_size, shuffle):\n", 273 | " #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)\n", 274 | " number_of_batches = np.ceil(X.shape[0]/batch_size)\n", 275 | " counter = 0\n", 276 | " sample_index = np.arange(X.shape[0])\n", 277 | " if shuffle:\n", 278 | " np.random.shuffle(sample_index)\n", 279 | " while True:\n", 280 | " batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]\n", 281 | " X_batch = X[batch_index,:].toarray()\n", 282 | " y_batch = y[batch_index]\n", 283 | " counter += 1\n", 284 | " yield X_batch, y_batch\n", 285 | " if (counter == number_of_batches):\n", 286 | " if shuffle:\n", 287 | " np.random.shuffle(sample_index)\n", 288 | " counter = 0\n", 289 | "\n", 290 | "def batch_generatorp(X, batch_size, shuffle):\n", 291 | " number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)\n", 292 | " counter = 0\n", 293 | " sample_index = np.arange(X.shape[0])\n", 294 | " while True:\n", 295 | " batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]\n", 296 | " X_batch = X[batch_index, :].toarray()\n", 297 | " counter += 1\n", 298 | " yield X_batch\n", 299 | " if (counter == number_of_batches):\n", 300 | " counter = 0" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "# 定义DNN+FM模型" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 11, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "class FM(Layer): \n", 317 | " def __init__(self, output_dim=30, activation=\"relu\",**kwargs): \n", 318 | " self.output_dim = output_dim \n", 319 | " self.activate = activations.get(activation) \n", 320 | " super(FM, self).__init__(**kwargs) \n", 321 | " \n", 322 | " def build(self, input_shape): \n", 323 | " self.weight = self.add_weight(name='weight',shape=(input_shape[1], self.output_dim),initializer='glorot_uniform',trainable=True) \n", 324 | " self.bias = self.add_weight(name='bias',shape=(self.output_dim,),initializer='zeros',trainable=True) \n", 325 | " self.kernel = self.add_weight(name='kernel',shape=(input_shape[1], self.output_dim),initializer='glorot_uniform',trainable=True) \n", 326 | " super(FM, self).build(input_shape) \n", 327 | " \n", 328 | " def call(self, x):\n", 329 | " feature = K.dot(x,self.weight) + self.bias\n", 330 | " a = K.pow(K.dot(x,self.kernel), 2)\n", 331 | " b = K.dot(x, K.pow(self.kernel, 2))\n", 332 | " cross = K.mean(a-b, 1, keepdims=True)*0.5\n", 333 | " cross = K.repeat_elements(K.reshape(cross, (-1, 1)), self.output_dim, axis=-1) \n", 334 | " return self.activate(feature + cross) \n", 335 | " \n", 336 | " def compute_output_shape(self, input_shape): \n", 337 | " return (input_shape[0], self.output_dim)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 16, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "def baseline_model_app():\n", 347 | " # create two models\n", 348 | " input1 = Input(shape=(X_train_app.shape[1],))\n", 349 | "\n", 350 | " dense_1 = Dense(32, kernel_initializer='normal', activation='relu')(input1)\n", 351 | " #dense_1 = Dropout(0.6)(dense_1)\n", 352 | " dense_2 = Dense(16, kernel_initializer='normal', activation='relu')(dense_1)\n", 353 | " #dense_2 = Dropout(0.6)(dense_2)\n", 354 | " dense_3 = Dense(8, kernel_initializer='normal', activation='relu')(dense_2)\n", 355 | " #dense_3 = Dropout(0.6)(dense_3)\n", 356 | " out = Dense(6,kernel_initializer='normal', activation='softmax')(dense_3)\n", 357 | " # Compile model\n", 358 | " model = Model(inputs=input1, outputs = out)\n", 359 | " model.compile(loss ='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss\n", 360 | " return model" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 16, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "def baseline_model_usage():\n", 370 | " # create two models\n", 371 | " input1 = Input(shape=(X_train_usage.shape[1],))\n", 372 | "\n", 373 | " dense_1 = Dense(32, kernel_initializer='normal', activation='relu')(input1)\n", 374 | " #dense_1 = Dropout(0.6)(dense_1)\n", 375 | " dense_2 = Dense(16, kernel_initializer='normal', activation='relu')(dense_1)\n", 376 | " #dense_2 = Dropout(0.6)(dense_2)\n", 377 | " dense_3 = Dense(8, kernel_initializer='normal', activation='relu')(dense_2)\n", 378 | " #dense_3 = Dropout(0.6)(dense_3)\n", 379 | " out = Dense(6,kernel_initializer='normal', activation='softmax')(dense_3)\n", 380 | " # Compile model\n", 381 | " model = Model(inputs=input1, outputs = out)\n", 382 | " model.compile(loss ='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss\n", 383 | " return model" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 13, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "def DenseNet():\n", 393 | " # create two models\n", 394 | " input1 = Input(shape=(X_train.shape[1],))\n", 395 | "\n", 396 | " dense_1 = Dense(256, kernel_initializer='normal', activation='relu')(input1)\n", 397 | " #dense_1 = Dropout(0.6)(dense_1)\n", 398 | " dense_2 = Dense(128, kernel_initializer='normal', activation='relu')(dense_1)\n", 399 | " #dense_2 = Dropout(0.6)(dense_2)\n", 400 | " dense_2_x = concatenate([dense_1,dense_2])\n", 401 | " dense_3 = Dense(64, kernel_initializer='normal', activation='relu')(dense_2_x)\n", 402 | " #dense_3 = Dropout(0.6)(dense_3)\n", 403 | " dense_3_x = concatenate([dense_1,dense_2,dense_3])\n", 404 | " dense_4 = Dense(7, kernel_initializer='normal', activation='relu')(dense_3_x)\n", 405 | " #dense_4 = Dropout(0.6)(dense_4)\n", 406 | " dense_4_x = concatenate([dense_1,dense_2,dense_3,dense_4])\n", 407 | " out = Dense(6,kernel_initializer='normal', activation='softmax')(dense_4_x)\n", 408 | " # Compile model\n", 409 | " model = Model(inputs=input1, outputs = out)\n", 410 | " model.compile(loss ='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss\n", 411 | " return model" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 14, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "def FMNet():\n", 421 | " # create two models\n", 422 | " input1 = Input(shape=(X_train.shape[1],))\n", 423 | " #DNN_model_I\n", 424 | " dense_1 = Dense(100, kernel_initializer='normal', activation='tanh')(input1)\n", 425 | " dense_2 = Dense(150, kernel_initializer='normal', activation='tanh')(dense_1)\n", 426 | " dense_3 = Dense(150, kernel_initializer='normal', activation='tanh')(dense_2)\n", 427 | " dense_4 = Dense(100, kernel_initializer='normal', activation='tanh')(dense_3)\n", 428 | " dense_5 = Dense(64, kernel_initializer='normal', activation='tanh')(dense_4)\n", 429 | " #FM_model_II\n", 430 | " FM_1 = FM(200)(input1)\n", 431 | " FM_2 = FM(64)(FM_1)\n", 432 | "\n", 433 | " x = concatenate([dense_5,FM_2])\n", 434 | " x_tmp = Dense(32,kernel_initializer='normal', activation='softmax')(x)\n", 435 | " out = Dense(6,kernel_initializer='normal', activation='softmax')(x_tmp)\n", 436 | " # Compile model\n", 437 | " model = Model(inputs=input1, outputs = out)\n", 438 | " model.compile(loss ='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss\n", 439 | " #model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss\n", 440 | " return model" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 17, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "model=baseline_model_usage()" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "name": "stdout", 459 | "output_type": "stream", 460 | "text": [ 461 | "Epoch 1/20\n", 462 | "1924/1924 [==============================] - 207s 108ms/step - loss: 1.1642 - acc: 0.4993 - val_loss: 1.0818 - val_acc: 0.5401\n", 463 | "\n", 464 | "Epoch 00001: val_acc improved from -inf to 0.54015, saving model to ./weight_bias/best_epoch_model_app_usage_top_times.hdf5\n", 465 | "Epoch 2/20\n", 466 | "1924/1924 [==============================] - 207s 107ms/step - loss: 1.0578 - acc: 0.5495 - val_loss: 1.0543 - val_acc: 0.5529\n", 467 | "\n", 468 | "Epoch 00002: val_acc improved from 0.54015 to 0.55294, saving model to ./weight_bias/best_epoch_model_app_usage_top_times.hdf5\n", 469 | "Epoch 3/20\n", 470 | "1924/1924 [==============================] - 207s 107ms/step - loss: 1.0388 - acc: 0.5583 - val_loss: 1.0440 - val_acc: 0.5570\n", 471 | "\n", 472 | "Epoch 00003: val_acc improved from 0.55294 to 0.55697, saving model to ./weight_bias/best_epoch_model_app_usage_top_times.hdf5\n", 473 | "Epoch 4/20\n", 474 | "1924/1924 [==============================] - 207s 107ms/step - loss: 1.0285 - acc: 0.5628 - val_loss: 1.0403 - val_acc: 0.5601\n", 475 | "\n", 476 | "Epoch 00004: val_acc improved from 0.55697 to 0.56012, saving model to ./weight_bias/best_epoch_model_app_usage_top_times.hdf5\n", 477 | "Epoch 5/20\n", 478 | "1924/1924 [==============================] - 205s 107ms/step - loss: 1.0218 - acc: 0.5658 - val_loss: 1.0366 - val_acc: 0.5602\n", 479 | "\n", 480 | "Epoch 00005: val_acc improved from 0.56012 to 0.56017, saving model to ./weight_bias/best_epoch_model_app_usage_top_times.hdf5\n", 481 | "Epoch 6/20\n", 482 | "1924/1924 [==============================] - 207s 108ms/step - loss: 1.0166 - acc: 0.5679 - val_loss: 1.0327 - val_acc: 0.5618\n", 483 | "\n", 484 | "Epoch 00006: val_acc improved from 0.56017 to 0.56179, saving model to ./weight_bias/best_epoch_model_app_usage_top_times.hdf5\n", 485 | "Epoch 7/20\n", 486 | "1924/1924 [==============================] - 206s 107ms/step - loss: 1.0127 - acc: 0.5698 - val_loss: 1.0290 - val_acc: 0.5639\n", 487 | "\n", 488 | "Epoch 00007: val_acc improved from 0.56179 to 0.56388, saving model to ./weight_bias/best_epoch_model_app_usage_top_times.hdf5\n", 489 | "Epoch 8/20\n", 490 | "1325/1924 [===================>..........] - ETA: 1:04 - loss: 1.0075 - acc: 0.5723" 491 | ] 492 | } 493 | ], 494 | "source": [ 495 | "checkpoint = ModelCheckpoint(filepath='./weight_bias/best_epoch_model_app_usage_top_times.hdf5',monitor='val_acc',mode='max' ,save_best_only=True,verbose=1,period=1)\n", 496 | " \n", 497 | "callback_lists=[checkpoint]\n", 498 | "\n", 499 | "\n", 500 | "fit= model.fit_generator(generator=batch_generator(X_train_usage, y_train_usage, 1024, True),\n", 501 | " nb_epoch=20,\n", 502 | " samples_per_epoch=np.ceil(X_train_usage.shape[0]/1024) ,\n", 503 | " validation_data=(X_val_usage.todense(), y_val_usage), verbose=1,\n", 504 | " shuffle=True,\n", 505 | " callbacks=callback_lists\n", 506 | " )" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 20, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "model=baseline_model_usage()" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 21, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "name": "stdout", 525 | "output_type": "stream", 526 | "text": [ 527 | "Epoch 1/1\n", 528 | "1924/1924 [==============================] - 233s 121ms/step - loss: 1.1621 - acc: 0.4998 - val_loss: 1.0832 - val_acc: 0.5392\n", 529 | "\n", 530 | "Epoch 00001: val_acc improved from -inf to 0.53923, saving model to ./weight_bias/best_epoch_model_usage.hdf5\n" 531 | ] 532 | } 533 | ], 534 | "source": [ 535 | "checkpoint = ModelCheckpoint(filepath='./weight_bias/best_epoch_model_usage.hdf5',monitor='val_acc',mode='max' ,save_best_only=True,verbose=1,period=1)\n", 536 | " \n", 537 | "callback_lists=[checkpoint]\n", 538 | "\n", 539 | "\n", 540 | "fit= model.fit_generator(generator=batch_generator(X_train_usage, y_train_usage, 1024, True),\n", 541 | " nb_epoch=20,\n", 542 | " samples_per_epoch=np.ceil(X_train_usage.shape[0]/1024) ,\n", 543 | " validation_data=(X_val_usage.todense(), y_val_usage), verbose=1,\n", 544 | " shuffle=True,\n", 545 | " callbacks=callback_lists \n", 546 | " )" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [ 555 | "from keras.models import load_model\n", 556 | "#保存X_app权重\n", 557 | "model = load_model('./weight_bias/best_epoch_model_app.hdf5')\n", 558 | "weight = model.get_weights()[0]\n", 559 | "bias = model.get_weights()[1]\n", 560 | "weight = weight[:X_app.shape[1],:]\n", 561 | "np.save('./weight_bias/X_app_weight_dense1.npy',weight)\n", 562 | "np.save('./weight_bias/X_app_bias_dense1.npy',bias)\n", 563 | "weight = model.get_weights()[2]\n", 564 | "bias = model.get_weights()[3]\n", 565 | "np.save('./weight_bias/X_app_weight_dense2.npy',weight)\n", 566 | "np.save('./weight_bias/X_app_bias_dense2.npy',bias)\n", 567 | "weight = model.get_weights()[4]\n", 568 | "bias = model.get_weights()[5]\n", 569 | "np.save('./weight_bias/X_app_weight_dense3.npy',weight)\n", 570 | "np.save('./weight_bias/X_app_bias_dense3.npy',bias)" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 23, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "from keras.models import load_model\n", 580 | "#保存X_usage权重\n", 581 | "model = load_model('./weight_bias/best_epoch_model_app_usage_top_times.hdf5')\n", 582 | "weight = model.get_weights()[0]\n", 583 | "bias = model.get_weights()[1]\n", 584 | "weight = weight[:X_usage.shape[1],:]\n", 585 | "np.save('./weight_bias/X_usage_weight_times_dense1.npy',weight)\n", 586 | "np.save('./weight_bias/X_usage_bias_times_dense1.npy',bias)\n", 587 | "weight = model.get_weights()[2]\n", 588 | "bias = model.get_weights()[3]\n", 589 | "np.save('./weight_bias/X_usage_weight_times_dense2.npy',weight)\n", 590 | "np.save('./weight_bias/X_usage_bias_times_dense2.npy',bias)\n", 591 | "weight = model.get_weights()[4]\n", 592 | "bias = model.get_weights()[5]\n", 593 | "np.save('./weight_bias/X_usage_weight_times_dense3.npy',weight)\n", 594 | "np.save('./weight_bias/X_usage_bias_times_dense3.npy',bias)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "from keras.models import load_model\n", 604 | "#保存X_usage权重\n", 605 | "model = load_model('./weight_bias/best_epoch_model_usage.hdf5')\n", 606 | "weight = model.get_weights()[0]\n", 607 | "bias = model.get_weights()[1]\n", 608 | "weight = weight[:X_usage.shape[1],:]\n", 609 | "np.save('./weight_bias/X_usage_weight_dense1.npy',weight)\n", 610 | "np.save('./weight_bias/X_usage_bias_dense1.npy',bias)\n", 611 | "weight = model.get_weights()[2]\n", 612 | "bias = model.get_weights()[3]\n", 613 | "np.save('./weight_bias/X_usage_weight_dense2.npy',weight)\n", 614 | "np.save('./weight_bias/X_usage_bias_dense2.npy',bias)\n", 615 | "weight = model.get_weights()[4]\n", 616 | "bias = model.get_weights()[5]\n", 617 | "np.save('./weight_bias/X_usage_weight_dense3.npy',weight)\n", 618 | "np.save('./weight_bias/X_usage_bias_dense3.npy',bias)" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 38, 624 | "metadata": {}, 625 | "outputs": [ 626 | { 627 | "name": "stdout", 628 | "output_type": "stream", 629 | "text": [ 630 | "# Final prediction\n", 631 | "Done\n" 632 | ] 633 | } 634 | ], 635 | "source": [ 636 | "# evaluate the model\n", 637 | "age_test = pd.read_csv('../age_test.csv', header = None, names = ['uId'])\n", 638 | "print(\"# Final prediction\")\n", 639 | "scores = model.predict(test_x)\n", 640 | "test_label = targetencoder.inverse_transform(np.argmax(scores, axis=1))\n", 641 | "result = pd.DataFrame()\n", 642 | "result['id'] = age_test.uId\n", 643 | "result['label'] = test_label.astype(int)\n", 644 | "result.to_csv('./submission.csv', index=False)\n", 645 | "print(\"Done\")" 646 | ] 647 | } 648 | ], 649 | "metadata": { 650 | "kernelspec": { 651 | "display_name": "Python 3", 652 | "language": "python", 653 | "name": "python3" 654 | }, 655 | "language_info": { 656 | "codemirror_mode": { 657 | "name": "ipython", 658 | "version": 3 659 | }, 660 | "file_extension": ".py", 661 | "mimetype": "text/x-python", 662 | "name": "python", 663 | "nbconvert_exporter": "python", 664 | "pygments_lexer": "ipython3", 665 | "version": "3.6.7" 666 | } 667 | }, 668 | "nbformat": 4, 669 | "nbformat_minor": 2 670 | } 671 | -------------------------------------------------------------------------------- /Travis/HUAWEI_FutureLab_result_ensemble.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from sklearn.metrics import accuracy_score\n", 12 | "import random\n", 13 | "from tqdm import tnrange" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Load data" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "age_test = pd.read_csv('./data/age_test.csv', header = None, names = ['uId'])\n", 30 | "age_train = pd.read_csv('./data/age_train.csv', header = None, names = ['uId','age_group'])\n", 31 | "data = pd.concat([age_train,age_test], axis = 0,sort=True).reset_index()\n", 32 | "data.drop(['index'],axis=1,inplace=True)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "scrolled": true 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "CPU times: user 358 ms, sys: 76.1 ms, total: 434 ms\n", 47 | "Wall time: 435 ms\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "%%time\n", 53 | "test_index = np.isnan(data.age_group)\n", 54 | "train_index = ~test_index\n", 55 | "from sklearn.model_selection import train_test_split\n", 56 | "X_train, X_validation, y_train, y_validation = train_test_split(data[train_index], data[train_index]['age_group'], test_size=0.02, random_state=42)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "res_test_64898 = np.load(\"./out/proba_test_64898.npy\")\n", 66 | "res_val_64898 = np.load(\"./out/proba_val_64898.npy\")\n", 67 | "res_test_65017 = np.load(\"./out/proba_test_65017.npy\")\n", 68 | "res_val_65017 = np.load(\"./out/proba_val_65017.npy\")\n", 69 | "res_test_6504 = np.load(\"./out/proba_test_6504.npy\")\n", 70 | "res_val_6504 = np.load(\"./out/proba_val_6504.npy\")\n", 71 | "res_test_6489 = np.load(\"./out/proba_test_6489.npy\")\n", 72 | "res_val_6489 = np.load(\"./out/proba_val_6489.npy\")\n", 73 | "res_test_65077 = np.load(\"./out/proba_test_65077.npy\")\n", 74 | "res_val_65077 = np.load(\"./out/proba_val_65077.npy\")\n", 75 | "res_test_65042 = np.load(\"./out/proba_test_65042.npy\")\n", 76 | "res_val_65042 = np.load(\"./out/proba_val_65042.npy\")\n", 77 | "res_test_65131 = np.load(\"./out/proba_test_65131.npy\")\n", 78 | "res_val_65131 = np.load(\"./out/proba_val_65131.npy\")\n", 79 | "res_test_6362 = np.load(\"./out/proba_test_6362.npy\")\n", 80 | "res_val_6362 = np.load(\"./out/proba_val_6362.npy\")\n", 81 | "res_test_6429 = np.load(\"./out/proba_test_6429.npy\")\n", 82 | "res_val_6429 = np.load(\"./out/proba_val_6429.npy\")\n", 83 | "res_test_6513 = np.load(\"./out/proba_test_6513.npy\")\n", 84 | "res_val_6513 = np.load(\"./out/proba_val_6513.npy\")\n", 85 | "res_test_6535 = np.load(\"./out/proba_test_6535.npy\")\n", 86 | "res_val_6535 = np.load(\"./out/proba_val_6535.npy\")\n", 87 | "res_test_6534 = np.load(\"./out/proba_test_6534.npy\")\n", 88 | "res_val_6534 = np.load(\"./out/proba_val_6534.npy\")\n", 89 | "res_test_6550 = np.load(\"./out/proba_test_6550.npy\")\n", 90 | "res_val_6550 = np.load(\"./out/proba_val_6550.npy\")\n", 91 | "res_test_6463 = np.load(\"./out/proba_test_6463.npy\")\n", 92 | "res_val_6463 = np.load(\"./out/proba_val_6463.npy\")\n", 93 | "res_test_6473 = np.load(\"./out/proba_test_6473.npy\")\n", 94 | "res_val_6473 = np.load(\"./out/proba_val_6473.npy\")\n", 95 | "res_test_6490 = np.load(\"./out/proba_test_6490.npy\")\n", 96 | "res_val_6490 = np.load(\"./out/proba_val_6490.npy\")\n", 97 | "res_test_6443 = np.load(\"./out/proba_test_6443.npy\")\n", 98 | "res_val_6443 = np.load(\"./out/proba_val_6443.npy\")\n", 99 | "res_test_6440 = np.load(\"./out/proba_test_6440.npy\")\n", 100 | "res_val_6440 = np.load(\"./out/proba_val_6440.npy\")\n", 101 | "res_test_64983 = np.load(\"./out/proba_test_64983.npy\")\n", 102 | "res_val_64983 = np.load(\"./out/proba_val_64983.npy\")\n", 103 | "res_test_64611 = np.load(\"./out/proba_test_64611.npy\")\n", 104 | "res_val_64611 = np.load(\"./out/proba_val_64611.npy\")\n", 105 | "res_test_64639 = np.load(\"./out/proba_test_64639.npy\")\n", 106 | "res_val_64639 = np.load(\"./out/proba_val_64639.npy\")\n", 107 | "res_test_64527 = np.load(\"./out/proba_test_0.64527.npy\")\n", 108 | "res_val_64527 = np.load(\"./out/proba_val_0.64527.npy\")\n", 109 | "res_test_646390 = np.load(\"./out/proba_test_0.64639.npy\")\n", 110 | "res_val_646390 = np.load(\"./out/proba_val_0.64639.npy\")\n", 111 | "res_test_64711 = np.load(\"./out/proba_test_0.64711.npy\")\n", 112 | "res_val_64711 = np.load(\"./out/proba_val_0.64711.npy\")\n", 113 | "res_test_64687 = np.load(\"./out/proba_test_0.64687.npy\")\n", 114 | "res_val_64687 = np.load(\"./out/proba_val_0.64687.npy\")\n", 115 | "res_test_64659 = np.load(\"./out/proba_test_0.64659.npy\")\n", 116 | "res_val_64659 = np.load(\"./out/proba_val_0.64659.npy\")\n", 117 | "res_test_64647 = np.load(\"./out/proba_test_0.64647.npy\")\n", 118 | "res_val_64647 = np.load(\"./out/proba_val_0.64647.npy\")\n", 119 | "res_test_64741 = np.load(\"./out/proba_test_0.64741.npy\")\n", 120 | "res_val_64741 = np.load(\"./out/proba_val_0.64741.npy\")\n", 121 | "res_test_64488 = np.load(\"./out/proba_test_0.64488.npy\")\n", 122 | "res_val_64488 = np.load(\"./out/proba_val_0.64488.npy\")\n", 123 | "res_test_646 = np.load(\"./out/proba_test_0.646.npy\")\n", 124 | "res_val_646 = np.load(\"./out/proba_val_0.646.npy\")\n", 125 | "res_test_64764 = np.load(\"./out/proba_test_0.64764.npy\")\n", 126 | "res_val_64764 = np.load(\"./out/proba_val_0.64764.npy\")\n", 127 | "res_test_64729 = np.load(\"./out/proba_test_0.64729.npy\")\n", 128 | "res_val_64729 = np.load(\"./out/proba_val_0.64729.npy\")\n", 129 | "res_test_64801 = np.load(\"./out/proba_test_0.64801.npy\")\n", 130 | "res_val_64801 = np.load(\"./out/proba_val_0.64801.npy\")\n", 131 | "res_test_64746 = np.load(\"./out/proba_test_0.64746.npy\")\n", 132 | "res_val_64746 = np.load(\"./out/proba_val_0.64746.npy\")\n", 133 | "res_test_64751 = np.load(\"./out/proba_test_0.64751.npy\")\n", 134 | "res_val_64751 = np.load(\"./out/proba_val_0.64751.npy\")\n", 135 | "res_test_644801 = np.load(\"./out/proba_test_0.644801.npy\")\n", 136 | "res_val_644801 = np.load(\"./out/proba_val_0.644801.npy\")\n", 137 | "res_test_64896 = np.load(\"./out/proba_test_0.64896.npy\")\n", 138 | "res_val_64896 = np.load(\"./out/proba_val_0.64896.npy\")\n", 139 | "res_test_646592 = np.load(\"./out/proba_test_0.646592.npy\")\n", 140 | "res_val_646592 = np.load(\"./out/proba_val_0.646592.npy\")\n", 141 | "res_test_64659 = np.load(\"./out/proba_test_0.64659.npy\")\n", 142 | "res_val_64659 = np.load(\"./out/proba_val_0.64659.npy\")\n", 143 | "res_test_64784 = np.load(\"./out/proba_test_0.64784.npy\")\n", 144 | "res_val_64784 = np.load(\"./out/proba_val_0.64784.npy\")\n", 145 | "res_test_64871 = np.load(\"./out/proba_test_0.64871.npy\")\n", 146 | "res_val_64871 = np.load(\"./out/proba_val_0.64871.npy\")\n", 147 | "res_test_65221 = np.load(\"./out/proba_test_0.65221.npy\")\n", 148 | "res_val_65221 = np.load(\"./out/proba_val_0.65221.npy\")\n", 149 | "res_test_65117 = np.load(\"./out/proba_test_0.65117.npy\")\n", 150 | "res_val_65117 = np.load(\"./out/proba_val_0.65117.npy\")\n", 151 | "res_test_64649 = np.load(\"./out/proba_test_0.64649.npy\")\n", 152 | "res_val_64649 = np.load(\"./out/proba_val_0.64649.npy\")\n", 153 | "res_test_6503 = np.load(\"./out/proba_test_0.6503.npy\")\n", 154 | "res_val_6503 = np.load(\"./out/proba_val_0.6503.npy\")\n", 155 | "res_test_65236 = np.load(\"./out/proba_test_0.65236.npy\")\n", 156 | "res_val_65236 = np.load(\"./out/proba_val_0.65236.npy\")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "# 对各模型结果用随机数作权重进行组合" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "best_score = 0.659\n", 173 | "for i in tnrange(0,3000000,1):\n", 174 | " i1 = random.uniform(25, 35)#0.1636255590669533\n", 175 | " i2 = random.uniform(19, 29) #1.4855965015936774 # \n", 176 | " i3 = random.uniform(-5, 5)#8.329328614291946 #\n", 177 | " i4 = random.uniform(-90, -80)#-2.9972802044593543 #\n", 178 | " i5 = random.uniform(23, 33)#3.852821930031347 #\n", 179 | " i6 = random.uniform(81, 91)#-1.8581970197832307 #\n", 180 | " i7 = random.uniform(89, 99)#10.104086995652871 #\n", 181 | " i8 = random.uniform(46, 56) #0.9999500014820883 #\n", 182 | " i9 = random.uniform(40, 50) #0.9998254782546445 #\n", 183 | " i10 =random.uniform(11, 21)# 0.002169386183770161 #\n", 184 | " i11 =random.uniform(-11, -1)# 1.0019618676346278 #\n", 185 | " i12 =random.uniform(-87, -77)# 0#\n", 186 | " i13 =random.uniform(-75, -65)# 2#\n", 187 | " i14 =random.uniform(62, 72)# 1#\n", 188 | " i15 =random.uniform(86, 96)# 0.9887095911033461#\n", 189 | " i16 =random.uniform(42, 52)\n", 190 | " i17 =random.uniform(-66, -56)\n", 191 | " i18 =random.uniform(-5, 5)\n", 192 | " i19 =random.uniform(42, 52)\n", 193 | " i20 =random.uniform(67, 77)\n", 194 | " i21 =random.uniform(80, 90)\n", 195 | " i22 =random.uniform(42, 52)\n", 196 | " i23 =random.uniform(-32, -22)\n", 197 | " i24 =random.uniform(-58, -48)\n", 198 | " i25 =random.uniform(67, 77)\n", 199 | " i26 =random.uniform(17, 27)\n", 200 | " i27 =random.uniform(-25, -15)\n", 201 | " i28 =random.uniform(-3, 7)\n", 202 | " i29 =random.uniform(-72, -62)# 0.002169386183770161 #\n", 203 | " i30 =random.uniform(76, 86)# 1.0019618676346278 #\n", 204 | " i31 =random.uniform(-33, -23)# 0#\n", 205 | " i32 =random.uniform(44, 54)# 2#\n", 206 | " i33 =random.uniform(24, 34)# 1#\n", 207 | " i34 =random.uniform(-56, -46)# 0.9887095911033461#\n", 208 | " i35 =random.uniform(40, 50)\n", 209 | " i36 =random.uniform(-81, -71)\n", 210 | " i37 =random.uniform(-68, -58)\n", 211 | " i38 =random.uniform(-11, -1)\n", 212 | " i39 =random.uniform(-68, -58)\n", 213 | " i40 =random.uniform(-60, -50)\n", 214 | " i41 =random.uniform(-49,-39)\n", 215 | " i42 =random.uniform(47, 57)\n", 216 | " i43 =random.uniform(-35, -25)\n", 217 | " i44 =random.uniform(49, 59)\n", 218 | " i45 =random.uniform(-33, -23)\n", 219 | " i46 =random.uniform(73, 83)\n", 220 | " \n", 221 | " res_val = np.argmax(i1*res_val_65236+i2*res_val_65221+i3*res_val_65117+i4*res_val_64649+i6*res_val_6503+i7*res_val_6550+i8*res_val_6535+i9*res_val_6534+i10*res_val_65131+i11*res_val_6513+i12*res_val_65077+i13*res_val_65042+i14*res_val_6504+i15*res_val_65017+i16*res_val_64983+i17*res_val_6490+i18*res_val_64898+i19*res_val_64896+i20*res_val_6489+i21*res_val_64871+i22*res_val_64801+i23*res_val_64784+i24*res_val_64764+i25*res_val_64751+i26*res_val_64746+i27*res_val_64741+i28*res_val_6473+i29*res_val_64729+i30*res_val_64711+i31*res_val_64687+i32*res_val_646592+i33*res_val_64659+i34*res_val_64647+i35*res_val_646390+i36*res_val_64639+i37*res_val_6463+i38*res_val_64611+i39*res_val_646+i40*res_val_64527+i41*res_val_64488+i42*res_val_644801+i43*res_val_6443+i44*res_val_6440+i45*res_val_6429+i46*res_val_6362, axis=1) + 1\n", 222 | " acc = accuracy_score(y_true=y_validation, y_pred=res_val)\n", 223 | " if acc>best_score:\n", 224 | " best_score=acc\n", 225 | " res_test = np.argmax(i1*res_test_65236+i2*res_test_65221+i3*res_test_65117+i4*res_test_64649+i6*res_test_6503+i7*res_test_6550+i8*res_test_6535+i9*res_test_6534+i10*res_test_65131+i11*res_test_6513+i12*res_test_65077+i13*res_test_65042+i14*res_test_6504+i15*res_test_65017+i16*res_test_64983+i17*res_test_6490+i18*res_test_64898+i19*res_test_64896+i20*res_test_6489+i21*res_test_64871+i22*res_test_64801+i23*res_test_64784+i24*res_test_64764+i25*res_test_64751+i26*res_test_64746+i27*res_test_64741+i28*res_test_6473+i29*res_test_64729+i30*res_test_64711+i31*res_test_64687+i32*res_test_646592+i33*res_test_64659+i34*res_test_64647+i35*res_test_646390+i36*res_test_64639+i37*res_test_6463+i38*res_test_64611+i39*res_test_646+i40*res_test_64527+i41*res_test_64488+i42*res_test_644801+i43*res_test_6443+i44*res_test_6440+i45*res_test_6429+i46*res_test_6362, axis=1) + 1\n", 226 | " print(\"i1=\",i1,\"i2=\",i2,\"i3=\",i3,\"i4=\",i4,\"i5=\",i5,\"i6=\",i6,\"i7=\",i7,\"i8=\",i8,\"i9=\",i9,\"i10=\",i10,\"i11=\",i11,\"i12=\",i12,\"i13=\",i13,\"i14=\",i14,\"i15=\",i15,\"i16=\",i16,\"i17=\",i17,\"i18=\",i18,\"i19=\",i19,\"i20=\",i20,\"i21=\",i21,\"i22=\",i22,\"i23=\",i23,\"i24=\",i24,\"i25=\",i25,\"i26=\",i26,\"i27=\",i27,\"i28=\",i28,\"i29=\",i29,\"i30=\",i30,\"i31=\",i31,\"i32=\",i32,\"i33=\",i33,\"i34=\",i34,\"i35=\",i35,\"i36=\",i36,\"i37=\",i37,\"i38=\",i38,\"i39=\",i39,\"i40=\",i40,\"i41=\",i41,\"i42=\",i42,\"i43=\",i43,\"i44=\",i44,\"i45=\",i45,\"i46=\",i46,\" \",\"accuracy=\",acc)\n", 227 | " print(acc)\n", 228 | " #65957" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 12, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "Save Done.\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "result = pd.DataFrame()\n", 246 | "result['id'] = data[test_index]['uId']\n", 247 | "result['label'] = res_test.astype(int)\n", 248 | "result.to_csv('./out/submission.csv', index=False)\n", 249 | "print('Save Done.')" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 3, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "pred_val_cb_6120 = np.load(\"./out/proba_cb_nan_val_6120.npy\")\n", 259 | "pred_test_cb_6120 = np.load(\"./out/proba_cb_nan_test_6120.npy\")\n", 260 | "pred_val_lgb_61228 = np.load(\"./out/proba_lgb_nan_val_612284.npy\")\n", 261 | "pred_test_lgb_61228 = np.load(\"./out/proba_lgb_nan_test_612284.npy\")\n", 262 | "pred_val_xgb_61312 = np.load(\"./out/proba_xgb_nan_val_61312.npy\")\n", 263 | "pred_test_xgb_61312 = np.load(\"./out/proba_xgb_nan_test_61312.npy\")\n", 264 | "pred_val_xgb_6118 = np.load(\"./out/proba_xgb_nan_val_6118.npy\")\n", 265 | "pred_test_xgb_6118 = np.load(\"./out/proba_xgb_nan_test_6118.npy\")\n", 266 | "pred_val_xgb_6114 = np.load(\"./out/proba_xgb_nan_val_6114.npy\")\n", 267 | "pred_test_xgb_6114 = np.load(\"./out/proba_xgb_nan_test_6114.npy\")" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 4, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "data": { 277 | "application/vnd.jupyter.widget-view+json": { 278 | "model_id": "d017d46a59014e9ebb68eb820f5769c3", 279 | "version_major": 2, 280 | "version_minor": 0 281 | }, 282 | "text/plain": [ 283 | "HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))" 284 | ] 285 | }, 286 | "metadata": {}, 287 | "output_type": "display_data" 288 | }, 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "\n" 294 | ] 295 | }, 296 | { 297 | "ename": "NameError", 298 | "evalue": "name 'y_validation' is not defined", 299 | "output_type": "error", 300 | "traceback": [ 301 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 302 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 303 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mres_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpred_val_cb_6120\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpred_val_lgb_61228\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mi3\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpred_val_xgb_61312\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mi4\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpred_val_xgb_6118\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mi5\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpred_val_xgb_6114\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0macc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0my_validation\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mres_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0macc\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0mbest_score\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mbest_score\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0macc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 304 | "\u001b[0;31mNameError\u001b[0m: name 'y_validation' is not defined" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "best_score = 0\n", 310 | "from tqdm import tnrange\n", 311 | "import random\n", 312 | "for i in tnrange(0,100000,1):\n", 313 | " i1 = random.uniform(0, 10)#0.1636255590669533\n", 314 | " i2 = random.uniform(0, 10) #1.4855965015936774 # \n", 315 | " i3 = random.uniform(0, 10)#8.329328614291946 #\n", 316 | " i4 = random.uniform(0, 10)\n", 317 | " i5 = random.uniform(0, 10)\n", 318 | " \n", 319 | " res_val = np.argmax(i1*pred_val_cb_6120+i2*pred_val_lgb_61228+i3*pred_val_xgb_61312+i4*pred_val_xgb_6118+i5*pred_val_xgb_6114, axis=1) + 1\n", 320 | " acc = accuracy_score(y_true=y_validation, y_pred=res_val)\n", 321 | " if acc>best_score:\n", 322 | " best_score=acc\n", 323 | " res_test = np.argmax(i1*pred_test_cb+i2*pred_test_lgb+i3*pred_test_xgb+i4*pred_test_xgb_6118+i5*pred_test_xgb_6114, axis=1) + 1\n", 324 | " print(\"i1=\",i1,\"i2=\",i2,\"i3=\",i3,\" \",\"accuracy=\",acc)\n", 325 | " print(acc)\n", 326 | " #6138" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "# 用LightGBM作次级分类器" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 167, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "import lightgbm_gpu as lgb\n", 343 | "from sklearn.model_selection import train_test_split\n", 344 | "from sklearn.model_selection import StratifiedKFold,KFold\n", 345 | "import warnings\n", 346 | "warnings.filterwarnings(\"ignore\")" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 164, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "train_x = np.hstack([res_val_6490,res_val_6362,res_val_6429,res_val_6489,res_val_64898,res_val_65017,res_val_6504,res_val_6504,res_val_65042,res_val_65077,res_val_6513,res_val_65131,res_val_6535,res_val_6534,res_val_6550])\n", 356 | "test_x = np.hstack([res_test_6490,res_test_6362,res_test_6429,res_test_6489,res_test_64898,res_test_65017,res_test_6504,res_test_6504,res_test_65042,res_test_65077,res_test_6513,res_test_65131,res_test_6535,res_test_6534,res_test_6550])\n", 357 | "train_y = y_validation.values\n", 358 | "train_x = pd.DataFrame(train_x)\n", 359 | "test_x = pd.DataFrame(test_x)\n", 360 | "train_y = pd.DataFrame(train_y)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 28, 366 | "metadata": { 367 | "scrolled": true 368 | }, 369 | "outputs": [], 370 | "source": [ 371 | "X_train, X_validation, y_train, y_validation = train_test_split(train_x, train_y, test_size=0.2, random_state=42)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 165, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "def acc(labels,preds):\n", 381 | " preds = np.argmax(preds.reshape(6, -1), axis=0)\n", 382 | " score = accuracy_score(y_true=labels, y_pred=preds)\n", 383 | " return 'accuracy', score, True" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "from sklearn.model_selection import StratifiedKFold\n", 393 | "n_splits = 10\n", 394 | "skf = StratifiedKFold(n_splits=n_splits,shuffle=True, random_state=42)\n", 395 | "\n", 396 | "val_scores = [0] * n_splits\n", 397 | "cv_val = np.zeros((train_x.shape[0],6))\n", 398 | "\n", 399 | "for i,(fit_idx,val_idx) in enumerate(skf.split(train_x, train_y)):\n", 400 | " X_fit = train_x.iloc[fit_idx]\n", 401 | " y_fit = train_y.iloc[fit_idx]\n", 402 | " X_val = train_x.iloc[val_idx]\n", 403 | " y_val = train_y.iloc[val_idx]\n", 404 | " \n", 405 | " lgb_model = lgb.LGBMClassifier(boosting_type=\"gbdt\", num_leaves=52, reg_alpha=0.1, reg_lambda=1,\n", 406 | " max_depth=-1, n_estimators=5000, objective='multiclass',sub_feature=0.8,\n", 407 | " subsample=0.8, colsample_bytree=0.8, subsample_freq=1,min_child_samples=50, \n", 408 | " learning_rate=0.01, random_state=2019, metric=\"None\",n_jobs=8,device='cpu')\n", 409 | " \n", 410 | " lgb_model.fit(X_fit,y_fit,eval_set=[(X_val, y_val)],eval_metric=acc,early_stopping_rounds=50,verbose=None)\n", 411 | " \n", 412 | " val_scores[i] = lgb_model.best_score_['valid_0']['accuracy']\n", 413 | " cv_val[val_idx] = lgb_model.predict_proba(X_val)\n", 414 | " print('Fold {} acc: {:.5f}'.format(i+1, val_scores[i]))\n", 415 | " \n", 416 | "\n", 417 | "val_mean = np.mean(val_scores)\n", 418 | "val_std = np.std(val_scores)\n", 419 | " \n", 420 | "print('average acc: {:.5f} (±{:.5f})'.format(val_mean, val_std)) #0.657" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "res_test = lgb_model.predict(test_x)\n", 430 | "result = pd.DataFrame()\n", 431 | "result['id'] = data[test_index]['uId']\n", 432 | "result['label'] = res_test.astype(int)\n", 433 | "result.to_csv('./out/submission.csv', index=False)\n", 434 | "print('Save Done.')" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "# 尝试利用各种线性模型对各模型结果进行融合" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 15, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "from sklearn.linear_model import LogisticRegression,RidgeClassifierCV,RidgeClassifier,LogisticRegressionCV,SGDClassifier\n", 451 | "import gc" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 14, 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/plain": [ 462 | "(40200, 240)" 463 | ] 464 | }, 465 | "execution_count": 14, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "#all\n", 472 | "train_x = np.hstack([res_val_65236,res_val_65221,res_val_65117,res_val_64649,res_val_64881,res_val_6503,res_val_6550,res_val_6535,res_val_6534,res_val_65131,res_val_6513,res_val_65077,res_val_65042,res_val_6504,res_val_65017,res_val_64983,res_val_6490,res_val_64898,res_val_64896,res_val_6489,res_val_64871,res_val_64801,res_val_64784,res_val_64764,res_val_64751,res_val_64746,res_val_64741,res_val_6473,res_val_64729,res_val_64711,res_val_64687,res_val_646592,res_val_64659,res_val_64647,res_val_646390,res_val_64639,res_val_6463,res_val_64611,res_val_646,res_val_64527,res_val_64488,res_val_644801,res_val_6443,res_val_6440,res_val_6429,res_val_6362])\n", 473 | "test_x = np.hstack([res_test_65236,res_test_65221,res_test_65117,res_test_64649,res_test_64881,res_test_6503,res_test_6550,res_test_6535,res_test_6534,res_test_65131,res_test_6513,res_test_65077,res_test_65042,res_test_6504,res_test_65017,res_test_64983,res_test_6490,res_test_64898,res_test_64896,res_test_6489,res_test_64871,res_test_64801,res_test_64784,res_test_64764,res_test_64751,res_test_64746,res_test_64741,res_test_6473,res_test_64729,res_test_64711,res_test_64687,res_test_646592,res_test_64659,res_test_64647,res_test_646390,res_test_64639,res_test_6463,res_test_64611,res_test_646,res_test_64527,res_test_64488,res_test_644801,res_test_6443,res_test_6440,res_test_6429,res_test_6362])\n", 474 | "train_y = y_validation\n", 475 | "train_x.shape" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 116, 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "(40200, 234)" 487 | ] 488 | }, 489 | "execution_count": 116, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "#few\n", 496 | "train_x = np.hstack([res_val_65236,res_val_65221,res_val_64649,res_val_6503,res_val_6550,res_val_6535,res_val_6534,res_val_65131,res_val_6513,res_val_65077,res_val_65042,res_val_65017,res_val_64983,res_val_6490,res_val_64898,res_val_64896,res_val_64801,res_val_64784,res_val_64764,res_val_64751,res_val_64746,res_val_64741,res_val_64729,res_val_64711,res_val_64687,res_val_646592,res_val_64659,res_val_64647,res_val_646390,res_val_64639,res_val_6463,res_val_64611,res_val_646,res_val_64527,res_val_64488,res_val_644801,res_val_6440,res_val_6429,res_val_6362])\n", 497 | "test_x = np.hstack([res_test_65236,res_test_65221,res_test_64649,res_test_6503,res_test_6550,res_test_6535,res_test_6534,res_test_65131,res_test_6513,res_test_65077,res_test_65042,res_test_65017,res_test_64983,res_test_6490,res_test_64898,res_test_64896,res_test_64801,res_test_64784,res_test_64764,res_test_64751,res_test_64746,res_test_64741,res_test_64729,res_test_64711,res_test_64687,res_test_646592,res_test_64659,res_test_64647,res_test_646390,res_test_64639,res_test_6463,res_test_64611,res_test_646,res_test_64527,res_test_64488,res_test_644801,res_test_6440,res_test_6429,res_test_6362])\n", 498 | "train_y = y_validation\n", 499 | "train_x.shape" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 117, 505 | "metadata": {}, 506 | "outputs": [ 507 | { 508 | "data": { 509 | "text/plain": [ 510 | "0.6621890547263681" 511 | ] 512 | }, 513 | "execution_count": 117, 514 | "metadata": {}, 515 | "output_type": "execute_result" 516 | } 517 | ], 518 | "source": [ 519 | "clf = RidgeClassifierCV()\n", 520 | "clf.fit(train_x,train_y)\n", 521 | "pred = clf.predict(train_x)\n", 522 | "pred = np.round(pred)\n", 523 | "print(accuracy_score(y_pred=pred ,y_true=train_y))#Ridge_cv 6621 Ridge 66422\n", 524 | "cv_test = clf.predict(test_x)\n", 525 | "cv_test = np.round(cv_test)\n", 526 | "print(cv_test)" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 120, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "name": "stdout", 536 | "output_type": "stream", 537 | "text": [ 538 | "Save Done.\n" 539 | ] 540 | } 541 | ], 542 | "source": [ 543 | "result = pd.DataFrame()\n", 544 | "result['id'] = data[test_index]['uId']\n", 545 | "result['label'] = cv_test.astype(int)\n", 546 | "result.to_csv('./out/submission_rc_cv_66219.csv', index=False)\n", 547 | "print('Save Done.')" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "# 按线上分数再投票" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 2, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "res1 = pd.read_csv('./out/submission_online_66045.csv')\n", 564 | "res2 = pd.read_csv('./out/submission_online_660414.csv')\n", 565 | "res3 = pd.read_csv('./out/submission_online_660448.csv')\n", 566 | "res4 = pd.read_csv('./out/submission_online_660468.csv')\n", 567 | "res5 = pd.read_csv('./out/submission_online_660577.csv')\n", 568 | "res6 = pd.read_csv('./out/submission_online_660551.csv')\n", 569 | "res7 = pd.read_csv('./out/submission_online_660579.csv')\n", 570 | "res8 = pd.read_csv('./out/submission_online_66056.csv')\n", 571 | "res9 = pd.read_csv('./out/submission_mix_660786.csv')" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 3, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "res1 = res1.merge(res2, on='id', how='left').merge(res3, on='id', how='left').merge(res4, on='id', how='left')\\\n", 581 | " .merge(res5, on='id', how='left').merge(res6, on='id', how='left').merge(res7, on='id', how='left')\\\n", 582 | " .merge(res8, on='id', how='left').merge(res9, on='id', how='left').merge(res7, on='id', how='left')" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 4, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "res1.columns=['id','label_1','label_2','label_3','label_4','label_5','label_6','label_7','label_8','label_9','label_10']" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 5, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "tmp = res1[['label_1','label_2','label_3','label_4','label_5','label_6','label_7','label_8','label_9','label_10']]" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 6, 606 | "metadata": {}, 607 | "outputs": [ 608 | { 609 | "data": { 610 | "application/vnd.jupyter.widget-view+json": { 611 | "model_id": "9aa8029e6e714460b4781a89294ac8e4", 612 | "version_major": 2, 613 | "version_minor": 0 614 | }, 615 | "text/plain": [ 616 | "HBox(children=(IntProgress(value=0, max=502500), HTML(value='')))" 617 | ] 618 | }, 619 | "metadata": {}, 620 | "output_type": "display_data" 621 | }, 622 | { 623 | "name": "stdout", 624 | "output_type": "stream", 625 | "text": [ 626 | "\n" 627 | ] 628 | } 629 | ], 630 | "source": [ 631 | "res=[]\n", 632 | "for i in tnrange(tmp.shape[0]):\n", 633 | " counts = np.bincount(tmp.values[i]) \n", 634 | " res.append(np.argmax(counts)) " 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 7, 640 | "metadata": {}, 641 | "outputs": [], 642 | "source": [ 643 | "res1['label'] = res\n", 644 | "res = res1[['id','label']]\n", 645 | "res.to_csv('./out/submission_mix.csv',index=False)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [] 654 | } 655 | ], 656 | "metadata": { 657 | "kernelspec": { 658 | "display_name": "Python 3", 659 | "language": "python", 660 | "name": "python3" 661 | }, 662 | "language_info": { 663 | "codemirror_mode": { 664 | "name": "ipython", 665 | "version": 3 666 | }, 667 | "file_extension": ".py", 668 | "mimetype": "text/x-python", 669 | "name": "python", 670 | "nbconvert_exporter": "python", 671 | "pygments_lexer": "ipython3", 672 | "version": "3.6.7" 673 | } 674 | }, 675 | "nbformat": 4, 676 | "nbformat_minor": 2 677 | } 678 | -------------------------------------------------------------------------------- /Travis/HUAWEI_FutureLab_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import catboost\n", 10 | "from catboost import CatBoostClassifier, Pool, cv\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.metrics import accuracy_score\n", 15 | "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler\n", 16 | "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer\n", 17 | "from sklearn.decomposition import TruncatedSVD\n", 18 | "from sklearn.model_selection import StratifiedKFold\n", 19 | "from sklearn.linear_model import SGDClassifier\n", 20 | "from tqdm import tnrange,tqdm_notebook\n", 21 | "import gc\n", 22 | "import warnings\n", 23 | "from scipy import sparse\n", 24 | "warnings.filterwarnings(\"ignore\")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# 原始数据" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "CPU times: user 477 ms, sys: 186 ms, total: 663 ms\n", 44 | "Wall time: 662 ms\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "%%time\n", 50 | "age_test = pd.read_csv('../age_test.csv', header = None, names = ['uId'])\n", 51 | "age_train = pd.read_csv('../age_train.csv', header = None, names = ['uId','age_group'])\n", 52 | "data = pd.concat([age_train,age_test], axis = 0,sort=True).reset_index()\n", 53 | "data.drop(['index'],axis=1,inplace=True)\n", 54 | "del age_test, age_train\n", 55 | "gc.collect() " 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "CPU times: user 6.67 s, sys: 1.55 s, total: 8.23 s\n", 68 | "Wall time: 8.22 s\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "%%time\n", 74 | "user_basic_info = pd.read_csv('../user_basic_info.csv',header= None, names=['uId','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])\n", 75 | "user_behavior_info = pd.read_csv('../user_behavior_info.csv', header = None, names = ['uId','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])\n", 76 | "data = data.merge(user_basic_info)\n", 77 | "data = data.merge(user_behavior_info)\n", 78 | "del user_basic_info, user_behavior_info\n", 79 | "gc.collect() " 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# 特征工程" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "CPU times: user 469 ms, sys: 246 ms, total: 715 ms\n", 99 | "Wall time: 689 ms\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "%%time\n", 105 | "data['ramLeftCapacity'] = data['ramCapacity'] * data['ramLeftRation']\n", 106 | "data['romLeftCapacity'] = data['romCapacity'] * data['romLeftRation']\n", 107 | "\n", 108 | "for i in ['A','B','C','D','E','F']:\n", 109 | " data['{}FuncTimes'.format(i)] = round(abs(data['{}FuncTimes'.format(i)]))\n", 110 | " \n", 111 | "data['romLeftRation'][data.romLeftRation>1] = 1\n", 112 | "data['ramLeftRation'][data.ramLeftRation>1] = 1" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "CPU times: user 1.35 s, sys: 989 ms, total: 2.34 s\n", 125 | "Wall time: 2.34 s\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "%%time\n", 131 | "#冷暖色系和颜色型号的降维\n", 132 | "tmp_color = pd.DataFrame(list(data.color.unique()))\n", 133 | "tmp_color.columns=['color']\n", 134 | "tmp_color['color_short'] = ['银','黑','蓝','银','金','极光','蓝','金','金','紫','黑','银','蓝','黑白','金','金','金','金','紫','灰','黑','蓝','红','蓝','灰','红','银','黑','白','黑','金','备件颜色','金','灰','白','金','黑','银','青','金','金','灰','灰','黑','蓝','蓝','白','蓝','红','紫','红','蓝','棕','粉','金','灰', '红','黑','灰','灰','紫','金','白','粉','青','金','蓝','灰','绿','银','金','银','银','灰','银','白','白','蓝','红','白','白','蓝','蓝','白','黑','白','极光','红','金','白','紫','蓝','蓝','金','金','蓝','白','红','银白','金','蓝','黑','蓝','粉','紫','灰','蓝','灰','红','黑','金', '红','黑','白','蓝','金','红','灰','灰','蓝','银白','灰','紫','灰','黑','蓝','银','粉','蓝','粉', '黄','橘','红','紫','黄','紫']\n", 135 | "tmp_color_warmcold = pd.DataFrame(list(tmp_color.color_short.unique()))\n", 136 | "tmp_color_warmcold['warm_cold'] = ['冷','冷','冷','暖','暖','暖','冷','冷','暖','暖','未知','冷','暖','暖','冷','暖','暖','暖']\n", 137 | "tmp_color_warmcold.columns=['color_short','warm_cold']\n", 138 | "tmp_color = tmp_color.merge(tmp_color_warmcold, on='color_short', how='left')\n", 139 | "data = data.merge(tmp_color, on='color', how='left')\n", 140 | "del tmp_color, tmp_color_warmcold\n", 141 | "gc.collect()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "CPU times: user 1.94 s, sys: 852 ms, total: 2.79 s\n", 154 | "Wall time: 2.71 s\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "%%time\n", 160 | "#rom分箱和求ram rom 的对应型号的最大最小值和比例\n", 161 | "bins = [4, 12, 24, 48, 96, 192, 384]\n", 162 | "group_names = ['rom_8G', 'rom_16G', 'rom_32G', 'rom_64G', 'rom_128G', 'rom_256G']\n", 163 | "cats = pd.cut(data.romCapacity, bins, labels = group_names)\n", 164 | "rom_max = data.groupby(['prodName'])['romCapacity'].max().reset_index()\n", 165 | "rom_max.columns=['prodName','romCapacity_max']\n", 166 | "rom_min = data.groupby(['prodName'])['romCapacity'].min().reset_index()\n", 167 | "rom_min.columns=['prodName','romCapacity_min']\n", 168 | "ram_max = data.groupby(['prodName'])['ramCapacity'].max().reset_index()\n", 169 | "ram_max.columns=['prodName','ramCapacity_max']\n", 170 | "ram_min = data.groupby(['prodName'])['ramCapacity'].min().reset_index()\n", 171 | "ram_min.columns=['prodName','ramCapacity_min']\n", 172 | "tmp = rom_max.merge(rom_min, on='prodName', how='left').merge(ram_max, on='prodName', how='left').merge(ram_min, on='prodName', how='left')\n", 173 | "res1 = data[['prodName','ramCapacity','romCapacity']]\n", 174 | "res1 = res1.merge(tmp, on='prodName', how='left')\n", 175 | "res1['rom_category'] = cats\n", 176 | "res1['rom_category'] = list(res1['rom_category'])\n", 177 | "res1['ram_max_ratio'] = res1['ramCapacity']/res1['ramCapacity_max']\n", 178 | "res1['ram_min_ratio'] = res1['ramCapacity']/res1['ramCapacity_min']\n", 179 | "res1['rom_max_ratio'] = res1['romCapacity']/res1['romCapacity_max']\n", 180 | "res1['rom_min_ratio'] = res1['romCapacity']/res1['romCapacity_min']\n", 181 | "res1.drop(['prodName','ramCapacity','rom_category'], axis=1 , inplace=True)\n", 182 | "data = pd.concat([data,res1], axis=1)\n", 183 | "del rom_max, rom_min, ram_max, ram_min, res1, tmp\n", 184 | "gc.collect()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 7, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "CPU times: user 5.22 s, sys: 5.53 s, total: 10.8 s\n", 197 | "Wall time: 10.7 s\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "%%time\n", 203 | "#对应手机型号的平均开机次数和fontsize\n", 204 | "tmp1 = data.groupby(['prodName'])['bootTimes'].mean().reset_index()\n", 205 | "tmp1.columns=['prodName','bootTimes_mean']\n", 206 | "data = data.merge(tmp1, on='prodName', how='left')\n", 207 | "tmp2 = data.groupby(['prodName'])['fontSize'].mean().reset_index()\n", 208 | "tmp2.columns=['prodName','fontSize_mean']\n", 209 | "data = data.merge(tmp2, on='prodName', how='left')\n", 210 | "del tmp1, tmp2" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 8, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "application/vnd.jupyter.widget-view+json": { 221 | "model_id": "07c5f0d7eb934ac59ccfef04f70da186", 222 | "version_major": 2, 223 | "version_minor": 0 224 | }, 225 | "text/plain": [ 226 | "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" 227 | ] 228 | }, 229 | "metadata": {}, 230 | "output_type": "display_data" 231 | }, 232 | { 233 | "name": "stdout", 234 | "output_type": "stream", 235 | "text": [ 236 | "\n", 237 | "CPU times: user 10min 8s, sys: 24.2 s, total: 10min 33s\n", 238 | "Wall time: 11min 9s\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "%%time\n", 244 | "#对usage表进行groupby处理\n", 245 | "user_app_usage = pd.read_csv('../user_app_usage.csv',chunksize=500000,names=['uId','appId','duration','times','use_date'])\n", 246 | "dist = []\n", 247 | "dist1 = []\n", 248 | "dist2 = []\n", 249 | "for chunk in tqdm_notebook(user_app_usage):\n", 250 | " tmp = chunk.groupby(['uId','use_date'])['duration','times'].sum().reset_index().values\n", 251 | " tmp1 = chunk.groupby(['uId','appId']).agg({'duration': 'sum','times': 'sum', 'use_date': 'count'}).reset_index().values\n", 252 | " tmp2 = chunk.groupby(['uId','use_date'])['appId'].count().reset_index().values\n", 253 | "\n", 254 | " dist.extend(tmp)\n", 255 | " dist1.extend(tmp1)\n", 256 | " dist2.extend(tmp2)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 9, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "CPU times: user 1min 56s, sys: 30.4 s, total: 2min 27s\n", 269 | "Wall time: 2min 26s\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "%%time\n", 275 | "#统计usage表中每周各天的使用d时长和使用次数\n", 276 | "user_app_usage_dt = pd.DataFrame(dist)\n", 277 | "user_app_usage_dt.columns=['uId','use_date','date_duration','date_times']\n", 278 | "user_app_usage_res = user_app_usage_dt.groupby(['uId','use_date'])['date_duration','date_times'].sum().reset_index()\n", 279 | "user_app_usage_res['use_date'] = pd.to_datetime(user_app_usage_res['use_date'], errors='coerce')\n", 280 | "user_app_usage_res['use_date'] = user_app_usage_res.use_date.dt.weekday\n", 281 | "user_app_usage_res = user_app_usage_res.rename(columns = {'use_date':'use_date_weekday'})\n", 282 | "user_app_usage_last = user_app_usage_res.groupby(['uId','use_date_weekday'])['date_duration','date_times'].sum().reset_index()\n", 283 | "tmp = user_app_usage_last.pivot(index='uId',values=['date_duration','date_times'],columns='use_date_weekday')\n", 284 | "tmp = tmp.reset_index()\n", 285 | "tmp.columns = ['uId','date_duration_Monday','date_duration_Tuesday','date_duration_Wednesday','date_duration_Thursday','date_duration_Friday','date_duration_Saturday','date_duration_Sunday',\n", 286 | " 'date_times_Monday','date_times_Tuesday','date_times_Wednesday','date_times_Thursday','date_times_Friday','date_times_Saturday','date_times_Sunday']\n", 287 | "data = data.merge(tmp, on='uId', how='left')\n", 288 | "del user_app_usage_dt, user_app_usage_res, user_app_usage_last, tmp\n", 289 | "gc.collect()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 10, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "name": "stdout", 299 | "output_type": "stream", 300 | "text": [ 301 | "CPU times: user 2min 25s, sys: 41.7 s, total: 3min 7s\n", 302 | "Wall time: 3min 7s\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "%%time\n", 308 | "#usage表中统计一个月三十天每天的开机次数和开机时间\n", 309 | "user_app_usage_dt = pd.DataFrame(dist)\n", 310 | "user_app_usage_dt.columns=['uId','use_date','date_duration','date_times']\n", 311 | "user_app_usage_dt = user_app_usage_dt.groupby(['uId','use_date'])['date_duration','date_times'].sum().reset_index()\n", 312 | "duration_per_day_dt = user_app_usage_dt.pivot(index='uId',columns='use_date',values='date_duration')\n", 313 | "times_per_day_dt = user_app_usage_dt.pivot(index='uId',columns='use_date',values='date_times')\n", 314 | "user_app_usage_dt = pd.concat([duration_per_day_dt,times_per_day_dt],axis=1).reset_index()\n", 315 | "user_app_usage_dt.columns=['total_feat_{}'.format(i) for i in range(61)]\n", 316 | "user_app_usage_dt = user_app_usage_dt.rename(columns={'total_feat_0':'uId'})\n", 317 | "data = data.merge(user_app_usage_dt,on='uId',how='left') \n", 318 | "del dist, user_app_usage_dt, duration_per_day_dt, times_per_day_dt\n", 319 | "gc.collect()" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 11, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "CPU times: user 4min 27s, sys: 1min 27s, total: 5min 54s\n", 332 | "Wall time: 5min 54s\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "%%time\n", 338 | "#对usage表中统计各个app种类的总时长、次数和天数\n", 339 | "user_app_usage_dt = pd.DataFrame(dist1)\n", 340 | "user_app_usage_dt.columns=['uId','appId','date_duration','date_times','use_date']\n", 341 | "user_app_usage = user_app_usage_dt.groupby(['uId','appId']).agg({'date_duration': 'sum','date_times': 'sum', 'use_date': 'count'}).reset_index()\n", 342 | "user_app_usage.columns=['uId','appId','total_duration','total_times','used_days']\n", 343 | "app_info = pd.read_csv('../app_info.csv', header = None, names = ['appId','category'])\n", 344 | "app_data = user_app_usage.merge(app_info,on='appId',how='left')\n", 345 | "tmp = app_data.groupby(['uId','category'])['total_duration'].sum().reset_index()\n", 346 | "tmp0 = tmp.pivot(index='uId',values='total_duration',columns='category')\n", 347 | "tmp0.columns = ['主题个性_total_duration','主题铃声_total_duration', '休闲娱乐_total_duration', '休闲游戏_total_duration', '休闲益智_total_duration', '体育射击_total_duration', '体育竞速_total_duration', '便捷生活_total_duration', '儿童_total_duration', '出行导航_total_duration',\n", 348 | " '动作冒险_total_duration', '动作射击_total_duration', '医疗健康_total_duration', '合作壁纸*_total_duration', '商务_total_duration', '图书阅读_total_duration', '学习办公_total_duration', '实用工具_total_duration', '影音娱乐_total_duration',\n", 349 | " '拍摄美化_total_duration', '教育_total_duration', '新闻阅读_total_duration', '旅游住宿_total_duration', '棋牌天地_total_duration', '棋牌桌游_total_duration', '模拟游戏_total_duration', '汽车_total_duration', '电子书籍_total_duration',\n", 350 | " '益智棋牌_total_duration', '社交通讯_total_duration', '策略游戏_total_duration', '经营策略_total_duration', '网络游戏_total_duration', '美食_total_duration', '表盘个性_total_duration', '角色扮演_total_duration', '角色游戏_total_duration',\n", 351 | " '购物比价_total_duration', '运动健康_total_duration', '金融理财_total_duration']\n", 352 | "tmp = app_data.groupby(['uId','category'])['total_times'].sum().reset_index()\n", 353 | "tmp1 = tmp.pivot(index='uId',values='total_times',columns='category')\n", 354 | "tmp1.columns = ['主题个性_total_times','主题铃声_total_times', '休闲娱乐_total_times', '休闲游戏_total_times', '休闲益智_total_times', '体育射击_total_times', '体育竞速_total_times', '便捷生活_total_times', '儿童_total_times', '出行导航_total_times',\n", 355 | " '动作冒险_total_times', '动作射击_total_times', '医疗健康_total_times', '合作壁纸*_total_times', '商务_total_times', '图书阅读_total_times', '学习办公_total_times', '实用工具_total_times', '影音娱乐_total_times',\n", 356 | " '拍摄美化_total_times', '教育_total_times', '新闻阅读_total_times', '旅游住宿_total_times', '棋牌天地_total_times', '棋牌桌游_total_times', '模拟游戏_total_times', '汽车_total_times', '电子书籍_total_times',\n", 357 | " '益智棋牌_total_times', '社交通讯_total_times', '策略游戏_total_times', '经营策略_total_times', '网络游戏_total_times', '美食_total_times', '表盘个性_total_times', '角色扮演_total_times', '角色游戏_total_times',\n", 358 | " '购物比价_total_times', '运动健康_total_times', '金融理财_total_times']\n", 359 | "tmp = app_data.groupby(['uId','category'])['used_days'].sum().reset_index()\n", 360 | "tmp2 = tmp.pivot(index='uId',values='used_days',columns='category')\n", 361 | "tmp2.columns = ['主题个性_used_days','主题铃声_used_days','休闲娱乐_used_days', '休闲游戏_used_days', '休闲益智_used_days', '体育射击_used_days', '体育竞速_used_days', '便捷生活_used_days', '儿童_used_days', '出行导航_used_days',\n", 362 | " '动作冒险_used_days', '动作射击_used_days', '医疗健康_used_days', '合作壁纸*_used_days', '商务_used_days', '图书阅读_used_days', '学习办公_used_days', '实用工具_used_days', '影音娱乐_used_days',\n", 363 | " '拍摄美化_used_days', '教育_used_days', '新闻阅读_used_days', '旅游住宿_used_days', '棋牌天地_used_days', '棋牌桌游_used_days', '模拟游戏_used_days', '汽车_used_days', '电子书籍_used_days',\n", 364 | " '益智棋牌_used_days', '社交通讯_used_days', '策略游戏_used_days', '经营策略_used_days', '网络游戏_used_days', '美食_used_days', '表盘个性_used_days', '角色扮演_used_days', '角色游戏_used_days',\n", 365 | " '购物比价_used_days', '运动健康_used_days', '金融理财_used_days']\n", 366 | "sum_usage = pd.concat([tmp0,tmp1,tmp2],axis=1).reset_index()\n", 367 | "data = data.merge(sum_usage,on='uId',how='left') \n", 368 | "del user_app_usage_dt, user_app_usage, app_info, app_data, tmp, tmp0, tmp1, tmp2, sum_usage\n", 369 | "gc.collect()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 12, 375 | "metadata": { 376 | "scrolled": true 377 | }, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | "CPU times: user 2min 14s, sys: 48 s, total: 3min 2s\n", 384 | "Wall time: 3min 1s\n" 385 | ] 386 | } 387 | ], 388 | "source": [ 389 | "%%time\n", 390 | "#统计usage表中30天每天的使用app总数\n", 391 | "user_app_usage_dt = pd.DataFrame(dist2)\n", 392 | "user_app_usage_dt.columns=['uId','use_date','app_count']\n", 393 | "user_app_usage_dt = user_app_usage_dt.groupby(['uId','use_date'])['app_count'].sum().reset_index()\n", 394 | "user_app_usage_dt = user_app_usage_dt.pivot(index='uId',columns='use_date',values='app_count').reset_index()\n", 395 | "user_app_usage_dt.columns=['total_app_count_{}'.format(i) for i in range(31)]\n", 396 | "user_app_usage_dt = user_app_usage_dt.rename(columns={'total_app_count_0':'uId'})\n", 397 | "data = data.merge(user_app_usage_dt,on='uId',how='left') \n", 398 | "del dist2, user_app_usage_dt\n", 399 | "gc.collect()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "## MLP降维" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "name": "stdout", 416 | "output_type": "stream", 417 | "text": [ 418 | "CPU times: user 5min 34s, sys: 43.1 s, total: 6min 18s\n", 419 | "Wall time: 6min 17s\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "%%time\n", 425 | "#从usage表中提取top5000duration的appId\n", 426 | "user_app_usage_dt = pd.DataFrame(dist1)\n", 427 | "user_app_usage_dt.columns=['uId','appId','date_duration','date_times','use_date']\n", 428 | "user_app_stat = user_app_usage_dt.groupby(['uId','appId']).agg({'date_duration': 'sum','date_times': 'sum', 'use_date': 'count'}).reset_index()\n", 429 | "user_app_stat.columns=['uId','appId','total_duration','total_times','used_days']\n", 430 | "train_age = pd.read_csv('../age_train.csv', names=['uId', 'age'])\n", 431 | "train_age_stat = train_age.merge(user_app_stat, how='left', on='uId')\n", 432 | "del train_age_stat['uId']\n", 433 | "train_age_stat = train_age_stat.groupby(['age', 'appId']).sum().reset_index()\n", 434 | "train_age_stat.sort_values(by=['age', 'total_duration'], inplace=True, ascending=False)\n", 435 | "train_age_stat_top5000 = train_age_stat.groupby('age').head(5000).reset_index()\n", 436 | "top5000app = train_age_stat_top5000['appId'].unique()\n", 437 | "user_app_5000 = user_app_stat[user_app_stat['appId'].isin(top5000app)][['uId', 'appId']]\n", 438 | "def concat(ser):\n", 439 | " return '#'.join([a for a in ser])\n", 440 | "app_usage5000 = user_app_5000.groupby('uId')['appId'].apply(concat).reset_index()\n", 441 | "#app_usage5000.to_hdf('../feature/app_usage5000.h5', key='data') #给MLP降维文件用" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 14, 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "name": "stdout", 451 | "output_type": "stream", 452 | "text": [ 453 | "CPU times: user 39.5 s, sys: 46.6 s, total: 1min 26s\n", 454 | "Wall time: 1min 27s\n" 455 | ] 456 | } 457 | ], 458 | "source": [ 459 | "%%time\n", 460 | "user_app_actived = pd.read_csv('../user_app_actived.csv', header = None, names =['uId','appId'])\n", 461 | "app_usage5000.columns=['uId', 'usage_appId']\n", 462 | "data = data.merge(user_app_actived, on='uId', how='left')\n", 463 | "data = data.merge(app_usage5000, on='uId', how='left')\n", 464 | "del user_app_actived, app_usage5000\n", 465 | "gc.collect()" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 15, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "name": "stdout", 475 | "output_type": "stream", 476 | "text": [ 477 | "CPU times: user 1min 27s, sys: 20.3 s, total: 1min 47s\n", 478 | "Wall time: 1min 38s\n" 479 | ] 480 | } 481 | ], 482 | "source": [ 483 | "%%time\n", 484 | "#取actived appID对应的MLP中间层特征\n", 485 | "actived_app = CountVectorizer(token_pattern='a\\d+',binary=True).fit_transform(data['appId'])\n", 486 | "weight = np.load('./weight_bias/Xapp_weight1_dense1.npy')\n", 487 | "bias = np.load('./weight_bias/Xapp_bias1_dense1.npy')\n", 488 | "actived_app_I = actived_app.dot(weight)+bias\n", 489 | "weight = np.load('./weight_bias/Xapp_weight1_dense2.npy')\n", 490 | "bias = np.load('./weight_bias/Xapp_bias1_dense2.npy')\n", 491 | "actived_app_II = actived_app_I.dot(weight)+bias\n", 492 | "weight = np.load('./weight_bias/Xapp_weight1_dense3.npy')\n", 493 | "bias = np.load('./weight_bias/Xapp_bias1_dense3.npy')\n", 494 | "actived_app_III = actived_app_II.dot(weight)+bias\n", 495 | "active_app_df_I = pd.DataFrame(actived_app_I, columns=['actived_app_{}'.format(i) for i in range(actived_app_I.shape[1])])\n", 496 | "active_app_df_II = pd.DataFrame(actived_app_II, columns=['actived_app_II_{}'.format(i) for i in range(actived_app_II.shape[1])])\n", 497 | "active_app_df_III = pd.DataFrame(actived_app_III, columns=['actived_app_III_{}'.format(i) for i in range(actived_app_III.shape[1])])\n", 498 | "data = pd.concat([data,active_app_df_I,active_app_df_II,active_app_df_III],axis=1)\n", 499 | "del actived_app, actived_app_I, actived_app_II, actived_app_III\n", 500 | "del weight, bias\n", 501 | "del active_app_df_I, active_app_df_II, active_app_df_III\n", 502 | "gc.collect()" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 16, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "CPU times: user 59.9 s, sys: 18.5 s, total: 1min 18s\n", 515 | "Wall time: 1min 12s\n" 516 | ] 517 | } 518 | ], 519 | "source": [ 520 | "%%time\n", 521 | "#取usage appID对应的MLP中间层特征\n", 522 | "X_usage = CountVectorizer(token_pattern='a\\d+', binary=True).fit_transform(data['usage_appId'].fillna('-1'))\n", 523 | "X_usage_weight = np.load('./weight_bias/X_usage_weight1_dense1.npy')\n", 524 | "X_usage_bias = np.load('./weight_bias/X_usage_bias1_dense1.npy')\n", 525 | "X_usage_I = X_usage.dot(X_usage_weight) + X_usage_bias\n", 526 | "X_usage_weight = np.load('./weight_bias/X_usage_weight1_dense2.npy')\n", 527 | "X_usage_bias = np.load('./weight_bias/X_usage_bias1_dense2.npy')\n", 528 | "X_usage_II = X_usage_I.dot(X_usage_weight) + X_usage_bias\n", 529 | "X_usage_weight = np.load('./weight_bias/X_usage_weight1_dense3.npy')\n", 530 | "X_usage_bias = np.load('./weight_bias/X_usage_bias1_dense3.npy')\n", 531 | "X_usage_III = X_usage_II.dot(X_usage_weight) + X_usage_bias\n", 532 | "X_usage_df_I = pd.DataFrame(X_usage_I, columns=['X_usage_I_{}'.format(i) for i in range(X_usage_I.shape[1])])\n", 533 | "X_usage_df_II = pd.DataFrame(X_usage_II, columns=['X_usage_II_{}'.format(i) for i in range(X_usage_II.shape[1])])\n", 534 | "X_usage_df_III = pd.DataFrame(X_usage_III, columns=['X_usage_III_{}'.format(i) for i in range(X_usage_III.shape[1])])\n", 535 | "data = pd.concat([data, X_usage_df_I, X_usage_df_II, X_usage_df_III], axis=1)\n", 536 | "del X_usage, X_usage_I, X_usage_II, X_usage_III\n", 537 | "del X_usage_weight, X_usage_bias\n", 538 | "del X_usage_df_I, X_usage_df_II, X_usage_df_III\n", 539 | "gc.collect()" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 17, 545 | "metadata": {}, 546 | "outputs": [ 547 | { 548 | "name": "stdout", 549 | "output_type": "stream", 550 | "text": [ 551 | "CPU times: user 39min 9s, sys: 6min 7s, total: 45min 17s\n", 552 | "Wall time: 20min 7s\n" 553 | ] 554 | } 555 | ], 556 | "source": [ 557 | "%%time\n", 558 | "#usage表中app的所有duration time days作svd降维\n", 559 | "user_app_stat = pd.DataFrame(dist1)\n", 560 | "user_app_stat.columns=['uId','appId','date_duration','date_times','use_date']\n", 561 | "user_app_stat = user_app_stat.groupby(['uId','appId']).agg({'date_duration': 'sum','date_times': 'sum', 'use_date': 'count'}).reset_index()\n", 562 | "user_app_stat.columns=['uId','appId','total_duration','total_times','used_days']\n", 563 | "\n", 564 | "user_app_actived = pd.read_csv('../user_app_actived.csv', names=['uId', 'appId'])\n", 565 | "cnt_vec = CountVectorizer(token_pattern='a\\d+', binary=True).fit(user_app_actived['appId'])\n", 566 | "actived_app = list(cnt_vec.vocabulary_.keys())\n", 567 | "user_actived_app_stat = user_app_stat[user_app_stat['appId'].isin(actived_app)]\n", 568 | "\n", 569 | "age_train = pd.read_csv('../age_train.csv', names=['uId', 'age_group'])\n", 570 | "age_test = pd.read_csv('../age_test.csv', names=['uId'])\n", 571 | "\n", 572 | "all_uId = pd.concat([age_train, age_test], sort=True)\n", 573 | "all_uId = all_uId[['uId']]\n", 574 | "all_uId['idx'] = np.arange(len(all_uId))\n", 575 | "\n", 576 | "all_user_actived_app_stat = all_uId.merge(user_actived_app_stat, 'left', 'uId').fillna(-1)\n", 577 | "\n", 578 | "all_user_actived_app_stat['appId_lbl'] = LabelEncoder().fit_transform(all_user_actived_app_stat['appId'].astype(str))\n", 579 | "\n", 580 | "shape = (len(all_uId), all_user_actived_app_stat['appId_lbl'].nunique())\n", 581 | "\n", 582 | "X_duration = sparse.csr_matrix((all_user_actived_app_stat['total_duration'].astype(int),\n", 583 | " (all_user_actived_app_stat['idx'], all_user_actived_app_stat['appId_lbl'])), shape=shape)\n", 584 | "\n", 585 | "X_times = sparse.csr_matrix((all_user_actived_app_stat['total_times'].astype(int),\n", 586 | " (all_user_actived_app_stat['idx'], all_user_actived_app_stat['appId_lbl'])), shape=shape) \n", 587 | "X_days = sparse.csr_matrix((all_user_actived_app_stat['used_days'].astype(int),\n", 588 | " (all_user_actived_app_stat['idx'], all_user_actived_app_stat['appId_lbl'])), shape=shape)\n", 589 | "\n", 590 | "X_duration_svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47).fit_transform(X_duration)\n", 591 | "X_times_svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47).fit_transform(X_times)\n", 592 | "X_days_svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47).fit_transform(X_days)\n", 593 | "\n", 594 | "X_duration_svd_I = pd.DataFrame(X_duration_svd, columns=['x_duration_{}'.format(i) for i in range(30)])\n", 595 | "X_times_svd_I = pd.DataFrame(X_times_svd, columns=['x_times_{}'.format(i) for i in range(30)])\n", 596 | "X_days_svd_I = pd.DataFrame(X_days_svd, columns=['x_days_{}'.format(i) for i in range(30)])\n", 597 | "\n", 598 | "X_duration_svd_I['uId'] = all_uId['uId'].values\n", 599 | "X_times_svd_I['uId'] = all_uId['uId'].values\n", 600 | "X_days_svd_I['uId'] = all_uId['uId'].values\n", 601 | "\n", 602 | "data = data.merge(X_duration_svd_I, how='left', on='uId')\n", 603 | "data = data.merge(X_times_svd_I, how='left', on='uId')\n", 604 | "data = data.merge(X_days_svd_I, how='left', on='uId')\n", 605 | "\n", 606 | "del dist1, X_duration, X_times, X_days, X_duration_svd, X_times_svd, X_days_svd\n", 607 | "del user_actived_app_stat, user_app_actived, user_app_stat, age_train, age_test, all_uId, all_user_actived_app_stat\n", 608 | "gc.collect()" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": {}, 614 | "source": [ 615 | "# test zone" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 18, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "data['user_actived_app_count'] = data['appId'].apply(lambda x: len(x.split('#')))" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 2, 630 | "metadata": { 631 | "scrolled": true 632 | }, 633 | "outputs": [], 634 | "source": [ 635 | "data = pd.read_hdf('./data_all.hdf',key='data')" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 19, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "week_day_conut_feature = ['date_duration_Monday','date_duration_Tuesday','date_duration_Wednesday','date_duration_Thursday','date_duration_Friday','date_duration_Saturday','date_duration_Sunday']\n", 645 | "data['date_duration_week_max'] = data[week_day_conut_feature].max(axis=1)\n", 646 | "data['date_duration_week_min'] = data[week_day_conut_feature].min(axis=1)\n", 647 | "data['date_duration_week_sum'] = data[week_day_conut_feature].sum(axis=1)\n", 648 | "data['date_duration_week_std'] = data[week_day_conut_feature].std(axis=1)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 20, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "week_day_conut_feature = ['date_times_Monday','date_times_Tuesday','date_times_Wednesday','date_times_Thursday','date_times_Friday','date_times_Saturday','date_times_Sunday']\n", 658 | "data['date_times_week_max'] = data[week_day_conut_feature].max(axis=1)\n", 659 | "data['date_times_week_min'] = data[week_day_conut_feature].min(axis=1)\n", 660 | "data['date_times_week_sum'] = data[week_day_conut_feature].sum(axis=1)\n", 661 | "data['date_times_week_std'] = data[week_day_conut_feature].std(axis=1)" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 18, 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "tmp = data['appId'].apply(lambda x : x.split('#'))" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 24, 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "tmp = tmp.reset_index()" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": { 686 | "scrolled": true 687 | }, 688 | "outputs": [ 689 | { 690 | "data": { 691 | "application/vnd.jupyter.widget-view+json": { 692 | "model_id": "fe01db272c5a452fb6f37f7f7bb3348d", 693 | "version_major": 2, 694 | "version_minor": 0 695 | }, 696 | "text/plain": [ 697 | "HBox(children=(IntProgress(value=0, max=2512500), HTML(value='')))" 698 | ] 699 | }, 700 | "metadata": {}, 701 | "output_type": "display_data" 702 | } 703 | ], 704 | "source": [ 705 | "dist = []\n", 706 | "for i in tnrange(tmp.shape[0]):\n", 707 | " temp = pd.DataFrame(tmp.appId[i])\n", 708 | " temp['uId'] = tmp.uId[i]\n", 709 | " dist.extend(temp.values)" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 115, 715 | "metadata": { 716 | "scrolled": true 717 | }, 718 | "outputs": [], 719 | "source": [ 720 | "actived_app_df = pd.DataFrame(dist, columns=['appId','uId'])" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 116, 726 | "metadata": {}, 727 | "outputs": [], 728 | "source": [ 729 | "actived_app_df = actived_app_df.merge(app_info, on='appId', how='left')" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 117, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [ 738 | "actived_app_df = actived_app_df.fillna('未知')" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 11, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "actived_app_df_group_uid = actived_app_df.groupby(['uId','category'])['appId'].count().reset_index()" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 23, 753 | "metadata": { 754 | "scrolled": true 755 | }, 756 | "outputs": [], 757 | "source": [ 758 | "actived_app_df_group_uid.appId = 1" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": 12, 764 | "metadata": { 765 | "scrolled": true 766 | }, 767 | "outputs": [], 768 | "source": [ 769 | "tmp = actived_app_df_group_uid.pivot(index='uId',values=['appId'],columns='category')" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": 13, 775 | "metadata": {}, 776 | "outputs": [], 777 | "source": [ 778 | "tmp = tmp.reset_index()" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 26, 784 | "metadata": {}, 785 | "outputs": [], 786 | "source": [ 787 | "tmp.columns=['uId','主题个性', '休闲游戏', '休闲益智', '体育竞速', '便捷生活', '儿童', '出行导航', '动作冒险', '动作射击', '商务', '图书阅读', '学习办公', '实用工具', '影音娱乐', '拍摄美化', '教育', '新闻阅读', '旅游住宿', '未知', '棋牌天地', '棋牌桌游', '汽车', '益智棋牌', '社交通讯', '经营策略', '网络游戏', '美食', '表盘个性', '角色扮演', '购物比价', '运动健康', '金融理财']" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 14, 793 | "metadata": {}, 794 | "outputs": [], 795 | "source": [ 796 | "tmp.columns=['uId','主题个性s', '休闲游戏s', '休闲益智s', '体育竞速s', '便捷生活s', '儿童s', '出行导航s', '动作冒险s', '动作射击s', '商务s', '图书阅读s', '学习办公s', '实用工具s', '影音娱乐s', '拍摄美化s', '教育s', '新闻阅读s', '旅游住宿s', '未知s', '棋牌天地s', '棋牌桌游s', '汽车s', '益智棋牌s', '社交通讯s', '经营策略s', '网络游戏s', '美食s', '表盘个性s', '角色扮演s', '购物比价s', '运动健康s', '金融理财s']" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": 16, 802 | "metadata": {}, 803 | "outputs": [], 804 | "source": [ 805 | "tmp = tmp.fillna(0)" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 18, 811 | "metadata": {}, 812 | "outputs": [], 813 | "source": [ 814 | "data = data.merge(tmp, on='uId', how='left')" 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": 68, 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [ 823 | "app_info = pd.read_csv('../app_info.csv', names=['appId', 'category'])" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 40, 829 | "metadata": { 830 | "scrolled": true 831 | }, 832 | "outputs": [], 833 | "source": [ 834 | "test_cate = app_info[app_info['category']=='拍摄美化'].uId.values" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 10, 840 | "metadata": { 841 | "scrolled": true 842 | }, 843 | "outputs": [], 844 | "source": [ 845 | "actived_app_df = pd.read_hdf('./feature/actived_app_cate.hdf',key='data')" 846 | ] 847 | }, 848 | { 849 | "cell_type": "markdown", 850 | "metadata": {}, 851 | "source": [ 852 | "# 模型训练" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": 29, 858 | "metadata": {}, 859 | "outputs": [ 860 | { 861 | "data": { 862 | "application/vnd.jupyter.widget-view+json": { 863 | "model_id": "32d77663a5f244edbb6d421572e1fe0f", 864 | "version_major": 2, 865 | "version_minor": 0 866 | }, 867 | "text/plain": [ 868 | "HBox(children=(IntProgress(value=0, max=11), HTML(value='')))" 869 | ] 870 | }, 871 | "metadata": {}, 872 | "output_type": "display_data" 873 | }, 874 | { 875 | "name": "stdout", 876 | "output_type": "stream", 877 | "text": [ 878 | "\n" 879 | ] 880 | } 881 | ], 882 | "source": [ 883 | "cate_features = ['city','prodName','color','ct','rom_category','color_short','warm_cold','carrier','gender','fontSize','os']\n", 884 | "for feat in tqdm_notebook(cate_features):\n", 885 | " data[feat] = LabelEncoder().fit_transform(data[feat].fillna('-1').apply(str))" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 19, 891 | "metadata": {}, 892 | "outputs": [ 893 | { 894 | "name": "stdout", 895 | "output_type": "stream", 896 | "text": [ 897 | "All features: train shape (2010000, 532), test shape (502500, 532)\n", 898 | "532\n", 899 | "CPU times: user 10.1 s, sys: 21.5 s, total: 31.5 s\n", 900 | "Wall time: 31.5 s\n" 901 | ] 902 | } 903 | ], 904 | "source": [ 905 | "%%time\n", 906 | "\n", 907 | "origin_num_feature = ['ramLeftCapacity','romLeftCapacity','city','prodName','color','ct','color_short','warm_cold',\n", 908 | " 'carrier','gender','ramCapacity','ramLeftRation','romCapacity','romLeftRation','fontSize',\n", 909 | " 'os','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes',\n", 910 | " 'FFuncSum']\n", 911 | "\n", 912 | "rom_ram_feature = ['rom_category', 'romCapacity_max', 'romCapacity_min', 'rom_max_ratio', 'rom_min_ratio',\n", 913 | " 'ramCapacity_max','ramCapacity_min','ram_max_ratio','ram_min_ratio']\n", 914 | "\n", 915 | "X_app_I = ['actived_app_{}'.format(i) for i in range(32)]\n", 916 | "X_app_II = ['actived_app_II_{}'.format(i) for i in range(16)]\n", 917 | "X_app_III = ['actived_app_III_{}'.format(i) for i in range(8)]\n", 918 | "X_app = X_app_I + X_app_II + X_app_III\n", 919 | "\n", 920 | "X_usage_I = ['X_usage_I_{}'.format(i) for i in range(32)]\n", 921 | "X_usage_II = ['X_usage_II_{}'.format(i) for i in range(16)]\n", 922 | "X_usage_III = ['X_usage_III_{}'.format(i) for i in range(8)]\n", 923 | "X_usage = X_usage_I + X_usage_II + X_usage_III\n", 924 | "\n", 925 | "X_duration_svd = ['x_duration_{}'.format(i) for i in range(30)]\n", 926 | "X_times_svd = ['x_times_{}'.format(i) for i in range(30)]\n", 927 | "X_days_svd = ['x_days_{}'.format(i) for i in range(30)]\n", 928 | "\n", 929 | "used_days_feature = ['主题个性_used_days','主题铃声_used_days', '休闲娱乐_used_days', '休闲游戏_used_days', '休闲益智_used_days', '体育射击_used_days', '体育竞速_used_days', '便捷生活_used_days', '儿童_used_days', '出行导航_used_days', '动作冒险_used_days', '动作射击_used_days', '医疗健康_used_days', '合作壁纸*_used_days', '商务_used_days', '图书阅读_used_days', '学习办公_used_days', '实用工具_used_days', '影音娱乐_used_days','拍摄美化_used_days', '教育_used_days', '新闻阅读_used_days', '旅游住宿_used_days', '棋牌天地_used_days', '棋牌桌游_used_days', '模拟游戏_used_days', '汽车_used_days', '电子书籍_used_days','益智棋牌_used_days', '社交通讯_used_days', '策略游戏_used_days', '经营策略_used_days', '网络游戏_used_days', '美食_used_days', '表盘个性_used_days', '角色扮演_used_days', '角色游戏_used_days','购物比价_used_days', '运动健康_used_days', '金融理财_used_days']\n", 930 | "total_times_feature = ['主题个性_total_times', '主题铃声_total_times','休闲娱乐_total_times', '休闲游戏_total_times', '休闲益智_total_times', '体育射击_total_times', '体育竞速_total_times', '便捷生活_total_times', '儿童_total_times', '出行导航_total_times','动作冒险_total_times', '动作射击_total_times', '医疗健康_total_times', '合作壁纸*_total_times', '商务_total_times', '图书阅读_total_times', '学习办公_total_times', '实用工具_total_times', '影音娱乐_total_times', '拍摄美化_total_times', '教育_total_times', '新闻阅读_total_times', '旅游住宿_total_times', '棋牌天地_total_times', '棋牌桌游_total_times', '模拟游戏_total_times', '汽车_total_times', '电子书籍_total_times', '益智棋牌_total_times', '社交通讯_total_times', '策略游戏_total_times', '经营策略_total_times', '网络游戏_total_times', '美食_total_times', '表盘个性_total_times', '角色扮演_total_times', '角色游戏_total_times','购物比价_total_times', '运动健康_total_times', '金融理财_total_times']\n", 931 | "total_duration_feature = ['主题个性_total_duration','主题铃声_total_duration', '休闲娱乐_total_duration', '休闲游戏_total_duration', '休闲益智_total_duration', '体育射击_total_duration', '体育竞速_total_duration', '便捷生活_total_duration', '儿童_total_duration', '出行导航_total_duration', '动作冒险_total_duration', '动作射击_total_duration', '医疗健康_total_duration', '合作壁纸*_total_duration', '商务_total_duration', '图书阅读_total_duration', '学习办公_total_duration', '实用工具_total_duration', '影音娱乐_total_duration','拍摄美化_total_duration', '教育_total_duration', '新闻阅读_total_duration', '旅游住宿_total_duration', '棋牌天地_total_duration', '棋牌桌游_total_duration', '模拟游戏_total_duration', '汽车_total_duration', '电子书籍_total_duration','益智棋牌_total_duration', '社交通讯_total_duration', '策略游戏_total_duration', '经营策略_total_duration', '网络游戏_total_duration', '美食_total_duration', '表盘个性_total_duration', '角色扮演_total_duration', '角色游戏_total_duration','购物比价_total_duration', '运动健康_total_duration', '金融理财_total_duration']\n", 932 | "week_day_conut_feature = ['date_duration_Monday','date_duration_Tuesday','date_duration_Wednesday','date_duration_Thursday','date_duration_Friday','date_duration_Saturday','date_duration_Sunday',\n", 933 | " 'date_times_Monday','date_times_Tuesday','date_times_Wednesday','date_times_Thursday','date_times_Friday','date_times_Saturday','date_times_Sunday']\n", 934 | "\n", 935 | "weekday_times_feat = ['weekday_0_total_times', 'weekday_1_total_times', 'weekday_2_total_times','weekday_3_total_times', 'weekday_4_total_times', 'weekday_5_total_times', 'weekday_6_total_times']\n", 936 | "total_app_count_feat = ['total_app_count_{}'.format(i) for i in range(1,31)]\n", 937 | "total_perday_feat = [ 'total_feat_{}'.format(i) for i in range(1,61)]\n", 938 | "#******* Feature test***********#\n", 939 | "test_feat = ['user_actived_app_count','date_duration_week_max','date_duration_week_min','date_duration_week_sum',\n", 940 | " 'date_duration_week_std','date_times_week_max','date_times_week_min','date_times_week_sum','date_times_week_std']\n", 941 | "actived_cate_haved = ['主题个性', '休闲游戏', '休闲益智', '体育竞速', '便捷生活', '儿童', '出行导航', '动作冒险', '动作射击', '商务', '图书阅读', '学习办公', '实用工具', '影音娱乐', '拍摄美化', '教育', '新闻阅读', '旅游住宿', '未知', '棋牌天地', '棋牌桌游', '汽车', '益智棋牌', '社交通讯', '经营策略', '网络游戏', '美食', '表盘个性', '角色扮演', '购物比价', '运动健康', '金融理财']\n", 942 | "actived_cate_counts=['主题个性s', '休闲游戏s', '休闲益智s', '体育竞速s', '便捷生活s', '儿童s', '出行导航s', '动作冒险s', '动作射击s', '商务s', '图书阅读s', '学习办公s', '实用工具s', '影音娱乐s', '拍摄美化s', '教育s', '新闻阅读s', '旅游住宿s', '未知s', '棋牌天地s', '棋牌桌游s', '汽车s', '益智棋牌s', '社交通讯s', '经营策略s', '网络游戏s', '美食s', '表盘个性s', '角色扮演s', '购物比价s', '运动健康s', '金融理财s']\n", 943 | "#******* Feature sum***********#\n", 944 | "feature = origin_num_feature +rom_ram_feature + used_days_feature + week_day_conut_feature + total_times_feature\\\n", 945 | " + total_duration_feature + total_app_count_feat + total_perday_feat +X_app+X_usage\\\n", 946 | " + X_duration_svd + X_times_svd + X_days_svd\\\n", 947 | " + test_feat + actived_cate_haved + actived_cate_counts\n", 948 | "#*********************************# \n", 949 | "\n", 950 | "test_index = np.isnan(data.age_group)\n", 951 | "train_index = ~test_index\n", 952 | "train_x = data[train_index][feature] \n", 953 | "train_y = data[train_index]['age_group']\n", 954 | "test_x = data[test_index][feature]\n", 955 | "\n", 956 | "\n", 957 | "print('All features: train shape {}, test shape {}'.format(train_x.shape, test_x.shape))\n", 958 | "print(len(feature))" 959 | ] 960 | }, 961 | { 962 | "cell_type": "code", 963 | "execution_count": 20, 964 | "metadata": {}, 965 | "outputs": [], 966 | "source": [ 967 | "def label_smoothing(inputs, epsilon=0.1):\n", 968 | " K = 6\n", 969 | " return ((1-epsilon) * inputs) + (epsilon / K)\n", 970 | "\n", 971 | "def label_smoothing_re(inputs, epsilon=0.1):\n", 972 | " K = 6\n", 973 | " return (inputs-epsilon/K)/(1-epsilon)" 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": 21, 979 | "metadata": {}, 980 | "outputs": [], 981 | "source": [ 982 | "train_y = label_smoothing(train_y)" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": null, 988 | "metadata": {}, 989 | "outputs": [ 990 | { 991 | "name": "stdout", 992 | "output_type": "stream", 993 | "text": [ 994 | "CPU times: user 8.38 s, sys: 3.55 s, total: 11.9 s\n", 995 | "Wall time: 11.9 s\n" 996 | ] 997 | } 998 | ], 999 | "source": [ 1000 | "%%time\n", 1001 | "from sklearn.model_selection import train_test_split \n", 1002 | "X_train, X_validation, y_train, y_validation = train_test_split(train_x, train_y, test_size=0.02, random_state=42)\n", 1003 | "del train_x, train_y\n", 1004 | "gc.collect()" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "code", 1009 | "execution_count": null, 1010 | "metadata": {}, 1011 | "outputs": [ 1012 | { 1013 | "name": "stdout", 1014 | "output_type": "stream", 1015 | "text": [ 1016 | "CPU times: user 2min 25s, sys: 4.28 s, total: 2min 30s\n", 1017 | "Wall time: 2min 29s\n" 1018 | ] 1019 | } 1020 | ], 1021 | "source": [ 1022 | "%%time\n", 1023 | "cate_features = ['city','prodName','color','ct','rom_category','color_short','carrier','gender','fontSize','os']\n", 1024 | "train_pool = Pool(X_train, y_train, cat_features=cate_features)\n", 1025 | "eval_pool = Pool(X_validation, y_validation,cat_features=cate_features)\n", 1026 | "del X_train, y_train, y_validation\n", 1027 | "gc.collect()" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": 2, 1033 | "metadata": { 1034 | "scrolled": true 1035 | }, 1036 | "outputs": [], 1037 | "source": [ 1038 | "data = pd.read_hdf('./data_all.hdf',key='data')" 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "code", 1043 | "execution_count": 40, 1044 | "metadata": {}, 1045 | "outputs": [], 1046 | "source": [ 1047 | "del data['romCapacity_x']" 1048 | ] 1049 | }, 1050 | { 1051 | "cell_type": "code", 1052 | "execution_count": null, 1053 | "metadata": { 1054 | "scrolled": true 1055 | }, 1056 | "outputs": [ 1057 | { 1058 | "name": "stdout", 1059 | "output_type": "stream", 1060 | "text": [ 1061 | "0:\tlearn: 0.5591918\ttest: 0.5564925\tbest: 0.5564925 (0)\ttotal: 140ms\tremaining: 19h 28m 27s\n", 1062 | "100:\tlearn: 0.6012722\ttest: 0.5979602\tbest: 0.5979602 (100)\ttotal: 11.8s\tremaining: 16h 14m 35s\n", 1063 | "200:\tlearn: 0.6120175\ttest: 0.6087562\tbest: 0.6087562 (200)\ttotal: 23.6s\tremaining: 16h 16m 30s\n", 1064 | "300:\tlearn: 0.6183059\ttest: 0.6136567\tbest: 0.6136567 (300)\ttotal: 35.1s\tremaining: 16h 11m 32s\n", 1065 | "400:\tlearn: 0.6229729\ttest: 0.6183085\tbest: 0.6183085 (396)\ttotal: 46.7s\tremaining: 16h 10m 35s\n", 1066 | "500:\tlearn: 0.6263534\ttest: 0.6217910\tbest: 0.6217910 (500)\ttotal: 58.5s\tremaining: 16h 11m 25s\n", 1067 | "600:\tlearn: 0.6292685\ttest: 0.6252736\tbest: 0.6252736 (600)\ttotal: 1m 10s\tremaining: 16h 19m 45s\n", 1068 | "700:\tlearn: 0.6316565\ttest: 0.6270896\tbest: 0.6270896 (700)\ttotal: 1m 23s\tremaining: 16h 25m 36s\n", 1069 | "800:\tlearn: 0.6338308\ttest: 0.6291045\tbest: 0.6291045 (800)\ttotal: 1m 35s\tremaining: 16h 29m 53s\n", 1070 | "900:\tlearn: 0.6358123\ttest: 0.6298507\tbest: 0.6300995 (875)\ttotal: 1m 47s\tremaining: 16h 33m 58s\n", 1071 | "1000:\tlearn: 0.6375678\ttest: 0.6310945\tbest: 0.6310945 (994)\ttotal: 2m\tremaining: 16h 37m 31s\n", 1072 | "1100:\tlearn: 0.6391126\ttest: 0.6321642\tbest: 0.6321642 (1100)\ttotal: 2m 12s\tremaining: 16h 40m 33s\n", 1073 | "1200:\tlearn: 0.6405275\ttest: 0.6330597\tbest: 0.6332836 (1193)\ttotal: 2m 24s\tremaining: 16h 41m 13s\n", 1074 | "1300:\tlearn: 0.6419606\ttest: 0.6346269\tbest: 0.6347761 (1296)\ttotal: 2m 37s\tremaining: 16h 44m 7s\n", 1075 | "1400:\tlearn: 0.6431491\ttest: 0.6353483\tbest: 0.6354726 (1398)\ttotal: 2m 49s\tremaining: 16h 45m 28s\n", 1076 | "1500:\tlearn: 0.6442852\ttest: 0.6360448\tbest: 0.6360448 (1500)\ttotal: 3m 1s\tremaining: 16h 47m 7s\n", 1077 | "1600:\tlearn: 0.6453797\ttest: 0.6359701\tbest: 0.6362687 (1566)\ttotal: 3m 14s\tremaining: 16h 48m 26s\n", 1078 | "1700:\tlearn: 0.6463570\ttest: 0.6362687\tbest: 0.6364428 (1643)\ttotal: 3m 26s\tremaining: 16h 49m 1s\n", 1079 | "1800:\tlearn: 0.6472652\ttest: 0.6364677\tbest: 0.6367413 (1780)\ttotal: 3m 39s\tremaining: 16h 50m 31s\n", 1080 | "1900:\tlearn: 0.6481161\ttest: 0.6372388\tbest: 0.6372886 (1898)\ttotal: 3m 51s\tremaining: 16h 51m 4s\n", 1081 | "2000:\tlearn: 0.6490745\ttest: 0.6380846\tbest: 0.6380846 (2000)\ttotal: 4m 4s\tremaining: 16h 52m 42s\n", 1082 | "2100:\tlearn: 0.6498934\ttest: 0.6385821\tbest: 0.6387065 (2093)\ttotal: 4m 16s\tremaining: 16h 53m 56s\n", 1083 | "2200:\tlearn: 0.6506640\ttest: 0.6390547\tbest: 0.6390547 (2194)\ttotal: 4m 29s\tremaining: 16h 54m 55s\n", 1084 | "2300:\tlearn: 0.6513545\ttest: 0.6394527\tbest: 0.6395771 (2268)\ttotal: 4m 41s\tremaining: 16h 55m 11s\n", 1085 | "2400:\tlearn: 0.6520830\ttest: 0.6398259\tbest: 0.6399254 (2385)\ttotal: 4m 54s\tremaining: 16h 56m 19s\n", 1086 | "2500:\tlearn: 0.6527434\ttest: 0.6398507\tbest: 0.6400498 (2461)\ttotal: 5m 6s\tremaining: 16h 57m 19s\n", 1087 | "2600:\tlearn: 0.6533912\ttest: 0.6407214\tbest: 0.6407960 (2596)\ttotal: 5m 19s\tremaining: 16h 57m 37s\n", 1088 | "2700:\tlearn: 0.6539720\ttest: 0.6403483\tbest: 0.6407960 (2596)\ttotal: 5m 31s\tremaining: 16h 58m 2s\n", 1089 | "2800:\tlearn: 0.6546700\ttest: 0.6409701\tbest: 0.6409701 (2796)\ttotal: 5m 44s\tremaining: 16h 58m 23s\n", 1090 | "2900:\tlearn: 0.6552782\ttest: 0.6413682\tbest: 0.6413930 (2897)\ttotal: 5m 56s\tremaining: 16h 58m 43s\n", 1091 | "3000:\tlearn: 0.6558666\ttest: 0.6420149\tbest: 0.6422139 (2998)\ttotal: 6m 9s\tremaining: 16h 58m 59s\n", 1092 | "3100:\tlearn: 0.6564534\ttest: 0.6425622\tbest: 0.6425622 (3094)\ttotal: 6m 21s\tremaining: 16h 58m 52s\n", 1093 | "3200:\tlearn: 0.6570134\ttest: 0.6428856\tbest: 0.6429353 (3146)\ttotal: 6m 33s\tremaining: 16h 58m 58s\n", 1094 | "3300:\tlearn: 0.6575739\ttest: 0.6428109\tbest: 0.6430348 (3258)\ttotal: 6m 46s\tremaining: 16h 58m 45s\n", 1095 | "3400:\tlearn: 0.6580724\ttest: 0.6433333\tbest: 0.6433831 (3388)\ttotal: 6m 58s\tremaining: 16h 58m 56s\n", 1096 | "3500:\tlearn: 0.6586080\ttest: 0.6437313\tbest: 0.6437313 (3499)\ttotal: 7m 11s\tremaining: 16h 59m\n", 1097 | "3600:\tlearn: 0.6591294\ttest: 0.6437562\tbest: 0.6439801 (3540)\ttotal: 7m 23s\tremaining: 16h 59m 19s\n", 1098 | "3700:\tlearn: 0.6596690\ttest: 0.6436816\tbest: 0.6439801 (3540)\ttotal: 7m 36s\tremaining: 16h 59m 37s\n", 1099 | "3800:\tlearn: 0.6601015\ttest: 0.6438557\tbest: 0.6440299 (3783)\ttotal: 7m 48s\tremaining: 16h 59m 45s\n", 1100 | "3900:\tlearn: 0.6606254\ttest: 0.6441294\tbest: 0.6441294 (3900)\ttotal: 8m 1s\tremaining: 16h 59m 36s\n", 1101 | "4000:\tlearn: 0.6610996\ttest: 0.6442289\tbest: 0.6442537 (3912)\ttotal: 8m 13s\tremaining: 16h 59m 36s\n", 1102 | "4100:\tlearn: 0.6615997\ttest: 0.6442786\tbest: 0.6444279 (4069)\ttotal: 8m 26s\tremaining: 16h 59m 54s\n", 1103 | "4200:\tlearn: 0.6620159\ttest: 0.6442786\tbest: 0.6445771 (4189)\ttotal: 8m 38s\tremaining: 17h 11s\n", 1104 | "4300:\tlearn: 0.6624784\ttest: 0.6443284\tbest: 0.6446020 (4289)\ttotal: 8m 51s\tremaining: 17h 4s\n", 1105 | "4400:\tlearn: 0.6629206\ttest: 0.6442786\tbest: 0.6446020 (4289)\ttotal: 9m 3s\tremaining: 16h 59m 39s\n", 1106 | "4500:\tlearn: 0.6633379\ttest: 0.6442786\tbest: 0.6446020 (4289)\ttotal: 9m 15s\tremaining: 16h 59m 15s\n", 1107 | "4600:\tlearn: 0.6638110\ttest: 0.6447264\tbest: 0.6448010 (4598)\ttotal: 9m 27s\tremaining: 16h 59m 12s\n", 1108 | "4700:\tlearn: 0.6642685\ttest: 0.6448507\tbest: 0.6449751 (4663)\ttotal: 9m 40s\tremaining: 16h 59m 2s\n", 1109 | "4800:\tlearn: 0.6647208\ttest: 0.6451244\tbest: 0.6451244 (4800)\ttotal: 9m 52s\tremaining: 16h 58m 26s\n", 1110 | "4900:\tlearn: 0.6651787\ttest: 0.6453980\tbest: 0.6453980 (4900)\ttotal: 10m 4s\tremaining: 16h 58m 6s\n", 1111 | "5000:\tlearn: 0.6656026\ttest: 0.6455224\tbest: 0.6455970 (4986)\ttotal: 10m 17s\tremaining: 16h 57m 50s\n", 1112 | "5100:\tlearn: 0.6659793\ttest: 0.6455970\tbest: 0.6458209 (5083)\ttotal: 10m 29s\tremaining: 16h 57m 32s\n", 1113 | "5200:\tlearn: 0.6664727\ttest: 0.6458955\tbest: 0.6460199 (5146)\ttotal: 10m 41s\tremaining: 16h 57m 1s\n", 1114 | "5300:\tlearn: 0.6668789\ttest: 0.6459950\tbest: 0.6460199 (5146)\ttotal: 10m 53s\tremaining: 16h 56m 43s\n", 1115 | "5400:\tlearn: 0.6672586\ttest: 0.6461692\tbest: 0.6462189 (5342)\ttotal: 11m 5s\tremaining: 16h 56m 25s\n", 1116 | "5500:\tlearn: 0.6676729\ttest: 0.6460697\tbest: 0.6462935 (5495)\ttotal: 11m 18s\tremaining: 16h 56m 9s\n", 1117 | "5600:\tlearn: 0.6680663\ttest: 0.6462687\tbest: 0.6462935 (5495)\ttotal: 11m 30s\tremaining: 16h 55m 36s\n", 1118 | "5700:\tlearn: 0.6684374\ttest: 0.6464677\tbest: 0.6465174 (5647)\ttotal: 11m 42s\tremaining: 16h 55m 18s\n", 1119 | "5800:\tlearn: 0.6688506\ttest: 0.6462935\tbest: 0.6465174 (5647)\ttotal: 11m 54s\tremaining: 16h 54m 49s\n", 1120 | "5900:\tlearn: 0.6692014\ttest: 0.6463682\tbest: 0.6465423 (5874)\ttotal: 12m 6s\tremaining: 16h 54m 15s\n", 1121 | "6000:\tlearn: 0.6696091\ttest: 0.6464428\tbest: 0.6465423 (5874)\ttotal: 12m 18s\tremaining: 16h 53m 52s\n", 1122 | "6100:\tlearn: 0.6700315\ttest: 0.6464428\tbest: 0.6465672 (6014)\ttotal: 12m 31s\tremaining: 16h 53m 21s\n", 1123 | "6200:\tlearn: 0.6703914\ttest: 0.6462438\tbest: 0.6465672 (6014)\ttotal: 12m 43s\tremaining: 16h 52m 48s\n", 1124 | "6300:\tlearn: 0.6707529\ttest: 0.6464428\tbest: 0.6465672 (6014)\ttotal: 12m 55s\tremaining: 16h 52m 39s\n", 1125 | "6400:\tlearn: 0.6711067\ttest: 0.6468408\tbest: 0.6468408 (6395)\ttotal: 13m 7s\tremaining: 16h 52m 18s\n", 1126 | "6500:\tlearn: 0.6714590\ttest: 0.6466915\tbest: 0.6468408 (6395)\ttotal: 13m 19s\tremaining: 16h 51m 43s\n", 1127 | "6600:\tlearn: 0.6718098\ttest: 0.6464428\tbest: 0.6468408 (6395)\ttotal: 13m 31s\tremaining: 16h 50m 56s\n", 1128 | "6700:\tlearn: 0.6721698\ttest: 0.6462935\tbest: 0.6468408 (6395)\ttotal: 13m 43s\tremaining: 16h 50m 16s\n", 1129 | "6800:\tlearn: 0.6724982\ttest: 0.6462687\tbest: 0.6468408 (6395)\ttotal: 13m 55s\tremaining: 16h 49m 23s\n", 1130 | "6900:\tlearn: 0.6728363\ttest: 0.6461940\tbest: 0.6468408 (6395)\ttotal: 14m 7s\tremaining: 16h 48m 52s\n", 1131 | "7000:\tlearn: 0.6732267\ttest: 0.6462189\tbest: 0.6468408 (6395)\ttotal: 14m 19s\tremaining: 16h 48m 19s\n", 1132 | "7100:\tlearn: 0.6735658\ttest: 0.6465423\tbest: 0.6468408 (6395)\ttotal: 14m 30s\tremaining: 16h 47m 37s\n", 1133 | "7200:\tlearn: 0.6738958\ttest: 0.6464677\tbest: 0.6468408 (6395)\ttotal: 14m 42s\tremaining: 16h 46m 50s\n", 1134 | "7300:\tlearn: 0.6742009\ttest: 0.6464179\tbest: 0.6468408 (6395)\ttotal: 14m 54s\tremaining: 16h 46m 17s\n", 1135 | "7400:\tlearn: 0.6745294\ttest: 0.6465423\tbest: 0.6468408 (6395)\ttotal: 15m 6s\tremaining: 16h 45m 43s\n", 1136 | "7500:\tlearn: 0.6748528\ttest: 0.6465423\tbest: 0.6468408 (6395)\ttotal: 15m 18s\tremaining: 16h 45m 2s\n", 1137 | "7600:\tlearn: 0.6751960\ttest: 0.6464179\tbest: 0.6468408 (6395)\ttotal: 15m 30s\tremaining: 16h 44m 10s\n", 1138 | "7700:\tlearn: 0.6754924\ttest: 0.6462935\tbest: 0.6468408 (6395)\ttotal: 15m 41s\tremaining: 16h 43m 25s\n", 1139 | "7800:\tlearn: 0.6758265\ttest: 0.6464428\tbest: 0.6468408 (6395)\ttotal: 15m 53s\tremaining: 16h 42m 46s\n", 1140 | "7900:\tlearn: 0.6761575\ttest: 0.6462438\tbest: 0.6468408 (6395)\ttotal: 16m 5s\tremaining: 16h 42m 9s\n", 1141 | "8000:\tlearn: 0.6764748\ttest: 0.6465174\tbest: 0.6468408 (6395)\ttotal: 16m 17s\tremaining: 16h 41m 27s\n", 1142 | "8100:\tlearn: 0.6768048\ttest: 0.6466169\tbest: 0.6468408 (6395)\ttotal: 16m 28s\tremaining: 16h 40m 40s\n", 1143 | "8200:\tlearn: 0.6771373\ttest: 0.6467413\tbest: 0.6468408 (6395)\ttotal: 16m 40s\tremaining: 16h 40m 3s\n", 1144 | "8300:\tlearn: 0.6774363\ttest: 0.6464925\tbest: 0.6468408 (6395)\ttotal: 16m 52s\tremaining: 16h 39m 14s\n" 1145 | ] 1146 | }, 1147 | { 1148 | "name": "stdout", 1149 | "output_type": "stream", 1150 | "text": [ 1151 | "8400:\tlearn: 0.6777571\ttest: 0.6467164\tbest: 0.6468408 (6395)\ttotal: 17m 4s\tremaining: 16h 38m 44s\n", 1152 | "8500:\tlearn: 0.6780912\ttest: 0.6466667\tbest: 0.6468657 (8416)\ttotal: 17m 15s\tremaining: 16h 37m 52s\n", 1153 | "8600:\tlearn: 0.6783800\ttest: 0.6467910\tbest: 0.6468657 (8416)\ttotal: 17m 27s\tremaining: 16h 37m 4s\n", 1154 | "8700:\tlearn: 0.6787065\ttest: 0.6468408\tbest: 0.6469652 (8660)\ttotal: 17m 38s\tremaining: 16h 36m 29s\n", 1155 | "8800:\tlearn: 0.6789659\ttest: 0.6467413\tbest: 0.6469900 (8706)\ttotal: 17m 50s\tremaining: 16h 35m 47s\n", 1156 | "8900:\tlearn: 0.6792964\ttest: 0.6466418\tbest: 0.6469900 (8706)\ttotal: 18m 1s\tremaining: 16h 34m 47s\n", 1157 | "9000:\tlearn: 0.6795822\ttest: 0.6468408\tbest: 0.6469900 (8706)\ttotal: 18m 13s\tremaining: 16h 33m 48s\n", 1158 | "9100:\tlearn: 0.6799152\ttest: 0.6468905\tbest: 0.6469900 (8706)\ttotal: 18m 24s\tremaining: 16h 33m 10s\n", 1159 | "9200:\tlearn: 0.6801868\ttest: 0.6470149\tbest: 0.6470149 (9200)\ttotal: 18m 36s\tremaining: 16h 32m 20s\n", 1160 | "9300:\tlearn: 0.6804630\ttest: 0.6467662\tbest: 0.6470398 (9203)\ttotal: 18m 47s\tremaining: 16h 31m 44s\n", 1161 | "9400:\tlearn: 0.6807849\ttest: 0.6466418\tbest: 0.6470398 (9203)\ttotal: 18m 59s\tremaining: 16h 30m 54s\n", 1162 | "9500:\tlearn: 0.6810361\ttest: 0.6466915\tbest: 0.6470398 (9203)\ttotal: 19m 10s\tremaining: 16h 29m 45s\n", 1163 | "9600:\tlearn: 0.6813529\ttest: 0.6468159\tbest: 0.6470398 (9203)\ttotal: 19m 21s\tremaining: 16h 28m 51s\n", 1164 | "9700:\tlearn: 0.6815859\ttest: 0.6470149\tbest: 0.6472139 (9663)\ttotal: 19m 33s\tremaining: 16h 28m 5s\n", 1165 | "9800:\tlearn: 0.6819398\ttest: 0.6469154\tbest: 0.6472139 (9663)\ttotal: 19m 44s\tremaining: 16h 27m 22s\n", 1166 | "9900:\tlearn: 0.6822246\ttest: 0.6466418\tbest: 0.6472139 (9663)\ttotal: 19m 56s\tremaining: 16h 26m 49s\n", 1167 | "10000:\tlearn: 0.6824962\ttest: 0.6468159\tbest: 0.6472139 (9663)\ttotal: 20m 7s\tremaining: 16h 26m 8s\n", 1168 | "10100:\tlearn: 0.6828069\ttest: 0.6468159\tbest: 0.6472139 (9663)\ttotal: 20m 19s\tremaining: 16h 25m 27s\n", 1169 | "10200:\tlearn: 0.6830622\ttest: 0.6469403\tbest: 0.6472139 (9663)\ttotal: 20m 30s\tremaining: 16h 24m 43s\n", 1170 | "10300:\tlearn: 0.6833745\ttest: 0.6467413\tbest: 0.6472139 (9663)\ttotal: 20m 42s\tremaining: 16h 24m 19s\n", 1171 | "10400:\tlearn: 0.6836902\ttest: 0.6468905\tbest: 0.6472139 (9663)\ttotal: 20m 53s\tremaining: 16h 23m 42s\n", 1172 | "10500:\tlearn: 0.6839816\ttest: 0.6467413\tbest: 0.6472139 (9663)\ttotal: 21m 5s\tremaining: 16h 23m 8s\n", 1173 | "10600:\tlearn: 0.6842481\ttest: 0.6467413\tbest: 0.6472139 (9663)\ttotal: 21m 16s\tremaining: 16h 22m 25s\n", 1174 | "10700:\tlearn: 0.6845121\ttest: 0.6464677\tbest: 0.6472139 (9663)\ttotal: 21m 28s\tremaining: 16h 21m 45s\n", 1175 | "10800:\tlearn: 0.6847949\ttest: 0.6466418\tbest: 0.6472139 (9663)\ttotal: 21m 39s\tremaining: 16h 21m 14s\n", 1176 | "10900:\tlearn: 0.6850909\ttest: 0.6466418\tbest: 0.6472139 (9663)\ttotal: 21m 51s\tremaining: 16h 20m 40s\n", 1177 | "11000:\tlearn: 0.6853879\ttest: 0.6466915\tbest: 0.6472139 (9663)\ttotal: 22m 2s\tremaining: 16h 19m 57s\n", 1178 | "11100:\tlearn: 0.6856529\ttest: 0.6469154\tbest: 0.6472139 (9663)\ttotal: 22m 14s\tremaining: 16h 19m 19s\n", 1179 | "11200:\tlearn: 0.6859509\ttest: 0.6470149\tbest: 0.6472139 (9663)\ttotal: 22m 25s\tremaining: 16h 18m 42s\n", 1180 | "11300:\tlearn: 0.6862377\ttest: 0.6471642\tbest: 0.6472388 (11249)\ttotal: 22m 37s\tremaining: 16h 18m 8s\n", 1181 | "11400:\tlearn: 0.6864981\ttest: 0.6470896\tbest: 0.6472388 (11249)\ttotal: 22m 48s\tremaining: 16h 17m 30s\n", 1182 | "11500:\tlearn: 0.6867606\ttest: 0.6472388\tbest: 0.6472886 (11473)\ttotal: 22m 59s\tremaining: 16h 16m 50s\n", 1183 | "11600:\tlearn: 0.6870383\ttest: 0.6471393\tbest: 0.6474378 (11525)\ttotal: 23m 11s\tremaining: 16h 16m 7s\n", 1184 | "11700:\tlearn: 0.6873190\ttest: 0.6472388\tbest: 0.6474378 (11525)\ttotal: 23m 22s\tremaining: 16h 15m 22s\n", 1185 | "11800:\tlearn: 0.6875510\ttest: 0.6472886\tbest: 0.6474378 (11525)\ttotal: 23m 33s\tremaining: 16h 14m 28s\n", 1186 | "11900:\tlearn: 0.6878231\ttest: 0.6470896\tbest: 0.6474378 (11525)\ttotal: 23m 44s\tremaining: 16h 13m 36s\n", 1187 | "12000:\tlearn: 0.6880759\ttest: 0.6470896\tbest: 0.6474378 (11525)\ttotal: 23m 55s\tremaining: 16h 12m 58s\n", 1188 | "12100:\tlearn: 0.6883618\ttest: 0.6471642\tbest: 0.6474378 (11525)\ttotal: 24m 6s\tremaining: 16h 12m 20s\n", 1189 | "12200:\tlearn: 0.6886745\ttest: 0.6470647\tbest: 0.6474378 (11525)\ttotal: 24m 18s\tremaining: 16h 11m 45s\n", 1190 | "12300:\tlearn: 0.6889679\ttest: 0.6470398\tbest: 0.6474378 (11525)\ttotal: 24m 29s\tremaining: 16h 11m 15s\n", 1191 | "12400:\tlearn: 0.6892410\ttest: 0.6468905\tbest: 0.6474378 (11525)\ttotal: 24m 41s\tremaining: 16h 10m 44s\n", 1192 | "12500:\tlearn: 0.6895010\ttest: 0.6470647\tbest: 0.6474378 (11525)\ttotal: 24m 52s\tremaining: 16h 9m 57s\n", 1193 | "12600:\tlearn: 0.6897639\ttest: 0.6470149\tbest: 0.6474378 (11525)\ttotal: 25m 3s\tremaining: 16h 9m 15s\n", 1194 | "12700:\tlearn: 0.6900487\ttest: 0.6470149\tbest: 0.6474378 (11525)\ttotal: 25m 14s\tremaining: 16h 8m 36s\n", 1195 | "12800:\tlearn: 0.6902736\ttest: 0.6469154\tbest: 0.6474378 (11525)\ttotal: 25m 25s\tremaining: 16h 7m 54s\n", 1196 | "12900:\tlearn: 0.6905508\ttest: 0.6467910\tbest: 0.6474378 (11525)\ttotal: 25m 37s\tremaining: 16h 7m 14s\n", 1197 | "13000:\tlearn: 0.6908224\ttest: 0.6469154\tbest: 0.6474378 (11525)\ttotal: 25m 48s\tremaining: 16h 6m 37s\n", 1198 | "13100:\tlearn: 0.6910966\ttest: 0.6469900\tbest: 0.6474378 (11525)\ttotal: 25m 59s\tremaining: 16h 6m 3s\n", 1199 | "13200:\tlearn: 0.6913611\ttest: 0.6469403\tbest: 0.6474378 (11525)\ttotal: 26m 10s\tremaining: 16h 5m 30s\n", 1200 | "13300:\tlearn: 0.6916255\ttest: 0.6471642\tbest: 0.6474378 (11525)\ttotal: 26m 22s\tremaining: 16h 5m 5s\n", 1201 | "13400:\tlearn: 0.6919164\ttest: 0.6471144\tbest: 0.6474378 (11525)\ttotal: 26m 33s\tremaining: 16h 4m 27s\n", 1202 | "13500:\tlearn: 0.6921693\ttest: 0.6471144\tbest: 0.6474378 (11525)\ttotal: 26m 45s\tremaining: 16h 3m 59s\n", 1203 | "13600:\tlearn: 0.6924403\ttest: 0.6470149\tbest: 0.6474378 (11525)\ttotal: 26m 56s\tremaining: 16h 3m 28s\n", 1204 | "13700:\tlearn: 0.6927150\ttest: 0.6470398\tbest: 0.6474378 (11525)\ttotal: 27m 7s\tremaining: 16h 2m 48s\n", 1205 | "13800:\tlearn: 0.6929328\ttest: 0.6469900\tbest: 0.6474378 (11525)\ttotal: 27m 18s\tremaining: 16h 2m 10s\n", 1206 | "13900:\tlearn: 0.6932658\ttest: 0.6471144\tbest: 0.6474378 (11525)\ttotal: 27m 29s\tremaining: 16h 1m 29s\n", 1207 | "14000:\tlearn: 0.6935207\ttest: 0.6469900\tbest: 0.6474378 (11525)\ttotal: 27m 40s\tremaining: 16h 48s\n", 1208 | "14100:\tlearn: 0.6937836\ttest: 0.6471144\tbest: 0.6474378 (11525)\ttotal: 27m 52s\tremaining: 16h 15s\n", 1209 | "14200:\tlearn: 0.6940222\ttest: 0.6469900\tbest: 0.6474378 (11525)\ttotal: 28m 3s\tremaining: 15h 59m 38s\n", 1210 | "14300:\tlearn: 0.6942974\ttest: 0.6470398\tbest: 0.6474378 (11525)\ttotal: 28m 14s\tremaining: 15h 59m 5s\n", 1211 | "14400:\tlearn: 0.6945421\ttest: 0.6470398\tbest: 0.6474378 (11525)\ttotal: 28m 25s\tremaining: 15h 58m 38s\n", 1212 | "14500:\tlearn: 0.6948289\ttest: 0.6471891\tbest: 0.6474378 (11525)\ttotal: 28m 37s\tremaining: 15h 58m 9s\n", 1213 | "14600:\tlearn: 0.6951051\ttest: 0.6470398\tbest: 0.6474378 (11525)\ttotal: 28m 48s\tremaining: 15h 57m 42s\n", 1214 | "14700:\tlearn: 0.6953818\ttest: 0.6470647\tbest: 0.6474378 (11525)\ttotal: 28m 59s\tremaining: 15h 57m 10s\n", 1215 | "14800:\tlearn: 0.6956752\ttest: 0.6470647\tbest: 0.6474378 (11525)\ttotal: 29m 10s\tremaining: 15h 56m 39s\n", 1216 | "14900:\tlearn: 0.6959498\ttest: 0.6473134\tbest: 0.6474378 (11525)\ttotal: 29m 22s\tremaining: 15h 56m 5s\n", 1217 | "15000:\tlearn: 0.6961991\ttest: 0.6473632\tbest: 0.6474378 (11525)\ttotal: 29m 33s\tremaining: 15h 55m 32s\n", 1218 | "15100:\tlearn: 0.6964773\ttest: 0.6473881\tbest: 0.6476617 (15084)\ttotal: 29m 44s\tremaining: 15h 55m 2s\n", 1219 | "15200:\tlearn: 0.6967256\ttest: 0.6474129\tbest: 0.6476617 (15084)\ttotal: 29m 55s\tremaining: 15h 54m 29s\n", 1220 | "15300:\tlearn: 0.6969713\ttest: 0.6471891\tbest: 0.6476617 (15084)\ttotal: 30m 6s\tremaining: 15h 53m 54s\n", 1221 | "15400:\tlearn: 0.6972535\ttest: 0.6472139\tbest: 0.6476617 (15084)\ttotal: 30m 18s\tremaining: 15h 53m 29s\n", 1222 | "15500:\tlearn: 0.6975404\ttest: 0.6473383\tbest: 0.6476617 (15084)\ttotal: 30m 29s\tremaining: 15h 53m 5s\n", 1223 | "15600:\tlearn: 0.6978358\ttest: 0.6474129\tbest: 0.6476617 (15084)\ttotal: 30m 40s\tremaining: 15h 52m 37s\n", 1224 | "15700:\tlearn: 0.6980805\ttest: 0.6475871\tbest: 0.6476617 (15084)\ttotal: 30m 51s\tremaining: 15h 51m 57s\n", 1225 | "15800:\tlearn: 0.6983780\ttest: 0.6475622\tbest: 0.6476866 (15701)\ttotal: 31m 2s\tremaining: 15h 51m 20s\n", 1226 | "15900:\tlearn: 0.6986080\ttest: 0.6476368\tbest: 0.6477612 (15844)\ttotal: 31m 13s\tremaining: 15h 50m 50s\n", 1227 | "16000:\tlearn: 0.6988796\ttest: 0.6477363\tbest: 0.6477861 (15902)\ttotal: 31m 25s\tremaining: 15h 50m 21s\n", 1228 | "16100:\tlearn: 0.6991552\ttest: 0.6477114\tbest: 0.6478109 (16036)\ttotal: 31m 36s\tremaining: 15h 49m 54s\n", 1229 | "16200:\tlearn: 0.6993817\ttest: 0.6477363\tbest: 0.6478358 (16150)\ttotal: 31m 47s\tremaining: 15h 49m 27s\n", 1230 | "16300:\tlearn: 0.6996695\ttest: 0.6476866\tbest: 0.6478358 (16150)\ttotal: 31m 58s\tremaining: 15h 48m 53s\n", 1231 | "16400:\tlearn: 0.6999233\ttest: 0.6474129\tbest: 0.6478358 (16150)\ttotal: 32m 10s\tremaining: 15h 48m 38s\n", 1232 | "16500:\tlearn: 0.7001817\ttest: 0.6474129\tbest: 0.6478358 (16150)\ttotal: 32m 21s\tremaining: 15h 48m 18s\n" 1233 | ] 1234 | }, 1235 | { 1236 | "name": "stdout", 1237 | "output_type": "stream", 1238 | "text": [ 1239 | "16600:\tlearn: 0.7004361\ttest: 0.6475373\tbest: 0.6478358 (16150)\ttotal: 32m 33s\tremaining: 15h 47m 56s\n", 1240 | "16700:\tlearn: 0.7006320\ttest: 0.6476368\tbest: 0.6478358 (16150)\ttotal: 32m 44s\tremaining: 15h 47m 28s\n", 1241 | "16800:\tlearn: 0.7009118\ttest: 0.6472637\tbest: 0.6478358 (16150)\ttotal: 32m 55s\tremaining: 15h 46m 55s\n", 1242 | "16900:\tlearn: 0.7012037\ttest: 0.6473881\tbest: 0.6478358 (16150)\ttotal: 33m 7s\tremaining: 15h 46m 38s\n", 1243 | "17000:\tlearn: 0.7014712\ttest: 0.6474876\tbest: 0.6478358 (16150)\ttotal: 33m 18s\tremaining: 15h 46m 15s\n", 1244 | "17100:\tlearn: 0.7017179\ttest: 0.6475622\tbest: 0.6478358 (16150)\ttotal: 33m 29s\tremaining: 15h 45m 49s\n", 1245 | "17200:\tlearn: 0.7019738\ttest: 0.6475622\tbest: 0.6478358 (16150)\ttotal: 33m 41s\tremaining: 15h 45m 27s\n", 1246 | "17300:\tlearn: 0.7022089\ttest: 0.6477861\tbest: 0.6478358 (16150)\ttotal: 33m 52s\tremaining: 15h 45m 1s\n", 1247 | "17400:\tlearn: 0.7025018\ttest: 0.6479104\tbest: 0.6480100 (17384)\ttotal: 34m 3s\tremaining: 15h 44m 40s\n", 1248 | "17500:\tlearn: 0.7027353\ttest: 0.6478109\tbest: 0.6480100 (17384)\ttotal: 34m 15s\tremaining: 15h 44m 18s\n", 1249 | "17600:\tlearn: 0.7029998\ttest: 0.6479851\tbest: 0.6480597 (17597)\ttotal: 34m 26s\tremaining: 15h 43m 59s\n", 1250 | "17700:\tlearn: 0.7032668\ttest: 0.6477114\tbest: 0.6480597 (17597)\ttotal: 34m 37s\tremaining: 15h 43m 37s\n", 1251 | "17800:\tlearn: 0.7035308\ttest: 0.6477114\tbest: 0.6480597 (17597)\ttotal: 34m 48s\tremaining: 15h 43m 6s\n", 1252 | "17900:\tlearn: 0.7037709\ttest: 0.6476368\tbest: 0.6480597 (17597)\ttotal: 35m\tremaining: 15h 42m 45s\n", 1253 | "18000:\tlearn: 0.7040268\ttest: 0.6477612\tbest: 0.6480597 (17597)\ttotal: 35m 11s\tremaining: 15h 42m 24s\n", 1254 | "18100:\tlearn: 0.7042984\ttest: 0.6478358\tbest: 0.6480597 (17597)\ttotal: 35m 23s\tremaining: 15h 42m 4s\n", 1255 | "18200:\tlearn: 0.7045284\ttest: 0.6479602\tbest: 0.6480597 (17597)\ttotal: 35m 34s\tremaining: 15h 41m 38s\n", 1256 | "18300:\tlearn: 0.7047954\ttest: 0.6482338\tbest: 0.6483831 (18279)\ttotal: 35m 46s\tremaining: 15h 41m 29s\n", 1257 | "18400:\tlearn: 0.7050437\ttest: 0.6480846\tbest: 0.6483831 (18279)\ttotal: 35m 57s\tremaining: 15h 41m 7s\n", 1258 | "18500:\tlearn: 0.7053021\ttest: 0.6482338\tbest: 0.6483831 (18279)\ttotal: 36m 8s\tremaining: 15h 40m 43s\n", 1259 | "18600:\tlearn: 0.7055843\ttest: 0.6481343\tbest: 0.6484080 (18523)\ttotal: 36m 20s\tremaining: 15h 40m 20s\n", 1260 | "18700:\tlearn: 0.7058696\ttest: 0.6481095\tbest: 0.6484080 (18523)\ttotal: 36m 31s\tremaining: 15h 39m 54s\n", 1261 | "18800:\tlearn: 0.7061042\ttest: 0.6482090\tbest: 0.6484080 (18523)\ttotal: 36m 42s\tremaining: 15h 39m 32s\n", 1262 | "18900:\tlearn: 0.7063438\ttest: 0.6482836\tbest: 0.6484080 (18523)\ttotal: 36m 53s\tremaining: 15h 39m 4s\n", 1263 | "19000:\tlearn: 0.7065870\ttest: 0.6483831\tbest: 0.6484080 (18523)\ttotal: 37m 4s\tremaining: 15h 38m 42s\n", 1264 | "19100:\tlearn: 0.7068388\ttest: 0.6485075\tbest: 0.6485572 (19030)\ttotal: 37m 16s\tremaining: 15h 38m 20s\n", 1265 | "19200:\tlearn: 0.7071357\ttest: 0.6485821\tbest: 0.6485821 (19200)\ttotal: 37m 27s\tremaining: 15h 38m 4s\n", 1266 | "19300:\tlearn: 0.7073942\ttest: 0.6484577\tbest: 0.6485821 (19200)\ttotal: 37m 39s\tremaining: 15h 37m 44s\n", 1267 | "19400:\tlearn: 0.7076536\ttest: 0.6483831\tbest: 0.6486070 (19329)\ttotal: 37m 50s\tremaining: 15h 37m 29s\n", 1268 | "19500:\tlearn: 0.7079308\ttest: 0.6484577\tbest: 0.6486070 (19329)\ttotal: 38m 2s\tremaining: 15h 37m 10s\n", 1269 | "19600:\tlearn: 0.7081881\ttest: 0.6485572\tbest: 0.6486567 (19579)\ttotal: 38m 13s\tremaining: 15h 36m 58s\n", 1270 | "19700:\tlearn: 0.7084394\ttest: 0.6485572\tbest: 0.6486567 (19579)\ttotal: 38m 25s\tremaining: 15h 36m 38s\n", 1271 | "19800:\tlearn: 0.7086628\ttest: 0.6484080\tbest: 0.6486567 (19579)\ttotal: 38m 36s\tremaining: 15h 36m 17s\n", 1272 | "19900:\tlearn: 0.7089344\ttest: 0.6483333\tbest: 0.6486567 (19579)\ttotal: 38m 47s\tremaining: 15h 35m 57s\n", 1273 | "20000:\tlearn: 0.7092182\ttest: 0.6483582\tbest: 0.6486567 (19579)\ttotal: 38m 59s\tremaining: 15h 35m 42s\n", 1274 | "20100:\tlearn: 0.7094639\ttest: 0.6485075\tbest: 0.6486567 (19579)\ttotal: 39m 10s\tremaining: 15h 35m 17s\n", 1275 | "20200:\tlearn: 0.7097096\ttest: 0.6482836\tbest: 0.6486567 (19579)\ttotal: 39m 21s\tremaining: 15h 34m 56s\n", 1276 | "20300:\tlearn: 0.7099436\ttest: 0.6484826\tbest: 0.6486567 (19579)\ttotal: 39m 32s\tremaining: 15h 34m 28s\n", 1277 | "20400:\tlearn: 0.7102000\ttest: 0.6485572\tbest: 0.6486567 (19579)\ttotal: 39m 44s\tremaining: 15h 34m 8s\n", 1278 | "20500:\tlearn: 0.7104432\ttest: 0.6485572\tbest: 0.6486567 (19579)\ttotal: 39m 55s\tremaining: 15h 33m 41s\n", 1279 | "20600:\tlearn: 0.7107214\ttest: 0.6485075\tbest: 0.6486567 (19579)\ttotal: 40m 6s\tremaining: 15h 33m 24s\n", 1280 | "20700:\tlearn: 0.7110092\ttest: 0.6484577\tbest: 0.6486567 (19579)\ttotal: 40m 18s\tremaining: 15h 33m 11s\n", 1281 | "20800:\tlearn: 0.7112514\ttest: 0.6482836\tbest: 0.6486567 (19579)\ttotal: 40m 29s\tremaining: 15h 32m 51s\n", 1282 | "20900:\tlearn: 0.7115128\ttest: 0.6480348\tbest: 0.6486567 (19579)\ttotal: 40m 40s\tremaining: 15h 32m 30s\n", 1283 | "21000:\tlearn: 0.7117768\ttest: 0.6482338\tbest: 0.6486567 (19579)\ttotal: 40m 52s\tremaining: 15h 32m 10s\n", 1284 | "21100:\tlearn: 0.7120159\ttest: 0.6483582\tbest: 0.6486567 (19579)\ttotal: 41m 3s\tremaining: 15h 31m 54s\n", 1285 | "21200:\tlearn: 0.7123048\ttest: 0.6486070\tbest: 0.6486567 (19579)\ttotal: 41m 15s\tremaining: 15h 31m 37s\n", 1286 | "21300:\tlearn: 0.7125246\ttest: 0.6484328\tbest: 0.6488308 (21217)\ttotal: 41m 26s\tremaining: 15h 31m 17s\n", 1287 | "21400:\tlearn: 0.7127795\ttest: 0.6486070\tbest: 0.6488308 (21217)\ttotal: 41m 37s\tremaining: 15h 30m 52s\n", 1288 | "21500:\tlearn: 0.7130267\ttest: 0.6487065\tbest: 0.6488308 (21217)\ttotal: 41m 49s\tremaining: 15h 30m 39s\n", 1289 | "21600:\tlearn: 0.7132658\ttest: 0.6483582\tbest: 0.6488308 (21217)\ttotal: 42m\tremaining: 15h 30m 19s\n", 1290 | "21700:\tlearn: 0.7135283\ttest: 0.6485323\tbest: 0.6488308 (21217)\ttotal: 42m 11s\tremaining: 15h 29m 57s\n", 1291 | "21800:\tlearn: 0.7138308\ttest: 0.6485821\tbest: 0.6488308 (21217)\ttotal: 42m 23s\tremaining: 15h 29m 41s\n", 1292 | "21900:\tlearn: 0.7140481\ttest: 0.6486567\tbest: 0.6488308 (21217)\ttotal: 42m 34s\tremaining: 15h 29m 19s\n", 1293 | "22000:\tlearn: 0.7143131\ttest: 0.6485572\tbest: 0.6488308 (21217)\ttotal: 42m 45s\tremaining: 15h 29m\n", 1294 | "22100:\tlearn: 0.7146010\ttest: 0.6485075\tbest: 0.6488308 (21217)\ttotal: 42m 57s\tremaining: 15h 28m 44s\n", 1295 | "22200:\tlearn: 0.7148132\ttest: 0.6486070\tbest: 0.6488308 (21217)\ttotal: 43m 8s\tremaining: 15h 28m 22s\n", 1296 | "22300:\tlearn: 0.7150650\ttest: 0.6484080\tbest: 0.6488308 (21217)\ttotal: 43m 19s\tremaining: 15h 28m 4s\n", 1297 | "22400:\tlearn: 0.7153010\ttest: 0.6483333\tbest: 0.6488308 (21217)\ttotal: 43m 30s\tremaining: 15h 27m 46s\n", 1298 | "22500:\tlearn: 0.7155681\ttest: 0.6483333\tbest: 0.6488308 (21217)\ttotal: 43m 42s\tremaining: 15h 27m 29s\n", 1299 | "22600:\tlearn: 0.7158417\ttest: 0.6484080\tbest: 0.6488308 (21217)\ttotal: 43m 53s\tremaining: 15h 27m 5s\n", 1300 | "22700:\tlearn: 0.7160955\ttest: 0.6482338\tbest: 0.6488308 (21217)\ttotal: 44m 4s\tremaining: 15h 26m 47s\n", 1301 | "22800:\tlearn: 0.7163204\ttest: 0.6485572\tbest: 0.6488308 (21217)\ttotal: 44m 15s\tremaining: 15h 26m 26s\n", 1302 | "22900:\tlearn: 0.7165824\ttest: 0.6485075\tbest: 0.6488308 (21217)\ttotal: 44m 27s\tremaining: 15h 26m 9s\n", 1303 | "23000:\tlearn: 0.7168149\ttest: 0.6485075\tbest: 0.6488308 (21217)\ttotal: 44m 38s\tremaining: 15h 25m 50s\n", 1304 | "23100:\tlearn: 0.7170936\ttest: 0.6483333\tbest: 0.6488308 (21217)\ttotal: 44m 50s\tremaining: 15h 25m 38s\n", 1305 | "23200:\tlearn: 0.7173302\ttest: 0.6485572\tbest: 0.6488308 (21217)\ttotal: 45m 1s\tremaining: 15h 25m 19s\n", 1306 | "23300:\tlearn: 0.7175911\ttest: 0.6484328\tbest: 0.6488308 (21217)\ttotal: 45m 12s\tremaining: 15h 24m 56s\n", 1307 | "23400:\tlearn: 0.7178561\ttest: 0.6486070\tbest: 0.6488308 (21217)\ttotal: 45m 23s\tremaining: 15h 24m 37s\n", 1308 | "23500:\tlearn: 0.7180891\ttest: 0.6487811\tbest: 0.6488308 (21217)\ttotal: 45m 35s\tremaining: 15h 24m 19s\n", 1309 | "23600:\tlearn: 0.7183359\ttest: 0.6486318\tbest: 0.6488308 (21217)\ttotal: 45m 46s\tremaining: 15h 24m 6s\n", 1310 | "23700:\tlearn: 0.7185978\ttest: 0.6486070\tbest: 0.6488308 (21217)\ttotal: 45m 58s\tremaining: 15h 23m 51s\n", 1311 | "23800:\tlearn: 0.7188319\ttest: 0.6484826\tbest: 0.6488308 (21217)\ttotal: 46m 9s\tremaining: 15h 23m 30s\n", 1312 | "23900:\tlearn: 0.7190826\ttest: 0.6485821\tbest: 0.6488308 (21217)\ttotal: 46m 20s\tremaining: 15h 23m 8s\n", 1313 | "24000:\tlearn: 0.7193583\ttest: 0.6486567\tbest: 0.6488308 (21217)\ttotal: 46m 32s\tremaining: 15h 22m 53s\n", 1314 | "24100:\tlearn: 0.7195812\ttest: 0.6483582\tbest: 0.6488308 (21217)\ttotal: 46m 43s\tremaining: 15h 22m 33s\n", 1315 | "24200:\tlearn: 0.7198700\ttest: 0.6485323\tbest: 0.6488308 (21217)\ttotal: 46m 54s\tremaining: 15h 22m 19s\n", 1316 | "24300:\tlearn: 0.7201685\ttest: 0.6485075\tbest: 0.6488308 (21217)\ttotal: 47m 6s\tremaining: 15h 22m 6s\n", 1317 | "24400:\tlearn: 0.7204305\ttest: 0.6485821\tbest: 0.6488308 (21217)\ttotal: 47m 17s\tremaining: 15h 21m 52s\n", 1318 | "24500:\tlearn: 0.7206544\ttest: 0.6486567\tbest: 0.6488308 (21217)\ttotal: 47m 29s\tremaining: 15h 21m 35s\n", 1319 | "24600:\tlearn: 0.7209062\ttest: 0.6486816\tbest: 0.6489055 (24544)\ttotal: 47m 40s\tremaining: 15h 21m 16s\n" 1320 | ] 1321 | }, 1322 | { 1323 | "name": "stdout", 1324 | "output_type": "stream", 1325 | "text": [ 1326 | "24700:\tlearn: 0.7211849\ttest: 0.6488557\tbest: 0.6489055 (24544)\ttotal: 47m 51s\tremaining: 15h 21m 1s\n", 1327 | "24800:\tlearn: 0.7214078\ttest: 0.6489552\tbest: 0.6490547 (24781)\ttotal: 48m 3s\tremaining: 15h 20m 40s\n", 1328 | "24900:\tlearn: 0.7216687\ttest: 0.6490050\tbest: 0.6490547 (24781)\ttotal: 48m 14s\tremaining: 15h 20m 28s\n", 1329 | "25000:\tlearn: 0.7218966\ttest: 0.6490299\tbest: 0.6490796 (24997)\ttotal: 48m 26s\tremaining: 15h 20m 11s\n", 1330 | "25100:\tlearn: 0.7221845\ttest: 0.6489552\tbest: 0.6491791 (25019)\ttotal: 48m 37s\tremaining: 15h 19m 59s\n", 1331 | "25200:\tlearn: 0.7224348\ttest: 0.6490547\tbest: 0.6491791 (25019)\ttotal: 48m 49s\tremaining: 15h 19m 47s\n", 1332 | "25300:\tlearn: 0.7226876\ttest: 0.6487313\tbest: 0.6491791 (25019)\ttotal: 49m\tremaining: 15h 19m 35s\n", 1333 | "25400:\tlearn: 0.7229343\ttest: 0.6489801\tbest: 0.6491791 (25019)\ttotal: 49m 12s\tremaining: 15h 19m 28s\n", 1334 | "25500:\tlearn: 0.7231846\ttest: 0.6490299\tbest: 0.6491791 (25019)\ttotal: 49m 24s\tremaining: 15h 19m 14s\n", 1335 | "25600:\tlearn: 0.7234293\ttest: 0.6489801\tbest: 0.6491791 (25019)\ttotal: 49m 35s\tremaining: 15h 19m 2s\n", 1336 | "25700:\tlearn: 0.7237034\ttest: 0.6489055\tbest: 0.6491791 (25019)\ttotal: 49m 47s\tremaining: 15h 18m 53s\n", 1337 | "25800:\tlearn: 0.7239486\ttest: 0.6489801\tbest: 0.6491791 (25019)\ttotal: 49m 58s\tremaining: 15h 18m 34s\n", 1338 | "25900:\tlearn: 0.7241750\ttest: 0.6491045\tbest: 0.6492289 (25878)\ttotal: 50m 10s\tremaining: 15h 18m 19s\n", 1339 | "26000:\tlearn: 0.7244492\ttest: 0.6489055\tbest: 0.6492289 (25878)\ttotal: 50m 21s\tremaining: 15h 18m 10s\n", 1340 | "26100:\tlearn: 0.7246847\ttest: 0.6492040\tbest: 0.6492289 (25878)\ttotal: 50m 32s\tremaining: 15h 17m 47s\n", 1341 | "26200:\tlearn: 0.7249365\ttest: 0.6489552\tbest: 0.6492537 (26128)\ttotal: 50m 44s\tremaining: 15h 17m 30s\n", 1342 | "26300:\tlearn: 0.7252061\ttest: 0.6489552\tbest: 0.6492537 (26128)\ttotal: 50m 55s\tremaining: 15h 17m 16s\n", 1343 | "26400:\tlearn: 0.7254407\ttest: 0.6490796\tbest: 0.6492537 (26128)\ttotal: 51m 7s\tremaining: 15h 17m\n", 1344 | "26500:\tlearn: 0.7256899\ttest: 0.6491045\tbest: 0.6492537 (26128)\ttotal: 51m 18s\tremaining: 15h 16m 47s\n", 1345 | "26600:\tlearn: 0.7259397\ttest: 0.6491045\tbest: 0.6493035 (26556)\ttotal: 51m 30s\tremaining: 15h 16m 33s\n", 1346 | "26700:\tlearn: 0.7261778\ttest: 0.6490796\tbest: 0.6493035 (26556)\ttotal: 51m 41s\tremaining: 15h 16m 18s\n", 1347 | "26800:\tlearn: 0.7264362\ttest: 0.6490050\tbest: 0.6493035 (26556)\ttotal: 51m 52s\tremaining: 15h 16m 2s\n", 1348 | "26900:\tlearn: 0.7266723\ttest: 0.6489801\tbest: 0.6493035 (26556)\ttotal: 52m 4s\tremaining: 15h 15m 42s\n", 1349 | "27000:\tlearn: 0.7268850\ttest: 0.6490547\tbest: 0.6493035 (26556)\ttotal: 52m 15s\tremaining: 15h 15m 22s\n", 1350 | "27100:\tlearn: 0.7271596\ttest: 0.6489303\tbest: 0.6493035 (26556)\ttotal: 52m 26s\tremaining: 15h 15m 6s\n", 1351 | "27200:\tlearn: 0.7273936\ttest: 0.6489801\tbest: 0.6493035 (26556)\ttotal: 52m 38s\tremaining: 15h 14m 53s\n", 1352 | "27300:\tlearn: 0.7276556\ttest: 0.6489801\tbest: 0.6493035 (26556)\ttotal: 52m 49s\tremaining: 15h 14m 36s\n", 1353 | "27400:\tlearn: 0.7279221\ttest: 0.6491045\tbest: 0.6493035 (26556)\ttotal: 53m\tremaining: 15h 14m 22s\n", 1354 | "27500:\tlearn: 0.7281886\ttest: 0.6489055\tbest: 0.6493035 (26556)\ttotal: 53m 12s\tremaining: 15h 14m 4s\n", 1355 | "27600:\tlearn: 0.7283882\ttest: 0.6490796\tbest: 0.6493284 (27561)\ttotal: 53m 23s\tremaining: 15h 13m 46s\n", 1356 | "27700:\tlearn: 0.7286928\ttest: 0.6491045\tbest: 0.6493284 (27561)\ttotal: 53m 34s\tremaining: 15h 13m 33s\n", 1357 | "27800:\tlearn: 0.7289410\ttest: 0.6491294\tbest: 0.6493284 (27561)\ttotal: 53m 46s\tremaining: 15h 13m 15s\n", 1358 | "27900:\tlearn: 0.7291705\ttest: 0.6490050\tbest: 0.6493284 (27561)\ttotal: 53m 57s\tremaining: 15h 12m 56s\n", 1359 | "28000:\tlearn: 0.7294411\ttest: 0.6491542\tbest: 0.6493284 (27561)\ttotal: 54m 8s\tremaining: 15h 12m 42s\n", 1360 | "28100:\tlearn: 0.7296929\ttest: 0.6490796\tbest: 0.6493284 (27561)\ttotal: 54m 20s\tremaining: 15h 12m 29s\n", 1361 | "28200:\tlearn: 0.7299431\ttest: 0.6490796\tbest: 0.6493284 (27561)\ttotal: 54m 31s\tremaining: 15h 12m 14s\n", 1362 | "28300:\tlearn: 0.7301838\ttest: 0.6491294\tbest: 0.6493284 (27561)\ttotal: 54m 43s\tremaining: 15h 11m 59s\n", 1363 | "28400:\tlearn: 0.7304752\ttest: 0.6492289\tbest: 0.6493532 (28374)\ttotal: 54m 54s\tremaining: 15h 11m 45s\n", 1364 | "28500:\tlearn: 0.7307295\ttest: 0.6490796\tbest: 0.6493532 (28374)\ttotal: 55m 6s\tremaining: 15h 11m 33s\n", 1365 | "28600:\tlearn: 0.7309894\ttest: 0.6491542\tbest: 0.6493532 (28374)\ttotal: 55m 17s\tremaining: 15h 11m 16s\n", 1366 | "28700:\tlearn: 0.7312418\ttest: 0.6490796\tbest: 0.6493532 (28374)\ttotal: 55m 28s\tremaining: 15h 10m 59s\n", 1367 | "28800:\tlearn: 0.7315149\ttest: 0.6490050\tbest: 0.6493532 (28374)\ttotal: 55m 40s\tremaining: 15h 10m 47s\n", 1368 | "28900:\tlearn: 0.7317144\ttest: 0.6490299\tbest: 0.6493532 (28374)\ttotal: 55m 51s\tremaining: 15h 10m 31s\n", 1369 | "29000:\tlearn: 0.7319758\ttest: 0.6492537\tbest: 0.6493532 (28374)\ttotal: 56m 3s\tremaining: 15h 10m 17s\n", 1370 | "29100:\tlearn: 0.7322358\ttest: 0.6490050\tbest: 0.6493532 (28374)\ttotal: 56m 14s\tremaining: 15h 10m 3s\n", 1371 | "29200:\tlearn: 0.7324906\ttest: 0.6490547\tbest: 0.6493532 (28374)\ttotal: 56m 25s\tremaining: 15h 9m 49s\n", 1372 | "29300:\tlearn: 0.7326962\ttest: 0.6491045\tbest: 0.6493532 (28374)\ttotal: 56m 37s\tremaining: 15h 9m 33s\n", 1373 | "29400:\tlearn: 0.7329541\ttest: 0.6490299\tbest: 0.6493532 (28374)\ttotal: 56m 48s\tremaining: 15h 9m 18s\n", 1374 | "29500:\tlearn: 0.7331973\ttest: 0.6488557\tbest: 0.6493532 (28374)\ttotal: 57m\tremaining: 15h 9m 6s\n", 1375 | "29600:\tlearn: 0.7334415\ttest: 0.6491542\tbest: 0.6493532 (28374)\ttotal: 57m 11s\tremaining: 15h 8m 54s\n", 1376 | "29700:\tlearn: 0.7336821\ttest: 0.6490050\tbest: 0.6493532 (28374)\ttotal: 57m 23s\tremaining: 15h 8m 38s\n", 1377 | "29800:\tlearn: 0.7339177\ttest: 0.6490299\tbest: 0.6493532 (28374)\ttotal: 57m 34s\tremaining: 15h 8m 23s\n", 1378 | "29900:\tlearn: 0.7341756\ttest: 0.6491045\tbest: 0.6493532 (28374)\ttotal: 57m 45s\tremaining: 15h 8m 5s\n", 1379 | "30000:\tlearn: 0.7343710\ttest: 0.6490299\tbest: 0.6493532 (28374)\ttotal: 57m 56s\tremaining: 15h 7m 49s\n", 1380 | "30100:\tlearn: 0.7346142\ttest: 0.6491294\tbest: 0.6493532 (28374)\ttotal: 58m 8s\tremaining: 15h 7m 37s\n", 1381 | "30200:\tlearn: 0.7348589\ttest: 0.6491791\tbest: 0.6493532 (28374)\ttotal: 58m 19s\tremaining: 15h 7m 20s\n", 1382 | "30300:\tlearn: 0.7351604\ttest: 0.6493284\tbest: 0.6493532 (28374)\ttotal: 58m 31s\tremaining: 15h 7m 10s\n", 1383 | "30400:\tlearn: 0.7353838\ttest: 0.6491542\tbest: 0.6494279 (30303)\ttotal: 58m 43s\tremaining: 15h 7m\n", 1384 | "30500:\tlearn: 0.7356275\ttest: 0.6491045\tbest: 0.6494279 (30303)\ttotal: 58m 54s\tremaining: 15h 6m 48s\n", 1385 | "30600:\tlearn: 0.7358955\ttest: 0.6492537\tbest: 0.6494279 (30303)\ttotal: 59m 6s\tremaining: 15h 6m 35s\n", 1386 | "30700:\tlearn: 0.7361092\ttest: 0.6493284\tbest: 0.6494279 (30303)\ttotal: 59m 17s\tremaining: 15h 6m 20s\n", 1387 | "30800:\tlearn: 0.7363626\ttest: 0.6492537\tbest: 0.6495274 (30757)\ttotal: 59m 29s\tremaining: 15h 6m 13s\n", 1388 | "30900:\tlearn: 0.7366321\ttest: 0.6491045\tbest: 0.6495274 (30757)\ttotal: 59m 40s\tremaining: 15h 5m 59s\n", 1389 | "31000:\tlearn: 0.7368616\ttest: 0.6491791\tbest: 0.6495274 (30757)\ttotal: 59m 52s\tremaining: 15h 5m 48s\n", 1390 | "31100:\tlearn: 0.7371246\ttest: 0.6491791\tbest: 0.6495274 (30757)\ttotal: 1h 4s\tremaining: 15h 5m 38s\n", 1391 | "31200:\tlearn: 0.7373779\ttest: 0.6492040\tbest: 0.6495274 (30757)\ttotal: 1h 15s\tremaining: 15h 5m 26s\n", 1392 | "31300:\tlearn: 0.7376363\ttest: 0.6489055\tbest: 0.6495274 (30757)\ttotal: 1h 27s\tremaining: 15h 5m 15s\n", 1393 | "31400:\tlearn: 0.7378749\ttest: 0.6487562\tbest: 0.6495274 (30757)\ttotal: 1h 39s\tremaining: 15h 5m 7s\n", 1394 | "31500:\tlearn: 0.7381252\ttest: 0.6489303\tbest: 0.6495274 (30757)\ttotal: 1h 50s\tremaining: 15h 4m 58s\n", 1395 | "31600:\tlearn: 0.7383943\ttest: 0.6487562\tbest: 0.6495274 (30757)\ttotal: 1h 1m 2s\tremaining: 15h 4m 44s\n", 1396 | "31700:\tlearn: 0.7386278\ttest: 0.6486318\tbest: 0.6495274 (30757)\ttotal: 1h 1m 13s\tremaining: 15h 4m 32s\n", 1397 | "31800:\tlearn: 0.7388618\ttest: 0.6487811\tbest: 0.6495274 (30757)\ttotal: 1h 1m 25s\tremaining: 15h 4m 18s\n", 1398 | "31900:\tlearn: 0.7391527\ttest: 0.6489303\tbest: 0.6495274 (30757)\ttotal: 1h 1m 36s\tremaining: 15h 4m 7s\n", 1399 | "32000:\tlearn: 0.7393751\ttest: 0.6489303\tbest: 0.6495274 (30757)\ttotal: 1h 1m 48s\tremaining: 15h 3m 55s\n", 1400 | "32100:\tlearn: 0.7396142\ttest: 0.6490299\tbest: 0.6495274 (30757)\ttotal: 1h 1m 59s\tremaining: 15h 3m 41s\n", 1401 | "32200:\tlearn: 0.7398761\ttest: 0.6490796\tbest: 0.6495274 (30757)\ttotal: 1h 2m 11s\tremaining: 15h 3m 29s\n", 1402 | "32300:\tlearn: 0.7401269\ttest: 0.6492040\tbest: 0.6495274 (30757)\ttotal: 1h 2m 22s\tremaining: 15h 3m 15s\n", 1403 | "32400:\tlearn: 0.7403726\ttest: 0.6492040\tbest: 0.6495274 (30757)\ttotal: 1h 2m 34s\tremaining: 15h 3m 5s\n", 1404 | "32500:\tlearn: 0.7406199\ttest: 0.6490796\tbest: 0.6495274 (30757)\ttotal: 1h 2m 45s\tremaining: 15h 2m 50s\n", 1405 | "32600:\tlearn: 0.7408498\ttest: 0.6490547\tbest: 0.6495274 (30757)\ttotal: 1h 2m 57s\tremaining: 15h 2m 41s\n", 1406 | "32700:\tlearn: 0.7411270\ttest: 0.6492537\tbest: 0.6495274 (30757)\ttotal: 1h 3m 9s\tremaining: 15h 2m 30s\n" 1407 | ] 1408 | }, 1409 | { 1410 | "name": "stdout", 1411 | "output_type": "stream", 1412 | "text": [ 1413 | "32800:\tlearn: 0.7413702\ttest: 0.6490547\tbest: 0.6495274 (30757)\ttotal: 1h 3m 22s\tremaining: 15h 2m 36s\n", 1414 | "32900:\tlearn: 0.7416164\ttest: 0.6491294\tbest: 0.6495274 (30757)\ttotal: 1h 3m 33s\tremaining: 15h 2m 24s\n", 1415 | "33000:\tlearn: 0.7418895\ttest: 0.6491542\tbest: 0.6495274 (30757)\ttotal: 1h 3m 45s\tremaining: 15h 2m 14s\n", 1416 | "33100:\tlearn: 0.7421266\ttest: 0.6491542\tbest: 0.6495274 (30757)\ttotal: 1h 3m 57s\tremaining: 15h 2m 7s\n", 1417 | "33200:\tlearn: 0.7423942\ttest: 0.6490547\tbest: 0.6495274 (30757)\ttotal: 1h 4m 8s\tremaining: 15h 1m 55s\n", 1418 | "33300:\tlearn: 0.7426490\ttest: 0.6491294\tbest: 0.6495274 (30757)\ttotal: 1h 4m 20s\tremaining: 15h 1m 44s\n", 1419 | "33400:\tlearn: 0.7428972\ttest: 0.6491542\tbest: 0.6495274 (30757)\ttotal: 1h 4m 32s\tremaining: 15h 1m 35s\n", 1420 | "33500:\tlearn: 0.7431359\ttest: 0.6491045\tbest: 0.6495274 (30757)\ttotal: 1h 4m 44s\tremaining: 15h 1m 26s\n", 1421 | "33600:\tlearn: 0.7433760\ttest: 0.6494279\tbest: 0.6495274 (30757)\ttotal: 1h 4m 55s\tremaining: 15h 1m 13s\n", 1422 | "33700:\tlearn: 0.7436369\ttest: 0.6490547\tbest: 0.6495274 (30757)\ttotal: 1h 5m 7s\tremaining: 15h 1m 2s\n", 1423 | "33800:\tlearn: 0.7439151\ttest: 0.6489055\tbest: 0.6495274 (30757)\ttotal: 1h 5m 19s\tremaining: 15h 54s\n", 1424 | "33900:\tlearn: 0.7441354\ttest: 0.6490796\tbest: 0.6495274 (30757)\ttotal: 1h 5m 30s\tremaining: 15h 41s\n", 1425 | "34000:\tlearn: 0.7443837\ttest: 0.6491045\tbest: 0.6495274 (30757)\ttotal: 1h 5m 42s\tremaining: 15h 27s\n", 1426 | "34100:\tlearn: 0.7446487\ttest: 0.6492040\tbest: 0.6495274 (30757)\ttotal: 1h 5m 54s\tremaining: 15h 20s\n", 1427 | "34200:\tlearn: 0.7449147\ttest: 0.6492786\tbest: 0.6495274 (30757)\ttotal: 1h 6m 5s\tremaining: 15h 12s\n", 1428 | "34300:\tlearn: 0.7451513\ttest: 0.6493781\tbest: 0.6495274 (30757)\ttotal: 1h 6m 17s\tremaining: 14h 59m 59s\n", 1429 | "34400:\tlearn: 0.7454016\ttest: 0.6492786\tbest: 0.6495274 (30757)\ttotal: 1h 6m 29s\tremaining: 14h 59m 50s\n", 1430 | "34500:\tlearn: 0.7456590\ttest: 0.6492289\tbest: 0.6495274 (30757)\ttotal: 1h 6m 40s\tremaining: 14h 59m 37s\n", 1431 | "34600:\tlearn: 0.7459478\ttest: 0.6492786\tbest: 0.6495274 (30757)\ttotal: 1h 6m 52s\tremaining: 14h 59m 25s\n", 1432 | "34700:\tlearn: 0.7461676\ttest: 0.6491791\tbest: 0.6495274 (30757)\ttotal: 1h 7m 3s\tremaining: 14h 59m 13s\n", 1433 | "34800:\tlearn: 0.7464265\ttest: 0.6493781\tbest: 0.6495274 (30757)\ttotal: 1h 7m 15s\tremaining: 14h 59m 4s\n", 1434 | "34900:\tlearn: 0.7466773\ttest: 0.6492786\tbest: 0.6495274 (30757)\ttotal: 1h 7m 27s\tremaining: 14h 58m 53s\n", 1435 | "35000:\tlearn: 0.7469012\ttest: 0.6492289\tbest: 0.6495274 (30757)\ttotal: 1h 7m 38s\tremaining: 14h 58m 38s\n", 1436 | "35100:\tlearn: 0.7471596\ttest: 0.6492786\tbest: 0.6495274 (30757)\ttotal: 1h 7m 49s\tremaining: 14h 58m 24s\n", 1437 | "35200:\tlearn: 0.7473881\ttest: 0.6495274\tbest: 0.6495771 (35192)\ttotal: 1h 8m 1s\tremaining: 14h 58m 11s\n", 1438 | "35300:\tlearn: 0.7476739\ttest: 0.6492289\tbest: 0.6496020 (35212)\ttotal: 1h 8m 13s\tremaining: 14h 58m 2s\n", 1439 | "35400:\tlearn: 0.7479089\ttest: 0.6493035\tbest: 0.6496020 (35212)\ttotal: 1h 8m 25s\tremaining: 14h 57m 53s\n", 1440 | "35500:\tlearn: 0.7481917\ttest: 0.6494279\tbest: 0.6496020 (35212)\ttotal: 1h 8m 36s\tremaining: 14h 57m 45s\n" 1441 | ] 1442 | } 1443 | ], 1444 | "source": [ 1445 | "model_I = CatBoostClassifier(iterations=500000,\n", 1446 | " learning_rate=0.01,\n", 1447 | " eval_metric='Accuracy',\n", 1448 | " use_best_model=True,\n", 1449 | " random_seed=42,\n", 1450 | " logging_level='Verbose',\n", 1451 | " task_type='GPU',\n", 1452 | " devices='0:1:2',\n", 1453 | " early_stopping_rounds=10000,\n", 1454 | " loss_function='MultiClass',\n", 1455 | " depth=9,\n", 1456 | " #gpu_ram_part=0.5,\n", 1457 | " )\n", 1458 | "model_I.fit(train_pool, eval_set=eval_pool, verbose=100) #0.6509 #0.6498" 1459 | ] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "execution_count": null, 1464 | "metadata": {}, 1465 | "outputs": [], 1466 | "source": [ 1467 | "pred = model_I.predict(X_validation)" 1468 | ] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "execution_count": null, 1473 | "metadata": {}, 1474 | "outputs": [], 1475 | "source": [] 1476 | }, 1477 | { 1478 | "cell_type": "code", 1479 | "execution_count": 109, 1480 | "metadata": {}, 1481 | "outputs": [], 1482 | "source": [ 1483 | "pred = model_I.predict(test_x)\n", 1484 | "pred = label_smoothing_re(pred)" 1485 | ] 1486 | }, 1487 | { 1488 | "cell_type": "code", 1489 | "execution_count": null, 1490 | "metadata": {}, 1491 | "outputs": [], 1492 | "source": [ 1493 | "#pred = model_I.predict(test_x)\n", 1494 | "result = pd.DataFrame()\n", 1495 | "result['id'] = data[test_index]['uId']\n", 1496 | "result['label'] = pred.astype(int)\n", 1497 | "result.to_csv('./out/submission.csv', index=False)\n", 1498 | "print('Save Done.')" 1499 | ] 1500 | }, 1501 | { 1502 | "cell_type": "code", 1503 | "execution_count": null, 1504 | "metadata": {}, 1505 | "outputs": [], 1506 | "source": [ 1507 | "pred_val = model.predict_proba(X_validation)\n", 1508 | "pred_test = model.predict_proba(test_x)\n", 1509 | "np.save(\"./out/proba_val_{}.npy\".format(round(model_I.best_score_['validation']['Accuracy'],5)), pred_val)\n", 1510 | "np.save(\"./out/proba_test_{}.npy\".format(round(model_I.best_score_['validation']['Accuracy'],5)), pred_test)" 1511 | ] 1512 | }, 1513 | { 1514 | "cell_type": "code", 1515 | "execution_count": null, 1516 | "metadata": {}, 1517 | "outputs": [], 1518 | "source": [ 1519 | "#特征&数据随机sample\n", 1520 | "import random\n", 1521 | "frac_axis1 = [0.7,0.75,0.8,0.85,0.9]\n", 1522 | "frac_axis0 = [0.9,0.85,0.8,0.75,0.7]\n", 1523 | "for index, i in enumerate(frac_axis1):\n", 1524 | " j = frac_axis0[index]\n", 1525 | " test_index = np.isnan(data.age_group)\n", 1526 | " train_index = ~test_index\n", 1527 | " train_x = data[train_index][feature] \n", 1528 | " train_y = data[train_index]['age_group']\n", 1529 | " test_x = data[test_index][feature]\n", 1530 | " rand = random.randint(0,2019)\n", 1531 | " train_x = train_x.sample(frac=i, replace=True, random_state=rand,axis=1)\n", 1532 | " test_x = test_x.sample(frac=i, replace=True, random_state=rand,axis=1)\n", 1533 | " from sklearn.model_selection import train_test_split\n", 1534 | " X_train, X_validation, y_train, y_validation = train_test_split(train_x, train_y, test_size=0.02, random_state=42)\n", 1535 | " del train_x\n", 1536 | " gc.collect()\n", 1537 | " rand = random.randint(0,2019)\n", 1538 | " X_train = X_train.sample(frac=j, replace=True, random_state=rand,axis=0)\n", 1539 | " y_train = y_train.sample(frac=j, replace=True, random_state=rand,axis=0)\n", 1540 | " train_pool = Pool(X_train, y_train)\n", 1541 | " eval_pool = Pool(X_validation, y_validation)\n", 1542 | " del X_train\n", 1543 | " #del X_validation\n", 1544 | " del y_train\n", 1545 | " del y_validation\n", 1546 | " gc.collect()\n", 1547 | " model = CatBoostClassifier(iterations=300000,\n", 1548 | " learning_rate=0.01,\n", 1549 | " eval_metric='Accuracy',\n", 1550 | " use_best_model=True,\n", 1551 | " random_seed=2019,\n", 1552 | " logging_level='Verbose',\n", 1553 | " task_type='GPU',\n", 1554 | " devices='0',\n", 1555 | " early_stopping_rounds=5000,\n", 1556 | " loss_function='MultiClass',\n", 1557 | " depth=8,\n", 1558 | " #gpu_ram_part=0.3,\n", 1559 | " )\n", 1560 | " model.fit(train_pool, eval_set=eval_pool, verbose=100) #0.6486\n", 1561 | " pred_val = model.predict_proba(X_validation)\n", 1562 | " pred_test = model.predict_proba(test_x)\n", 1563 | " np.save(\"./out/proba_val_{}.npy\".format(round(model.best_score_['validation']['Accuracy'],5)), pred_val)\n", 1564 | " np.save(\"./out/proba_test_{}.npy\".format(round(model.best_score_['validation']['Accuracy'],5)), pred_test)" 1565 | ] 1566 | }, 1567 | { 1568 | "cell_type": "code", 1569 | "execution_count": null, 1570 | "metadata": {}, 1571 | "outputs": [], 1572 | "source": [ 1573 | "#5-fold 交叉验证\n", 1574 | "from tqdm import tqdm_notebook\n", 1575 | "from sklearn.model_selection import StratifiedKFold\n", 1576 | "\n", 1577 | "skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)\n", 1578 | "\n", 1579 | "cv_train = np.zeros((X_train.shape[0],6))\n", 1580 | "cv_test = np.zeros((test_x.shape[0],6))\n", 1581 | "cv_val = np.zeros((X_validation.shape[0],6))\n", 1582 | "\n", 1583 | "cate_features = ['city','prodName','color','ct','rom_category','color_short','carrier','gender','fontSize','os']\n", 1584 | "\n", 1585 | "for index,(train_idx,valid_idx) in tqdm_notebook(enumerate(skf.split(train_x, train_y))):\n", 1586 | " print(\"Fold_{}_started\".format(index))\n", 1587 | " X_KFold_train, y_KFold_train, X_KFold_valid, y_KFold_valid = train_x.iloc[train_idx], train_y.iloc[train_idx], train_x.iloc[valid_idx],train_y.iloc[valid_idx]\n", 1588 | " train_pool = Pool(X_KFold_train, y_KFold_train, cat_features=cate_features)\n", 1589 | " valid_pool = Pool(X_KFold_valid, y_KFold_valid, cat_features=cate_features)\n", 1590 | "\n", 1591 | " del X_KFold_train\n", 1592 | " del y_KFold_train\n", 1593 | " gc.collect()\n", 1594 | " model = CatBoostClassifier(iterations=500000,\n", 1595 | " learning_rate=0.01,\n", 1596 | " eval_metric='Accuracy',\n", 1597 | " use_best_model=True,\n", 1598 | " random_seed=47,\n", 1599 | " logging_level='Verbose',\n", 1600 | " task_type='GPU',\n", 1601 | " devices='0:1',\n", 1602 | " early_stopping_rounds=10000,\n", 1603 | " loss_function='MultiClass',\n", 1604 | " depth=8\n", 1605 | " )\n", 1606 | " model.fit(train_pool, eval_set=valid_pool, verbose=100) #0.648\n", 1607 | " cv_train[valid_idx] = model.predict_proba(X_KFold_valid)\n", 1608 | " cv_val[valid_idx] += model.predict_proba(X_validation)/5\n", 1609 | " cv_test += model.predict_proba(test_x)/5\n", 1610 | " print(\"Fold_{}_result_saved\".format(index))\n", 1611 | " np.save(\"./out/proba_val_{}.npy\".format(round(model.best_score_['validation']['Accuracy'],5)), pred_val)\n", 1612 | " np.save(\"./out/proba_test_{}.npy\".format(round(model.best_score_['validation']['Accuracy'],5)), pred_test)" 1613 | ] 1614 | }, 1615 | { 1616 | "cell_type": "code", 1617 | "execution_count": null, 1618 | "metadata": {}, 1619 | "outputs": [], 1620 | "source": [ 1621 | "#5-fold 交叉验证\n", 1622 | "from tqdm import tqdm_notebook\n", 1623 | "from sklearn.model_selection import StratifiedKFold\n", 1624 | "\n", 1625 | "skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)\n", 1626 | "\n", 1627 | "\n", 1628 | "\n", 1629 | "cv_result = np.zeros((X_train.shape[0],6))\n", 1630 | "cv_test = np.zeros((test_x.shape[0],6))\n", 1631 | "cv_val = np.zeros((X_validation.shape[0],6))\n", 1632 | "cate_features = ['city','prodName','color','ct','rom_category','color_short','carrier','gender','fontSize','os']\n", 1633 | "eval_pool = Pool(X_validation, y_validation, cat_features=cate_features)\n", 1634 | "\n", 1635 | "for index,(train_idx,valid_idx) in tqdm_notebook(enumerate(skf.split(X_train, y_train))):\n", 1636 | " print(\"Fold_{}_started\".format(index))\n", 1637 | " X_KFold_train, y_KFold_train, X_KFold_valid = X_train.iloc[train_idx], y_train.iloc[train_idx], X_train.iloc[valid_idx]\n", 1638 | " train_pool = Pool(X_KFold_train, y_KFold_train, cat_features=cate_features)\n", 1639 | " del X_KFold_train\n", 1640 | " del y_KFold_train\n", 1641 | " gc.collect()\n", 1642 | " model = CatBoostClassifier(iterations=500000,\n", 1643 | " learning_rate=0.01,\n", 1644 | " eval_metric='Accuracy',\n", 1645 | " use_best_model=True,\n", 1646 | " random_seed=47,\n", 1647 | " logging_level='Verbose',\n", 1648 | " task_type='GPU',\n", 1649 | " devices='0:1',\n", 1650 | " early_stopping_rounds=10000,\n", 1651 | " loss_function='MultiClass',\n", 1652 | " depth=8\n", 1653 | " )\n", 1654 | " model.fit(train_pool, eval_set=eval_pool, verbose=1000) #0.648\n", 1655 | " cv_result[valid_idx] = model.predict_proba(X_KFold_valid)\n", 1656 | " cv_val += model.predict_proba(X_validation)/5\n", 1657 | " cv_test += model.predict_proba(test_x)/5\n", 1658 | " print(\"Fold_{}_result_saved\".format(index))" 1659 | ] 1660 | } 1661 | ], 1662 | "metadata": { 1663 | "kernelspec": { 1664 | "display_name": "Python 3", 1665 | "language": "python", 1666 | "name": "python3" 1667 | }, 1668 | "language_info": { 1669 | "codemirror_mode": { 1670 | "name": "ipython", 1671 | "version": 3 1672 | }, 1673 | "file_extension": ".py", 1674 | "mimetype": "text/x-python", 1675 | "name": "python", 1676 | "nbconvert_exporter": "python", 1677 | "pygments_lexer": "ipython3", 1678 | "version": "3.6.7" 1679 | } 1680 | }, 1681 | "nbformat": 4, 1682 | "nbformat_minor": 2 1683 | } 1684 | --------------------------------------------------------------------------------