├── README.md ├── .gitignore └── baseline.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # 2019-datacastle-enbrands 2 | 2019数据智能算法大赛baseline\ 3 | 线下为约为0.39 线上约为94 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import lightgbm as lgb\n", 12 | "from tqdm import tqdm, tqdm_notebook, tnrange\n", 13 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 14 | "from sklearn.model_selection import StratifiedKFold, train_test_split\n", 15 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", 16 | "from sklearn.decomposition import TruncatedSVD\n", 17 | "from sklearn.preprocessing import PolynomialFeatures\n", 18 | "from sklearn.metrics import roc_auc_score\n", 19 | "from sklearn.linear_model import LogisticRegression,HuberRegressor\n", 20 | "from sklearn.linear_model import SGDClassifier,PassiveAggressiveClassifier,RidgeClassifier\n", 21 | "from sklearn.naive_bayes import BernoulliNB,MultinomialNB\n", 22 | "from sklearn.svm import LinearSVC\n", 23 | "from scipy.sparse import hstack,vstack\n", 24 | "import datetime\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings(\"ignore\")" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# Data Loading" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "Index(['order_detail_id', 'order_id', 'order_total_num', 'order_amount',\n", 45 | " 'order_total_payment', 'order_total_discount', 'order_pay_time',\n", 46 | " 'order_status', 'order_count', 'is_customer_rate',\n", 47 | " 'order_detail_status', 'order_detail_goods_num', 'order_detail_amount',\n", 48 | " 'order_detail_payment', 'order_detail_discount', 'customer_province',\n", 49 | " 'customer_city', 'member_id', 'customer_id', 'customer_gender',\n", 50 | " 'member_status', 'is_member_actived', 'goods_id', 'goods_price',\n", 51 | " 'goods_status', 'goods_has_discount', 'goods_list_time',\n", 52 | " 'goods_delist_time'],\n", 53 | " dtype='object')" 54 | ] 55 | }, 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "train = pd.read_csv('../data/round1_diac2019_train.csv', parse_dates=['order_pay_time','goods_list_time','goods_delist_time'])\n", 63 | "train.columns" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "train_last shape: (539577, 28) train_label shape: (861254, 28) train_all shape: (1400831, 28)\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "train_last = train[((train['order_pay_time'].dt.date).astype(str)<='2013-07-03')]\n", 81 | "train_label = train[(train['order_pay_time'].dt.date).astype(str)>='2013-07-04']\n", 82 | "\n", 83 | "train_all = train[((train['order_pay_time'].dt.date).astype(str)<='2013-12-31')]\n", 84 | "print('train_last shape:',train_last.shape,'train_label shape:',train_label.shape,'train_all shape:',train_all.shape)\n", 85 | "\n", 86 | "last_data = pd.DataFrame(train_last[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 87 | "all_data = pd.DataFrame(train_all[['customer_id']]).drop_duplicates(['customer_id']).dropna()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "train_last['order_pay_month'] = train_last['order_pay_time'].dt.month\n", 97 | "train_last['order_pay_dayofweek'] = train_last['order_pay_time'].dt.dayofweek\n", 98 | "train_last['order_pay_day'] = train_last['order_pay_time'].dt.day\n", 99 | "\n", 100 | "train_all['order_pay_month'] = train_last['order_pay_time'].dt.month\n", 101 | "train_all['order_pay_dayofweek'] = train_last['order_pay_time'].dt.dayofweek\n", 102 | "train_all['order_pay_day'] = train_last['order_pay_time'].dt.day" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "# Feature Engineering " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "CPU times: user 10.5 s, sys: 1.79 s, total: 12.3 s\n", 122 | "Wall time: 12.3 s\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "%%time\n", 128 | "for idx,data in enumerate([train_last,train_all]):\n", 129 | " customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 130 | " data = data.sort_values(by=['customer_id','order_pay_time'])\n", 131 | "\n", 132 | " data['count'] = 1\n", 133 | " tmp = data.groupby(['customer_id'])['count'].agg({'customer_counts':'count'}).reset_index()\n", 134 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 135 | "\n", 136 | " tmp = data.groupby(['customer_id'])['customer_province'].last().reset_index()\n", 137 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 138 | "\n", 139 | " tmp = data.groupby(['customer_id'])['customer_city'].last().reset_index()\n", 140 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 141 | "\n", 142 | " tmp = data.groupby(['customer_id'])['member_status'].last().reset_index()\n", 143 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 144 | "\n", 145 | " tmp = data.groupby(['customer_id'])['is_member_actived'].last().reset_index()\n", 146 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 147 | " \n", 148 | " data['count'] = 1\n", 149 | " tmp = data[data['is_customer_rate']==0].groupby(['customer_id'])['count'].agg({'is_customer_rate_0':'count'}).reset_index()\n", 150 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 151 | "\n", 152 | " data['count'] = 1\n", 153 | " tmp = data[data['is_customer_rate']==1].groupby(['customer_id'])['count'].agg({'is_customer_rate_1':'count'}).reset_index()\n", 154 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 155 | " \n", 156 | " data['count'] = 1\n", 157 | " tmp = data[(data['is_member_actived']==1) & (data['goods_has_discount']==1)].groupby(['customer_id'])['count'].agg({'is_customer_have_discount_count':'count'}).reset_index()\n", 158 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 159 | " \n", 160 | " if idx == 0:\n", 161 | " last_data = last_data.merge(customer_all, on='customer_id', how='left')\n", 162 | " else:\n", 163 | " all_data = all_data.merge(customer_all, on='customer_id', how='left')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 6, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "CPU times: user 5.22 s, sys: 768 ms, total: 5.99 s\n", 176 | "Wall time: 5.99 s\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "%%time\n", 182 | "for idx,data in enumerate([train_last,train_all]):\n", 183 | " customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 184 | "\n", 185 | " tmp = data.groupby(['customer_id'],as_index=False)['goods_price'].agg({'goods_price_max':'max','goods_price_min':'min','goods_price_mean':'mean'})\n", 186 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 187 | "\n", 188 | " data['count'] = 1\n", 189 | " tmp = data[data['goods_has_discount']==1].groupby(['customer_id'])['count'].agg({'goods_has_discount_counts':'count'}).reset_index()\n", 190 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 191 | "\n", 192 | " data['count'] = 1\n", 193 | " tmp = data[data['goods_has_discount']==0].groupby(['customer_id'])['count'].agg({'goods_has_not_discount_counts':'count'}).reset_index()\n", 194 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 195 | "\n", 196 | " data['count'] = 1\n", 197 | " tmp = data[data['goods_status']==1].groupby(['customer_id'])['count'].agg({'goods_status_1':'count'}).reset_index()\n", 198 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 199 | "\n", 200 | " \n", 201 | " \n", 202 | " if idx == 0:\n", 203 | " last_data = last_data.merge(customer_all, on='customer_id', how='left')\n", 204 | " else:\n", 205 | " all_data = all_data.merge(customer_all, on='customer_id', how='left')" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 7, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "CPU times: user 7.11 s, sys: 536 ms, total: 7.65 s\n", 218 | "Wall time: 8.87 s\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "%%time\n", 224 | "for idx,data in enumerate([train_last,train_all]):\n", 225 | " customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 226 | "\n", 227 | " tmp = data.groupby(['customer_id'])['order_amount'].agg({'order_amount_sum':'sum'})\n", 228 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 229 | " \n", 230 | " tmp = data.groupby(['customer_id'])['order_total_payment'].agg({'order_total_payment_sum':'sum','order_total_payment_count':'count'})\n", 231 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 232 | "\n", 233 | " tmp = data.groupby(['customer_id'])['order_total_discount'].agg({'order_total_discount_sum':'sum'})\n", 234 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 235 | "\n", 236 | " tmp = data.groupby(['customer_id'])['order_status'].agg({'order_status_max':'max'})\n", 237 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 238 | " \n", 239 | " data['count'] = 1\n", 240 | " tmp = data[data['goods_status']==2].groupby(['customer_id'])['count'].agg({'goods_status_2':'count'}).reset_index()\n", 241 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 242 | " \n", 243 | " \n", 244 | "\n", 245 | " if idx == 0:\n", 246 | " last_data = last_data.merge(customer_all, on='customer_id', how='left')\n", 247 | " else:\n", 248 | " all_data = all_data.merge(customer_all, on='customer_id', how='left')" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 8, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "CPU times: user 6 s, sys: 216 ms, total: 6.22 s\n", 261 | "Wall time: 6.21 s\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "%%time\n", 267 | "for idx,data in enumerate([train_last,train_all]):\n", 268 | " customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 269 | "\n", 270 | " tmp = data.groupby(['customer_id'])['order_detail_amount'].agg({'order_detail_amount_sum':'sum'})\n", 271 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 272 | " \n", 273 | " tmp = data.groupby(['customer_id'])['order_detail_payment'].agg({'order_detail_payment_sum':'sum','order_detail_payment_count':'count'})\n", 274 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left')\n", 275 | "\n", 276 | " tmp = data.groupby(['customer_id'])['order_detail_discount'].agg({'order_detail_discount_sum':'sum'})\n", 277 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 278 | "\n", 279 | " tmp = data.groupby(['customer_id'])['order_detail_status'].agg({'order_detail_status_max':'max'})\n", 280 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 281 | " \n", 282 | " \n", 283 | " if idx == 0:\n", 284 | " last_data = last_data.merge(customer_all, on='customer_id', how='left')\n", 285 | " else:\n", 286 | " all_data = all_data.merge(customer_all, on='customer_id', how='left')" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 14, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "CPU times: user 17min 53s, sys: 40 s, total: 18min 33s\n", 299 | "Wall time: 5min 22s\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "# %%time\n", 305 | "# for idx,data in enumerate([train_last,train_all]):\n", 306 | "# customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 307 | " \n", 308 | "# tmp = data.groupby(['customer_id'])['goods_id'].apply(lambda x:','.join(x.astype(str))).reset_index()\n", 309 | "# tmp.columns = ['customer_id','customer_goods_ids']\n", 310 | "# customer_all = customer_all.merge(tmp, on='customer_id', how='left')\n", 311 | " \n", 312 | "# X_seller = TfidfVectorizer(token_pattern='[0-9]+',binary=True).fit_transform(customer_all['customer_goods_ids'].fillna('0'))\n", 313 | "# seller_svd = TruncatedSVD(n_components=30,n_iter=30,random_state=2019).fit_transform(X_seller)\n", 314 | "# seller_svd_df = pd.DataFrame(seller_svd, columns=['customer_goods_svd_{}'.format(i) for i in range(1,31)])\n", 315 | "# customer_all = pd.concat([customer_all,seller_svd_df], axis=1)\n", 316 | "\n", 317 | "# if idx == 0:\n", 318 | "# last_data = last_data.merge(customer_all, on='customer_id', how='left')\n", 319 | "# else:\n", 320 | "# all_data = all_data.merge(customer_all, on='customer_id', how='left')" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 9, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "CPU times: user 2.75 s, sys: 352 ms, total: 3.1 s\n", 333 | "Wall time: 3.1 s\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "%%time\n", 339 | "for idx,data in enumerate([train_last,train_all]):\n", 340 | " customer_all = pd.DataFrame(data[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 341 | " data['order_pay_dayofyear'] = data['order_pay_time'].dt.dayofyear\n", 342 | "\n", 343 | " tmp = data.groupby(['customer_id'])['order_pay_dayofyear'].agg({'order_pay_dayofyear_max':'max','order_pay_dayofyear_min':'min'})\n", 344 | " customer_all = customer_all.merge(tmp,on=['customer_id'],how='left') \n", 345 | "\n", 346 | " if idx == 0:\n", 347 | " last_data = last_data.merge(customer_all, on='customer_id', how='left')\n", 348 | " else:\n", 349 | " all_data = all_data.merge(customer_all, on='customer_id', how='left')" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 16, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "Index(['order_detail_id', 'order_id', 'order_total_num', 'order_amount',\n", 361 | " 'order_total_payment', 'order_total_discount', 'order_pay_time',\n", 362 | " 'order_status', 'order_count', 'is_customer_rate',\n", 363 | " 'order_detail_status', 'order_detail_goods_num', 'order_detail_amount',\n", 364 | " 'order_detail_payment', 'order_detail_discount', 'customer_province',\n", 365 | " 'customer_city', 'member_id', 'customer_id', 'customer_gender',\n", 366 | " 'member_status', 'is_member_actived', 'goods_id', 'goods_price',\n", 367 | " 'goods_status', 'goods_has_discount', 'goods_list_time',\n", 368 | " 'goods_delist_time', 'order_pay_month', 'order_pay_dayofweek',\n", 369 | " 'order_pay_day', 'count'],\n", 370 | " dtype='object')" 371 | ] 372 | }, 373 | "execution_count": 16, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "train_last.columns" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 10, 385 | "metadata": { 386 | "scrolled": true 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "for data in [last_data, all_data]:\n", 391 | " data['customer_city'] = LabelEncoder().fit_transform(data['customer_city'].fillna('None'))\n", 392 | " data['customer_province'] = LabelEncoder().fit_transform(data['customer_province'].fillna('None'))" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 11, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "def generate_label(data,label):\n", 402 | " data['label'] = 0\n", 403 | " valid_idx_list = list(label['customer_id'].unique())\n", 404 | " data['label'][data['customer_id'].isin(valid_idx_list)] = 1\n", 405 | "\n", 406 | " return data\n", 407 | "\n", 408 | "last_data = generate_label(last_data,train_label)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 12, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "last_data['order_pay_dayofyear_gap'] = last_data['order_pay_dayofyear_max'] - last_data['order_pay_dayofyear_min']\n", 418 | "all_data['order_pay_dayofyear_gap'] = all_data['order_pay_dayofyear_max'] - all_data['order_pay_dayofyear_min']" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "# Model" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 58, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "last_data.drop(['order_pay_dayofyear_max','order_pay_dayofyear_min'], axis=1, inplace=True)\n", 435 | "all_data.drop(['order_pay_dayofyear_max','order_pay_dayofyear_min'], axis=1, inplace=True)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 13, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stdout", 445 | "output_type": "stream", 446 | "text": [ 447 | "Feature Length: 28 Data prepared......\n" 448 | ] 449 | } 450 | ], 451 | "source": [ 452 | "origin_feat = ['customer_counts','goods_price_max', 'goods_price_min', 'goods_price_mean','member_status','is_member_actived',\n", 453 | " 'customer_city','customer_province','goods_has_discount_counts','goods_has_not_discount_counts','goods_status_1',\n", 454 | " 'goods_status_2','is_customer_rate_0','is_customer_rate_1','is_customer_have_discount_count']\n", 455 | "\n", 456 | "main_order_feat = ['order_total_payment_sum','order_total_payment_count','order_total_discount_sum',\n", 457 | " 'order_amount_sum','order_status_max','order_pay_dayofyear_max','order_pay_dayofyear_min',\n", 458 | " 'order_pay_dayofyear_gap']\n", 459 | " \n", 460 | " \n", 461 | "\n", 462 | "detail_order_feat = ['order_detail_payment_sum','order_detail_payment_count','order_detail_discount_sum',\n", 463 | " 'order_detail_amount_sum','order_detail_status_max']\n", 464 | "\n", 465 | "##########################################################################################\n", 466 | "feature = origin_feat + main_order_feat + detail_order_feat\n", 467 | "\n", 468 | "X = last_data[feature]\n", 469 | "y = last_data['label']\n", 470 | "X_all = all_data[feature]\n", 471 | "\n", 472 | "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42,stratify=y)\n", 473 | "print('Feature Length:',len(feature),' Data prepared......')" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 17, 479 | "metadata": { 480 | "scrolled": true 481 | }, 482 | "outputs": [ 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "Training until validation scores don't improve for 100 rounds.\n", 488 | "[50]\tvalid_0's binary_logloss: 0.395892\n", 489 | "[100]\tvalid_0's binary_logloss: 0.39609\n", 490 | "[150]\tvalid_0's binary_logloss: 0.396465\n", 491 | "Early stopping, best iteration is:\n", 492 | "[64]\tvalid_0's binary_logloss: 0.395797\n" 493 | ] 494 | } 495 | ], 496 | "source": [ 497 | "def re_logloss(labels,preds): \n", 498 | " deta = 3.4\n", 499 | " y_true = labels # you can try this eval metric for fun\n", 500 | " y_pred = preds\n", 501 | " p = np.clip(y_pred, 1e-10, 1-1e-10)\n", 502 | " loss = -1/len(y_true) * np.sum(y_true * np.log(p) * deta + (1 - y_true) * np.log(1-p))\n", 503 | " return 're_logloss',loss,False\n", 504 | "\n", 505 | "lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=64, reg_alpha=0.1, reg_lambda=1.0,\n", 506 | " max_depth=-1, n_estimators=10000, objective='binary', metrics='None', \n", 507 | " bagging_fraction=0.8, is_unbalance=False, bagging_freq=5, min_child_samples=80, \n", 508 | " feature_fraction=0.8, learning_rate=0.1, random_state=42, n_jobs=8,\n", 509 | " )\n", 510 | "\n", 511 | "eval_set = [(X_valid, y_valid)]\n", 512 | "lgb_model.fit(X_train, y_train, eval_set=eval_set, eval_metric='logloss',verbose=50, early_stopping_rounds=100)\n", 513 | "pred = lgb_model.predict_proba(X_all) #0.3958" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 16, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [ 522 | "res = all_data[['customer_id']]\n", 523 | "res['result'] = pred[:,1]\n", 524 | "\n", 525 | "data = pd.DataFrame(train[['customer_id']]).drop_duplicates(['customer_id']).dropna()\n", 526 | "data = (data.merge(res,on=['customer_id'],how='left')).sort_values(['customer_id'])\n", 527 | "data['customer_id'] = data['customer_id'].astype('int64')\n", 528 | "data['result'] = data['result'].fillna(0)\n", 529 | "result = data[['customer_id','result']]\n", 530 | "result.to_csv('../out/round1_diac2019_test.csv', index=False)" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [] 539 | } 540 | ], 541 | "metadata": { 542 | "kernelspec": { 543 | "display_name": "Python 3", 544 | "language": "python", 545 | "name": "python3" 546 | }, 547 | "language_info": { 548 | "codemirror_mode": { 549 | "name": "ipython", 550 | "version": 3 551 | }, 552 | "file_extension": ".py", 553 | "mimetype": "text/x-python", 554 | "name": "python", 555 | "nbconvert_exporter": "python", 556 | "pygments_lexer": "ipython3", 557 | "version": "3.6.7" 558 | } 559 | }, 560 | "nbformat": 4, 561 | "nbformat_minor": 2 562 | } 563 | --------------------------------------------------------------------------------