├── README.md └── code ├── TreeModel.ipynb └── feature_V3.ipynb /README.md: -------------------------------------------------------------------------------- 1 | ### 基于移动网络通讯行为的风险用户识别 2 | 3 | ######## 1. 2018-05-22 baseline 4 | 5 | ###### 因为这学期担任实训课TA,这个比赛是实训课的作业,所以做个baseline给师弟师妹们参考,这个baseline 线下是0.80,线上成绩是0.76~0.77 6 | ###### (所以LB上有一群DM+学号的,不要惊讶,不是小号,是一群可爱的小鲜肉) 7 | 8 | 9 | 10 | 11 | ######## 2. 2018-06-12 解决方案 (初赛 3th, 复赛 15th) 12 | 13 | ## 特征工程: 14 | ### 2.1 单变量数量统计特征: 15 | #### voice统计用户 记录数,用户 不同opp_num 记录数 16 | #### voice统计用户不同 opp_head记录 数 17 | #### voice统计用户 不同 opp_len 的记录数 18 | #### 统计用户不同 call_tyoe 记录数 19 | 20 | #### sms统计用户sms 不同opp_num记录数 21 | #### sms统计用户sms 不同opp_head 记录数 22 | #### sms统计用户 不同opp_len记录数 23 | #### sms不同in_out 记录数 24 | 25 | ### 2.2 多变量数目统计特征: 26 | #### voice统计用户不同 in_out 下 不同 opp_num记录数 27 | #### voice统计不同 opp_len 下 不同 opp_head 记录数 28 | #### voice统计不同 opp_len 下不同opp_head 的记录数 29 | #### voice统计不同call_type 下 不同opp_num 的记录数 30 | 31 | #### sms统计用户不同opp_len 中 不同opp_head的记录数 32 | 33 | #### sms不同in_out 下 不同opp_head 数 34 | 35 | ### 2.3 one-hot 类数目统计特征: 36 | #### voice对opp_num one-hot 统计记录数 37 | #### sms每天不同的opp_head 记录数 38 | #### sms opp_head one-hot 统计记录数 39 | #### sms每天的短信记录数 40 | #### wa top 1000 wa_name 分组统计记录数 41 | 42 | ### 2.4 时间统计量: 43 | #### voice通话时长统计量 44 | #### voice两次通话间隔统计量 45 | #### sms两次短信间隔统计量 46 | ## 模型: 47 | 只使用了lgb单模型,成绩是初赛第三(0.874),复赛14(0.864),应该是wa 特征 A,B榜数据分布不一致。 48 | 49 | 50 | ### 最近一堆事,趁着早上上班前,赶紧把方案开源了。 51 | -------------------------------------------------------------------------------- /code/TreeModel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 45, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# -*- coding: utf-8 -*-\n", 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import lightgbm as lgb\n", 13 | "from sklearn import metrics\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import seaborn as sns\n", 16 | "from sklearn import model_selection\n", 17 | "import copy" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 54, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "train = pd.read_csv('../data/train_featureV3.csv',encoding='utf-8')\n", 27 | "test = pd.read_csv('../data/test_featureV3.csv',encoding='utf-8')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 55, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "def getUseColumn(data):\n", 37 | " d = copy.deepcopy(data)\n", 38 | " k = d.var()\n", 39 | " # print k\n", 40 | " print k[(k == np.nan) | (k == 0)].index.values\n", 41 | " col1 = k[(k != np.nan) & (k != 0)].index.values\n", 42 | " return col1" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 56, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "[u'voice_opp_len_18' u'voice_opp_len_25'\n", 55 | " u'voice_opp_len_opp_num_unique_18' ...\n", 56 | " u'voice_sms_each_opp_head_count_893' u'voice_sms_each_opp_head_count_955'\n", 57 | " u'voice_sms_each_opp_head_count_991']\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "train_x = train.drop(['uid','label'],axis=1)\n", 63 | "test_x = test.drop(['uid'],axis=1)\n", 64 | "use_cols = getUseColumn(train_x)\n", 65 | "\n", 66 | "dtrain_all = lgb.Dataset(train_x[use_cols].values,label=train.label)\n", 67 | "#dtrain_all = lgb.Dataset(train[feat_imp[0:50]['feat'].values].values,label=train.label)\n", 68 | "\n", 69 | "\n", 70 | "#dtrain = lgb.Dataset(X_train,label=y_train)\n", 71 | "#dvalid = lgb.Dataset(X_test,label=y_test)\n", 72 | "#dtest = lgb.Dataset(test[feat_imp[0:50]['feat'].values].values)\n", 73 | "dtest = lgb.Dataset(test_x[use_cols].values)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 49, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "(4999, 6817)" 85 | ] 86 | }, 87 | "execution_count": 49, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "train_x.shape" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 50, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "lgb_params = {\n", 103 | " 'boosting_type': 'gbdt',\n", 104 | " 'objective': 'binary',\n", 105 | " # 'metric': ('auc', 'f1'),\n", 106 | " #'metric_freq': 100,\n", 107 | " 'is_training_metric': False,\n", 108 | " 'min_data_in_leaf': 12,\n", 109 | " 'num_leaves': 256,\n", 110 | " 'learning_rate': 0.05,\n", 111 | " 'feature_fraction': 0.9,\n", 112 | " 'bagging_fraction': 0.9,\n", 113 | " 'verbosity':-1,\n", 114 | " # 'gpu_device_id':2,\n", 115 | "# 'device':'gpu',\n", 116 | "# 'lambda_l1': 0.1,\n", 117 | "# 'skip_drop': 0.95,\n", 118 | "# 'max_drop' : 10\n", 119 | "# 'lambda_l2': 0.1\n", 120 | " #'num_threads': 18\n", 121 | "# 'eta':0.07\n", 122 | "}\n", 123 | "\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 51, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "def evalMetric(preds,dtrain):\n", 133 | " \n", 134 | " label = dtrain.get_label()\n", 135 | " \n", 136 | " \n", 137 | " pre = pd.DataFrame({'preds':preds,'label':label})\n", 138 | " pre= pre.sort_values(by='preds',ascending=False)\n", 139 | " \n", 140 | " auc = metrics.roc_auc_score(pre.label,pre.preds)\n", 141 | " index = int(len(preds) *0.18)\n", 142 | " \n", 143 | " \n", 144 | " pre.preds[0:index] =1\n", 145 | " pre.preds[index:] =0\n", 146 | " \n", 147 | " # pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)\n", 148 | "\n", 149 | " f1 = metrics.f1_score(pre.label,pre.preds)\n", 150 | " \n", 151 | " # print 'auc:%f---------f1:%f'%(auc,f1)\n", 152 | " \n", 153 | " \n", 154 | " res = 0.6*auc +0.4*f1\n", 155 | " \n", 156 | " return 'res',res,True\n", 157 | " \n", 158 | " \n", 159 | "\n", 160 | "def auc(preds,dtrain):\n", 161 | " label = dtrain.get_label()\n", 162 | " \n", 163 | " \n", 164 | " pre = pd.DataFrame({'preds':preds,'label':label})\n", 165 | " pre= pre.sort_values(by='preds',ascending=False)\n", 166 | " \n", 167 | " auc = metrics.roc_auc_score(pre.label,pre.preds)\n", 168 | " \n", 169 | " return 'auc',auc,True\n", 170 | "\n", 171 | "\n", 172 | "def f1(preds,dtrain):\n", 173 | " label = dtrain.get_label()\n", 174 | " \n", 175 | " \n", 176 | " pre = pd.DataFrame({'preds':preds,'label':label})\n", 177 | " pre= pre.sort_values(by='preds',ascending=False)\n", 178 | " \n", 179 | " index = int(len(preds) *0.18)\n", 180 | " \n", 181 | " \n", 182 | " pre.preds[0:index] =1\n", 183 | " pre.preds[index:] =0\n", 184 | " \n", 185 | "\n", 186 | " f1 = metrics.f1_score(pre.label,pre.preds)\n", 187 | " \n", 188 | " \n", 189 | " return 'f1',f1,True\n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " " 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 52, 199 | "metadata": { 200 | "scrolled": false 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stderr", 205 | "output_type": "stream", 206 | "text": [ 207 | "/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: \n", 208 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 209 | "\n", 210 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 211 | " del sys.path[0]\n", 212 | "/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: \n", 213 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 214 | "\n", 215 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 216 | " \n" 217 | ] 218 | }, 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "[10]\tcv_agg's res: 0.839911 + 0.00376391\tcv_agg's auc: 0.928215 + 0.00378463\n", 224 | "[20]\tcv_agg's res: 0.847893 + 0.00715504\tcv_agg's auc: 0.932618 + 0.00675313\n", 225 | "[30]\tcv_agg's res: 0.852002 + 0.00669625\tcv_agg's auc: 0.935756 + 0.00522615\n", 226 | "[40]\tcv_agg's res: 0.854175 + 0.00600323\tcv_agg's auc: 0.937155 + 0.00419292\n", 227 | "[50]\tcv_agg's res: 0.858107 + 0.00557947\tcv_agg's auc: 0.94 + 0.00339179\n", 228 | "[60]\tcv_agg's res: 0.859881 + 0.00551541\tcv_agg's auc: 0.942957 + 0.00329419\n", 229 | "[70]\tcv_agg's res: 0.862413 + 0.00329366\tcv_agg's auc: 0.944953 + 0.00267238\n", 230 | "[80]\tcv_agg's res: 0.864363 + 0.00593388\tcv_agg's auc: 0.945976 + 0.00247281\n", 231 | "[90]\tcv_agg's res: 0.86661 + 0.00557191\tcv_agg's auc: 0.946755 + 0.00269796\n", 232 | "[100]\tcv_agg's res: 0.866009 + 0.00603467\tcv_agg's auc: 0.947237 + 0.00269985\n", 233 | "[110]\tcv_agg's res: 0.866101 + 0.00579519\tcv_agg's auc: 0.94739 + 0.00232926\n", 234 | "[120]\tcv_agg's res: 0.866436 + 0.00629509\tcv_agg's auc: 0.947948 + 0.00258401\n", 235 | "[130]\tcv_agg's res: 0.865773 + 0.00580196\tcv_agg's auc: 0.948326 + 0.00219183\n", 236 | "[140]\tcv_agg's res: 0.865451 + 0.00628025\tcv_agg's auc: 0.94853 + 0.00228953\n", 237 | "[150]\tcv_agg's res: 0.864684 + 0.0057199\tcv_agg's auc: 0.948735 + 0.0023252\n", 238 | "[160]\tcv_agg's res: 0.864202 + 0.00541687\tcv_agg's auc: 0.948674 + 0.00212464\n", 239 | "[170]\tcv_agg's res: 0.864634 + 0.00539378\tcv_agg's auc: 0.948653 + 0.0023119\n", 240 | "[180]\tcv_agg's res: 0.864379 + 0.00593571\tcv_agg's auc: 0.948969 + 0.00229604\n", 241 | "[190]\tcv_agg's res: 0.86443 + 0.00478318\tcv_agg's auc: 0.949055 + 0.00219332\n", 242 | "[200]\tcv_agg's res: 0.865009 + 0.00638168\tcv_agg's auc: 0.949277 + 0.00215146\n", 243 | "[210]\tcv_agg's res: 0.866869 + 0.00537734\tcv_agg's auc: 0.949413 + 0.00201847\n", 244 | "[220]\tcv_agg's res: 0.866487 + 0.00558959\tcv_agg's auc: 0.949518 + 0.00205708\n", 245 | "[230]\tcv_agg's res: 0.866593 + 0.00474557\tcv_agg's auc: 0.949694 + 0.00204757\n", 246 | "[240]\tcv_agg's res: 0.866269 + 0.00464564\tcv_agg's auc: 0.949895 + 0.00200057\n", 247 | "[250]\tcv_agg's res: 0.86695 + 0.00464194\tcv_agg's auc: 0.950289 + 0.0018377\n", 248 | "[260]\tcv_agg's res: 0.866549 + 0.00505962\tcv_agg's auc: 0.950362 + 0.00177416\n", 249 | "[270]\tcv_agg's res: 0.867027 + 0.00539233\tcv_agg's auc: 0.950415 + 0.00173058\n", 250 | "[280]\tcv_agg's res: 0.866313 + 0.00654881\tcv_agg's auc: 0.950706 + 0.00158749\n", 251 | "[290]\tcv_agg's res: 0.867277 + 0.0062036\tcv_agg's auc: 0.950831 + 0.00156642\n", 252 | "[300]\tcv_agg's res: 0.866903 + 0.00583529\tcv_agg's auc: 0.95095 + 0.00143535\n", 253 | "[310]\tcv_agg's res: 0.866519 + 0.00643757\tcv_agg's auc: 0.951051 + 0.00138806\n", 254 | "[320]\tcv_agg's res: 0.867516 + 0.00631196\tcv_agg's auc: 0.95123 + 0.00156202\n", 255 | "[330]\tcv_agg's res: 0.868001 + 0.00756147\tcv_agg's auc: 0.951295 + 0.00158178\n", 256 | "[340]\tcv_agg's res: 0.868496 + 0.00782755\tcv_agg's auc: 0.951378 + 0.00161712\n", 257 | "[350]\tcv_agg's res: 0.868127 + 0.00853171\tcv_agg's auc: 0.951503 + 0.00174514\n", 258 | "[360]\tcv_agg's res: 0.869036 + 0.00822351\tcv_agg's auc: 0.951536 + 0.00179389\n", 259 | "[370]\tcv_agg's res: 0.869134 + 0.00825024\tcv_agg's auc: 0.951699 + 0.00183597\n", 260 | "[380]\tcv_agg's res: 0.869566 + 0.0076091\tcv_agg's auc: 0.951678 + 0.00181962\n", 261 | "[390]\tcv_agg's res: 0.869586 + 0.00765303\tcv_agg's auc: 0.951711 + 0.00190347\n", 262 | "[400]\tcv_agg's res: 0.869176 + 0.00739375\tcv_agg's auc: 0.951769 + 0.00194574\n", 263 | "[410]\tcv_agg's res: 0.869245 + 0.00829726\tcv_agg's auc: 0.951883 + 0.00193629\n", 264 | "[420]\tcv_agg's res: 0.869253 + 0.00836799\tcv_agg's auc: 0.951897 + 0.00206933\n", 265 | "[430]\tcv_agg's res: 0.869259 + 0.00834354\tcv_agg's auc: 0.951908 + 0.00202975\n", 266 | "[440]\tcv_agg's res: 0.870164 + 0.00899768\tcv_agg's auc: 0.951932 + 0.00207055\n", 267 | "[450]\tcv_agg's res: 0.870188 + 0.00900273\tcv_agg's auc: 0.951971 + 0.00207746\n", 268 | "[460]\tcv_agg's res: 0.869774 + 0.00872432\tcv_agg's auc: 0.952023 + 0.00208322\n", 269 | "[470]\tcv_agg's res: 0.869788 + 0.00874283\tcv_agg's auc: 0.952048 + 0.00210492\n", 270 | "[480]\tcv_agg's res: 0.869784 + 0.0087987\tcv_agg's auc: 0.95204 + 0.00220303\n", 271 | "[490]\tcv_agg's res: 0.869825 + 0.00880326\tcv_agg's auc: 0.952109 + 0.00221283\n", 272 | "[500]\tcv_agg's res: 0.870319 + 0.00809941\tcv_agg's auc: 0.952192 + 0.00216047\n", 273 | "[510]\tcv_agg's res: 0.869455 + 0.0084159\tcv_agg's auc: 0.952234 + 0.00215936\n", 274 | "[520]\tcv_agg's res: 0.870397 + 0.00907048\tcv_agg's auc: 0.952319 + 0.00220583\n", 275 | "[530]\tcv_agg's res: 0.86999 + 0.00877531\tcv_agg's auc: 0.952384 + 0.00218001\n", 276 | "[540]\tcv_agg's res: 0.870459 + 0.00906187\tcv_agg's auc: 0.952424 + 0.00219866\n", 277 | "[550]\tcv_agg's res: 0.871367 + 0.00873542\tcv_agg's auc: 0.952453 + 0.00217044\n", 278 | "[560]\tcv_agg's res: 0.871388 + 0.00877027\tcv_agg's auc: 0.952489 + 0.00222663\n", 279 | "[570]\tcv_agg's res: 0.870944 + 0.00846395\tcv_agg's auc: 0.952492 + 0.00223305\n", 280 | "[580]\tcv_agg's res: 0.870495 + 0.00820383\tcv_agg's auc: 0.952484 + 0.00226073\n", 281 | "[590]\tcv_agg's res: 0.870496 + 0.00910113\tcv_agg's auc: 0.952484 + 0.00224488\n", 282 | "[600]\tcv_agg's res: 0.870529 + 0.00910566\tcv_agg's auc: 0.95254 + 0.00224935\n", 283 | "[610]\tcv_agg's res: 0.870074 + 0.00883208\tcv_agg's auc: 0.952523 + 0.00225639\n", 284 | "[620]\tcv_agg's res: 0.870541 + 0.00912299\tcv_agg's auc: 0.95256 + 0.00227182\n", 285 | "[630]\tcv_agg's res: 0.870088 + 0.00974716\tcv_agg's auc: 0.952546 + 0.00226166\n", 286 | "[640]\tcv_agg's res: 0.870108 + 0.00975526\tcv_agg's auc: 0.952579 + 0.0022767\n", 287 | "[650]\tcv_agg's res: 0.870106 + 0.00975828\tcv_agg's auc: 0.952576 + 0.00228426\n", 288 | "[660]\tcv_agg's res: 0.869647 + 0.00946729\tcv_agg's auc: 0.952553 + 0.00227029\n", 289 | "boost_round:561,res:0.871393011749146,auc:0.9524972875594031\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "res_cv = lgb.cv(lgb_params,dtrain_all,feval=evalMetric,early_stopping_rounds=100,verbose_eval=10,num_boost_round=10000,nfold=3,metrics=['evalMetric','auc'],seed=1000)\n", 295 | "print 'boost_round:%s,res:%s,auc:%s'%(len(res_cv['res-mean']),res_cv['res-mean'][-1],res_cv['auc-mean'][-1])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "scrolled": false 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "model =lgb.train(lgb_params,dtrain_all,feval=evalMetric,verbose_eval=5,num_boost_round=int(len(res_cv['res-mean'])*1.5))" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "pred=model.predict(test.drop(['uid'],axis=1).values)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "test.drop(['uid'],axis=1).values" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "res =pd.DataFrame({'uid':test.uid,'label':pred})\n" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "res.to_csv('../result/b/lgb/lgb-%0.4f-%0.4f-prob-nowa.csv'%(res_cv['res-mean'][-1],res_cv['auc-mean'][-1]),index=False,header=False,sep=',',columns=['uid','label'])" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "res=res.sort_values(by='label',ascending=False)\n", 352 | "index = int(len(res)*0.18)\n", 353 | "\n", 354 | "res.label[0:index] =int(1)\n", 355 | "res.label[index:] =int(0)\n", 356 | "res.label = res.label.map(lambda x: int(x))" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "res.to_csv('../result/b/lgb/lgb-%0.4f-%0.4f-nowa.csv'%(res_cv['res-mean'][-1],res_cv['auc-mean'][-1]),index=False,header=False,sep=',',columns=['uid','label'])" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [] 374 | } 375 | ], 376 | "metadata": { 377 | "kernelspec": { 378 | "display_name": "Python 2", 379 | "language": "python", 380 | "name": "python2" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 2 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython2", 392 | "version": "2.7.6" 393 | } 394 | }, 395 | "nbformat": 4, 396 | "nbformat_minor": 2 397 | } 398 | -------------------------------------------------------------------------------- /code/feature_V3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 13 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "# -*- coding: utf-8 -*-\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "\n", 22 | "\n", 23 | "# # SKlearn classification models\n", 24 | "from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB\n", 25 | "from sklearn.linear_model import SGDClassifier\n", 26 | "from sklearn.linear_model import LogisticRegression\n", 27 | "\n", 28 | "from sklearn import svm\n", 29 | "from sklearn.metrics import accuracy_score\n", 30 | "import numpy as np\n", 31 | "from sklearn.linear_model import RidgeClassifier\n", 32 | "\n", 33 | "#cross validation\n", 34 | "from scipy import sparse\n", 35 | "from sklearn.cross_validation import StratifiedKFold\n", 36 | "from sklearn.cross_validation import cross_val_score\n", 37 | "from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stderr", 47 | "output_type": "stream", 48 | "text": [ 49 | "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.\n", 50 | " interactivity=interactivity, compiler=compiler, result=result)\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "uid_train = pd.read_csv('../data/uid_train.txt',sep='\\t',header=None,names=('uid','label'))\n", 56 | "voice_train = pd.read_csv('../data/voice_train.txt',sep='\\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str},encoding='utf-8')\n", 57 | "sms_train = pd.read_csv('../data/sms_train.txt',sep='\\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str},encoding='utf-8')\n", 58 | "wa_train = pd.read_csv('../data/wa_train.txt',sep='\\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str},encoding='utf-8')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "voice_test = pd.read_csv('../data/voice_test_b.txt',sep='\\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str},encoding='utf-8')\n", 68 | "sms_test = pd.read_csv('../data/sms_test_b.txt',sep='\\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str},encoding='utf-8')\n", 69 | "wa_test = pd.read_csv('../data/wa_test_b.txt',sep='\\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str},encoding='utf-8')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "voice_test_a = pd.read_csv('../data/voice_test_a.txt',sep='\\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str},encoding='utf-8')\n", 79 | "sms_test_a = pd.read_csv('../data/sms_test_a.txt',sep='\\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str},encoding='utf-8')\n", 80 | "wa_test_a = pd.read_csv('../data/wa_test_a.txt',sep='\\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str},encoding='utf-8')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})\n", 90 | "\n", 91 | "voice = pd.concat([voice_train,voice_test_a,voice_test],axis=0)\n", 92 | "sms = pd.concat([sms_train,sms_test_a,sms_test],axis=0)\n", 93 | "wa = pd.concat([wa_train,wa_test_a,wa_test],axis=0)\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 110, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})\n", 103 | "uid_test.to_csv('../data/uid_test_b.csv',index=None)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "voice.drop_duplicates(inplace=True)\n", 113 | "sms.drop_duplicates(inplace=True)\n", 114 | "wa.drop_duplicates(inplace=True)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 7, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def single_groupby(data,groupby_col,count_col,methods,prefix,fill=None):\n", 124 | " temp = data.groupby([groupby_col])[count_col].agg(methods).add_prefix(prefix).reset_index()\n", 125 | " if fill !=None:\n", 126 | " temp.fillna(fill,inplace=True)\n", 127 | " return temp\n", 128 | "\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "# 通话记录" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "##### 通话记录不同记录数 通话记录数" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 8, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "\n", 152 | "voice_opp_num = single_groupby(voice,'uid','opp_num',{'unique_count': lambda x: len(pd.unique(x)),'count':'count'},'voice_opp_num_',0)\n", 153 | "voice_opp_head = single_groupby(voice,'uid','opp_head',{'unique_count': lambda x: len(pd.unique(x))},'voice_opp_head_',0)\n", 154 | "\n", 155 | "voice_opp_len=voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)\n", 156 | "voice_opp_len_opp_num_unique=voice.groupby(['uid','opp_len'])['opp_num'].agg(lambda x: len(pd.unique(x))).unstack().add_prefix('voice_opp_len_opp_num_unique_').reset_index().fillna(0)\n", 157 | "\n", 158 | "\n", 159 | "\n", 160 | "voice_call_type = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)\n", 161 | "voice_call_type_opp_num_unique=voice.groupby(['uid','call_type'])['opp_num'].agg(lambda x: len(pd.unique(x))).unstack().add_prefix('voice_call_type_opp_num_unique_').reset_index().fillna(0)\n", 162 | "\n", 163 | "voice_in_out_opp_num_unique = voice.groupby(['uid','in_out'])['opp_num'].agg(lambda x: len(pd.unique(x))).unstack().add_prefix('voice_in_out_opp_num_unique_').reset_index().fillna(0)\n" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 9, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "voice_each_opp_head_count=voice.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('voice_each_opp_head_count_').reset_index().fillna(0)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 10, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "voice_opp_len_opp_head_unique=voice.groupby(['uid','opp_len'])['opp_head'].agg(lambda x: len(pd.unique(x))).unstack().add_prefix('voice_opp_len_opp_head_unique_').reset_index().fillna(0)\n" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 12, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "\n", 191 | "opp_num_list = voice.groupby(['opp_num'])['uid'].count().sort_values(ascending=False).reset_index()['opp_num'][0:1000].values\n", 192 | "voice_each_opp_num_count=voice[voice.opp_num.map(lambda x: x in opp_num_list)].groupby(['uid','opp_num'])['uid'].count().unstack().add_prefix('voice_each_opp_num_count_').reset_index().fillna(0)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 13, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "def time_gap(start,end):\n", 202 | " if pd.isnull(start):\n", 203 | " return np.nan\n", 204 | " end_day = int(str(end)[0:2])\n", 205 | " start_day = int(str(start)[0:2])\n", 206 | " day_gap = (end_day-start_day)*86400\n", 207 | " \n", 208 | " \n", 209 | " end_hour = int(str(end)[2:4])\n", 210 | " start_hour = int(str(start)[2:4])\n", 211 | " hour_gap = (end_hour-start_hour)*3600\n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " end_min = int(str(end)[4:6])\n", 216 | " start_min = int(str(start)[4:6])\n", 217 | " min_gap = (end_min-start_min)*60\n", 218 | " \n", 219 | " \n", 220 | " end_sec = int(str(end)[6:8])\n", 221 | " start_sec = int(str(start)[6:8])\n", 222 | " sec_gap = (end_sec-start_sec)\n", 223 | " \n", 224 | " \n", 225 | " return day_gap+hour_gap+min_gap+sec_gap\n", 226 | " " 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 14, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "## 通话时长统计\n", 236 | "voice['gap_time']=voice[['start_time','end_time']].apply(lambda x: time_gap(x[0],x[1]),axis=1)\n", 237 | "\n", 238 | "voice_gap_time=voice.groupby(['uid'])['gap_time'].agg(['std','max','min','median','mean','sum',np.ptp]).add_prefix('voice_gap_time_').reset_index()\n", 239 | "\n", 240 | "\n", 241 | "\n" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 15, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "### 通话时间间隔统计\n", 251 | "voice_sort = (voice.sort_values(by=['start_time','end_time'],ascending=True)).reset_index()\n", 252 | "voice_sort['last_end_time']=voice_sort.groupby(['uid'])['end_time'].apply(lambda i:i.shift(1))\n", 253 | "voice_sort['last_gap_time'] = voice_sort[['last_end_time','start_time']].apply(lambda x: time_gap(x[0],x[1]),axis=1)\n", 254 | "\n", 255 | "\n", 256 | "voice_last_gap_time=voice_sort.groupby(['uid'])['last_gap_time'].agg(['std','max','min','median','mean','sum',np.ptp]).add_prefix('voice_last_gap_time_').reset_index()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 16, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "voice['start_day']=voice.start_time.map(lambda x: x[0:2])\n", 266 | "voice['end_day']=voice.end_time.map(lambda x: x[0:2])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 17, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "voice_start_day_count = voice.groupby(['uid','start_day'])['opp_head'].agg(lambda x: len(pd.unique(x))).unstack().fillna(0).add_prefix('voice_start_day_count_').reset_index()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "### sms " 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 18, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "sms_opp_num = sms.groupby(['uid'])['opp_num'].agg({'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('sms_opp_num_').reset_index().fillna(0)\n", 292 | "sms_opp_head=sms.groupby(['uid'])['opp_head'].agg({'unique_count': lambda x: len(pd.unique(x))}).add_prefix('sms_opp_head_').reset_index().fillna(0)\n", 293 | "sms_opp_len=sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)\n", 294 | "\n", 295 | "sms_opp_len_opp_head_unique=sms.groupby(['uid','opp_len'])['opp_head'].agg(lambda x: len(pd.unique(x))).unstack().add_prefix('sms_opp_len_opp_head_unique_').reset_index().fillna(0)\n", 296 | "\n", 297 | "\n", 298 | "sms_in_out = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)\n", 299 | "\n", 300 | "\n", 301 | "sms_in_out['sms_in_out_0_rate'] = sms_in_out['sms_in_out_0'] / sms_opp_num['sms_opp_num_count']\n", 302 | "\n", 303 | "sms_in_out['sms_in_out_1_rate'] = sms_in_out['sms_in_out_1'] / sms_opp_num['sms_opp_num_count']\n", 304 | "\n", 305 | "\n", 306 | "\n" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 19, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "# for col in sms_opp_len.columns:\n", 316 | "# if col !='uid':\n", 317 | "# sms_opp_len[col+'_rate'] = sms_opp_len[col] / sms_opp_num['sms_opp_num_count']" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 20, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "sms['hour'] = sms.start_time.map(lambda x: x[2:4])\n", 327 | "sms['day'] = sms.start_time.map(lambda x: x[0:2])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 21, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "opp_len = [5,7,8,9,10,11,12,13,14]\n", 337 | "sms_opp_count = []\n", 338 | "for l in opp_len:\n", 339 | " temp = sms[sms.opp_len==l].groupby(['uid','hour'])['uid'].count().unstack().add_prefix('sms_hour_count_opp_len_'+str(l)+'_').reset_index().fillna(0)\n", 340 | " sms_opp_count.append(temp)\n", 341 | " \n", 342 | "for l in opp_len:\n", 343 | " temp = sms[sms.opp_len==l].groupby(['uid','day'])['uid'].count().unstack().add_prefix('sms_day_count_opp_len_'+str(l)+'_').reset_index().fillna(0)\n", 344 | " sms_opp_count.append(temp)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 22, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "sms_each_opp_head_count=sms.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('sms_each_opp_head_count_').reset_index().fillna(0)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 23, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "\n", 363 | "sms_opp_num_list = sms.groupby(['opp_num'])['uid'].count().sort_values(ascending=False).reset_index()['opp_num'][0:1000].values\n", 364 | "sms_each_opp_num_count=sms[sms.opp_num.map(lambda x: x in sms_opp_num_list)].groupby(['uid','opp_num'])['uid'].count().unstack().add_prefix('sms_each_opp_num_count_').reset_index().fillna(0)\n", 365 | "\n", 366 | "\n", 367 | "#sms_each_opp_num_count=sms.groupby(['uid','opp_num'])['uid'].count().unstack().add_prefix('sms_each_opp_num_count_').reset_index().fillna(0)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 24, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "### 短信间隔统计\n", 377 | "sms_sort = sms.sort_values(by=['uid','start_time'],ascending='True').reset_index()\n", 378 | "sms_sort['last_start_time']=sms_sort.groupby(['uid'])['start_time'].apply(lambda i:i.shift(1))\n", 379 | "sms_sort['last_start_gap_time'] = sms_sort[['last_start_time','start_time']].apply(lambda x: time_gap(x[0],x[1]),axis=1)\n", 380 | "sms_last_start_gap_time=sms_sort.groupby(['uid'])['last_start_gap_time'].agg(['std','max','min','median','mean','sum',np.ptp]).add_prefix('sms_last_start_gap_time_').reset_index()\n", 381 | "\n" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 25, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "sms_each_day_count = sms.groupby(['uid','day'])['opp_num'].count().unstack().fillna(0).add_prefix('sms_each_day_count_').reset_index()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 26, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "sms_each_day_unique_count_opp_head = sms.groupby(['uid','day'])['opp_head'].agg(lambda x: len(pd.unique(x))).unstack().fillna(0).add_prefix('sms_each_day_unique_count_opp_head_').reset_index()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 27, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "sms_each_day_unique_count_opp_num = sms.groupby(['uid','day'])['opp_num'].agg(lambda x: len(pd.unique(x))).unstack().fillna(0).add_prefix('sms_each_day_unique_count_opp_num_').reset_index()" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 28, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "sms_each_day_in_out_0_count = sms[sms.in_out==0].groupby(['uid','day'])['uid'].count().unstack().fillna(0).add_prefix('sms_each_day_in_out_0_count_').reset_index()" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 29, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "sms_each_day_in_out_1_count = sms[sms.in_out==1].groupby(['uid','day'])['uid'].count().unstack().fillna(0).add_prefix('sms_each_day_in_out_1_count_').reset_index()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 30, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "sms_each_hour_unique_count_opp_head = sms.groupby(['uid','hour'])['opp_head'].agg(lambda x: len(pd.unique(x))).unstack().fillna(0).add_prefix('sms_each_hour_unique_count_opp_head_').reset_index()" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 31, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "sms_each_hour_unique_count_opp_num = sms.groupby(['uid','hour'])['opp_num'].agg(lambda x: len(pd.unique(x))).unstack().fillna(0).add_prefix('sms_each_hour_unique_count_opp_num_').reset_index()" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 32, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "\n", 454 | "\n", 455 | "\n", 456 | "\n", 457 | "sms_in_out_opp_num_unique = sms.groupby(['uid','in_out'])['opp_num'].agg(lambda x: len(pd.unique(x))).unstack().add_prefix('sms_in_out_opp_num_unique_').reset_index().fillna(0)\n", 458 | "sms_in_out_opp_head_unique = sms.groupby(['uid','in_out'])['opp_head'].agg(lambda x: len(pd.unique(x))).unstack().add_prefix('sms_in_out_opp_head_unique_').reset_index().fillna(0)\n" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "## 网站/app记录" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 35, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "wa_a = pd.concat([wa_train,wa_test_a],axis=0)\n", 482 | "\n", 483 | "wa_name_list = wa.groupby(['wa_name'])['uid'].count().sort_values(ascending=False).reset_index()['wa_name'][0:1000].values\n", 484 | "wa_each_name_count=wa[wa.wa_name.map(lambda x: x in wa_name_list)].groupby(['uid','wa_name'])['uid'].count().unstack().add_prefix('wa_each_name_count_').reset_index().fillna(0)\n" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "## 特征" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 102, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "voice_feature = [voice_opp_num, ## 统计用户 记录数,用户 不同opp_num 记录数\n", 501 | " voice_opp_head, ## 统计用户不同 opp_head记录 数\n", 502 | " voice_opp_len, ## 统计用户 不同 opp_len 的记录数\n", 503 | " voice_call_type, ## 统计用户不同 call_tyoe 记录数\n", 504 | " voice_in_out_opp_num_unique, ## 统计用户不同 in_out 下 不同 opp_num记录数\n", 505 | " voice_opp_len_opp_num_unique, ## 统计不同 opp_len 下 不同 opp_head 记录数\n", 506 | " voice_opp_len_opp_head_unique, ## 统计不同 opp_len 下不同opp_head 的记录数\n", 507 | " voice_call_type_opp_num_unique, ## 统计不同call_type 下 不同opp_num 的记录数\n", 508 | " voice_gap_time, ## 通话时长统计量\n", 509 | " voice_last_gap_time, ## 两次通话间隔统计量\n", 510 | " voice_each_opp_num_count, ## 对opp_num one-hot 统计记录数\n", 511 | " \n", 512 | "\n", 513 | " ]\n", 514 | "\n", 515 | "\n", 516 | "sms_feature = [sms_opp_num, ## 统计用户sms 不同opp_num记录数\n", 517 | " sms_opp_head, ## 统计用户sms 不同opp_head 记录数\n", 518 | " sms_opp_len, ##统计用户 不同opp_len记录数\n", 519 | " sms_opp_len_opp_head_unique, ##统计用户不同opp_len 中 不同opp_head的记录数\n", 520 | " sms_in_out, ## 不同in_out 记录数\n", 521 | " sms_last_start_gap_time, ## 两次短信间隔统计量\n", 522 | " sms_each_day_count, ##每天的短信记录数\n", 523 | " sms_each_day_unique_count_opp_head, ## 每天不同的opp_head 记录数\n", 524 | " \n", 525 | " sms_in_out_opp_head_unique, ## 不同in_out 下 不同opp_head 数\n", 526 | "\n", 527 | " sms_each_opp_head_count, ## opp_head one-hot 统计记录数\n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | "\n", 533 | " \n", 534 | " ]\n", 535 | "#sms_feature =sms_feature+sms_opp_count\n", 536 | "\n", 537 | "wa_feature = [\n", 538 | "\n", 539 | " \n", 540 | " wa_each_name_count, ##top 1000 wa_name 分组统计记录数\n", 541 | "\n", 542 | "]\n", 543 | "\n", 544 | "\n" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 103, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "train_feature = uid_train\n", 561 | "test_feature = uid_test\n", 562 | "\n", 563 | "for feat in voice_feature:\n", 564 | " train_feature=pd.merge(train_feature,feat,how='left',on='uid')\n", 565 | "for feat in voice_feature:\n", 566 | " test_feature=pd.merge(test_feature,feat,how='left',on='uid')\n", 567 | " \n", 568 | "for feat in sms_feature:\n", 569 | " train_feature=pd.merge(train_feature,feat,how='left',on='uid')\n", 570 | "for feat in sms_feature:\n", 571 | " test_feature=pd.merge(test_feature,feat,how='left',on='uid')\n", 572 | " \n", 573 | "for feat in wa_feature:\n", 574 | " train_feature=pd.merge(train_feature,feat,how='left',on='uid')\n", 575 | "for feat in wa_feature:\n", 576 | " test_feature=pd.merge(test_feature,feat,how='left',on='uid')\n", 577 | " \n", 578 | "for feat in voice_sms_feature:\n", 579 | " train_feature=pd.merge(train_feature,feat,how='left',on='uid')\n", 580 | "for feat in voice_sms_feature:\n", 581 | " test_feature=pd.merge(test_feature,feat,how='left',on='uid')\n", 582 | " \n", 583 | "# train_feature=pd.merge(train_feature,stacking_feat[stacking_col],how='left',on='uid')\n", 584 | "# test_feature=pd.merge(test_feature,stacking_feat[stacking_col],how='left',on='uid')\n" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 104, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "train_feature.to_csv('../data/train_featureV0.csv',index=None,encoding='utf-8')\n", 608 | "test_feature.to_csv('../data/test_featureV0.csv',index=None,encoding='utf-8')" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 105, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "(4999, 2315)\n" 621 | ] 622 | } 623 | ], 624 | "source": [ 625 | "print train_feature.shape" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [] 634 | } 635 | ], 636 | "metadata": { 637 | "kernelspec": { 638 | "display_name": "Python 2", 639 | "language": "python", 640 | "name": "python2" 641 | }, 642 | "language_info": { 643 | "codemirror_mode": { 644 | "name": "ipython", 645 | "version": 2 646 | }, 647 | "file_extension": ".py", 648 | "mimetype": "text/x-python", 649 | "name": "python", 650 | "nbconvert_exporter": "python", 651 | "pygments_lexer": "ipython2", 652 | "version": "2.7.6" 653 | } 654 | }, 655 | "nbformat": 4, 656 | "nbformat_minor": 2 657 | } 658 | --------------------------------------------------------------------------------