├── .gitattributes ├── img ├── method1.PNG └── method2.PNG ├── 模型一代码b榜0.75673 ├── result │ └── res │ │ └── ensemble.txt └── code │ ├── .ipynb_checkpoints │ └── ensemble-checkpoint.ipynb │ └── ensemble.ipynb ├── README.md └── 模型二代码b榜0.749880 └── code ├── ensemble1.ipynb ├── ensemble.ipynb ├── .ipynb_checkpoints ├── ensemble1-checkpoint.ipynb ├── ensemble-checkpoint.ipynb ├── gbdt预测-checkpoint.ipynb ├── _ensemble-checkpoint.ipynb ├── semi_ensemble-checkpoint.ipynb ├── lgb预测-checkpoint.ipynb ├── xgb预测-checkpoint.ipynb ├── semi_xgb预测-checkpoint.ipynb ├── semi_lgb预测-checkpoint.ipynb ├── nn预测-checkpoint.ipynb └── semi_gbdt预测-checkpoint.ipynb ├── xgb预测.ipynb ├── gbdt预测.ipynb ├── _ensemble.ipynb ├── semi_ensemble.ipynb ├── lgb预测.ipynb ├── semi_xgb预测.ipynb ├── semi_lgb预测.ipynb ├── nn预测.ipynb ├── semi_gbdt预测.ipynb └── semi_nn预测.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /img/method1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CuiNing6/2018-xinwang/HEAD/img/method1.PNG -------------------------------------------------------------------------------- /img/method2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CuiNing6/2018-xinwang/HEAD/img/method2.PNG -------------------------------------------------------------------------------- /模型一代码b榜0.75673/result/res/ensemble.txt: -------------------------------------------------------------------------------- 1 | xgb_all_feature Submission 0.74577 2 | lgb_all_feature_cv Submission4 0.74 3 | gbdt_all_feature Submission9 0.73 4 | nn_all_feature Submission11 0.70 5 | xgb_corr_feature Submission12 0.70 6 | 7 | Submission*0.5 + Submission4*0.5 = sub_0_4 = 0.7463 8 | sub_0_4*0.7 + Submission9*0.3 = sub_0_4_9 = 0.74651 9 | sub_0_4_9*0.7 + Submission11*0.3 = sub_0_4_9_11 = 0.74757 10 | sub_0_4_9_11*0.7 + Submission12*0.3 = sub_0_4_9_11_12 = 0.75152 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2018-新网银行杯Top1方案 2 | 比赛链接:http://www.dcjingsai.com/common/cmpt/西南财经大学“新网银行杯”数据科学竞赛_竞赛信息.html 3 | # 队伍名称:摸金校尉 4 | # 解决方案: 5 | ## 基于集成学习的信用风险预测模型 6 | 本次比赛通过机器学习和数据挖掘技术定量分析信用风险,给出每个样本的预测结果。首先,研究了违约客户和履约客户这两批客户的特征,其次,将机器学习领域比较流行的集成学习模型应用于信用风险评估领域,并利用主流的模型性能评价指标评价模型。在比赛中,对类别型数据进行哑编码,并搭建自编码网络提取特征,利用特征相关性,特征重要性,information value三个方法筛选特征,最后,选取基于加权平均法的集成学习模型和类别分布不平衡环境下基于加权平均法的半监督集成模型对数据进行建模,并使用AUC作为模型性能的具体评价指标,通过两种参数调节方法优化模型。在测试数据集上应用,竞赛结果验证了所构建的集成系统泛化能力较强,模型复杂度适中。 7 | ## 基于加权平均法的集成学习模型示意图: 8 | ![基于加权平均法的集成学习模型](https://github.com/CuiNing6/2018-xinwang/blob/master/img/method1.PNG) 9 | ## 基于加权平均法的半监督集成学习模型示意图: 10 | ![基于加权平均法的集成学习模型](https://github.com/CuiNing6/2018-xinwang/blob/master/img/method2.PNG) 11 | # 代码说明: 12 | * 模型一对应基于加权平均法的集成学习模型 13 | * 模型二对应基于加权平均法的半监督集成学习模型 14 | * 其他代码里存放了比赛过程中的一些尝试,但是最终没有用到,包括自编码网络提取特征,多模态集成和woe特征构建等等。 15 | -------------------------------------------------------------------------------- /模型一代码b榜0.75673/code/.ipynb_checkpoints/ensemble-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "sub1 = pd.read_csv('../result/Submission.csv')\n", 26 | "sub2 = pd.read_csv('../result/Submission4.csv')\n", 27 | "sub3 = pd.read_csv('../result/Submission9.csv')\n", 28 | "sub4 = pd.read_csv('../result/Submission11.csv')\n", 29 | "sub5 = pd.read_csv('../result/Submission12.csv')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "ensemble = sub1\n", 41 | "ensemble['pred_prob'] = sub1['pred_prob']*0.5 + sub2['pred_prob']*0.5" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "ensemble1 = sub1\n", 53 | "ensemble1['pred_prob'] = ensemble['pred_prob']*0.7 + sub3['pred_prob']*0.3" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "ensemble2 = sub1\n", 65 | "ensemble2['pred_prob'] = ensemble1['pred_prob']*0.7 + sub4['pred_prob']*0.3" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "ensemble3 = sub1\n", 77 | "ensemble3['pred_prob'] = ensemble2['pred_prob']*0.7 + sub5['pred_prob']*0.3" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 7, 83 | "metadata": { 84 | "collapsed": false, 85 | "scrolled": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "ensemble3.to_csv('../result/ensemble.csv')" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python [default]", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.5.2" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 1 123 | } 124 | -------------------------------------------------------------------------------- /模型一代码b榜0.75673/code/ensemble.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "sub1 = pd.read_csv('../result/Submission.csv')\n", 26 | "sub2 = pd.read_csv('../result/Submission4.csv')\n", 27 | "sub3 = pd.read_csv('../result/Submission9.csv')\n", 28 | "sub4 = pd.read_csv('../result/Submission11.csv')\n", 29 | "sub5 = pd.read_csv('../result/Submission12.csv')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "ensemble = sub1\n", 41 | "ensemble['pred_prob'] = sub1['pred_prob']*0.5 + sub2['pred_prob']*0.5" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "ensemble1 = sub1\n", 53 | "ensemble1['pred_prob'] = ensemble['pred_prob']*0.7 + sub3['pred_prob']*0.3" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "ensemble2 = sub1\n", 65 | "ensemble2['pred_prob'] = ensemble1['pred_prob']*0.7 + sub4['pred_prob']*0.3" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "ensemble3 = sub1\n", 77 | "ensemble3['pred_prob'] = ensemble2['pred_prob']*0.7 + sub5['pred_prob']*0.3" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 9, 83 | "metadata": { 84 | "collapsed": false, 85 | "scrolled": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "ensemble3.to_csv('../result/ensemble.csv',index=False)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "anaconda-cloud": {}, 104 | "kernelspec": { 105 | "display_name": "Python [default]", 106 | "language": "python", 107 | "name": "python3" 108 | }, 109 | "language_info": { 110 | "codemirror_mode": { 111 | "name": "ipython", 112 | "version": 3 113 | }, 114 | "file_extension": ".py", 115 | "mimetype": "text/x-python", 116 | "name": "python", 117 | "nbconvert_exporter": "python", 118 | "pygments_lexer": "ipython3", 119 | "version": "3.5.2" 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 1 124 | } 125 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/ensemble1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 5, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "ensemble = pd.read_csv('../result/free2/sub_0_4_9_11_12.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 6, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 8, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
\n", 50 | "\n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | "
cust_idpred_prob
010.016407
120.051909
230.194874
340.081682
450.154739
\n", 86 | "
" 87 | ], 88 | "text/plain": [ 89 | " cust_id pred_prob\n", 90 | "0 1 0.016407\n", 91 | "1 2 0.051909\n", 92 | "2 3 0.194874\n", 93 | "3 4 0.081682\n", 94 | "4 5 0.154739" 95 | ] 96 | }, 97 | "execution_count": 8, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "ensemble.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 9, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "result = ensemble.pred_prob*0.7 + semi_ensemble.pred_prob*0.3" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 10, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "pred = _ensemble\n", 126 | "pred.pred_prob = result" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 11, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/html": [ 139 | "
\n", 140 | "\n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | "
cust_idpred_prob
010.018662
120.051679
230.191229
340.092566
450.142426
\n", 176 | "
" 177 | ], 178 | "text/plain": [ 179 | " cust_id pred_prob\n", 180 | "0 1 0.018662\n", 181 | "1 2 0.051679\n", 182 | "2 3 0.191229\n", 183 | "3 4 0.092566\n", 184 | "4 5 0.142426" 185 | ] 186 | }, 187 | "execution_count": 11, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "pred.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 12, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "pred.to_csv('../result/ensemble_final1.csv',index = False)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python [default]", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.5.2" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 1 238 | } 239 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/ensemble.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "_ensemble = pd.read_csv('../result/_ensemble.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
\n", 50 | "\n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | "
cust_idpred_prob
010.014872
120.036687
230.194181
340.064974
450.118509
\n", 86 | "
" 87 | ], 88 | "text/plain": [ 89 | " cust_id pred_prob\n", 90 | "0 1 0.014872\n", 91 | "1 2 0.036687\n", 92 | "2 3 0.194181\n", 93 | "3 4 0.064974\n", 94 | "4 5 0.118509" 95 | ] 96 | }, 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "_ensemble.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "result = semi_ensemble.pred_prob*0.4 + _ensemble.pred_prob*0.6" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 7, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "pred = _ensemble\n", 126 | "pred.pred_prob = result" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/html": [ 139 | "
\n", 140 | "\n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | "
cust_idpred_prob
010.015424
120.034652
230.146920
340.062570
450.093373
\n", 176 | "
" 177 | ], 178 | "text/plain": [ 179 | " cust_id pred_prob\n", 180 | "0 1 0.015424\n", 181 | "1 2 0.034652\n", 182 | "2 3 0.146920\n", 183 | "3 4 0.062570\n", 184 | "4 5 0.093373" 185 | ] 186 | }, 187 | "execution_count": 8, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "pred.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 9, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "pred.to_csv('../result/ensemble_final.csv',index = False)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "anaconda-cloud": {}, 219 | "kernelspec": { 220 | "display_name": "Python [default]", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.5.2" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 1 239 | } 240 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/ensemble1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 5, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "ensemble = pd.read_csv('../result/free2/sub_0_4_9_11_12.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 6, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 8, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
\n", 50 | "\n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | "
cust_idpred_prob
010.016407
120.051909
230.194874
340.081682
450.154739
\n", 86 | "
" 87 | ], 88 | "text/plain": [ 89 | " cust_id pred_prob\n", 90 | "0 1 0.016407\n", 91 | "1 2 0.051909\n", 92 | "2 3 0.194874\n", 93 | "3 4 0.081682\n", 94 | "4 5 0.154739" 95 | ] 96 | }, 97 | "execution_count": 8, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "ensemble.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 9, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "result = ensemble.pred_prob*0.7 + semi_ensemble.pred_prob*0.3" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 10, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "pred = _ensemble\n", 126 | "pred.pred_prob = result" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 11, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/html": [ 139 | "
\n", 140 | "\n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | "
cust_idpred_prob
010.018662
120.051679
230.191229
340.092566
450.142426
\n", 176 | "
" 177 | ], 178 | "text/plain": [ 179 | " cust_id pred_prob\n", 180 | "0 1 0.018662\n", 181 | "1 2 0.051679\n", 182 | "2 3 0.191229\n", 183 | "3 4 0.092566\n", 184 | "4 5 0.142426" 185 | ] 186 | }, 187 | "execution_count": 11, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "pred.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 12, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "pred.to_csv('../result/ensemble_final1.csv',index = False)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python [default]", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.5.2" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 1 238 | } 239 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/ensemble-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "_ensemble = pd.read_csv('../result/_ensemble.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
\n", 50 | "\n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | "
cust_idpred_prob
010.014872
120.036687
230.194181
340.064974
450.118509
\n", 86 | "
" 87 | ], 88 | "text/plain": [ 89 | " cust_id pred_prob\n", 90 | "0 1 0.014872\n", 91 | "1 2 0.036687\n", 92 | "2 3 0.194181\n", 93 | "3 4 0.064974\n", 94 | "4 5 0.118509" 95 | ] 96 | }, 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "_ensemble.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "result = semi_ensemble.pred_prob*0.4 + _ensemble.pred_prob*0.6" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 7, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "pred = _ensemble\n", 126 | "pred.pred_prob = result" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/html": [ 139 | "
\n", 140 | "\n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | "
cust_idpred_prob
010.015424
120.034652
230.146920
340.062570
450.093373
\n", 176 | "
" 177 | ], 178 | "text/plain": [ 179 | " cust_id pred_prob\n", 180 | "0 1 0.015424\n", 181 | "1 2 0.034652\n", 182 | "2 3 0.146920\n", 183 | "3 4 0.062570\n", 184 | "4 5 0.093373" 185 | ] 186 | }, 187 | "execution_count": 8, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "pred.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 9, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "pred.to_csv('../result/ensemble_final.csv',index = False)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "anaconda-cloud": {}, 219 | "kernelspec": { 220 | "display_name": "Python [default]", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.5.2" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 1 239 | } 240 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/xgb预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 3, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 6, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 6, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 7, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 7, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 8, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 9, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 10, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 11, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.metrics import accuracy_score\n", 144 | "from sklearn import metrics\n", 145 | "from sklearn.model_selection import train_test_split\n", 146 | "from xgboost import XGBClassifier" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 13, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 14, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "gbm = XGBClassifier( n_estimators= 100, max_depth= 4, min_child_weight= 2, gamma=0.9, subsample=0.8, \n", 169 | " colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1).fit(X_train, y_train)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 15, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "0.80642048092\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "predictions = gbm.predict_proba(X_val)\n", 189 | "pre = predictions[:,1]\n", 190 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 191 | "print(val_auc)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 16, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "(10000,)" 205 | ] 206 | }, 207 | "execution_count": 16, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "preds = gbm.predict_proba(test_X)\n", 214 | "pred = preds[:,1]\n", 215 | "pred.shape" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 18, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 227 | "Submission.to_csv('../result/xgb.csv',index=False)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [] 238 | } 239 | ], 240 | "metadata": { 241 | "anaconda-cloud": {}, 242 | "kernelspec": { 243 | "display_name": "Python [default]", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.5.2" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 1 262 | } 263 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/gbdt预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 6, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 7, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 8, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.metrics import accuracy_score\n", 144 | "from sklearn import metrics\n", 145 | "from sklearn.model_selection import train_test_split\n", 146 | "from xgboost import XGBClassifier\n", 147 | "from sklearn.ensemble import GradientBoostingClassifier" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 11, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 16, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "clf = GradientBoostingClassifier(n_estimators=120, learning_rate=0.05,max_depth=3, random_state=0).fit(X_train, y_train)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 17, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "0.798263097577\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "predictions = clf.predict_proba(X_val)\n", 189 | "pre = predictions[:,1]\n", 190 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 191 | "print(val_auc)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 18, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "(10000,)" 205 | ] 206 | }, 207 | "execution_count": 18, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "preds = clf.predict_proba(test_X)\n", 214 | "pred = preds[:,1]\n", 215 | "pred.shape" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 19, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 227 | "Submission.to_csv('../result/gbdt.csv',index=False)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "anaconda-cloud": {}, 251 | "kernelspec": { 252 | "display_name": "Python [default]", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.5.2" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 1 271 | } 272 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/gbdt预测-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 6, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 7, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 8, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.metrics import accuracy_score\n", 144 | "from sklearn import metrics\n", 145 | "from sklearn.model_selection import train_test_split\n", 146 | "from xgboost import XGBClassifier\n", 147 | "from sklearn.ensemble import GradientBoostingClassifier" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 11, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 16, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "clf = GradientBoostingClassifier(n_estimators=120, learning_rate=0.05,max_depth=3, random_state=0).fit(X_train, y_train)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 17, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "0.798263097577\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "predictions = clf.predict_proba(X_val)\n", 189 | "pre = predictions[:,1]\n", 190 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 191 | "print(val_auc)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 18, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "(10000,)" 205 | ] 206 | }, 207 | "execution_count": 18, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "preds = clf.predict_proba(test_X)\n", 214 | "pred = preds[:,1]\n", 215 | "pred.shape" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 19, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 227 | "Submission.to_csv('../result/gbdt.csv',index=False)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "anaconda-cloud": {}, 251 | "kernelspec": { 252 | "display_name": "Python [default]", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.5.2" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 1 271 | } 272 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/_ensemble.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "xgb = pd.read_csv('../result/xgb.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "lgb = pd.read_csv('../result/lgb.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "gbdt = pd.read_csv('../result/gbdt.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "nn = pd.read_csv('../result/nn.csv')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "
\n", 72 | "\n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | "
cust_idpred_prob
010.005983
120.025552
230.018066
340.035206
450.067734
\n", 108 | "
" 109 | ], 110 | "text/plain": [ 111 | " cust_id pred_prob\n", 112 | "0 1 0.005983\n", 113 | "1 2 0.025552\n", 114 | "2 3 0.018066\n", 115 | "3 4 0.035206\n", 116 | "4 5 0.067734" 117 | ] 118 | }, 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "nn.head()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 8, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "pred = nn\n", 148 | "pred.pred_prob = result" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "
\n", 162 | "\n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
cust_idpred_prob
010.014872
120.036687
230.194181
340.064974
450.118509
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " cust_id pred_prob\n", 202 | "0 1 0.014872\n", 203 | "1 2 0.036687\n", 204 | "2 3 0.194181\n", 205 | "3 4 0.064974\n", 206 | "4 5 0.118509" 207 | ] 208 | }, 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "pred.head()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "pred.to_csv('../result/_ensemble.csv',index = False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "anaconda-cloud": {}, 241 | "kernelspec": { 242 | "display_name": "Python [default]", 243 | "language": "python", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 3 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython3", 256 | "version": "3.5.2" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 1 261 | } 262 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/semi_ensemble.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "xgb = pd.read_csv('../result/semi_xgb.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "lgb = pd.read_csv('../result/semi_lgb.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "gbdt = pd.read_csv('../result/semi_gbdt.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "nn = pd.read_csv('../result/semi_nn.csv')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "
\n", 72 | "\n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | "
cust_idpred_prob
010.003583
120.069257
230.015288
340.084963
450.064052
\n", 108 | "
" 109 | ], 110 | "text/plain": [ 111 | " cust_id pred_prob\n", 112 | "0 1 0.003583\n", 113 | "1 2 0.069257\n", 114 | "2 3 0.015288\n", 115 | "3 4 0.084963\n", 116 | "4 5 0.064052" 117 | ] 118 | }, 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "nn.head()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 8, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "pred = nn\n", 148 | "pred.pred_prob = result" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "
\n", 162 | "\n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
cust_idpred_prob
010.016251
120.031600
230.076029
340.058965
450.055669
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " cust_id pred_prob\n", 202 | "0 1 0.016251\n", 203 | "1 2 0.031600\n", 204 | "2 3 0.076029\n", 205 | "3 4 0.058965\n", 206 | "4 5 0.055669" 207 | ] 208 | }, 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "pred.head()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "pred.to_csv('../result/semi_ensemble.csv',index = False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "anaconda-cloud": {}, 241 | "kernelspec": { 242 | "display_name": "Python [default]", 243 | "language": "python", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 3 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython3", 256 | "version": "3.5.2" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 1 261 | } 262 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/_ensemble-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "xgb = pd.read_csv('../result/xgb.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "lgb = pd.read_csv('../result/lgb.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "gbdt = pd.read_csv('../result/gbdt.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "nn = pd.read_csv('../result/nn.csv')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "
\n", 72 | "\n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | "
cust_idpred_prob
010.005983
120.025552
230.018066
340.035206
450.067734
\n", 108 | "
" 109 | ], 110 | "text/plain": [ 111 | " cust_id pred_prob\n", 112 | "0 1 0.005983\n", 113 | "1 2 0.025552\n", 114 | "2 3 0.018066\n", 115 | "3 4 0.035206\n", 116 | "4 5 0.067734" 117 | ] 118 | }, 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "nn.head()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 8, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "pred = nn\n", 148 | "pred.pred_prob = result" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "
\n", 162 | "\n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
cust_idpred_prob
010.014872
120.036687
230.194181
340.064974
450.118509
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " cust_id pred_prob\n", 202 | "0 1 0.014872\n", 203 | "1 2 0.036687\n", 204 | "2 3 0.194181\n", 205 | "3 4 0.064974\n", 206 | "4 5 0.118509" 207 | ] 208 | }, 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "pred.head()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "pred.to_csv('../result/_ensemble.csv',index = False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "anaconda-cloud": {}, 241 | "kernelspec": { 242 | "display_name": "Python [default]", 243 | "language": "python", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 3 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython3", 256 | "version": "3.5.2" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 1 261 | } 262 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_ensemble-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "xgb = pd.read_csv('../result/semi_xgb.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "lgb = pd.read_csv('../result/semi_lgb.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "gbdt = pd.read_csv('../result/semi_gbdt.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "nn = pd.read_csv('../result/semi_nn.csv')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "
\n", 72 | "\n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | "
cust_idpred_prob
010.003583
120.069257
230.015288
340.084963
450.064052
\n", 108 | "
" 109 | ], 110 | "text/plain": [ 111 | " cust_id pred_prob\n", 112 | "0 1 0.003583\n", 113 | "1 2 0.069257\n", 114 | "2 3 0.015288\n", 115 | "3 4 0.084963\n", 116 | "4 5 0.064052" 117 | ] 118 | }, 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "nn.head()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 8, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "pred = nn\n", 148 | "pred.pred_prob = result" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "
\n", 162 | "\n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
cust_idpred_prob
010.016251
120.031600
230.076029
340.058965
450.055669
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " cust_id pred_prob\n", 202 | "0 1 0.016251\n", 203 | "1 2 0.031600\n", 204 | "2 3 0.076029\n", 205 | "3 4 0.058965\n", 206 | "4 5 0.055669" 207 | ] 208 | }, 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "pred.head()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "pred.to_csv('../result/semi_ensemble.csv',index = False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "anaconda-cloud": {}, 241 | "kernelspec": { 242 | "display_name": "Python [default]", 243 | "language": "python", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 3 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython3", 256 | "version": "3.5.2" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 1 261 | } 262 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/lgb预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 4, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 5, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 7, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 7, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 8, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 8, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 9, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 10, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 11, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 12, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.metrics import accuracy_score\n", 144 | "from sklearn import metrics\n", 145 | "from sklearn.model_selection import train_test_split\n", 146 | "from xgboost import XGBClassifier\n", 147 | "import lightgbm as lgb" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 13, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 14, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "lgb_train = lgb.Dataset(X_train, y_train)\n", 170 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 15, 176 | "metadata": { 177 | "collapsed": false, 178 | "scrolled": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "param = {\n", 183 | " 'max_depth':6,\n", 184 | " 'num_leaves':80,\n", 185 | " 'learning_rate':0.03,\n", 186 | " 'scale_pos_weight':1,\n", 187 | " 'num_threads':40,\n", 188 | " 'objective':'binary',\n", 189 | " 'bagging_fraction':0.7,\n", 190 | " 'bagging_freq':1,\n", 191 | " 'min_sum_hessian_in_leaf':100\n", 192 | "}\n", 193 | "\n", 194 | "param['is_unbalance']='true'\n", 195 | "param['metric'] = 'auc'\n", 196 | "\n", 197 | "\n", 198 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=3, early_stopping_rounds=30)\n", 199 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 16, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "0.809795678593\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "ypred = gbm.predict(X_val)\n", 219 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n", 220 | "print(val_auc)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 17, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "y_pred = gbm.predict(test_X)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 18, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': y_pred})\n", 243 | "Submission.to_csv('../result/lgb.csv',index=False)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [] 254 | } 255 | ], 256 | "metadata": { 257 | "anaconda-cloud": {}, 258 | "kernelspec": { 259 | "display_name": "Python [default]", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.5.2" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 1 278 | } 279 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/lgb预测-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 4, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 5, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 7, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 7, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 8, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 8, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 9, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 10, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 11, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 12, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.metrics import accuracy_score\n", 144 | "from sklearn import metrics\n", 145 | "from sklearn.model_selection import train_test_split\n", 146 | "from xgboost import XGBClassifier\n", 147 | "import lightgbm as lgb" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 13, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 14, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "lgb_train = lgb.Dataset(X_train, y_train)\n", 170 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 15, 176 | "metadata": { 177 | "collapsed": false, 178 | "scrolled": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "param = {\n", 183 | " 'max_depth':6,\n", 184 | " 'num_leaves':80,\n", 185 | " 'learning_rate':0.03,\n", 186 | " 'scale_pos_weight':1,\n", 187 | " 'num_threads':40,\n", 188 | " 'objective':'binary',\n", 189 | " 'bagging_fraction':0.7,\n", 190 | " 'bagging_freq':1,\n", 191 | " 'min_sum_hessian_in_leaf':100\n", 192 | "}\n", 193 | "\n", 194 | "param['is_unbalance']='true'\n", 195 | "param['metric'] = 'auc'\n", 196 | "\n", 197 | "\n", 198 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=3, early_stopping_rounds=30)\n", 199 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 16, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "0.809795678593\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "ypred = gbm.predict(X_val)\n", 219 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n", 220 | "print(val_auc)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 17, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "y_pred = gbm.predict(test_X)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 18, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': y_pred})\n", 243 | "Submission.to_csv('../result/lgb.csv',index=False)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [] 254 | } 255 | ], 256 | "metadata": { 257 | "anaconda-cloud": {}, 258 | "kernelspec": { 259 | "display_name": "Python [default]", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.5.2" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 1 278 | } 279 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/xgb预测-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 4, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 5, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 7, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 7, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 8, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 8, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 9, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 10, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 11, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 12, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stderr", 144 | "output_type": "stream", 145 | "text": [ 146 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 147 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 148 | "Using TensorFlow backend.\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "from sklearn.tree import DecisionTreeClassifier\n", 154 | "from sklearn.ensemble import RandomForestClassifier\n", 155 | "from sklearn.ensemble import AdaBoostClassifier\n", 156 | "from sklearn.ensemble import ExtraTreesClassifier\n", 157 | "from sklearn.ensemble import GradientBoostingClassifier\n", 158 | "from sklearn.neighbors import KNeighborsClassifier\n", 159 | "from sklearn.svm import SVC\n", 160 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n", 161 | "from sklearn.metrics import classification_report\n", 162 | "from sklearn.metrics import precision_recall_fscore_support\n", 163 | "from sklearn.utils.multiclass import unique_labels\n", 164 | "from sklearn.metrics import accuracy_score\n", 165 | "from xgboost import XGBClassifier\n", 166 | "from sklearn.ensemble import GradientBoostingClassifier\n", 167 | "from sklearn.cross_validation import cross_val_score\n", 168 | "from lightgbm import LGBMClassifier\n", 169 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n", 170 | "from sklearn.svm import LinearSVC\n", 171 | "from sklearn import linear_model\n", 172 | "import lightgbm as lgb\n", 173 | "import xgboost as xgb\n", 174 | "\n", 175 | "from keras.models import Model\n", 176 | "from keras.layers import Dense, Input" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 14, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 15, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "xgb_train = xgb.DMatrix(X_train, y_train)\n", 199 | "xgb_val = xgb.DMatrix(X_val, y_val)\n", 200 | "xgb_test = xgb.DMatrix(test_X)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 29, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "param = {\n", 212 | " 'booster':'gbtree',\n", 213 | " 'max_depth':4,\n", 214 | " 'num_leaves':50,\n", 215 | " 'learning_rate':0.05,\n", 216 | " 'scale_pos_weight':1,\n", 217 | " 'num_threads':40,\n", 218 | " 'objective':'binary:logistic',\n", 219 | " 'bagging_fraction':0.7,\n", 220 | " 'bagging_freq':1,\n", 221 | " 'min_sum_hessian_in_leaf':100,\n", 222 | "}\n", 223 | "\n", 224 | "param['is_unbalance']='true'\n", 225 | "param['metric'] = 'auc'\n", 226 | "\n", 227 | "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=100)\n", 228 | "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 30, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "0.806100532112\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "ypred = gbm.predict(xgb_val)\n", 248 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n", 249 | "print(val_auc)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 31, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "(10000,)" 263 | ] 264 | }, 265 | "execution_count": 31, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "pred = gbm.predict(xgb_test)\n", 272 | "pred.shape" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 32, 278 | "metadata": { 279 | "collapsed": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 284 | "Submission.to_csv('../result/xgb.csv',index=False)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [] 295 | } 296 | ], 297 | "metadata": { 298 | "anaconda-cloud": {}, 299 | "kernelspec": { 300 | "display_name": "Python [default]", 301 | "language": "python", 302 | "name": "python3" 303 | }, 304 | "language_info": { 305 | "codemirror_mode": { 306 | "name": "ipython", 307 | "version": 3 308 | }, 309 | "file_extension": ".py", 310 | "mimetype": "text/x-python", 311 | "name": "python", 312 | "nbconvert_exporter": "python", 313 | "pygments_lexer": "ipython3", 314 | "version": "3.5.2" 315 | } 316 | }, 317 | "nbformat": 4, 318 | "nbformat_minor": 1 319 | } 320 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/semi_xgb预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "(10000, 160)" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n", 37 | "train_x1.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(15000, 160)" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n", 60 | "train_x2.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(10000, 159)" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "test = pd.read_csv('../data/test_all.csv')\n", 83 | "test.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(25000, 157)" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n", 106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n", 107 | "train_x = pd.concat([train_x11, train_x22])\n", 108 | "train_x.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(10000, 157)" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n", 131 | "test_x.shape" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(35000, 157)" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "x = pd.concat([train_x, test_x])\n", 154 | "x.shape" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "(25000,)" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "train_y1 = train_x1['y']\n", 177 | "train_y2 = train_x2['y']\n", 178 | "Y_train = train_y1.append(train_y2)\n", 179 | "Y_train.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "(35000, 364)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "for i in range(96,158):\n", 199 | " col = 'x'+'_'+str(i)\n", 200 | " if col in x.columns.values:\n", 201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n", 202 | " x = pd.concat([x, dummies_df], axis=1)\n", 203 | "print(x.shape)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "(25000, 364)\n", 218 | "(10000, 364)\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "train_X = x[0:25000]\n", 224 | "test_X = x[25000:35000]\n", 225 | "print(train_X.shape)\n", 226 | "print(test_X.shape)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 11, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stderr", 238 | "output_type": "stream", 239 | "text": [ 240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 242 | "Using TensorFlow backend.\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "from sklearn.tree import DecisionTreeClassifier\n", 248 | "from sklearn.ensemble import RandomForestClassifier\n", 249 | "from sklearn.ensemble import AdaBoostClassifier\n", 250 | "from sklearn.ensemble import ExtraTreesClassifier\n", 251 | "from sklearn.ensemble import GradientBoostingClassifier\n", 252 | "from sklearn.neighbors import KNeighborsClassifier\n", 253 | "from sklearn.svm import SVC\n", 254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n", 255 | "from sklearn.metrics import classification_report\n", 256 | "from sklearn.metrics import precision_recall_fscore_support\n", 257 | "from sklearn.utils.multiclass import unique_labels\n", 258 | "from sklearn.metrics import accuracy_score\n", 259 | "from xgboost import XGBClassifier\n", 260 | "from sklearn.ensemble import GradientBoostingClassifier\n", 261 | "from sklearn.cross_validation import cross_val_score\n", 262 | "from lightgbm import LGBMClassifier\n", 263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n", 264 | "from sklearn.svm import LinearSVC\n", 265 | "from sklearn import linear_model\n", 266 | "import lightgbm as lgb\n", 267 | "import xgboost as xgb\n", 268 | "\n", 269 | "from keras.models import Model\n", 270 | "from keras.layers import Dense, Input" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 12, 276 | "metadata": { 277 | "collapsed": false, 278 | "scrolled": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "# encoding_dim = 600\n", 283 | "# input_dim = Input(shape=(364,))\n", 284 | "\n", 285 | "# encoded = Dense(364, activation='linear')(input_dim)\n", 286 | "# # encoded = Dense(300, activation='relu')(encoded)\n", 287 | "# # encoded = Dense(32, activation='relu')(encoded)\n", 288 | "# encoder_output = Dense(encoding_dim)(encoded)\n", 289 | "\n", 290 | "# decoded = Dense(600, activation='relu')(encoder_output)\n", 291 | "# # decoded = Dense(64, activation='relu')(decoded)\n", 292 | "# # decoded = Dense(128, activation='relu')(decoded)\n", 293 | "# decoded = Dense(364, activation='tanh')(decoded)\n", 294 | "\n", 295 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n", 296 | "\n", 297 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n", 298 | "\n", 299 | "# autoencoder.compile(optimizer='adam', loss='mse')\n", 300 | "# # training\n", 301 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 13, 307 | "metadata": { 308 | "collapsed": false 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "# new_train_feature = encoder.predict(train_X.values)\n", 313 | "# new_test_feature = encoder.predict(test_X.values)\n", 314 | "# print(new_train_feature.shape)\n", 315 | "# print(new_test_feature.shape)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 14, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 15, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "xgb_train = xgb.DMatrix(X_train, y_train)\n", 338 | "xgb_val = xgb.DMatrix(X_val, y_val)\n", 339 | "xgb_test = xgb.DMatrix(test_X)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 16, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "param = {\n", 351 | " 'booster':'gbtree',\n", 352 | " 'max_depth':10,\n", 353 | " 'num_leaves':80,\n", 354 | " 'learning_rate':0.03,\n", 355 | " 'scale_pos_weight':1,\n", 356 | " 'num_threads':40,\n", 357 | " 'objective':'binary:logistic',\n", 358 | " 'bagging_fraction':0.7,\n", 359 | " 'bagging_freq':1,\n", 360 | " 'min_sum_hessian_in_leaf':100,\n", 361 | "}\n", 362 | "\n", 363 | "param['is_unbalance']='true'\n", 364 | "param['metric'] = 'auc'\n", 365 | "\n", 366 | "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=10, early_stopping_rounds=100)\n", 367 | "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])\n" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 17, 373 | "metadata": { 374 | "collapsed": false 375 | }, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "0.817833166049\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "ypred = gbm.predict(xgb_val)\n", 387 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n", 388 | "print(val_auc)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 18, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "(10000,)" 402 | ] 403 | }, 404 | "execution_count": 18, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "pred = gbm.predict(xgb_test)\n", 411 | "pred.shape" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 19, 417 | "metadata": { 418 | "collapsed": true 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 423 | "Submission.to_csv('../result/semi_xgb.csv',index=False)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "collapsed": true 431 | }, 432 | "outputs": [], 433 | "source": [] 434 | } 435 | ], 436 | "metadata": { 437 | "anaconda-cloud": {}, 438 | "kernelspec": { 439 | "display_name": "Python [default]", 440 | "language": "python", 441 | "name": "python3" 442 | }, 443 | "language_info": { 444 | "codemirror_mode": { 445 | "name": "ipython", 446 | "version": 3 447 | }, 448 | "file_extension": ".py", 449 | "mimetype": "text/x-python", 450 | "name": "python", 451 | "nbconvert_exporter": "python", 452 | "pygments_lexer": "ipython3", 453 | "version": "3.5.2" 454 | } 455 | }, 456 | "nbformat": 4, 457 | "nbformat_minor": 1 458 | } 459 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_xgb预测-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "(10000, 160)" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n", 37 | "train_x1.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(15000, 160)" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n", 60 | "train_x2.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(10000, 159)" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "test = pd.read_csv('../data/test_all.csv')\n", 83 | "test.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(25000, 157)" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n", 106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n", 107 | "train_x = pd.concat([train_x11, train_x22])\n", 108 | "train_x.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(10000, 157)" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n", 131 | "test_x.shape" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(35000, 157)" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "x = pd.concat([train_x, test_x])\n", 154 | "x.shape" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "(25000,)" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "train_y1 = train_x1['y']\n", 177 | "train_y2 = train_x2['y']\n", 178 | "Y_train = train_y1.append(train_y2)\n", 179 | "Y_train.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "(35000, 364)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "for i in range(96,158):\n", 199 | " col = 'x'+'_'+str(i)\n", 200 | " if col in x.columns.values:\n", 201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n", 202 | " x = pd.concat([x, dummies_df], axis=1)\n", 203 | "print(x.shape)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "(25000, 364)\n", 218 | "(10000, 364)\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "train_X = x[0:25000]\n", 224 | "test_X = x[25000:35000]\n", 225 | "print(train_X.shape)\n", 226 | "print(test_X.shape)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 11, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stderr", 238 | "output_type": "stream", 239 | "text": [ 240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 242 | "Using TensorFlow backend.\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "from sklearn.tree import DecisionTreeClassifier\n", 248 | "from sklearn.ensemble import RandomForestClassifier\n", 249 | "from sklearn.ensemble import AdaBoostClassifier\n", 250 | "from sklearn.ensemble import ExtraTreesClassifier\n", 251 | "from sklearn.ensemble import GradientBoostingClassifier\n", 252 | "from sklearn.neighbors import KNeighborsClassifier\n", 253 | "from sklearn.svm import SVC\n", 254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n", 255 | "from sklearn.metrics import classification_report\n", 256 | "from sklearn.metrics import precision_recall_fscore_support\n", 257 | "from sklearn.utils.multiclass import unique_labels\n", 258 | "from sklearn.metrics import accuracy_score\n", 259 | "from xgboost import XGBClassifier\n", 260 | "from sklearn.ensemble import GradientBoostingClassifier\n", 261 | "from sklearn.cross_validation import cross_val_score\n", 262 | "from lightgbm import LGBMClassifier\n", 263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n", 264 | "from sklearn.svm import LinearSVC\n", 265 | "from sklearn import linear_model\n", 266 | "import lightgbm as lgb\n", 267 | "import xgboost as xgb\n", 268 | "\n", 269 | "from keras.models import Model\n", 270 | "from keras.layers import Dense, Input" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 12, 276 | "metadata": { 277 | "collapsed": false, 278 | "scrolled": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "# encoding_dim = 600\n", 283 | "# input_dim = Input(shape=(364,))\n", 284 | "\n", 285 | "# encoded = Dense(364, activation='linear')(input_dim)\n", 286 | "# # encoded = Dense(300, activation='relu')(encoded)\n", 287 | "# # encoded = Dense(32, activation='relu')(encoded)\n", 288 | "# encoder_output = Dense(encoding_dim)(encoded)\n", 289 | "\n", 290 | "# decoded = Dense(600, activation='relu')(encoder_output)\n", 291 | "# # decoded = Dense(64, activation='relu')(decoded)\n", 292 | "# # decoded = Dense(128, activation='relu')(decoded)\n", 293 | "# decoded = Dense(364, activation='tanh')(decoded)\n", 294 | "\n", 295 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n", 296 | "\n", 297 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n", 298 | "\n", 299 | "# autoencoder.compile(optimizer='adam', loss='mse')\n", 300 | "# # training\n", 301 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 13, 307 | "metadata": { 308 | "collapsed": false 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "# new_train_feature = encoder.predict(train_X.values)\n", 313 | "# new_test_feature = encoder.predict(test_X.values)\n", 314 | "# print(new_train_feature.shape)\n", 315 | "# print(new_test_feature.shape)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 14, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 15, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "xgb_train = xgb.DMatrix(X_train, y_train)\n", 338 | "xgb_val = xgb.DMatrix(X_val, y_val)\n", 339 | "xgb_test = xgb.DMatrix(test_X)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 16, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "param = {\n", 351 | " 'booster':'gbtree',\n", 352 | " 'max_depth':10,\n", 353 | " 'num_leaves':80,\n", 354 | " 'learning_rate':0.03,\n", 355 | " 'scale_pos_weight':1,\n", 356 | " 'num_threads':40,\n", 357 | " 'objective':'binary:logistic',\n", 358 | " 'bagging_fraction':0.7,\n", 359 | " 'bagging_freq':1,\n", 360 | " 'min_sum_hessian_in_leaf':100,\n", 361 | "}\n", 362 | "\n", 363 | "param['is_unbalance']='true'\n", 364 | "param['metric'] = 'auc'\n", 365 | "\n", 366 | "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=10, early_stopping_rounds=100)\n", 367 | "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])\n" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 17, 373 | "metadata": { 374 | "collapsed": false 375 | }, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "0.817833166049\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "ypred = gbm.predict(xgb_val)\n", 387 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n", 388 | "print(val_auc)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 18, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "(10000,)" 402 | ] 403 | }, 404 | "execution_count": 18, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "pred = gbm.predict(xgb_test)\n", 411 | "pred.shape" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 19, 417 | "metadata": { 418 | "collapsed": true 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 423 | "Submission.to_csv('../result/semi_xgb.csv',index=False)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "collapsed": true 431 | }, 432 | "outputs": [], 433 | "source": [] 434 | } 435 | ], 436 | "metadata": { 437 | "anaconda-cloud": {}, 438 | "kernelspec": { 439 | "display_name": "Python [default]", 440 | "language": "python", 441 | "name": "python3" 442 | }, 443 | "language_info": { 444 | "codemirror_mode": { 445 | "name": "ipython", 446 | "version": 3 447 | }, 448 | "file_extension": ".py", 449 | "mimetype": "text/x-python", 450 | "name": "python", 451 | "nbconvert_exporter": "python", 452 | "pygments_lexer": "ipython3", 453 | "version": "3.5.2" 454 | } 455 | }, 456 | "nbformat": 4, 457 | "nbformat_minor": 1 458 | } 459 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/semi_lgb预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "(10000, 160)" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n", 37 | "train_x1.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(15000, 160)" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n", 60 | "train_x2.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(10000, 159)" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "test = pd.read_csv('../data/test_all.csv')\n", 83 | "test.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(25000, 157)" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n", 106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n", 107 | "train_x = pd.concat([train_x11, train_x22])\n", 108 | "train_x.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(10000, 157)" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n", 131 | "test_x.shape" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(35000, 157)" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "x = pd.concat([train_x, test_x])\n", 154 | "x.shape" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "(25000,)" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "train_y1 = train_x1['y']\n", 177 | "train_y2 = train_x2['y']\n", 178 | "Y_train = train_y1.append(train_y2)\n", 179 | "Y_train.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "(35000, 364)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "for i in range(96,158):\n", 199 | " col = 'x'+'_'+str(i)\n", 200 | " if col in x.columns.values:\n", 201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n", 202 | " x = pd.concat([x, dummies_df], axis=1)\n", 203 | "print(x.shape)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "(25000, 364)\n", 218 | "(10000, 364)\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "train_X = x[0:25000]\n", 224 | "test_X = x[25000:35000]\n", 225 | "print(train_X.shape)\n", 226 | "print(test_X.shape)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 11, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stderr", 238 | "output_type": "stream", 239 | "text": [ 240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 242 | "Using TensorFlow backend.\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "from sklearn.tree import DecisionTreeClassifier\n", 248 | "from sklearn.ensemble import RandomForestClassifier\n", 249 | "from sklearn.ensemble import AdaBoostClassifier\n", 250 | "from sklearn.ensemble import ExtraTreesClassifier\n", 251 | "from sklearn.ensemble import GradientBoostingClassifier\n", 252 | "from sklearn.neighbors import KNeighborsClassifier\n", 253 | "from sklearn.svm import SVC\n", 254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n", 255 | "from sklearn.metrics import classification_report\n", 256 | "from sklearn.metrics import precision_recall_fscore_support\n", 257 | "from sklearn.utils.multiclass import unique_labels\n", 258 | "from sklearn.metrics import accuracy_score\n", 259 | "from xgboost import XGBClassifier\n", 260 | "from sklearn.ensemble import GradientBoostingClassifier\n", 261 | "from sklearn.cross_validation import cross_val_score\n", 262 | "from lightgbm import LGBMClassifier\n", 263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n", 264 | "from sklearn.svm import LinearSVC\n", 265 | "from sklearn import linear_model\n", 266 | "import lightgbm as lgb\n", 267 | "\n", 268 | "from keras.models import Model\n", 269 | "from keras.layers import Dense, Input" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "metadata": { 276 | "collapsed": false, 277 | "scrolled": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "# encoding_dim = 600\n", 282 | "# input_dim = Input(shape=(364,))\n", 283 | "\n", 284 | "# encoded = Dense(364, activation='linear')(input_dim)\n", 285 | "# # encoded = Dense(300, activation='relu')(encoded)\n", 286 | "# # encoded = Dense(32, activation='relu')(encoded)\n", 287 | "# encoder_output = Dense(encoding_dim)(encoded)\n", 288 | "\n", 289 | "# decoded = Dense(600, activation='relu')(encoder_output)\n", 290 | "# # decoded = Dense(64, activation='relu')(decoded)\n", 291 | "# # decoded = Dense(128, activation='relu')(decoded)\n", 292 | "# decoded = Dense(364, activation='tanh')(decoded)\n", 293 | "\n", 294 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n", 295 | "\n", 296 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n", 297 | "\n", 298 | "# autoencoder.compile(optimizer='adam', loss='mse')\n", 299 | "# # training\n", 300 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 13, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "# new_train_feature = encoder.predict(train_X.values)\n", 312 | "# new_test_feature = encoder.predict(test_X.values)\n", 313 | "# print(new_train_feature.shape)\n", 314 | "# print(new_test_feature.shape)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 14, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 15, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "lgb_train = lgb.Dataset(X_train, y_train)\n", 337 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 16, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "param = {\n", 349 | " 'max_depth':6,\n", 350 | " 'num_leaves':80,\n", 351 | " 'learning_rate':0.03,\n", 352 | " 'scale_pos_weight':1,\n", 353 | " 'num_threads':40,\n", 354 | " 'objective':'binary',\n", 355 | " 'bagging_fraction':0.7,\n", 356 | " 'bagging_freq':1,\n", 357 | " 'min_sum_hessian_in_leaf':100\n", 358 | "}\n", 359 | "\n", 360 | "param['is_unbalance']='true'\n", 361 | "param['metric'] = 'auc'\n", 362 | "\n", 363 | "\n", 364 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=30)\n", 365 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 17, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [ 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "0.810753521759\n" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "ypred = gbm.predict(X_val)\n", 385 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n", 386 | "print(val_auc)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 18, 392 | "metadata": { 393 | "collapsed": false 394 | }, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "(10000,)" 400 | ] 401 | }, 402 | "execution_count": 18, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "pred = gbm.predict(test_X)\n", 409 | "pred.shape" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 19, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 421 | "Submission.to_csv('../result/semi_lgb.csv',index=False)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "collapsed": true 429 | }, 430 | "outputs": [], 431 | "source": [] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": { 437 | "collapsed": true 438 | }, 439 | "outputs": [], 440 | "source": [] 441 | } 442 | ], 443 | "metadata": { 444 | "anaconda-cloud": {}, 445 | "kernelspec": { 446 | "display_name": "Python [default]", 447 | "language": "python", 448 | "name": "python3" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 3 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython3", 460 | "version": "3.5.2" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 1 465 | } 466 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_lgb预测-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "(10000, 160)" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n", 37 | "train_x1.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(15000, 160)" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n", 60 | "train_x2.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(10000, 159)" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "test = pd.read_csv('../data/test_all.csv')\n", 83 | "test.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(25000, 157)" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n", 106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n", 107 | "train_x = pd.concat([train_x11, train_x22])\n", 108 | "train_x.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(10000, 157)" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n", 131 | "test_x.shape" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(35000, 157)" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "x = pd.concat([train_x, test_x])\n", 154 | "x.shape" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "(25000,)" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "train_y1 = train_x1['y']\n", 177 | "train_y2 = train_x2['y']\n", 178 | "Y_train = train_y1.append(train_y2)\n", 179 | "Y_train.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "(35000, 364)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "for i in range(96,158):\n", 199 | " col = 'x'+'_'+str(i)\n", 200 | " if col in x.columns.values:\n", 201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n", 202 | " x = pd.concat([x, dummies_df], axis=1)\n", 203 | "print(x.shape)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "(25000, 364)\n", 218 | "(10000, 364)\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "train_X = x[0:25000]\n", 224 | "test_X = x[25000:35000]\n", 225 | "print(train_X.shape)\n", 226 | "print(test_X.shape)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 11, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stderr", 238 | "output_type": "stream", 239 | "text": [ 240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 242 | "Using TensorFlow backend.\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "from sklearn.tree import DecisionTreeClassifier\n", 248 | "from sklearn.ensemble import RandomForestClassifier\n", 249 | "from sklearn.ensemble import AdaBoostClassifier\n", 250 | "from sklearn.ensemble import ExtraTreesClassifier\n", 251 | "from sklearn.ensemble import GradientBoostingClassifier\n", 252 | "from sklearn.neighbors import KNeighborsClassifier\n", 253 | "from sklearn.svm import SVC\n", 254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n", 255 | "from sklearn.metrics import classification_report\n", 256 | "from sklearn.metrics import precision_recall_fscore_support\n", 257 | "from sklearn.utils.multiclass import unique_labels\n", 258 | "from sklearn.metrics import accuracy_score\n", 259 | "from xgboost import XGBClassifier\n", 260 | "from sklearn.ensemble import GradientBoostingClassifier\n", 261 | "from sklearn.cross_validation import cross_val_score\n", 262 | "from lightgbm import LGBMClassifier\n", 263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n", 264 | "from sklearn.svm import LinearSVC\n", 265 | "from sklearn import linear_model\n", 266 | "import lightgbm as lgb\n", 267 | "\n", 268 | "from keras.models import Model\n", 269 | "from keras.layers import Dense, Input" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "metadata": { 276 | "collapsed": false, 277 | "scrolled": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "# encoding_dim = 600\n", 282 | "# input_dim = Input(shape=(364,))\n", 283 | "\n", 284 | "# encoded = Dense(364, activation='linear')(input_dim)\n", 285 | "# # encoded = Dense(300, activation='relu')(encoded)\n", 286 | "# # encoded = Dense(32, activation='relu')(encoded)\n", 287 | "# encoder_output = Dense(encoding_dim)(encoded)\n", 288 | "\n", 289 | "# decoded = Dense(600, activation='relu')(encoder_output)\n", 290 | "# # decoded = Dense(64, activation='relu')(decoded)\n", 291 | "# # decoded = Dense(128, activation='relu')(decoded)\n", 292 | "# decoded = Dense(364, activation='tanh')(decoded)\n", 293 | "\n", 294 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n", 295 | "\n", 296 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n", 297 | "\n", 298 | "# autoencoder.compile(optimizer='adam', loss='mse')\n", 299 | "# # training\n", 300 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 13, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "# new_train_feature = encoder.predict(train_X.values)\n", 312 | "# new_test_feature = encoder.predict(test_X.values)\n", 313 | "# print(new_train_feature.shape)\n", 314 | "# print(new_test_feature.shape)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 14, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 15, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "lgb_train = lgb.Dataset(X_train, y_train)\n", 337 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 16, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "param = {\n", 349 | " 'max_depth':6,\n", 350 | " 'num_leaves':80,\n", 351 | " 'learning_rate':0.03,\n", 352 | " 'scale_pos_weight':1,\n", 353 | " 'num_threads':40,\n", 354 | " 'objective':'binary',\n", 355 | " 'bagging_fraction':0.7,\n", 356 | " 'bagging_freq':1,\n", 357 | " 'min_sum_hessian_in_leaf':100\n", 358 | "}\n", 359 | "\n", 360 | "param['is_unbalance']='true'\n", 361 | "param['metric'] = 'auc'\n", 362 | "\n", 363 | "\n", 364 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=30)\n", 365 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 17, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [ 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "0.810753521759\n" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "ypred = gbm.predict(X_val)\n", 385 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n", 386 | "print(val_auc)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 18, 392 | "metadata": { 393 | "collapsed": false 394 | }, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "(10000,)" 400 | ] 401 | }, 402 | "execution_count": 18, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "pred = gbm.predict(test_X)\n", 409 | "pred.shape" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 19, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 421 | "Submission.to_csv('../result/semi_lgb.csv',index=False)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "collapsed": true 429 | }, 430 | "outputs": [], 431 | "source": [] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": { 437 | "collapsed": true 438 | }, 439 | "outputs": [], 440 | "source": [] 441 | } 442 | ], 443 | "metadata": { 444 | "anaconda-cloud": {}, 445 | "kernelspec": { 446 | "display_name": "Python [default]", 447 | "language": "python", 448 | "name": "python3" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 3 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython3", 460 | "version": "3.5.2" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 1 465 | } 466 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/nn预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 6, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 7, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 8, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stderr", 144 | "output_type": "stream", 145 | "text": [ 146 | "Using TensorFlow backend.\n", 147 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 148 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "from sklearn.metrics import accuracy_score\n", 154 | "from sklearn import metrics\n", 155 | "from xgboost import XGBClassifier\n", 156 | "from sklearn.ensemble import GradientBoostingClassifier\n", 157 | "\n", 158 | "from keras.models import Sequential\n", 159 | "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n", 160 | "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n", 161 | "from sklearn.cross_validation import train_test_split\n", 162 | "from keras.optimizers import RMSprop, Adam\n", 163 | "from keras.callbacks import ReduceLROnPlateau\n", 164 | "from keras.callbacks import ModelCheckpoint\n", 165 | "from keras.utils.np_utils import to_categorical" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 12, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 13, 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "X_train = X_train.values\n", 188 | "X_val = X_val.values" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 14, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "y_train = y_train.values\n", 200 | "yy_train = to_categorical(y_train)\n", 201 | "\n", 202 | "y_val = y_val.values\n", 203 | "yy_val = to_categorical(y_val)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 15, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "# Set the CNN model \n", 215 | "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n", 216 | "\n", 217 | "model = Sequential()\n", 218 | "\n", 219 | "model.add(BatchNormalization(input_shape=(355,)))\n", 220 | "model.add(Reshape((355,1,1)))\n", 221 | "\n", 222 | "\n", 223 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n", 224 | " activation ='relu'))\n", 225 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n", 226 | " activation ='relu'))\n", 227 | "model.add(MaxPooling2D(pool_size=2, padding='same'))\n", 228 | "# model.add(Dropout(0.25))\n", 229 | "\n", 230 | "\n", 231 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n", 232 | " activation ='relu'))\n", 233 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n", 234 | " activation ='relu'))\n", 235 | "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n", 236 | "# model.add(Dropout(0.25))\n", 237 | "\n", 238 | "\n", 239 | "model.add(Flatten())\n", 240 | "model.add(Dense(256, activation = 'relu'))\n", 241 | "model.add(Dropout(0.5))\n", 242 | "model.add(Dense(2, activation = 'softmax'))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 16, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 17, 259 | "metadata": { 260 | "collapsed": false, 261 | "scrolled": true 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Train on 10800 samples, validate on 1200 samples\n", 269 | "Epoch 1/15\n", 270 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.2493 - acc: 0.9522 - val_loss: 0.4120 - val_acc: 0.9583\n", 271 | "Epoch 2/15\n", 272 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1865 - acc: 0.9530 - val_loss: 0.2823 - val_acc: 0.9583\n", 273 | "Epoch 3/15\n", 274 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1729 - acc: 0.9530 - val_loss: 0.2359 - val_acc: 0.9583\n", 275 | "Epoch 4/15\n", 276 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1679 - acc: 0.9530 - val_loss: 0.1838 - val_acc: 0.9583\n", 277 | "Epoch 5/15\n", 278 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1632 - acc: 0.9530 - val_loss: 0.1968 - val_acc: 0.9583\n", 279 | "Epoch 6/15\n", 280 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1657 - acc: 0.9530 - val_loss: 0.1643 - val_acc: 0.9583\n", 281 | "Epoch 7/15\n", 282 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1614 - acc: 0.9530 - val_loss: 0.2133 - val_acc: 0.9583\n", 283 | "Epoch 8/15\n", 284 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1626 - acc: 0.9530 - val_loss: 0.1540 - val_acc: 0.9583\n", 285 | "Epoch 9/15\n", 286 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1612 - acc: 0.9530 - val_loss: 0.1574 - val_acc: 0.9583\n", 287 | "Epoch 10/15\n", 288 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1605 - acc: 0.9530 - val_loss: 0.1564 - val_acc: 0.9583\n", 289 | "Epoch 11/15\n", 290 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1586 - acc: 0.9530 - val_loss: 0.1549 - val_acc: 0.9583\n", 291 | "Epoch 12/15\n", 292 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1585 - acc: 0.9530 - val_loss: 0.1545 - val_acc: 0.9583\n", 293 | "Epoch 13/15\n", 294 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1594 - acc: 0.9530 - val_loss: 0.1565 - val_acc: 0.9583\n", 295 | "Epoch 14/15\n", 296 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1570 - acc: 0.9530 - val_loss: 0.1589 - val_acc: 0.9583\n", 297 | "Epoch 15/15\n", 298 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1569 - acc: 0.9529 - val_loss: 0.1572 - val_acc: 0.9583\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 18, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "3000/3000 [==============================] - 1s 452us/step\n", 318 | "0.764895321667\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "predictions = model.predict_proba(X_val,verbose=1)\n", 324 | "pre = predictions[:,1]\n", 325 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 326 | "print(val_auc)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 19, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [ 336 | { 337 | "name": "stdout", 338 | "output_type": "stream", 339 | "text": [ 340 | "10000/10000 [==============================] - 5s 453us/step\n" 341 | ] 342 | }, 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "(10000,)" 347 | ] 348 | }, 349 | "execution_count": 19, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "preds = model.predict_proba(test_X.values)\n", 356 | "pred = preds[:,1]\n", 357 | "pred.shape" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 20, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 369 | "Submission.to_csv('../result/nn.csv',index=False)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "outputs": [], 379 | "source": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "collapsed": true 386 | }, 387 | "outputs": [], 388 | "source": [] 389 | } 390 | ], 391 | "metadata": { 392 | "anaconda-cloud": {}, 393 | "kernelspec": { 394 | "display_name": "Python [default]", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.5.2" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 1 413 | } 414 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/nn预测-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "train = pd.read_csv('../data/train_xy.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "test = pd.read_csv('../data/test_all.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(10000, 157)" 50 | ] 51 | }, 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n", 59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n", 60 | "x_train.shape\n", 61 | "x_test.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "(25000, 157)" 75 | ] 76 | }, 77 | "execution_count": 6, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "x = pd.concat([x_train,x_test])\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 7, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "Y_train = train['y']" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 8, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "for i in range(96,158):\n", 107 | " col = 'x'+'_'+str(i)\n", 108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n", 109 | " x = pd.concat([x, dummies_df], axis=1)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "(15000, 355)\n", 124 | "(10000, 355)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "train_X = x[0:15000]\n", 130 | "test_X = x[15000:25000]\n", 131 | "print(train_X.shape)\n", 132 | "print(test_X.shape)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stderr", 144 | "output_type": "stream", 145 | "text": [ 146 | "Using TensorFlow backend.\n", 147 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 148 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "from sklearn.metrics import accuracy_score\n", 154 | "from sklearn import metrics\n", 155 | "from xgboost import XGBClassifier\n", 156 | "from sklearn.ensemble import GradientBoostingClassifier\n", 157 | "\n", 158 | "from keras.models import Sequential\n", 159 | "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n", 160 | "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n", 161 | "from sklearn.cross_validation import train_test_split\n", 162 | "from keras.optimizers import RMSprop, Adam\n", 163 | "from keras.callbacks import ReduceLROnPlateau\n", 164 | "from keras.callbacks import ModelCheckpoint\n", 165 | "from keras.utils.np_utils import to_categorical" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 12, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 13, 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "X_train = X_train.values\n", 188 | "X_val = X_val.values" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 14, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "y_train = y_train.values\n", 200 | "yy_train = to_categorical(y_train)\n", 201 | "\n", 202 | "y_val = y_val.values\n", 203 | "yy_val = to_categorical(y_val)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 15, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "# Set the CNN model \n", 215 | "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n", 216 | "\n", 217 | "model = Sequential()\n", 218 | "\n", 219 | "model.add(BatchNormalization(input_shape=(355,)))\n", 220 | "model.add(Reshape((355,1,1)))\n", 221 | "\n", 222 | "\n", 223 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n", 224 | " activation ='relu'))\n", 225 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n", 226 | " activation ='relu'))\n", 227 | "model.add(MaxPooling2D(pool_size=2, padding='same'))\n", 228 | "# model.add(Dropout(0.25))\n", 229 | "\n", 230 | "\n", 231 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n", 232 | " activation ='relu'))\n", 233 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n", 234 | " activation ='relu'))\n", 235 | "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n", 236 | "# model.add(Dropout(0.25))\n", 237 | "\n", 238 | "\n", 239 | "model.add(Flatten())\n", 240 | "model.add(Dense(256, activation = 'relu'))\n", 241 | "model.add(Dropout(0.5))\n", 242 | "model.add(Dense(2, activation = 'softmax'))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 16, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 17, 259 | "metadata": { 260 | "collapsed": false, 261 | "scrolled": true 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Train on 10800 samples, validate on 1200 samples\n", 269 | "Epoch 1/15\n", 270 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.2493 - acc: 0.9522 - val_loss: 0.4120 - val_acc: 0.9583\n", 271 | "Epoch 2/15\n", 272 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1865 - acc: 0.9530 - val_loss: 0.2823 - val_acc: 0.9583\n", 273 | "Epoch 3/15\n", 274 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1729 - acc: 0.9530 - val_loss: 0.2359 - val_acc: 0.9583\n", 275 | "Epoch 4/15\n", 276 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1679 - acc: 0.9530 - val_loss: 0.1838 - val_acc: 0.9583\n", 277 | "Epoch 5/15\n", 278 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1632 - acc: 0.9530 - val_loss: 0.1968 - val_acc: 0.9583\n", 279 | "Epoch 6/15\n", 280 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1657 - acc: 0.9530 - val_loss: 0.1643 - val_acc: 0.9583\n", 281 | "Epoch 7/15\n", 282 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1614 - acc: 0.9530 - val_loss: 0.2133 - val_acc: 0.9583\n", 283 | "Epoch 8/15\n", 284 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1626 - acc: 0.9530 - val_loss: 0.1540 - val_acc: 0.9583\n", 285 | "Epoch 9/15\n", 286 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1612 - acc: 0.9530 - val_loss: 0.1574 - val_acc: 0.9583\n", 287 | "Epoch 10/15\n", 288 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1605 - acc: 0.9530 - val_loss: 0.1564 - val_acc: 0.9583\n", 289 | "Epoch 11/15\n", 290 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1586 - acc: 0.9530 - val_loss: 0.1549 - val_acc: 0.9583\n", 291 | "Epoch 12/15\n", 292 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1585 - acc: 0.9530 - val_loss: 0.1545 - val_acc: 0.9583\n", 293 | "Epoch 13/15\n", 294 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1594 - acc: 0.9530 - val_loss: 0.1565 - val_acc: 0.9583\n", 295 | "Epoch 14/15\n", 296 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1570 - acc: 0.9530 - val_loss: 0.1589 - val_acc: 0.9583\n", 297 | "Epoch 15/15\n", 298 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1569 - acc: 0.9529 - val_loss: 0.1572 - val_acc: 0.9583\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 18, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "3000/3000 [==============================] - 1s 452us/step\n", 318 | "0.764895321667\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "predictions = model.predict_proba(X_val,verbose=1)\n", 324 | "pre = predictions[:,1]\n", 325 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 326 | "print(val_auc)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 19, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [ 336 | { 337 | "name": "stdout", 338 | "output_type": "stream", 339 | "text": [ 340 | "10000/10000 [==============================] - 5s 453us/step\n" 341 | ] 342 | }, 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "(10000,)" 347 | ] 348 | }, 349 | "execution_count": 19, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "preds = model.predict_proba(test_X.values)\n", 356 | "pred = preds[:,1]\n", 357 | "pred.shape" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 20, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 369 | "Submission.to_csv('../result/nn.csv',index=False)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "outputs": [], 379 | "source": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "collapsed": true 386 | }, 387 | "outputs": [], 388 | "source": [] 389 | } 390 | ], 391 | "metadata": { 392 | "anaconda-cloud": {}, 393 | "kernelspec": { 394 | "display_name": "Python [default]", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.5.2" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 1 413 | } 414 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/semi_gbdt预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "(10000, 160)" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n", 37 | "train_x1.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(15000, 160)" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n", 60 | "train_x2.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(10000, 159)" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "test = pd.read_csv('../data/test_all.csv')\n", 83 | "test.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(25000, 157)" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n", 106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n", 107 | "train_x = pd.concat([train_x11, train_x22])\n", 108 | "train_x.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(10000, 157)" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n", 131 | "test_x.shape" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(35000, 157)" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "x = pd.concat([train_x, test_x])\n", 154 | "x.shape" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "(25000,)" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "train_y1 = train_x1['y']\n", 177 | "train_y2 = train_x2['y']\n", 178 | "Y_train = train_y1.append(train_y2)\n", 179 | "Y_train.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "(35000, 364)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "for i in range(96,158):\n", 199 | " col = 'x'+'_'+str(i)\n", 200 | " if col in x.columns.values:\n", 201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n", 202 | " x = pd.concat([x, dummies_df], axis=1)\n", 203 | "print(x.shape)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "(25000, 364)\n", 218 | "(10000, 364)\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "train_X = x[0:25000]\n", 224 | "test_X = x[25000:35000]\n", 225 | "print(train_X.shape)\n", 226 | "print(test_X.shape)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 11, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stderr", 238 | "output_type": "stream", 239 | "text": [ 240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 242 | "Using TensorFlow backend.\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "from sklearn.tree import DecisionTreeClassifier\n", 248 | "from sklearn.ensemble import RandomForestClassifier\n", 249 | "from sklearn.ensemble import AdaBoostClassifier\n", 250 | "from sklearn.ensemble import ExtraTreesClassifier\n", 251 | "from sklearn.ensemble import GradientBoostingClassifier\n", 252 | "from sklearn.neighbors import KNeighborsClassifier\n", 253 | "from sklearn.svm import SVC\n", 254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n", 255 | "from sklearn.metrics import classification_report\n", 256 | "from sklearn.metrics import precision_recall_fscore_support\n", 257 | "from sklearn.utils.multiclass import unique_labels\n", 258 | "from sklearn.metrics import accuracy_score\n", 259 | "from xgboost import XGBClassifier\n", 260 | "from sklearn.ensemble import GradientBoostingClassifier\n", 261 | "from sklearn.cross_validation import cross_val_score\n", 262 | "from lightgbm import LGBMClassifier\n", 263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n", 264 | "from sklearn.svm import LinearSVC\n", 265 | "from sklearn import linear_model\n", 266 | "import lightgbm as lgb\n", 267 | "import xgboost as xgb\n", 268 | "from sklearn.model_selection import GridSearchCV\n", 269 | "\n", 270 | "from keras.models import Model\n", 271 | "from keras.layers import Dense, Input" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 12, 277 | "metadata": { 278 | "collapsed": false, 279 | "scrolled": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "# encoding_dim = 600\n", 284 | "# input_dim = Input(shape=(364,))\n", 285 | "\n", 286 | "# encoded = Dense(364, activation='linear')(input_dim)\n", 287 | "# # encoded = Dense(300, activation='relu')(encoded)\n", 288 | "# # encoded = Dense(32, activation='relu')(encoded)\n", 289 | "# encoder_output = Dense(encoding_dim)(encoded)\n", 290 | "\n", 291 | "# decoded = Dense(600, activation='relu')(encoder_output)\n", 292 | "# # decoded = Dense(64, activation='relu')(decoded)\n", 293 | "# # decoded = Dense(128, activation='relu')(decoded)\n", 294 | "# decoded = Dense(364, activation='tanh')(decoded)\n", 295 | "\n", 296 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n", 297 | "\n", 298 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n", 299 | "\n", 300 | "# autoencoder.compile(optimizer='adam', loss='mse')\n", 301 | "# # training\n", 302 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 13, 308 | "metadata": { 309 | "collapsed": false 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "# new_train_feature = encoder.predict(train_X.values)\n", 314 | "# new_test_feature = encoder.predict(test_X.values)\n", 315 | "# print(new_train_feature.shape)\n", 316 | "# print(new_test_feature.shape)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 14, 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 15, 333 | "metadata": { 334 | "collapsed": false, 335 | "scrolled": true 336 | }, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "GridSearchCV(cv=5, error_score='raise',\n", 342 | " estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", 343 | " learning_rate=0.1, loss='deviance', max_depth=3,\n", 344 | " max_features=None, max_leaf_nodes=None,\n", 345 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 346 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 347 | " n_estimators=100, presort='auto', random_state=None,\n", 348 | " subsample=1.0, verbose=0, warm_start=False),\n", 349 | " fit_params={}, iid=True, n_jobs=1,\n", 350 | " param_grid=[{'max_depth': range(4, 8, 12), 'n_estimators': range(100, 300, 500), 'learning_rate': [0.01, 0.1]}],\n", 351 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", 352 | " scoring='roc_auc', verbose=0)" 353 | ] 354 | }, 355 | "execution_count": 15, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "tuned_parameters= [{'n_estimators':range(100,300,500),\n", 362 | " 'max_depth':range(4,8,12),\n", 363 | " 'learning_rate':[0.01, 0.1]\n", 364 | " }]\n", 365 | "clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')\n", 366 | "clf.fit(X_train, y_train)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 18, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "0.805375686875\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "predictions = clf.predict_proba(X_val)\n", 386 | "pre = predictions[:,1]\n", 387 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 388 | "print(val_auc) " 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 19, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "(10000,)" 402 | ] 403 | }, 404 | "execution_count": 19, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "preds = clf.predict_proba(test_X)\n", 411 | "pred = preds[:,1]\n", 412 | "pred.shape" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 20, 418 | "metadata": { 419 | "collapsed": true 420 | }, 421 | "outputs": [], 422 | "source": [ 423 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 424 | "Submission.to_csv('../result/semi_gbdt.csv',index=False)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": { 431 | "collapsed": true 432 | }, 433 | "outputs": [], 434 | "source": [] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": { 440 | "collapsed": true 441 | }, 442 | "outputs": [], 443 | "source": [] 444 | } 445 | ], 446 | "metadata": { 447 | "anaconda-cloud": {}, 448 | "kernelspec": { 449 | "display_name": "Python [default]", 450 | "language": "python", 451 | "name": "python3" 452 | }, 453 | "language_info": { 454 | "codemirror_mode": { 455 | "name": "ipython", 456 | "version": 3 457 | }, 458 | "file_extension": ".py", 459 | "mimetype": "text/x-python", 460 | "name": "python", 461 | "nbconvert_exporter": "python", 462 | "pygments_lexer": "ipython3", 463 | "version": "3.5.2" 464 | } 465 | }, 466 | "nbformat": 4, 467 | "nbformat_minor": 1 468 | } 469 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/semi_nn预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "(10000, 160)" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n", 37 | "train_x1.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(15000, 160)" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n", 60 | "train_x2.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(10000, 159)" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "test = pd.read_csv('../data/test_all.csv')\n", 83 | "test.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(25000, 157)" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n", 106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n", 107 | "train_x = pd.concat([train_x11, train_x22])\n", 108 | "train_x.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(10000, 157)" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n", 131 | "test_x.shape" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(35000, 157)" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "x = pd.concat([train_x, test_x])\n", 154 | "x.shape" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "(25000,)" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "train_y1 = train_x1['y']\n", 177 | "train_y2 = train_x2['y']\n", 178 | "Y_train = train_y1.append(train_y2)\n", 179 | "Y_train.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "(35000, 364)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "for i in range(96,158):\n", 199 | " col = 'x'+'_'+str(i)\n", 200 | " if col in x.columns.values:\n", 201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n", 202 | " x = pd.concat([x, dummies_df], axis=1)\n", 203 | "print(x.shape)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "(25000, 364)\n", 218 | "(10000, 364)\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "train_X = x[0:25000]\n", 224 | "test_X = x[25000:35000]\n", 225 | "print(train_X.shape)\n", 226 | "print(test_X.shape)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 11, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stderr", 238 | "output_type": "stream", 239 | "text": [ 240 | "Using TensorFlow backend.\n", 241 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 242 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "from sklearn.metrics import accuracy_score\n", 248 | "from sklearn import metrics\n", 249 | "\n", 250 | "from keras.models import Sequential\n", 251 | "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n", 252 | "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n", 253 | "from sklearn.cross_validation import train_test_split\n", 254 | "from keras.optimizers import RMSprop, Adam\n", 255 | "from keras.callbacks import ReduceLROnPlateau\n", 256 | "from keras.callbacks import ModelCheckpoint\n", 257 | "from keras.utils.np_utils import to_categorical" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 12, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 13, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | "(20000, 364)\n", 283 | "(5000, 364)\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "X_train = X_train.values\n", 289 | "X_val = X_val.values\n", 290 | "print(X_train.shape)\n", 291 | "print(X_val.shape)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 14, 297 | "metadata": { 298 | "collapsed": false 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "y_train = y_train.values\n", 303 | "yy_train = to_categorical(y_train)\n", 304 | "\n", 305 | "y_val = y_val.values\n", 306 | "yy_val = to_categorical(y_val)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 15, 312 | "metadata": { 313 | "collapsed": false, 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "# Set the CNN model \n", 319 | "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n", 320 | "\n", 321 | "model = Sequential()\n", 322 | "\n", 323 | "model.add(BatchNormalization(input_shape=(364,)))\n", 324 | "model.add(Reshape((364,1,1)))\n", 325 | "\n", 326 | "\n", 327 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n", 328 | " activation ='relu'))\n", 329 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n", 330 | " activation ='relu'))\n", 331 | "model.add(MaxPooling2D(pool_size=2, padding='same'))\n", 332 | "# model.add(Dropout(0.25))\n", 333 | "\n", 334 | "\n", 335 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n", 336 | " activation ='relu'))\n", 337 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n", 338 | " activation ='relu'))\n", 339 | "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n", 340 | "# model.add(Dropout(0.25))\n", 341 | "\n", 342 | "\n", 343 | "model.add(Flatten())\n", 344 | "model.add(Dense(256, activation = 'relu'))\n", 345 | "model.add(Dropout(0.5))\n", 346 | "model.add(Dense(2, activation = 'softmax'))" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 16, 352 | "metadata": { 353 | "collapsed": true 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 17, 363 | "metadata": { 364 | "collapsed": false, 365 | "scrolled": true 366 | }, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "Train on 18000 samples, validate on 2000 samples\n", 373 | "Epoch 1/15\n", 374 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.2315 - acc: 0.9452 - val_loss: 0.3059 - val_acc: 0.9550\n", 375 | "Epoch 2/15\n", 376 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1728 - acc: 0.9552 - val_loss: 0.2073 - val_acc: 0.9550\n", 377 | "Epoch 3/15\n", 378 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1647 - acc: 0.9552 - val_loss: 0.1616 - val_acc: 0.9550\n", 379 | "Epoch 4/15\n", 380 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1610 - acc: 0.9552 - val_loss: 0.1613 - val_acc: 0.9550\n", 381 | "Epoch 5/15\n", 382 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1608 - acc: 0.9552 - val_loss: 0.1613 - val_acc: 0.9550\n", 383 | "Epoch 6/15\n", 384 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1612 - acc: 0.9552 - val_loss: 0.1608 - val_acc: 0.9550\n", 385 | "Epoch 7/15\n", 386 | "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1597 - acc: 0.9552 - val_loss: 0.1600 - val_acc: 0.9550\n", 387 | "Epoch 8/15\n", 388 | "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1592 - acc: 0.9552 - val_loss: 0.1602 - val_acc: 0.9550\n", 389 | "Epoch 9/15\n", 390 | "18000/18000 [==============================] - 27s 1ms/step - loss: 0.1571 - acc: 0.9552 - val_loss: 0.1635 - val_acc: 0.9550\n", 391 | "Epoch 10/15\n", 392 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1578 - acc: 0.9552 - val_loss: 0.1620 - val_acc: 0.9550\n", 393 | "Epoch 11/15\n", 394 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1571 - acc: 0.9552 - val_loss: 0.1628 - val_acc: 0.9550\n", 395 | "Epoch 12/15\n", 396 | "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1584 - acc: 0.9552 - val_loss: 0.1633 - val_acc: 0.9550\n", 397 | "Epoch 13/15\n", 398 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1557 - acc: 0.9552 - val_loss: 0.1664 - val_acc: 0.9550\n", 399 | "Epoch 14/15\n", 400 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1551 - acc: 0.9552 - val_loss: 0.1637 - val_acc: 0.9550\n", 401 | "Epoch 15/15\n", 402 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1562 - acc: 0.9552 - val_loss: 0.1631 - val_acc: 0.9550\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 18, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [ 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "5000/5000 [==============================] - 2s 461us/step\n", 422 | "0.793837946347\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "predictions = model.predict_proba(X_val,verbose=1)\n", 428 | "pre = predictions[:,1]\n", 429 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 430 | "print(val_auc)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 19, 436 | "metadata": { 437 | "collapsed": false 438 | }, 439 | "outputs": [ 440 | { 441 | "name": "stdout", 442 | "output_type": "stream", 443 | "text": [ 444 | "10000/10000 [==============================] - 5s 459us/step\n" 445 | ] 446 | }, 447 | { 448 | "data": { 449 | "text/plain": [ 450 | "(10000,)" 451 | ] 452 | }, 453 | "execution_count": 19, 454 | "metadata": {}, 455 | "output_type": "execute_result" 456 | } 457 | ], 458 | "source": [ 459 | "preds = model.predict_proba(test_X.values)\n", 460 | "pred = preds[:,1]\n", 461 | "pred.shape" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 20, 467 | "metadata": { 468 | "collapsed": true 469 | }, 470 | "outputs": [], 471 | "source": [ 472 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 473 | "Submission.to_csv('../result/semi_nn.csv',index=False)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": { 480 | "collapsed": true 481 | }, 482 | "outputs": [], 483 | "source": [] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": { 489 | "collapsed": true 490 | }, 491 | "outputs": [], 492 | "source": [] 493 | } 494 | ], 495 | "metadata": { 496 | "anaconda-cloud": {}, 497 | "kernelspec": { 498 | "display_name": "Python [default]", 499 | "language": "python", 500 | "name": "python3" 501 | }, 502 | "language_info": { 503 | "codemirror_mode": { 504 | "name": "ipython", 505 | "version": 3 506 | }, 507 | "file_extension": ".py", 508 | "mimetype": "text/x-python", 509 | "name": "python", 510 | "nbconvert_exporter": "python", 511 | "pygments_lexer": "ipython3", 512 | "version": "3.5.2" 513 | } 514 | }, 515 | "nbformat": 4, 516 | "nbformat_minor": 1 517 | } 518 | -------------------------------------------------------------------------------- /模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_gbdt预测-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "(10000, 160)" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n", 37 | "train_x1.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(15000, 160)" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n", 60 | "train_x2.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(10000, 159)" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "test = pd.read_csv('../data/test_all.csv')\n", 83 | "test.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(25000, 157)" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n", 106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n", 107 | "train_x = pd.concat([train_x11, train_x22])\n", 108 | "train_x.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(10000, 157)" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n", 131 | "test_x.shape" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(35000, 157)" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "x = pd.concat([train_x, test_x])\n", 154 | "x.shape" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "(25000,)" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "train_y1 = train_x1['y']\n", 177 | "train_y2 = train_x2['y']\n", 178 | "Y_train = train_y1.append(train_y2)\n", 179 | "Y_train.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "(35000, 364)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "for i in range(96,158):\n", 199 | " col = 'x'+'_'+str(i)\n", 200 | " if col in x.columns.values:\n", 201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n", 202 | " x = pd.concat([x, dummies_df], axis=1)\n", 203 | "print(x.shape)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "(25000, 364)\n", 218 | "(10000, 364)\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "train_X = x[0:25000]\n", 224 | "test_X = x[25000:35000]\n", 225 | "print(train_X.shape)\n", 226 | "print(test_X.shape)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 15, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "from sklearn.tree import DecisionTreeClassifier\n", 238 | "from sklearn.ensemble import RandomForestClassifier\n", 239 | "from sklearn.ensemble import AdaBoostClassifier\n", 240 | "from sklearn.ensemble import ExtraTreesClassifier\n", 241 | "from sklearn.ensemble import GradientBoostingClassifier\n", 242 | "from sklearn.neighbors import KNeighborsClassifier\n", 243 | "from sklearn.svm import SVC\n", 244 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n", 245 | "from sklearn.metrics import classification_report\n", 246 | "from sklearn.metrics import precision_recall_fscore_support\n", 247 | "from sklearn.utils.multiclass import unique_labels\n", 248 | "from sklearn.metrics import accuracy_score\n", 249 | "from xgboost import XGBClassifier\n", 250 | "from sklearn.ensemble import GradientBoostingClassifier\n", 251 | "from sklearn.cross_validation import cross_val_score\n", 252 | "from lightgbm import LGBMClassifier\n", 253 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n", 254 | "from sklearn.svm import LinearSVC\n", 255 | "from sklearn import linear_model\n", 256 | "import lightgbm as lgb\n", 257 | "import xgboost as xgb\n", 258 | "from sklearn.model_selection import GridSearchCV\n", 259 | "\n", 260 | "from keras.models import Model\n", 261 | "from keras.layers import Dense, Input" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 12, 267 | "metadata": { 268 | "collapsed": false, 269 | "scrolled": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "# encoding_dim = 600\n", 274 | "# input_dim = Input(shape=(364,))\n", 275 | "\n", 276 | "# encoded = Dense(364, activation='linear')(input_dim)\n", 277 | "# # encoded = Dense(300, activation='relu')(encoded)\n", 278 | "# # encoded = Dense(32, activation='relu')(encoded)\n", 279 | "# encoder_output = Dense(encoding_dim)(encoded)\n", 280 | "\n", 281 | "# decoded = Dense(600, activation='relu')(encoder_output)\n", 282 | "# # decoded = Dense(64, activation='relu')(decoded)\n", 283 | "# # decoded = Dense(128, activation='relu')(decoded)\n", 284 | "# decoded = Dense(364, activation='tanh')(decoded)\n", 285 | "\n", 286 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n", 287 | "\n", 288 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n", 289 | "\n", 290 | "# autoencoder.compile(optimizer='adam', loss='mse')\n", 291 | "# # training\n", 292 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 13, 298 | "metadata": { 299 | "collapsed": false 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "# new_train_feature = encoder.predict(train_X.values)\n", 304 | "# new_test_feature = encoder.predict(test_X.values)\n", 305 | "# print(new_train_feature.shape)\n", 306 | "# print(new_test_feature.shape)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 27, 312 | "metadata": { 313 | "collapsed": false 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 29, 323 | "metadata": { 324 | "collapsed": false, 325 | "scrolled": true 326 | }, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "GridSearchCV(cv=5, error_score='raise',\n", 332 | " estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", 333 | " learning_rate=0.1, loss='deviance', max_depth=3,\n", 334 | " max_features=None, max_leaf_nodes=None,\n", 335 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 336 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 337 | " n_estimators=100, presort='auto', random_state=None,\n", 338 | " subsample=1.0, verbose=0, warm_start=False),\n", 339 | " fit_params={}, iid=True, n_jobs=1,\n", 340 | " param_grid=[{'learning_rate': [0.01, 0.1], 'n_estimators': range(100, 300, 500), 'max_depth': range(4, 8, 12)}],\n", 341 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", 342 | " scoring='roc_auc', verbose=0)" 343 | ] 344 | }, 345 | "execution_count": 29, 346 | "metadata": {}, 347 | "output_type": "execute_result" 348 | } 349 | ], 350 | "source": [ 351 | "tuned_parameters= [{'n_estimators':range(100,300,500),\n", 352 | " 'max_depth':range(4,8,12),\n", 353 | " 'learning_rate':[0.01, 0.1]\n", 354 | " }]\n", 355 | "clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')\n", 356 | "clf.fit(X_train, y_train)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 30, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "0.804463759173\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "predictions = clf.predict_proba(X_val)\n", 376 | "pre = predictions[:,1]\n", 377 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n", 378 | "print(val_auc)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 36, 384 | "metadata": { 385 | "collapsed": false 386 | }, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "text/plain": [ 391 | "(10000,)" 392 | ] 393 | }, 394 | "execution_count": 36, 395 | "metadata": {}, 396 | "output_type": "execute_result" 397 | } 398 | ], 399 | "source": [ 400 | "preds = clf.predict_proba(test_X)\n", 401 | "pred = preds[:,1]\n", 402 | "pred.shape" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 37, 408 | "metadata": { 409 | "collapsed": true 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n", 414 | "Submission.to_csv('../result/semi_gbdt.csv',index=False)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "outputs": [], 424 | "source": [] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "collapsed": true 431 | }, 432 | "outputs": [], 433 | "source": [] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 57, 438 | "metadata": { 439 | "collapsed": false 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "xgb = pd.read_csv('../result/semi_xgb4.csv')" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 58, 449 | "metadata": { 450 | "collapsed": true 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "lgb = pd.read_csv('../result/semi_lgb2.csv')" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 59, 460 | "metadata": { 461 | "collapsed": true 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "result = xgb.pred_prob*0.3 + lgb.pred_prob*0.7" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 60, 471 | "metadata": { 472 | "collapsed": false 473 | }, 474 | "outputs": [ 475 | { 476 | "data": { 477 | "text/html": [ 478 | "
\n", 479 | "\n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | "
cust_idpred_prob
010.038582
120.087885
230.342310
340.213558
450.193331
\n", 515 | "
" 516 | ], 517 | "text/plain": [ 518 | " cust_id pred_prob\n", 519 | "0 1 0.038582\n", 520 | "1 2 0.087885\n", 521 | "2 3 0.342310\n", 522 | "3 4 0.213558\n", 523 | "4 5 0.193331" 524 | ] 525 | }, 526 | "execution_count": 60, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "xgb.pred_prob = result\n", 533 | "xgb.head()" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 61, 539 | "metadata": { 540 | "collapsed": true 541 | }, 542 | "outputs": [], 543 | "source": [ 544 | "xgb.to_csv('../result/semi_xgb_lgb1.csv',index= False)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": { 551 | "collapsed": true 552 | }, 553 | "outputs": [], 554 | "source": [] 555 | } 556 | ], 557 | "metadata": { 558 | "anaconda-cloud": {}, 559 | "kernelspec": { 560 | "display_name": "Python [default]", 561 | "language": "python", 562 | "name": "python3" 563 | }, 564 | "language_info": { 565 | "codemirror_mode": { 566 | "name": "ipython", 567 | "version": 3 568 | }, 569 | "file_extension": ".py", 570 | "mimetype": "text/x-python", 571 | "name": "python", 572 | "nbconvert_exporter": "python", 573 | "pygments_lexer": "ipython3", 574 | "version": "3.5.2" 575 | } 576 | }, 577 | "nbformat": 4, 578 | "nbformat_minor": 1 579 | } 580 | --------------------------------------------------------------------------------