├── .gitattributes
├── img
    ├── method1.PNG
    └── method2.PNG
├── 模型一代码b榜0.75673
    ├── result
    │   └── res
    │   │   └── ensemble.txt
    └── code
    │   ├── .ipynb_checkpoints
    │       └── ensemble-checkpoint.ipynb
    │   └── ensemble.ipynb
├── README.md
└── 模型二代码b榜0.749880
    └── code
        ├── ensemble1.ipynb
        ├── ensemble.ipynb
        ├── .ipynb_checkpoints
            ├── ensemble1-checkpoint.ipynb
            ├── ensemble-checkpoint.ipynb
            ├── gbdt预测-checkpoint.ipynb
            ├── _ensemble-checkpoint.ipynb
            ├── semi_ensemble-checkpoint.ipynb
            ├── lgb预测-checkpoint.ipynb
            ├── xgb预测-checkpoint.ipynb
            ├── semi_xgb预测-checkpoint.ipynb
            ├── semi_lgb预测-checkpoint.ipynb
            ├── nn预测-checkpoint.ipynb
            └── semi_gbdt预测-checkpoint.ipynb
        ├── xgb预测.ipynb
        ├── gbdt预测.ipynb
        ├── _ensemble.ipynb
        ├── semi_ensemble.ipynb
        ├── lgb预测.ipynb
        ├── semi_xgb预测.ipynb
        ├── semi_lgb预测.ipynb
        ├── nn预测.ipynb
        ├── semi_gbdt预测.ipynb
        └── semi_nn预测.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/img/method1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CuiNing6/2018-xinwang/HEAD/img/method1.PNG


--------------------------------------------------------------------------------
/img/method2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CuiNing6/2018-xinwang/HEAD/img/method2.PNG


--------------------------------------------------------------------------------
/模型一代码b榜0.75673/result/res/ensemble.txt:
--------------------------------------------------------------------------------
 1 | xgb_all_feature       Submission      0.74577
 2 | lgb_all_feature_cv    Submission4     0.74
 3 | gbdt_all_feature      Submission9     0.73
 4 | nn_all_feature        Submission11    0.70
 5 | xgb_corr_feature      Submission12    0.70
 6 | 
 7 | Submission*0.5 + Submission4*0.5 = sub_0_4 = 0.7463
 8 | sub_0_4*0.7 + Submission9*0.3 = sub_0_4_9 = 0.74651
 9 | sub_0_4_9*0.7 + Submission11*0.3 = sub_0_4_9_11 = 0.74757
10 | sub_0_4_9_11*0.7 + Submission12*0.3 = sub_0_4_9_11_12 = 0.75152
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 2018-新网银行杯Top1方案  
 2 | 比赛链接：http://www.dcjingsai.com/common/cmpt/西南财经大学“新网银行杯”数据科学竞赛_竞赛信息.html  
 3 | # 队伍名称：摸金校尉  
 4 | # 解决方案：
 5 | ## 基于集成学习的信用风险预测模型
 6 |    本次比赛通过机器学习和数据挖掘技术定量分析信用风险，给出每个样本的预测结果。首先，研究了违约客户和履约客户这两批客户的特征，其次，将机器学习领域比较流行的集成学习模型应用于信用风险评估领域，并利用主流的模型性能评价指标评价模型。在比赛中，对类别型数据进行哑编码，并搭建自编码网络提取特征，利用特征相关性，特征重要性，information value三个方法筛选特征，最后，选取基于加权平均法的集成学习模型和类别分布不平衡环境下基于加权平均法的半监督集成模型对数据进行建模，并使用AUC作为模型性能的具体评价指标，通过两种参数调节方法优化模型。在测试数据集上应用，竞赛结果验证了所构建的集成系统泛化能力较强，模型复杂度适中。  
 7 | ## 基于加权平均法的集成学习模型示意图：
 8 | ![基于加权平均法的集成学习模型](https://github.com/CuiNing6/2018-xinwang/blob/master/img/method1.PNG)
 9 | ## 基于加权平均法的半监督集成学习模型示意图：
10 | ![基于加权平均法的集成学习模型](https://github.com/CuiNing6/2018-xinwang/blob/master/img/method2.PNG)
11 | # 代码说明：  
12 | * 模型一对应基于加权平均法的集成学习模型  
13 | * 模型二对应基于加权平均法的半监督集成学习模型  
14 | * 其他代码里存放了比赛过程中的一些尝试，但是最终没有用到，包括自编码网络提取特征，多模态集成和woe特征构建等等。
15 | 


--------------------------------------------------------------------------------
/模型一代码b榜0.75673/code/.ipynb_checkpoints/ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "sub1 = pd.read_csv('../result/Submission.csv')\n",
 26 |     "sub2 = pd.read_csv('../result/Submission4.csv')\n",
 27 |     "sub3 = pd.read_csv('../result/Submission9.csv')\n",
 28 |     "sub4 = pd.read_csv('../result/Submission11.csv')\n",
 29 |     "sub5 = pd.read_csv('../result/Submission12.csv')"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "ensemble = sub1\n",
 41 |     "ensemble['pred_prob'] = sub1['pred_prob']*0.5 + sub2['pred_prob']*0.5"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "metadata": {
 48 |     "collapsed": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "ensemble1 = sub1\n",
 53 |     "ensemble1['pred_prob'] = ensemble['pred_prob']*0.7 + sub3['pred_prob']*0.3"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 5,
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "ensemble2 = sub1\n",
 65 |     "ensemble2['pred_prob'] = ensemble1['pred_prob']*0.7 + sub4['pred_prob']*0.3"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "ensemble3 = sub1\n",
 77 |     "ensemble3['pred_prob'] = ensemble2['pred_prob']*0.7 + sub5['pred_prob']*0.3"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 7,
 83 |    "metadata": {
 84 |     "collapsed": false,
 85 |     "scrolled": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "ensemble3.to_csv('../result/ensemble.csv')"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": []
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "kernelspec": {
104 |    "display_name": "Python [default]",
105 |    "language": "python",
106 |    "name": "python3"
107 |   },
108 |   "language_info": {
109 |    "codemirror_mode": {
110 |     "name": "ipython",
111 |     "version": 3
112 |    },
113 |    "file_extension": ".py",
114 |    "mimetype": "text/x-python",
115 |    "name": "python",
116 |    "nbconvert_exporter": "python",
117 |    "pygments_lexer": "ipython3",
118 |    "version": "3.5.2"
119 |   }
120 |  },
121 |  "nbformat": 4,
122 |  "nbformat_minor": 1
123 | }
124 | 


--------------------------------------------------------------------------------
/模型一代码b榜0.75673/code/ensemble.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "sub1 = pd.read_csv('../result/Submission.csv')\n",
 26 |     "sub2 = pd.read_csv('../result/Submission4.csv')\n",
 27 |     "sub3 = pd.read_csv('../result/Submission9.csv')\n",
 28 |     "sub4 = pd.read_csv('../result/Submission11.csv')\n",
 29 |     "sub5 = pd.read_csv('../result/Submission12.csv')"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "ensemble = sub1\n",
 41 |     "ensemble['pred_prob'] = sub1['pred_prob']*0.5 + sub2['pred_prob']*0.5"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "metadata": {
 48 |     "collapsed": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "ensemble1 = sub1\n",
 53 |     "ensemble1['pred_prob'] = ensemble['pred_prob']*0.7 + sub3['pred_prob']*0.3"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 5,
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "ensemble2 = sub1\n",
 65 |     "ensemble2['pred_prob'] = ensemble1['pred_prob']*0.7 + sub4['pred_prob']*0.3"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "ensemble3 = sub1\n",
 77 |     "ensemble3['pred_prob'] = ensemble2['pred_prob']*0.7 + sub5['pred_prob']*0.3"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 9,
 83 |    "metadata": {
 84 |     "collapsed": false,
 85 |     "scrolled": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "ensemble3.to_csv('../result/ensemble.csv',index=False)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": []
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "anaconda-cloud": {},
104 |   "kernelspec": {
105 |    "display_name": "Python [default]",
106 |    "language": "python",
107 |    "name": "python3"
108 |   },
109 |   "language_info": {
110 |    "codemirror_mode": {
111 |     "name": "ipython",
112 |     "version": 3
113 |    },
114 |    "file_extension": ".py",
115 |    "mimetype": "text/x-python",
116 |    "name": "python",
117 |    "nbconvert_exporter": "python",
118 |    "pygments_lexer": "ipython3",
119 |    "version": "3.5.2"
120 |   }
121 |  },
122 |  "nbformat": 4,
123 |  "nbformat_minor": 1
124 | }
125 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/ensemble1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 5,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "ensemble = pd.read_csv('../result/free2/sub_0_4_9_11_12.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 6,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 8,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/html": [
 49 |        "<div>\n",
 50 |        "<table border=\"1\" class=\"dataframe\">\n",
 51 |        "  <thead>\n",
 52 |        "    <tr style=\"text-align: right;\">\n",
 53 |        "      <th></th>\n",
 54 |        "      <th>cust_id</th>\n",
 55 |        "      <th>pred_prob</th>\n",
 56 |        "    </tr>\n",
 57 |        "  </thead>\n",
 58 |        "  <tbody>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>0</th>\n",
 61 |        "      <td>1</td>\n",
 62 |        "      <td>0.016407</td>\n",
 63 |        "    </tr>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>1</th>\n",
 66 |        "      <td>2</td>\n",
 67 |        "      <td>0.051909</td>\n",
 68 |        "    </tr>\n",
 69 |        "    <tr>\n",
 70 |        "      <th>2</th>\n",
 71 |        "      <td>3</td>\n",
 72 |        "      <td>0.194874</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>3</th>\n",
 76 |        "      <td>4</td>\n",
 77 |        "      <td>0.081682</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>4</th>\n",
 81 |        "      <td>5</td>\n",
 82 |        "      <td>0.154739</td>\n",
 83 |        "    </tr>\n",
 84 |        "  </tbody>\n",
 85 |        "</table>\n",
 86 |        "</div>"
 87 |       ],
 88 |       "text/plain": [
 89 |        "   cust_id  pred_prob\n",
 90 |        "0        1   0.016407\n",
 91 |        "1        2   0.051909\n",
 92 |        "2        3   0.194874\n",
 93 |        "3        4   0.081682\n",
 94 |        "4        5   0.154739"
 95 |       ]
 96 |      },
 97 |      "execution_count": 8,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "ensemble.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 9,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "result = ensemble.pred_prob*0.7 + semi_ensemble.pred_prob*0.3"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 10,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "pred = _ensemble\n",
126 |     "pred.pred_prob = result"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 11,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/html": [
139 |        "<div>\n",
140 |        "<table border=\"1\" class=\"dataframe\">\n",
141 |        "  <thead>\n",
142 |        "    <tr style=\"text-align: right;\">\n",
143 |        "      <th></th>\n",
144 |        "      <th>cust_id</th>\n",
145 |        "      <th>pred_prob</th>\n",
146 |        "    </tr>\n",
147 |        "  </thead>\n",
148 |        "  <tbody>\n",
149 |        "    <tr>\n",
150 |        "      <th>0</th>\n",
151 |        "      <td>1</td>\n",
152 |        "      <td>0.018662</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>1</th>\n",
156 |        "      <td>2</td>\n",
157 |        "      <td>0.051679</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>2</th>\n",
161 |        "      <td>3</td>\n",
162 |        "      <td>0.191229</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>3</th>\n",
166 |        "      <td>4</td>\n",
167 |        "      <td>0.092566</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>5</td>\n",
172 |        "      <td>0.142426</td>\n",
173 |        "    </tr>\n",
174 |        "  </tbody>\n",
175 |        "</table>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "   cust_id  pred_prob\n",
180 |        "0        1   0.018662\n",
181 |        "1        2   0.051679\n",
182 |        "2        3   0.191229\n",
183 |        "3        4   0.092566\n",
184 |        "4        5   0.142426"
185 |       ]
186 |      },
187 |      "execution_count": 11,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "pred.head()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 12,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "pred.to_csv('../result/ensemble_final1.csv',index = False)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": "Python [default]",
220 |    "language": "python",
221 |    "name": "python3"
222 |   },
223 |   "language_info": {
224 |    "codemirror_mode": {
225 |     "name": "ipython",
226 |     "version": 3
227 |    },
228 |    "file_extension": ".py",
229 |    "mimetype": "text/x-python",
230 |    "name": "python",
231 |    "nbconvert_exporter": "python",
232 |    "pygments_lexer": "ipython3",
233 |    "version": "3.5.2"
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 1
238 | }
239 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/ensemble.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "_ensemble = pd.read_csv('../result/_ensemble.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/html": [
 49 |        "<div>\n",
 50 |        "<table border=\"1\" class=\"dataframe\">\n",
 51 |        "  <thead>\n",
 52 |        "    <tr style=\"text-align: right;\">\n",
 53 |        "      <th></th>\n",
 54 |        "      <th>cust_id</th>\n",
 55 |        "      <th>pred_prob</th>\n",
 56 |        "    </tr>\n",
 57 |        "  </thead>\n",
 58 |        "  <tbody>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>0</th>\n",
 61 |        "      <td>1</td>\n",
 62 |        "      <td>0.014872</td>\n",
 63 |        "    </tr>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>1</th>\n",
 66 |        "      <td>2</td>\n",
 67 |        "      <td>0.036687</td>\n",
 68 |        "    </tr>\n",
 69 |        "    <tr>\n",
 70 |        "      <th>2</th>\n",
 71 |        "      <td>3</td>\n",
 72 |        "      <td>0.194181</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>3</th>\n",
 76 |        "      <td>4</td>\n",
 77 |        "      <td>0.064974</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>4</th>\n",
 81 |        "      <td>5</td>\n",
 82 |        "      <td>0.118509</td>\n",
 83 |        "    </tr>\n",
 84 |        "  </tbody>\n",
 85 |        "</table>\n",
 86 |        "</div>"
 87 |       ],
 88 |       "text/plain": [
 89 |        "   cust_id  pred_prob\n",
 90 |        "0        1   0.014872\n",
 91 |        "1        2   0.036687\n",
 92 |        "2        3   0.194181\n",
 93 |        "3        4   0.064974\n",
 94 |        "4        5   0.118509"
 95 |       ]
 96 |      },
 97 |      "execution_count": 5,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "_ensemble.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 6,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "result = semi_ensemble.pred_prob*0.4 + _ensemble.pred_prob*0.6"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 7,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "pred = _ensemble\n",
126 |     "pred.pred_prob = result"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 8,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/html": [
139 |        "<div>\n",
140 |        "<table border=\"1\" class=\"dataframe\">\n",
141 |        "  <thead>\n",
142 |        "    <tr style=\"text-align: right;\">\n",
143 |        "      <th></th>\n",
144 |        "      <th>cust_id</th>\n",
145 |        "      <th>pred_prob</th>\n",
146 |        "    </tr>\n",
147 |        "  </thead>\n",
148 |        "  <tbody>\n",
149 |        "    <tr>\n",
150 |        "      <th>0</th>\n",
151 |        "      <td>1</td>\n",
152 |        "      <td>0.015424</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>1</th>\n",
156 |        "      <td>2</td>\n",
157 |        "      <td>0.034652</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>2</th>\n",
161 |        "      <td>3</td>\n",
162 |        "      <td>0.146920</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>3</th>\n",
166 |        "      <td>4</td>\n",
167 |        "      <td>0.062570</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>5</td>\n",
172 |        "      <td>0.093373</td>\n",
173 |        "    </tr>\n",
174 |        "  </tbody>\n",
175 |        "</table>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "   cust_id  pred_prob\n",
180 |        "0        1   0.015424\n",
181 |        "1        2   0.034652\n",
182 |        "2        3   0.146920\n",
183 |        "3        4   0.062570\n",
184 |        "4        5   0.093373"
185 |       ]
186 |      },
187 |      "execution_count": 8,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "pred.head()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 9,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "pred.to_csv('../result/ensemble_final.csv',index = False)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "anaconda-cloud": {},
219 |   "kernelspec": {
220 |    "display_name": "Python [default]",
221 |    "language": "python",
222 |    "name": "python3"
223 |   },
224 |   "language_info": {
225 |    "codemirror_mode": {
226 |     "name": "ipython",
227 |     "version": 3
228 |    },
229 |    "file_extension": ".py",
230 |    "mimetype": "text/x-python",
231 |    "name": "python",
232 |    "nbconvert_exporter": "python",
233 |    "pygments_lexer": "ipython3",
234 |    "version": "3.5.2"
235 |   }
236 |  },
237 |  "nbformat": 4,
238 |  "nbformat_minor": 1
239 | }
240 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/ensemble1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 5,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "ensemble = pd.read_csv('../result/free2/sub_0_4_9_11_12.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 6,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 8,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/html": [
 49 |        "<div>\n",
 50 |        "<table border=\"1\" class=\"dataframe\">\n",
 51 |        "  <thead>\n",
 52 |        "    <tr style=\"text-align: right;\">\n",
 53 |        "      <th></th>\n",
 54 |        "      <th>cust_id</th>\n",
 55 |        "      <th>pred_prob</th>\n",
 56 |        "    </tr>\n",
 57 |        "  </thead>\n",
 58 |        "  <tbody>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>0</th>\n",
 61 |        "      <td>1</td>\n",
 62 |        "      <td>0.016407</td>\n",
 63 |        "    </tr>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>1</th>\n",
 66 |        "      <td>2</td>\n",
 67 |        "      <td>0.051909</td>\n",
 68 |        "    </tr>\n",
 69 |        "    <tr>\n",
 70 |        "      <th>2</th>\n",
 71 |        "      <td>3</td>\n",
 72 |        "      <td>0.194874</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>3</th>\n",
 76 |        "      <td>4</td>\n",
 77 |        "      <td>0.081682</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>4</th>\n",
 81 |        "      <td>5</td>\n",
 82 |        "      <td>0.154739</td>\n",
 83 |        "    </tr>\n",
 84 |        "  </tbody>\n",
 85 |        "</table>\n",
 86 |        "</div>"
 87 |       ],
 88 |       "text/plain": [
 89 |        "   cust_id  pred_prob\n",
 90 |        "0        1   0.016407\n",
 91 |        "1        2   0.051909\n",
 92 |        "2        3   0.194874\n",
 93 |        "3        4   0.081682\n",
 94 |        "4        5   0.154739"
 95 |       ]
 96 |      },
 97 |      "execution_count": 8,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "ensemble.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 9,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "result = ensemble.pred_prob*0.7 + semi_ensemble.pred_prob*0.3"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 10,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "pred = _ensemble\n",
126 |     "pred.pred_prob = result"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 11,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/html": [
139 |        "<div>\n",
140 |        "<table border=\"1\" class=\"dataframe\">\n",
141 |        "  <thead>\n",
142 |        "    <tr style=\"text-align: right;\">\n",
143 |        "      <th></th>\n",
144 |        "      <th>cust_id</th>\n",
145 |        "      <th>pred_prob</th>\n",
146 |        "    </tr>\n",
147 |        "  </thead>\n",
148 |        "  <tbody>\n",
149 |        "    <tr>\n",
150 |        "      <th>0</th>\n",
151 |        "      <td>1</td>\n",
152 |        "      <td>0.018662</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>1</th>\n",
156 |        "      <td>2</td>\n",
157 |        "      <td>0.051679</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>2</th>\n",
161 |        "      <td>3</td>\n",
162 |        "      <td>0.191229</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>3</th>\n",
166 |        "      <td>4</td>\n",
167 |        "      <td>0.092566</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>5</td>\n",
172 |        "      <td>0.142426</td>\n",
173 |        "    </tr>\n",
174 |        "  </tbody>\n",
175 |        "</table>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "   cust_id  pred_prob\n",
180 |        "0        1   0.018662\n",
181 |        "1        2   0.051679\n",
182 |        "2        3   0.191229\n",
183 |        "3        4   0.092566\n",
184 |        "4        5   0.142426"
185 |       ]
186 |      },
187 |      "execution_count": 11,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "pred.head()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 12,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "pred.to_csv('../result/ensemble_final1.csv',index = False)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": "Python [default]",
220 |    "language": "python",
221 |    "name": "python3"
222 |   },
223 |   "language_info": {
224 |    "codemirror_mode": {
225 |     "name": "ipython",
226 |     "version": 3
227 |    },
228 |    "file_extension": ".py",
229 |    "mimetype": "text/x-python",
230 |    "name": "python",
231 |    "nbconvert_exporter": "python",
232 |    "pygments_lexer": "ipython3",
233 |    "version": "3.5.2"
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 1
238 | }
239 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "_ensemble = pd.read_csv('../result/_ensemble.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/html": [
 49 |        "<div>\n",
 50 |        "<table border=\"1\" class=\"dataframe\">\n",
 51 |        "  <thead>\n",
 52 |        "    <tr style=\"text-align: right;\">\n",
 53 |        "      <th></th>\n",
 54 |        "      <th>cust_id</th>\n",
 55 |        "      <th>pred_prob</th>\n",
 56 |        "    </tr>\n",
 57 |        "  </thead>\n",
 58 |        "  <tbody>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>0</th>\n",
 61 |        "      <td>1</td>\n",
 62 |        "      <td>0.014872</td>\n",
 63 |        "    </tr>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>1</th>\n",
 66 |        "      <td>2</td>\n",
 67 |        "      <td>0.036687</td>\n",
 68 |        "    </tr>\n",
 69 |        "    <tr>\n",
 70 |        "      <th>2</th>\n",
 71 |        "      <td>3</td>\n",
 72 |        "      <td>0.194181</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>3</th>\n",
 76 |        "      <td>4</td>\n",
 77 |        "      <td>0.064974</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>4</th>\n",
 81 |        "      <td>5</td>\n",
 82 |        "      <td>0.118509</td>\n",
 83 |        "    </tr>\n",
 84 |        "  </tbody>\n",
 85 |        "</table>\n",
 86 |        "</div>"
 87 |       ],
 88 |       "text/plain": [
 89 |        "   cust_id  pred_prob\n",
 90 |        "0        1   0.014872\n",
 91 |        "1        2   0.036687\n",
 92 |        "2        3   0.194181\n",
 93 |        "3        4   0.064974\n",
 94 |        "4        5   0.118509"
 95 |       ]
 96 |      },
 97 |      "execution_count": 5,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "_ensemble.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 6,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "result = semi_ensemble.pred_prob*0.4 + _ensemble.pred_prob*0.6"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 7,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "pred = _ensemble\n",
126 |     "pred.pred_prob = result"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 8,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/html": [
139 |        "<div>\n",
140 |        "<table border=\"1\" class=\"dataframe\">\n",
141 |        "  <thead>\n",
142 |        "    <tr style=\"text-align: right;\">\n",
143 |        "      <th></th>\n",
144 |        "      <th>cust_id</th>\n",
145 |        "      <th>pred_prob</th>\n",
146 |        "    </tr>\n",
147 |        "  </thead>\n",
148 |        "  <tbody>\n",
149 |        "    <tr>\n",
150 |        "      <th>0</th>\n",
151 |        "      <td>1</td>\n",
152 |        "      <td>0.015424</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>1</th>\n",
156 |        "      <td>2</td>\n",
157 |        "      <td>0.034652</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>2</th>\n",
161 |        "      <td>3</td>\n",
162 |        "      <td>0.146920</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>3</th>\n",
166 |        "      <td>4</td>\n",
167 |        "      <td>0.062570</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>5</td>\n",
172 |        "      <td>0.093373</td>\n",
173 |        "    </tr>\n",
174 |        "  </tbody>\n",
175 |        "</table>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "   cust_id  pred_prob\n",
180 |        "0        1   0.015424\n",
181 |        "1        2   0.034652\n",
182 |        "2        3   0.146920\n",
183 |        "3        4   0.062570\n",
184 |        "4        5   0.093373"
185 |       ]
186 |      },
187 |      "execution_count": 8,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "pred.head()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 9,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "pred.to_csv('../result/ensemble_final.csv',index = False)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "anaconda-cloud": {},
219 |   "kernelspec": {
220 |    "display_name": "Python [default]",
221 |    "language": "python",
222 |    "name": "python3"
223 |   },
224 |   "language_info": {
225 |    "codemirror_mode": {
226 |     "name": "ipython",
227 |     "version": 3
228 |    },
229 |    "file_extension": ".py",
230 |    "mimetype": "text/x-python",
231 |    "name": "python",
232 |    "nbconvert_exporter": "python",
233 |    "pygments_lexer": "ipython3",
234 |    "version": "3.5.2"
235 |   }
236 |  },
237 |  "nbformat": 4,
238 |  "nbformat_minor": 1
239 | }
240 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/xgb预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 3,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 6,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 6,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 7,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 7,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 8,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 9,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 10,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 11,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from sklearn.metrics import accuracy_score\n",
144 |     "from sklearn import metrics\n",
145 |     "from sklearn.model_selection import train_test_split\n",
146 |     "from xgboost import XGBClassifier"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 13,
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 14,
163 |    "metadata": {
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "gbm = XGBClassifier( n_estimators= 100, max_depth= 4, min_child_weight= 2, gamma=0.9, subsample=0.8, \n",
169 |     "                        colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1).fit(X_train, y_train)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 15,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "0.80642048092\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "predictions = gbm.predict_proba(X_val)\n",
189 |     "pre = predictions[:,1]\n",
190 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
191 |     "print(val_auc)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 16,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "(10000,)"
205 |       ]
206 |      },
207 |      "execution_count": 16,
208 |      "metadata": {},
209 |      "output_type": "execute_result"
210 |     }
211 |    ],
212 |    "source": [
213 |     "preds = gbm.predict_proba(test_X)\n",
214 |     "pred = preds[:,1]\n",
215 |     "pred.shape"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 18,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
227 |     "Submission.to_csv('../result/xgb.csv',index=False)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "collapsed": true
235 |    },
236 |    "outputs": [],
237 |    "source": []
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "anaconda-cloud": {},
242 |   "kernelspec": {
243 |    "display_name": "Python [default]",
244 |    "language": "python",
245 |    "name": "python3"
246 |   },
247 |   "language_info": {
248 |    "codemirror_mode": {
249 |     "name": "ipython",
250 |     "version": 3
251 |    },
252 |    "file_extension": ".py",
253 |    "mimetype": "text/x-python",
254 |    "name": "python",
255 |    "nbconvert_exporter": "python",
256 |    "pygments_lexer": "ipython3",
257 |    "version": "3.5.2"
258 |   }
259 |  },
260 |  "nbformat": 4,
261 |  "nbformat_minor": 1
262 | }
263 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/gbdt预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 5,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 6,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 6,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 7,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 8,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 9,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 10,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from sklearn.metrics import accuracy_score\n",
144 |     "from sklearn import metrics\n",
145 |     "from sklearn.model_selection import train_test_split\n",
146 |     "from xgboost import XGBClassifier\n",
147 |     "from sklearn.ensemble import GradientBoostingClassifier"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 11,
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 16,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "clf = GradientBoostingClassifier(n_estimators=120, learning_rate=0.05,max_depth=3, random_state=0).fit(X_train, y_train)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 17,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "0.798263097577\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "predictions = clf.predict_proba(X_val)\n",
189 |     "pre = predictions[:,1]\n",
190 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
191 |     "print(val_auc)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 18,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "(10000,)"
205 |       ]
206 |      },
207 |      "execution_count": 18,
208 |      "metadata": {},
209 |      "output_type": "execute_result"
210 |     }
211 |    ],
212 |    "source": [
213 |     "preds = clf.predict_proba(test_X)\n",
214 |     "pred = preds[:,1]\n",
215 |     "pred.shape"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 19,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
227 |     "Submission.to_csv('../result/gbdt.csv',index=False)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "collapsed": true
235 |    },
236 |    "outputs": [],
237 |    "source": []
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {
243 |     "collapsed": true
244 |    },
245 |    "outputs": [],
246 |    "source": []
247 |   }
248 |  ],
249 |  "metadata": {
250 |   "anaconda-cloud": {},
251 |   "kernelspec": {
252 |    "display_name": "Python [default]",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.5.2"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 1
271 | }
272 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/gbdt预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 5,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 6,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 6,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 7,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 8,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 9,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 10,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from sklearn.metrics import accuracy_score\n",
144 |     "from sklearn import metrics\n",
145 |     "from sklearn.model_selection import train_test_split\n",
146 |     "from xgboost import XGBClassifier\n",
147 |     "from sklearn.ensemble import GradientBoostingClassifier"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 11,
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 16,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "clf = GradientBoostingClassifier(n_estimators=120, learning_rate=0.05,max_depth=3, random_state=0).fit(X_train, y_train)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 17,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "0.798263097577\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "predictions = clf.predict_proba(X_val)\n",
189 |     "pre = predictions[:,1]\n",
190 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
191 |     "print(val_auc)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 18,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "(10000,)"
205 |       ]
206 |      },
207 |      "execution_count": 18,
208 |      "metadata": {},
209 |      "output_type": "execute_result"
210 |     }
211 |    ],
212 |    "source": [
213 |     "preds = clf.predict_proba(test_X)\n",
214 |     "pred = preds[:,1]\n",
215 |     "pred.shape"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 19,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
227 |     "Submission.to_csv('../result/gbdt.csv',index=False)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "collapsed": true
235 |    },
236 |    "outputs": [],
237 |    "source": []
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {
243 |     "collapsed": true
244 |    },
245 |    "outputs": [],
246 |    "source": []
247 |   }
248 |  ],
249 |  "metadata": {
250 |   "anaconda-cloud": {},
251 |   "kernelspec": {
252 |    "display_name": "Python [default]",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.5.2"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 1
271 | }
272 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/_ensemble.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "xgb = pd.read_csv('../result/xgb.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "lgb = pd.read_csv('../result/lgb.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "gbdt = pd.read_csv('../result/gbdt.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 5,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "nn = pd.read_csv('../result/nn.csv')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "<div>\n",
 72 |        "<table border=\"1\" class=\"dataframe\">\n",
 73 |        "  <thead>\n",
 74 |        "    <tr style=\"text-align: right;\">\n",
 75 |        "      <th></th>\n",
 76 |        "      <th>cust_id</th>\n",
 77 |        "      <th>pred_prob</th>\n",
 78 |        "    </tr>\n",
 79 |        "  </thead>\n",
 80 |        "  <tbody>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>0</th>\n",
 83 |        "      <td>1</td>\n",
 84 |        "      <td>0.005983</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>1</th>\n",
 88 |        "      <td>2</td>\n",
 89 |        "      <td>0.025552</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>2</th>\n",
 93 |        "      <td>3</td>\n",
 94 |        "      <td>0.018066</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>3</th>\n",
 98 |        "      <td>4</td>\n",
 99 |        "      <td>0.035206</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>4</th>\n",
103 |        "      <td>5</td>\n",
104 |        "      <td>0.067734</td>\n",
105 |        "    </tr>\n",
106 |        "  </tbody>\n",
107 |        "</table>\n",
108 |        "</div>"
109 |       ],
110 |       "text/plain": [
111 |        "   cust_id  pred_prob\n",
112 |        "0        1   0.005983\n",
113 |        "1        2   0.025552\n",
114 |        "2        3   0.018066\n",
115 |        "3        4   0.035206\n",
116 |        "4        5   0.067734"
117 |       ]
118 |      },
119 |      "execution_count": 6,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "nn.head()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 7,
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 8,
142 |    "metadata": {
143 |     "collapsed": false
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "pred = nn\n",
148 |     "pred.pred_prob = result"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {
155 |     "collapsed": false
156 |    },
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/html": [
161 |        "<div>\n",
162 |        "<table border=\"1\" class=\"dataframe\">\n",
163 |        "  <thead>\n",
164 |        "    <tr style=\"text-align: right;\">\n",
165 |        "      <th></th>\n",
166 |        "      <th>cust_id</th>\n",
167 |        "      <th>pred_prob</th>\n",
168 |        "    </tr>\n",
169 |        "  </thead>\n",
170 |        "  <tbody>\n",
171 |        "    <tr>\n",
172 |        "      <th>0</th>\n",
173 |        "      <td>1</td>\n",
174 |        "      <td>0.014872</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>1</th>\n",
178 |        "      <td>2</td>\n",
179 |        "      <td>0.036687</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>2</th>\n",
183 |        "      <td>3</td>\n",
184 |        "      <td>0.194181</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>3</th>\n",
188 |        "      <td>4</td>\n",
189 |        "      <td>0.064974</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>4</th>\n",
193 |        "      <td>5</td>\n",
194 |        "      <td>0.118509</td>\n",
195 |        "    </tr>\n",
196 |        "  </tbody>\n",
197 |        "</table>\n",
198 |        "</div>"
199 |       ],
200 |       "text/plain": [
201 |        "   cust_id  pred_prob\n",
202 |        "0        1   0.014872\n",
203 |        "1        2   0.036687\n",
204 |        "2        3   0.194181\n",
205 |        "3        4   0.064974\n",
206 |        "4        5   0.118509"
207 |       ]
208 |      },
209 |      "execution_count": 9,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "pred.head()"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "pred.to_csv('../result/_ensemble.csv',index = False)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "anaconda-cloud": {},
241 |   "kernelspec": {
242 |    "display_name": "Python [default]",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.5.2"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 1
261 | }
262 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_ensemble.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "xgb = pd.read_csv('../result/semi_xgb.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "lgb = pd.read_csv('../result/semi_lgb.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "gbdt = pd.read_csv('../result/semi_gbdt.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 5,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "nn = pd.read_csv('../result/semi_nn.csv')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "<div>\n",
 72 |        "<table border=\"1\" class=\"dataframe\">\n",
 73 |        "  <thead>\n",
 74 |        "    <tr style=\"text-align: right;\">\n",
 75 |        "      <th></th>\n",
 76 |        "      <th>cust_id</th>\n",
 77 |        "      <th>pred_prob</th>\n",
 78 |        "    </tr>\n",
 79 |        "  </thead>\n",
 80 |        "  <tbody>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>0</th>\n",
 83 |        "      <td>1</td>\n",
 84 |        "      <td>0.003583</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>1</th>\n",
 88 |        "      <td>2</td>\n",
 89 |        "      <td>0.069257</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>2</th>\n",
 93 |        "      <td>3</td>\n",
 94 |        "      <td>0.015288</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>3</th>\n",
 98 |        "      <td>4</td>\n",
 99 |        "      <td>0.084963</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>4</th>\n",
103 |        "      <td>5</td>\n",
104 |        "      <td>0.064052</td>\n",
105 |        "    </tr>\n",
106 |        "  </tbody>\n",
107 |        "</table>\n",
108 |        "</div>"
109 |       ],
110 |       "text/plain": [
111 |        "   cust_id  pred_prob\n",
112 |        "0        1   0.003583\n",
113 |        "1        2   0.069257\n",
114 |        "2        3   0.015288\n",
115 |        "3        4   0.084963\n",
116 |        "4        5   0.064052"
117 |       ]
118 |      },
119 |      "execution_count": 6,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "nn.head()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 7,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 8,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "pred = nn\n",
148 |     "pred.pred_prob = result"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {
155 |     "collapsed": false
156 |    },
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/html": [
161 |        "<div>\n",
162 |        "<table border=\"1\" class=\"dataframe\">\n",
163 |        "  <thead>\n",
164 |        "    <tr style=\"text-align: right;\">\n",
165 |        "      <th></th>\n",
166 |        "      <th>cust_id</th>\n",
167 |        "      <th>pred_prob</th>\n",
168 |        "    </tr>\n",
169 |        "  </thead>\n",
170 |        "  <tbody>\n",
171 |        "    <tr>\n",
172 |        "      <th>0</th>\n",
173 |        "      <td>1</td>\n",
174 |        "      <td>0.016251</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>1</th>\n",
178 |        "      <td>2</td>\n",
179 |        "      <td>0.031600</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>2</th>\n",
183 |        "      <td>3</td>\n",
184 |        "      <td>0.076029</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>3</th>\n",
188 |        "      <td>4</td>\n",
189 |        "      <td>0.058965</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>4</th>\n",
193 |        "      <td>5</td>\n",
194 |        "      <td>0.055669</td>\n",
195 |        "    </tr>\n",
196 |        "  </tbody>\n",
197 |        "</table>\n",
198 |        "</div>"
199 |       ],
200 |       "text/plain": [
201 |        "   cust_id  pred_prob\n",
202 |        "0        1   0.016251\n",
203 |        "1        2   0.031600\n",
204 |        "2        3   0.076029\n",
205 |        "3        4   0.058965\n",
206 |        "4        5   0.055669"
207 |       ]
208 |      },
209 |      "execution_count": 9,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "pred.head()"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "pred.to_csv('../result/semi_ensemble.csv',index = False)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "anaconda-cloud": {},
241 |   "kernelspec": {
242 |    "display_name": "Python [default]",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.5.2"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 1
261 | }
262 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/_ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "xgb = pd.read_csv('../result/xgb.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "lgb = pd.read_csv('../result/lgb.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "gbdt = pd.read_csv('../result/gbdt.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 5,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "nn = pd.read_csv('../result/nn.csv')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "<div>\n",
 72 |        "<table border=\"1\" class=\"dataframe\">\n",
 73 |        "  <thead>\n",
 74 |        "    <tr style=\"text-align: right;\">\n",
 75 |        "      <th></th>\n",
 76 |        "      <th>cust_id</th>\n",
 77 |        "      <th>pred_prob</th>\n",
 78 |        "    </tr>\n",
 79 |        "  </thead>\n",
 80 |        "  <tbody>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>0</th>\n",
 83 |        "      <td>1</td>\n",
 84 |        "      <td>0.005983</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>1</th>\n",
 88 |        "      <td>2</td>\n",
 89 |        "      <td>0.025552</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>2</th>\n",
 93 |        "      <td>3</td>\n",
 94 |        "      <td>0.018066</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>3</th>\n",
 98 |        "      <td>4</td>\n",
 99 |        "      <td>0.035206</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>4</th>\n",
103 |        "      <td>5</td>\n",
104 |        "      <td>0.067734</td>\n",
105 |        "    </tr>\n",
106 |        "  </tbody>\n",
107 |        "</table>\n",
108 |        "</div>"
109 |       ],
110 |       "text/plain": [
111 |        "   cust_id  pred_prob\n",
112 |        "0        1   0.005983\n",
113 |        "1        2   0.025552\n",
114 |        "2        3   0.018066\n",
115 |        "3        4   0.035206\n",
116 |        "4        5   0.067734"
117 |       ]
118 |      },
119 |      "execution_count": 6,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "nn.head()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 7,
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 8,
142 |    "metadata": {
143 |     "collapsed": false
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "pred = nn\n",
148 |     "pred.pred_prob = result"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {
155 |     "collapsed": false
156 |    },
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/html": [
161 |        "<div>\n",
162 |        "<table border=\"1\" class=\"dataframe\">\n",
163 |        "  <thead>\n",
164 |        "    <tr style=\"text-align: right;\">\n",
165 |        "      <th></th>\n",
166 |        "      <th>cust_id</th>\n",
167 |        "      <th>pred_prob</th>\n",
168 |        "    </tr>\n",
169 |        "  </thead>\n",
170 |        "  <tbody>\n",
171 |        "    <tr>\n",
172 |        "      <th>0</th>\n",
173 |        "      <td>1</td>\n",
174 |        "      <td>0.014872</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>1</th>\n",
178 |        "      <td>2</td>\n",
179 |        "      <td>0.036687</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>2</th>\n",
183 |        "      <td>3</td>\n",
184 |        "      <td>0.194181</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>3</th>\n",
188 |        "      <td>4</td>\n",
189 |        "      <td>0.064974</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>4</th>\n",
193 |        "      <td>5</td>\n",
194 |        "      <td>0.118509</td>\n",
195 |        "    </tr>\n",
196 |        "  </tbody>\n",
197 |        "</table>\n",
198 |        "</div>"
199 |       ],
200 |       "text/plain": [
201 |        "   cust_id  pred_prob\n",
202 |        "0        1   0.014872\n",
203 |        "1        2   0.036687\n",
204 |        "2        3   0.194181\n",
205 |        "3        4   0.064974\n",
206 |        "4        5   0.118509"
207 |       ]
208 |      },
209 |      "execution_count": 9,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "pred.head()"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "pred.to_csv('../result/_ensemble.csv',index = False)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "anaconda-cloud": {},
241 |   "kernelspec": {
242 |    "display_name": "Python [default]",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.5.2"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 1
261 | }
262 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "xgb = pd.read_csv('../result/semi_xgb.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "lgb = pd.read_csv('../result/semi_lgb.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "gbdt = pd.read_csv('../result/semi_gbdt.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 5,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "nn = pd.read_csv('../result/semi_nn.csv')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "<div>\n",
 72 |        "<table border=\"1\" class=\"dataframe\">\n",
 73 |        "  <thead>\n",
 74 |        "    <tr style=\"text-align: right;\">\n",
 75 |        "      <th></th>\n",
 76 |        "      <th>cust_id</th>\n",
 77 |        "      <th>pred_prob</th>\n",
 78 |        "    </tr>\n",
 79 |        "  </thead>\n",
 80 |        "  <tbody>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>0</th>\n",
 83 |        "      <td>1</td>\n",
 84 |        "      <td>0.003583</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>1</th>\n",
 88 |        "      <td>2</td>\n",
 89 |        "      <td>0.069257</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>2</th>\n",
 93 |        "      <td>3</td>\n",
 94 |        "      <td>0.015288</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>3</th>\n",
 98 |        "      <td>4</td>\n",
 99 |        "      <td>0.084963</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>4</th>\n",
103 |        "      <td>5</td>\n",
104 |        "      <td>0.064052</td>\n",
105 |        "    </tr>\n",
106 |        "  </tbody>\n",
107 |        "</table>\n",
108 |        "</div>"
109 |       ],
110 |       "text/plain": [
111 |        "   cust_id  pred_prob\n",
112 |        "0        1   0.003583\n",
113 |        "1        2   0.069257\n",
114 |        "2        3   0.015288\n",
115 |        "3        4   0.084963\n",
116 |        "4        5   0.064052"
117 |       ]
118 |      },
119 |      "execution_count": 6,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "nn.head()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 7,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 8,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "pred = nn\n",
148 |     "pred.pred_prob = result"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {
155 |     "collapsed": false
156 |    },
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/html": [
161 |        "<div>\n",
162 |        "<table border=\"1\" class=\"dataframe\">\n",
163 |        "  <thead>\n",
164 |        "    <tr style=\"text-align: right;\">\n",
165 |        "      <th></th>\n",
166 |        "      <th>cust_id</th>\n",
167 |        "      <th>pred_prob</th>\n",
168 |        "    </tr>\n",
169 |        "  </thead>\n",
170 |        "  <tbody>\n",
171 |        "    <tr>\n",
172 |        "      <th>0</th>\n",
173 |        "      <td>1</td>\n",
174 |        "      <td>0.016251</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>1</th>\n",
178 |        "      <td>2</td>\n",
179 |        "      <td>0.031600</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>2</th>\n",
183 |        "      <td>3</td>\n",
184 |        "      <td>0.076029</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>3</th>\n",
188 |        "      <td>4</td>\n",
189 |        "      <td>0.058965</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>4</th>\n",
193 |        "      <td>5</td>\n",
194 |        "      <td>0.055669</td>\n",
195 |        "    </tr>\n",
196 |        "  </tbody>\n",
197 |        "</table>\n",
198 |        "</div>"
199 |       ],
200 |       "text/plain": [
201 |        "   cust_id  pred_prob\n",
202 |        "0        1   0.016251\n",
203 |        "1        2   0.031600\n",
204 |        "2        3   0.076029\n",
205 |        "3        4   0.058965\n",
206 |        "4        5   0.055669"
207 |       ]
208 |      },
209 |      "execution_count": 9,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "pred.head()"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "pred.to_csv('../result/semi_ensemble.csv',index = False)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "anaconda-cloud": {},
241 |   "kernelspec": {
242 |    "display_name": "Python [default]",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.5.2"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 1
261 | }
262 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/lgb预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 4,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 5,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 7,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 7,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 8,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 8,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 9,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 10,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 11,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 12,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from sklearn.metrics import accuracy_score\n",
144 |     "from sklearn import metrics\n",
145 |     "from sklearn.model_selection import train_test_split\n",
146 |     "from xgboost import XGBClassifier\n",
147 |     "import lightgbm as lgb"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 13,
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 14,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "lgb_train = lgb.Dataset(X_train, y_train)\n",
170 |     "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 15,
176 |    "metadata": {
177 |     "collapsed": false,
178 |     "scrolled": true
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "param = {\n",
183 |     "    'max_depth':6,\n",
184 |     "    'num_leaves':80,\n",
185 |     "    'learning_rate':0.03,\n",
186 |     "    'scale_pos_weight':1,\n",
187 |     "    'num_threads':40,\n",
188 |     "    'objective':'binary',\n",
189 |     "    'bagging_fraction':0.7,\n",
190 |     "    'bagging_freq':1,\n",
191 |     "    'min_sum_hessian_in_leaf':100\n",
192 |     "}\n",
193 |     "\n",
194 |     "param['is_unbalance']='true'\n",
195 |     "param['metric'] = 'auc'\n",
196 |     "\n",
197 |     "\n",
198 |     "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=3, early_stopping_rounds=30)\n",
199 |     "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 16,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "0.809795678593\n"
214 |      ]
215 |     }
216 |    ],
217 |    "source": [
218 |     "ypred = gbm.predict(X_val)\n",
219 |     "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
220 |     "print(val_auc)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 17,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "y_pred = gbm.predict(test_X)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 18,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': y_pred})\n",
243 |     "Submission.to_csv('../result/lgb.csv',index=False)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": []
254 |   }
255 |  ],
256 |  "metadata": {
257 |   "anaconda-cloud": {},
258 |   "kernelspec": {
259 |    "display_name": "Python [default]",
260 |    "language": "python",
261 |    "name": "python3"
262 |   },
263 |   "language_info": {
264 |    "codemirror_mode": {
265 |     "name": "ipython",
266 |     "version": 3
267 |    },
268 |    "file_extension": ".py",
269 |    "mimetype": "text/x-python",
270 |    "name": "python",
271 |    "nbconvert_exporter": "python",
272 |    "pygments_lexer": "ipython3",
273 |    "version": "3.5.2"
274 |   }
275 |  },
276 |  "nbformat": 4,
277 |  "nbformat_minor": 1
278 | }
279 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/lgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 4,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 5,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 7,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 7,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 8,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 8,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 9,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 10,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 11,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 12,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from sklearn.metrics import accuracy_score\n",
144 |     "from sklearn import metrics\n",
145 |     "from sklearn.model_selection import train_test_split\n",
146 |     "from xgboost import XGBClassifier\n",
147 |     "import lightgbm as lgb"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 13,
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 14,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "lgb_train = lgb.Dataset(X_train, y_train)\n",
170 |     "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 15,
176 |    "metadata": {
177 |     "collapsed": false,
178 |     "scrolled": true
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "param = {\n",
183 |     "    'max_depth':6,\n",
184 |     "    'num_leaves':80,\n",
185 |     "    'learning_rate':0.03,\n",
186 |     "    'scale_pos_weight':1,\n",
187 |     "    'num_threads':40,\n",
188 |     "    'objective':'binary',\n",
189 |     "    'bagging_fraction':0.7,\n",
190 |     "    'bagging_freq':1,\n",
191 |     "    'min_sum_hessian_in_leaf':100\n",
192 |     "}\n",
193 |     "\n",
194 |     "param['is_unbalance']='true'\n",
195 |     "param['metric'] = 'auc'\n",
196 |     "\n",
197 |     "\n",
198 |     "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=3, early_stopping_rounds=30)\n",
199 |     "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 16,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "0.809795678593\n"
214 |      ]
215 |     }
216 |    ],
217 |    "source": [
218 |     "ypred = gbm.predict(X_val)\n",
219 |     "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
220 |     "print(val_auc)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 17,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "y_pred = gbm.predict(test_X)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 18,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': y_pred})\n",
243 |     "Submission.to_csv('../result/lgb.csv',index=False)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": []
254 |   }
255 |  ],
256 |  "metadata": {
257 |   "anaconda-cloud": {},
258 |   "kernelspec": {
259 |    "display_name": "Python [default]",
260 |    "language": "python",
261 |    "name": "python3"
262 |   },
263 |   "language_info": {
264 |    "codemirror_mode": {
265 |     "name": "ipython",
266 |     "version": 3
267 |    },
268 |    "file_extension": ".py",
269 |    "mimetype": "text/x-python",
270 |    "name": "python",
271 |    "nbconvert_exporter": "python",
272 |    "pygments_lexer": "ipython3",
273 |    "version": "3.5.2"
274 |   }
275 |  },
276 |  "nbformat": 4,
277 |  "nbformat_minor": 1
278 | }
279 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/xgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 4,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 5,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 7,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 7,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 8,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 8,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 9,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 10,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 11,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 12,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stderr",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
147 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
148 |       "Using TensorFlow backend.\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "from sklearn.tree import DecisionTreeClassifier\n",
154 |     "from sklearn.ensemble import RandomForestClassifier\n",
155 |     "from sklearn.ensemble import AdaBoostClassifier\n",
156 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
157 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
158 |     "from sklearn.neighbors import KNeighborsClassifier\n",
159 |     "from sklearn.svm import SVC\n",
160 |     "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
161 |     "from sklearn.metrics import classification_report\n",
162 |     "from sklearn.metrics import precision_recall_fscore_support\n",
163 |     "from sklearn.utils.multiclass import unique_labels\n",
164 |     "from sklearn.metrics import accuracy_score\n",
165 |     "from xgboost import XGBClassifier\n",
166 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
167 |     "from sklearn.cross_validation import cross_val_score\n",
168 |     "from lightgbm import LGBMClassifier\n",
169 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
170 |     "from sklearn.svm import LinearSVC\n",
171 |     "from sklearn import linear_model\n",
172 |     "import lightgbm as lgb\n",
173 |     "import xgboost as xgb\n",
174 |     "\n",
175 |     "from keras.models import Model\n",
176 |     "from keras.layers import Dense, Input"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 14,
182 |    "metadata": {
183 |     "collapsed": false
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 15,
193 |    "metadata": {
194 |     "collapsed": true
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "xgb_train = xgb.DMatrix(X_train, y_train)\n",
199 |     "xgb_val = xgb.DMatrix(X_val, y_val)\n",
200 |     "xgb_test = xgb.DMatrix(test_X)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 29,
206 |    "metadata": {
207 |     "collapsed": false
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "param = {\n",
212 |     "    'booster':'gbtree',\n",
213 |     "    'max_depth':4,\n",
214 |     "    'num_leaves':50,\n",
215 |     "    'learning_rate':0.05,\n",
216 |     "    'scale_pos_weight':1,\n",
217 |     "    'num_threads':40,\n",
218 |     "    'objective':'binary:logistic',\n",
219 |     "    'bagging_fraction':0.7,\n",
220 |     "    'bagging_freq':1,\n",
221 |     "    'min_sum_hessian_in_leaf':100,\n",
222 |     "}\n",
223 |     "\n",
224 |     "param['is_unbalance']='true'\n",
225 |     "param['metric'] = 'auc'\n",
226 |     "\n",
227 |     "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=100)\n",
228 |     "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 30,
234 |    "metadata": {
235 |     "collapsed": false
236 |    },
237 |    "outputs": [
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "0.806100532112\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "ypred = gbm.predict(xgb_val)\n",
248 |     "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
249 |     "print(val_auc)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 31,
255 |    "metadata": {
256 |     "collapsed": false
257 |    },
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/plain": [
262 |        "(10000,)"
263 |       ]
264 |      },
265 |      "execution_count": 31,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "pred = gbm.predict(xgb_test)\n",
272 |     "pred.shape"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 32,
278 |    "metadata": {
279 |     "collapsed": true
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
284 |     "Submission.to_csv('../result/xgb.csv',index=False)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "collapsed": true
292 |    },
293 |    "outputs": [],
294 |    "source": []
295 |   }
296 |  ],
297 |  "metadata": {
298 |   "anaconda-cloud": {},
299 |   "kernelspec": {
300 |    "display_name": "Python [default]",
301 |    "language": "python",
302 |    "name": "python3"
303 |   },
304 |   "language_info": {
305 |    "codemirror_mode": {
306 |     "name": "ipython",
307 |     "version": 3
308 |    },
309 |    "file_extension": ".py",
310 |    "mimetype": "text/x-python",
311 |    "name": "python",
312 |    "nbconvert_exporter": "python",
313 |    "pygments_lexer": "ipython3",
314 |    "version": "3.5.2"
315 |   }
316 |  },
317 |  "nbformat": 4,
318 |  "nbformat_minor": 1
319 | }
320 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_xgb预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "(10000, 160)"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
 37 |     "train_x1.shape"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "(15000, 160)"
 51 |       ]
 52 |      },
 53 |      "execution_count": 3,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
 60 |     "train_x2.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(10000, 159)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "test = pd.read_csv('../data/test_all.csv')\n",
 83 |     "test.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "(25000, 157)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 |     "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 |     "train_x = pd.concat([train_x11, train_x22])\n",
108 |     "train_x.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "(10000, 157)"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 |     "test_x.shape"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(35000, 157)"
145 |       ]
146 |      },
147 |      "execution_count": 7,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "x = pd.concat([train_x, test_x])\n",
154 |     "x.shape"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "(25000,)"
168 |       ]
169 |      },
170 |      "execution_count": 8,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "train_y1 = train_x1['y']\n",
177 |     "train_y2 = train_x2['y']\n",
178 |     "Y_train = train_y1.append(train_y2)\n",
179 |     "Y_train.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "(35000, 364)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "for i in range(96,158):\n",
199 |     "    col = 'x'+'_'+str(i)\n",
200 |     "    if col in x.columns.values:\n",
201 |     "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 |     "        x = pd.concat([x, dummies_df], axis=1)\n",
203 |     "print(x.shape)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "(25000, 364)\n",
218 |       "(10000, 364)\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "train_X = x[0:25000]\n",
224 |     "test_X = x[25000:35000]\n",
225 |     "print(train_X.shape)\n",
226 |     "print(test_X.shape)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 11,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 |       "Using TensorFlow backend.\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "from sklearn.tree import DecisionTreeClassifier\n",
248 |     "from sklearn.ensemble import RandomForestClassifier\n",
249 |     "from sklearn.ensemble import AdaBoostClassifier\n",
250 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
251 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
252 |     "from sklearn.neighbors import KNeighborsClassifier\n",
253 |     "from sklearn.svm import SVC\n",
254 |     "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
255 |     "from sklearn.metrics import classification_report\n",
256 |     "from sklearn.metrics import precision_recall_fscore_support\n",
257 |     "from sklearn.utils.multiclass import unique_labels\n",
258 |     "from sklearn.metrics import accuracy_score\n",
259 |     "from xgboost import XGBClassifier\n",
260 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
261 |     "from sklearn.cross_validation import cross_val_score\n",
262 |     "from lightgbm import LGBMClassifier\n",
263 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 |     "from sklearn.svm import LinearSVC\n",
265 |     "from sklearn import linear_model\n",
266 |     "import lightgbm as lgb\n",
267 |     "import xgboost as xgb\n",
268 |     "\n",
269 |     "from keras.models import Model\n",
270 |     "from keras.layers import Dense, Input"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 12,
276 |    "metadata": {
277 |     "collapsed": false,
278 |     "scrolled": true
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "# encoding_dim = 600\n",
283 |     "# input_dim = Input(shape=(364,))\n",
284 |     "\n",
285 |     "# encoded = Dense(364, activation='linear')(input_dim)\n",
286 |     "# # encoded = Dense(300, activation='relu')(encoded)\n",
287 |     "# # encoded = Dense(32, activation='relu')(encoded)\n",
288 |     "# encoder_output = Dense(encoding_dim)(encoded)\n",
289 |     "\n",
290 |     "# decoded = Dense(600, activation='relu')(encoder_output)\n",
291 |     "# # decoded = Dense(64, activation='relu')(decoded)\n",
292 |     "# # decoded = Dense(128, activation='relu')(decoded)\n",
293 |     "# decoded = Dense(364, activation='tanh')(decoded)\n",
294 |     "\n",
295 |     "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
296 |     "\n",
297 |     "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
298 |     "\n",
299 |     "# autoencoder.compile(optimizer='adam', loss='mse')\n",
300 |     "# # training\n",
301 |     "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 13,
307 |    "metadata": {
308 |     "collapsed": false
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "# new_train_feature = encoder.predict(train_X.values)\n",
313 |     "# new_test_feature = encoder.predict(test_X.values)\n",
314 |     "# print(new_train_feature.shape)\n",
315 |     "# print(new_test_feature.shape)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 14,
321 |    "metadata": {
322 |     "collapsed": false
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 15,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "xgb_train = xgb.DMatrix(X_train, y_train)\n",
338 |     "xgb_val = xgb.DMatrix(X_val, y_val)\n",
339 |     "xgb_test = xgb.DMatrix(test_X)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 16,
345 |    "metadata": {
346 |     "collapsed": false
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "param = {\n",
351 |     "    'booster':'gbtree',\n",
352 |     "    'max_depth':10,\n",
353 |     "    'num_leaves':80,\n",
354 |     "    'learning_rate':0.03,\n",
355 |     "    'scale_pos_weight':1,\n",
356 |     "    'num_threads':40,\n",
357 |     "    'objective':'binary:logistic',\n",
358 |     "    'bagging_fraction':0.7,\n",
359 |     "    'bagging_freq':1,\n",
360 |     "    'min_sum_hessian_in_leaf':100,\n",
361 |     "}\n",
362 |     "\n",
363 |     "param['is_unbalance']='true'\n",
364 |     "param['metric'] = 'auc'\n",
365 |     "\n",
366 |     "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=10, early_stopping_rounds=100)\n",
367 |     "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])\n"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 17,
373 |    "metadata": {
374 |     "collapsed": false
375 |    },
376 |    "outputs": [
377 |     {
378 |      "name": "stdout",
379 |      "output_type": "stream",
380 |      "text": [
381 |       "0.817833166049\n"
382 |      ]
383 |     }
384 |    ],
385 |    "source": [
386 |     "ypred = gbm.predict(xgb_val)\n",
387 |     "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
388 |     "print(val_auc)"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 18,
394 |    "metadata": {
395 |     "collapsed": false
396 |    },
397 |    "outputs": [
398 |     {
399 |      "data": {
400 |       "text/plain": [
401 |        "(10000,)"
402 |       ]
403 |      },
404 |      "execution_count": 18,
405 |      "metadata": {},
406 |      "output_type": "execute_result"
407 |     }
408 |    ],
409 |    "source": [
410 |     "pred = gbm.predict(xgb_test)\n",
411 |     "pred.shape"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 19,
417 |    "metadata": {
418 |     "collapsed": true
419 |    },
420 |    "outputs": [],
421 |    "source": [
422 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
423 |     "Submission.to_csv('../result/semi_xgb.csv',index=False)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "collapsed": true
431 |    },
432 |    "outputs": [],
433 |    "source": []
434 |   }
435 |  ],
436 |  "metadata": {
437 |   "anaconda-cloud": {},
438 |   "kernelspec": {
439 |    "display_name": "Python [default]",
440 |    "language": "python",
441 |    "name": "python3"
442 |   },
443 |   "language_info": {
444 |    "codemirror_mode": {
445 |     "name": "ipython",
446 |     "version": 3
447 |    },
448 |    "file_extension": ".py",
449 |    "mimetype": "text/x-python",
450 |    "name": "python",
451 |    "nbconvert_exporter": "python",
452 |    "pygments_lexer": "ipython3",
453 |    "version": "3.5.2"
454 |   }
455 |  },
456 |  "nbformat": 4,
457 |  "nbformat_minor": 1
458 | }
459 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_xgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "(10000, 160)"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
 37 |     "train_x1.shape"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "(15000, 160)"
 51 |       ]
 52 |      },
 53 |      "execution_count": 3,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
 60 |     "train_x2.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(10000, 159)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "test = pd.read_csv('../data/test_all.csv')\n",
 83 |     "test.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "(25000, 157)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 |     "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 |     "train_x = pd.concat([train_x11, train_x22])\n",
108 |     "train_x.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "(10000, 157)"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 |     "test_x.shape"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(35000, 157)"
145 |       ]
146 |      },
147 |      "execution_count": 7,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "x = pd.concat([train_x, test_x])\n",
154 |     "x.shape"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "(25000,)"
168 |       ]
169 |      },
170 |      "execution_count": 8,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "train_y1 = train_x1['y']\n",
177 |     "train_y2 = train_x2['y']\n",
178 |     "Y_train = train_y1.append(train_y2)\n",
179 |     "Y_train.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "(35000, 364)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "for i in range(96,158):\n",
199 |     "    col = 'x'+'_'+str(i)\n",
200 |     "    if col in x.columns.values:\n",
201 |     "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 |     "        x = pd.concat([x, dummies_df], axis=1)\n",
203 |     "print(x.shape)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "(25000, 364)\n",
218 |       "(10000, 364)\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "train_X = x[0:25000]\n",
224 |     "test_X = x[25000:35000]\n",
225 |     "print(train_X.shape)\n",
226 |     "print(test_X.shape)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 11,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 |       "Using TensorFlow backend.\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "from sklearn.tree import DecisionTreeClassifier\n",
248 |     "from sklearn.ensemble import RandomForestClassifier\n",
249 |     "from sklearn.ensemble import AdaBoostClassifier\n",
250 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
251 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
252 |     "from sklearn.neighbors import KNeighborsClassifier\n",
253 |     "from sklearn.svm import SVC\n",
254 |     "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
255 |     "from sklearn.metrics import classification_report\n",
256 |     "from sklearn.metrics import precision_recall_fscore_support\n",
257 |     "from sklearn.utils.multiclass import unique_labels\n",
258 |     "from sklearn.metrics import accuracy_score\n",
259 |     "from xgboost import XGBClassifier\n",
260 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
261 |     "from sklearn.cross_validation import cross_val_score\n",
262 |     "from lightgbm import LGBMClassifier\n",
263 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 |     "from sklearn.svm import LinearSVC\n",
265 |     "from sklearn import linear_model\n",
266 |     "import lightgbm as lgb\n",
267 |     "import xgboost as xgb\n",
268 |     "\n",
269 |     "from keras.models import Model\n",
270 |     "from keras.layers import Dense, Input"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 12,
276 |    "metadata": {
277 |     "collapsed": false,
278 |     "scrolled": true
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "# encoding_dim = 600\n",
283 |     "# input_dim = Input(shape=(364,))\n",
284 |     "\n",
285 |     "# encoded = Dense(364, activation='linear')(input_dim)\n",
286 |     "# # encoded = Dense(300, activation='relu')(encoded)\n",
287 |     "# # encoded = Dense(32, activation='relu')(encoded)\n",
288 |     "# encoder_output = Dense(encoding_dim)(encoded)\n",
289 |     "\n",
290 |     "# decoded = Dense(600, activation='relu')(encoder_output)\n",
291 |     "# # decoded = Dense(64, activation='relu')(decoded)\n",
292 |     "# # decoded = Dense(128, activation='relu')(decoded)\n",
293 |     "# decoded = Dense(364, activation='tanh')(decoded)\n",
294 |     "\n",
295 |     "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
296 |     "\n",
297 |     "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
298 |     "\n",
299 |     "# autoencoder.compile(optimizer='adam', loss='mse')\n",
300 |     "# # training\n",
301 |     "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 13,
307 |    "metadata": {
308 |     "collapsed": false
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "# new_train_feature = encoder.predict(train_X.values)\n",
313 |     "# new_test_feature = encoder.predict(test_X.values)\n",
314 |     "# print(new_train_feature.shape)\n",
315 |     "# print(new_test_feature.shape)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 14,
321 |    "metadata": {
322 |     "collapsed": false
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 15,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "xgb_train = xgb.DMatrix(X_train, y_train)\n",
338 |     "xgb_val = xgb.DMatrix(X_val, y_val)\n",
339 |     "xgb_test = xgb.DMatrix(test_X)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 16,
345 |    "metadata": {
346 |     "collapsed": false
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "param = {\n",
351 |     "    'booster':'gbtree',\n",
352 |     "    'max_depth':10,\n",
353 |     "    'num_leaves':80,\n",
354 |     "    'learning_rate':0.03,\n",
355 |     "    'scale_pos_weight':1,\n",
356 |     "    'num_threads':40,\n",
357 |     "    'objective':'binary:logistic',\n",
358 |     "    'bagging_fraction':0.7,\n",
359 |     "    'bagging_freq':1,\n",
360 |     "    'min_sum_hessian_in_leaf':100,\n",
361 |     "}\n",
362 |     "\n",
363 |     "param['is_unbalance']='true'\n",
364 |     "param['metric'] = 'auc'\n",
365 |     "\n",
366 |     "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=10, early_stopping_rounds=100)\n",
367 |     "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])\n"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 17,
373 |    "metadata": {
374 |     "collapsed": false
375 |    },
376 |    "outputs": [
377 |     {
378 |      "name": "stdout",
379 |      "output_type": "stream",
380 |      "text": [
381 |       "0.817833166049\n"
382 |      ]
383 |     }
384 |    ],
385 |    "source": [
386 |     "ypred = gbm.predict(xgb_val)\n",
387 |     "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
388 |     "print(val_auc)"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 18,
394 |    "metadata": {
395 |     "collapsed": false
396 |    },
397 |    "outputs": [
398 |     {
399 |      "data": {
400 |       "text/plain": [
401 |        "(10000,)"
402 |       ]
403 |      },
404 |      "execution_count": 18,
405 |      "metadata": {},
406 |      "output_type": "execute_result"
407 |     }
408 |    ],
409 |    "source": [
410 |     "pred = gbm.predict(xgb_test)\n",
411 |     "pred.shape"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 19,
417 |    "metadata": {
418 |     "collapsed": true
419 |    },
420 |    "outputs": [],
421 |    "source": [
422 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
423 |     "Submission.to_csv('../result/semi_xgb.csv',index=False)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "collapsed": true
431 |    },
432 |    "outputs": [],
433 |    "source": []
434 |   }
435 |  ],
436 |  "metadata": {
437 |   "anaconda-cloud": {},
438 |   "kernelspec": {
439 |    "display_name": "Python [default]",
440 |    "language": "python",
441 |    "name": "python3"
442 |   },
443 |   "language_info": {
444 |    "codemirror_mode": {
445 |     "name": "ipython",
446 |     "version": 3
447 |    },
448 |    "file_extension": ".py",
449 |    "mimetype": "text/x-python",
450 |    "name": "python",
451 |    "nbconvert_exporter": "python",
452 |    "pygments_lexer": "ipython3",
453 |    "version": "3.5.2"
454 |   }
455 |  },
456 |  "nbformat": 4,
457 |  "nbformat_minor": 1
458 | }
459 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_lgb预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "(10000, 160)"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
 37 |     "train_x1.shape"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "(15000, 160)"
 51 |       ]
 52 |      },
 53 |      "execution_count": 3,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
 60 |     "train_x2.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(10000, 159)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "test = pd.read_csv('../data/test_all.csv')\n",
 83 |     "test.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "(25000, 157)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 |     "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 |     "train_x = pd.concat([train_x11, train_x22])\n",
108 |     "train_x.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "(10000, 157)"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 |     "test_x.shape"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(35000, 157)"
145 |       ]
146 |      },
147 |      "execution_count": 7,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "x = pd.concat([train_x, test_x])\n",
154 |     "x.shape"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "(25000,)"
168 |       ]
169 |      },
170 |      "execution_count": 8,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "train_y1 = train_x1['y']\n",
177 |     "train_y2 = train_x2['y']\n",
178 |     "Y_train = train_y1.append(train_y2)\n",
179 |     "Y_train.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "(35000, 364)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "for i in range(96,158):\n",
199 |     "    col = 'x'+'_'+str(i)\n",
200 |     "    if col in x.columns.values:\n",
201 |     "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 |     "        x = pd.concat([x, dummies_df], axis=1)\n",
203 |     "print(x.shape)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "(25000, 364)\n",
218 |       "(10000, 364)\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "train_X = x[0:25000]\n",
224 |     "test_X = x[25000:35000]\n",
225 |     "print(train_X.shape)\n",
226 |     "print(test_X.shape)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 11,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 |       "Using TensorFlow backend.\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "from sklearn.tree import DecisionTreeClassifier\n",
248 |     "from sklearn.ensemble import RandomForestClassifier\n",
249 |     "from sklearn.ensemble import AdaBoostClassifier\n",
250 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
251 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
252 |     "from sklearn.neighbors import KNeighborsClassifier\n",
253 |     "from sklearn.svm import SVC\n",
254 |     "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
255 |     "from sklearn.metrics import classification_report\n",
256 |     "from sklearn.metrics import precision_recall_fscore_support\n",
257 |     "from sklearn.utils.multiclass import unique_labels\n",
258 |     "from sklearn.metrics import accuracy_score\n",
259 |     "from xgboost import XGBClassifier\n",
260 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
261 |     "from sklearn.cross_validation import cross_val_score\n",
262 |     "from lightgbm import LGBMClassifier\n",
263 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 |     "from sklearn.svm import LinearSVC\n",
265 |     "from sklearn import linear_model\n",
266 |     "import lightgbm as lgb\n",
267 |     "\n",
268 |     "from keras.models import Model\n",
269 |     "from keras.layers import Dense, Input"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 12,
275 |    "metadata": {
276 |     "collapsed": false,
277 |     "scrolled": true
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "# encoding_dim = 600\n",
282 |     "# input_dim = Input(shape=(364,))\n",
283 |     "\n",
284 |     "# encoded = Dense(364, activation='linear')(input_dim)\n",
285 |     "# # encoded = Dense(300, activation='relu')(encoded)\n",
286 |     "# # encoded = Dense(32, activation='relu')(encoded)\n",
287 |     "# encoder_output = Dense(encoding_dim)(encoded)\n",
288 |     "\n",
289 |     "# decoded = Dense(600, activation='relu')(encoder_output)\n",
290 |     "# # decoded = Dense(64, activation='relu')(decoded)\n",
291 |     "# # decoded = Dense(128, activation='relu')(decoded)\n",
292 |     "# decoded = Dense(364, activation='tanh')(decoded)\n",
293 |     "\n",
294 |     "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
295 |     "\n",
296 |     "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
297 |     "\n",
298 |     "# autoencoder.compile(optimizer='adam', loss='mse')\n",
299 |     "# # training\n",
300 |     "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 13,
306 |    "metadata": {
307 |     "collapsed": false
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "# new_train_feature = encoder.predict(train_X.values)\n",
312 |     "# new_test_feature = encoder.predict(test_X.values)\n",
313 |     "# print(new_train_feature.shape)\n",
314 |     "# print(new_test_feature.shape)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 14,
320 |    "metadata": {
321 |     "collapsed": false
322 |    },
323 |    "outputs": [],
324 |    "source": [
325 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 15,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "lgb_train = lgb.Dataset(X_train, y_train)\n",
337 |     "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 16,
343 |    "metadata": {
344 |     "collapsed": false
345 |    },
346 |    "outputs": [],
347 |    "source": [
348 |     "param = {\n",
349 |     "    'max_depth':6,\n",
350 |     "    'num_leaves':80,\n",
351 |     "    'learning_rate':0.03,\n",
352 |     "    'scale_pos_weight':1,\n",
353 |     "    'num_threads':40,\n",
354 |     "    'objective':'binary',\n",
355 |     "    'bagging_fraction':0.7,\n",
356 |     "    'bagging_freq':1,\n",
357 |     "    'min_sum_hessian_in_leaf':100\n",
358 |     "}\n",
359 |     "\n",
360 |     "param['is_unbalance']='true'\n",
361 |     "param['metric'] = 'auc'\n",
362 |     "\n",
363 |     "\n",
364 |     "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=30)\n",
365 |     "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 17,
371 |    "metadata": {
372 |     "collapsed": false
373 |    },
374 |    "outputs": [
375 |     {
376 |      "name": "stdout",
377 |      "output_type": "stream",
378 |      "text": [
379 |       "0.810753521759\n"
380 |      ]
381 |     }
382 |    ],
383 |    "source": [
384 |     "ypred = gbm.predict(X_val)\n",
385 |     "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
386 |     "print(val_auc)"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 18,
392 |    "metadata": {
393 |     "collapsed": false
394 |    },
395 |    "outputs": [
396 |     {
397 |      "data": {
398 |       "text/plain": [
399 |        "(10000,)"
400 |       ]
401 |      },
402 |      "execution_count": 18,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "pred = gbm.predict(test_X)\n",
409 |     "pred.shape"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 19,
415 |    "metadata": {
416 |     "collapsed": true
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
421 |     "Submission.to_csv('../result/semi_lgb.csv',index=False)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "collapsed": true
429 |    },
430 |    "outputs": [],
431 |    "source": []
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": []
441 |   }
442 |  ],
443 |  "metadata": {
444 |   "anaconda-cloud": {},
445 |   "kernelspec": {
446 |    "display_name": "Python [default]",
447 |    "language": "python",
448 |    "name": "python3"
449 |   },
450 |   "language_info": {
451 |    "codemirror_mode": {
452 |     "name": "ipython",
453 |     "version": 3
454 |    },
455 |    "file_extension": ".py",
456 |    "mimetype": "text/x-python",
457 |    "name": "python",
458 |    "nbconvert_exporter": "python",
459 |    "pygments_lexer": "ipython3",
460 |    "version": "3.5.2"
461 |   }
462 |  },
463 |  "nbformat": 4,
464 |  "nbformat_minor": 1
465 | }
466 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_lgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "(10000, 160)"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
 37 |     "train_x1.shape"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "(15000, 160)"
 51 |       ]
 52 |      },
 53 |      "execution_count": 3,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
 60 |     "train_x2.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(10000, 159)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "test = pd.read_csv('../data/test_all.csv')\n",
 83 |     "test.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "(25000, 157)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 |     "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 |     "train_x = pd.concat([train_x11, train_x22])\n",
108 |     "train_x.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "(10000, 157)"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 |     "test_x.shape"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(35000, 157)"
145 |       ]
146 |      },
147 |      "execution_count": 7,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "x = pd.concat([train_x, test_x])\n",
154 |     "x.shape"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "(25000,)"
168 |       ]
169 |      },
170 |      "execution_count": 8,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "train_y1 = train_x1['y']\n",
177 |     "train_y2 = train_x2['y']\n",
178 |     "Y_train = train_y1.append(train_y2)\n",
179 |     "Y_train.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "(35000, 364)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "for i in range(96,158):\n",
199 |     "    col = 'x'+'_'+str(i)\n",
200 |     "    if col in x.columns.values:\n",
201 |     "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 |     "        x = pd.concat([x, dummies_df], axis=1)\n",
203 |     "print(x.shape)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "(25000, 364)\n",
218 |       "(10000, 364)\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "train_X = x[0:25000]\n",
224 |     "test_X = x[25000:35000]\n",
225 |     "print(train_X.shape)\n",
226 |     "print(test_X.shape)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 11,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 |       "Using TensorFlow backend.\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "from sklearn.tree import DecisionTreeClassifier\n",
248 |     "from sklearn.ensemble import RandomForestClassifier\n",
249 |     "from sklearn.ensemble import AdaBoostClassifier\n",
250 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
251 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
252 |     "from sklearn.neighbors import KNeighborsClassifier\n",
253 |     "from sklearn.svm import SVC\n",
254 |     "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
255 |     "from sklearn.metrics import classification_report\n",
256 |     "from sklearn.metrics import precision_recall_fscore_support\n",
257 |     "from sklearn.utils.multiclass import unique_labels\n",
258 |     "from sklearn.metrics import accuracy_score\n",
259 |     "from xgboost import XGBClassifier\n",
260 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
261 |     "from sklearn.cross_validation import cross_val_score\n",
262 |     "from lightgbm import LGBMClassifier\n",
263 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 |     "from sklearn.svm import LinearSVC\n",
265 |     "from sklearn import linear_model\n",
266 |     "import lightgbm as lgb\n",
267 |     "\n",
268 |     "from keras.models import Model\n",
269 |     "from keras.layers import Dense, Input"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 12,
275 |    "metadata": {
276 |     "collapsed": false,
277 |     "scrolled": true
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "# encoding_dim = 600\n",
282 |     "# input_dim = Input(shape=(364,))\n",
283 |     "\n",
284 |     "# encoded = Dense(364, activation='linear')(input_dim)\n",
285 |     "# # encoded = Dense(300, activation='relu')(encoded)\n",
286 |     "# # encoded = Dense(32, activation='relu')(encoded)\n",
287 |     "# encoder_output = Dense(encoding_dim)(encoded)\n",
288 |     "\n",
289 |     "# decoded = Dense(600, activation='relu')(encoder_output)\n",
290 |     "# # decoded = Dense(64, activation='relu')(decoded)\n",
291 |     "# # decoded = Dense(128, activation='relu')(decoded)\n",
292 |     "# decoded = Dense(364, activation='tanh')(decoded)\n",
293 |     "\n",
294 |     "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
295 |     "\n",
296 |     "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
297 |     "\n",
298 |     "# autoencoder.compile(optimizer='adam', loss='mse')\n",
299 |     "# # training\n",
300 |     "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 13,
306 |    "metadata": {
307 |     "collapsed": false
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "# new_train_feature = encoder.predict(train_X.values)\n",
312 |     "# new_test_feature = encoder.predict(test_X.values)\n",
313 |     "# print(new_train_feature.shape)\n",
314 |     "# print(new_test_feature.shape)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 14,
320 |    "metadata": {
321 |     "collapsed": false
322 |    },
323 |    "outputs": [],
324 |    "source": [
325 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 15,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "lgb_train = lgb.Dataset(X_train, y_train)\n",
337 |     "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 16,
343 |    "metadata": {
344 |     "collapsed": false
345 |    },
346 |    "outputs": [],
347 |    "source": [
348 |     "param = {\n",
349 |     "    'max_depth':6,\n",
350 |     "    'num_leaves':80,\n",
351 |     "    'learning_rate':0.03,\n",
352 |     "    'scale_pos_weight':1,\n",
353 |     "    'num_threads':40,\n",
354 |     "    'objective':'binary',\n",
355 |     "    'bagging_fraction':0.7,\n",
356 |     "    'bagging_freq':1,\n",
357 |     "    'min_sum_hessian_in_leaf':100\n",
358 |     "}\n",
359 |     "\n",
360 |     "param['is_unbalance']='true'\n",
361 |     "param['metric'] = 'auc'\n",
362 |     "\n",
363 |     "\n",
364 |     "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=30)\n",
365 |     "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 17,
371 |    "metadata": {
372 |     "collapsed": false
373 |    },
374 |    "outputs": [
375 |     {
376 |      "name": "stdout",
377 |      "output_type": "stream",
378 |      "text": [
379 |       "0.810753521759\n"
380 |      ]
381 |     }
382 |    ],
383 |    "source": [
384 |     "ypred = gbm.predict(X_val)\n",
385 |     "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
386 |     "print(val_auc)"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 18,
392 |    "metadata": {
393 |     "collapsed": false
394 |    },
395 |    "outputs": [
396 |     {
397 |      "data": {
398 |       "text/plain": [
399 |        "(10000,)"
400 |       ]
401 |      },
402 |      "execution_count": 18,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "pred = gbm.predict(test_X)\n",
409 |     "pred.shape"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 19,
415 |    "metadata": {
416 |     "collapsed": true
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
421 |     "Submission.to_csv('../result/semi_lgb.csv',index=False)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "collapsed": true
429 |    },
430 |    "outputs": [],
431 |    "source": []
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": []
441 |   }
442 |  ],
443 |  "metadata": {
444 |   "anaconda-cloud": {},
445 |   "kernelspec": {
446 |    "display_name": "Python [default]",
447 |    "language": "python",
448 |    "name": "python3"
449 |   },
450 |   "language_info": {
451 |    "codemirror_mode": {
452 |     "name": "ipython",
453 |     "version": 3
454 |    },
455 |    "file_extension": ".py",
456 |    "mimetype": "text/x-python",
457 |    "name": "python",
458 |    "nbconvert_exporter": "python",
459 |    "pygments_lexer": "ipython3",
460 |    "version": "3.5.2"
461 |   }
462 |  },
463 |  "nbformat": 4,
464 |  "nbformat_minor": 1
465 | }
466 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/nn预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 5,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 6,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 6,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 7,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 8,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 9,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 10,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stderr",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Using TensorFlow backend.\n",
147 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
148 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "from sklearn.metrics import accuracy_score\n",
154 |     "from sklearn import metrics\n",
155 |     "from xgboost import XGBClassifier\n",
156 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
157 |     "\n",
158 |     "from keras.models import Sequential\n",
159 |     "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
160 |     "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
161 |     "from sklearn.cross_validation import train_test_split\n",
162 |     "from keras.optimizers import RMSprop, Adam\n",
163 |     "from keras.callbacks import ReduceLROnPlateau\n",
164 |     "from keras.callbacks import ModelCheckpoint\n",
165 |     "from keras.utils.np_utils import to_categorical"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 12,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 13,
182 |    "metadata": {
183 |     "collapsed": true
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "X_train = X_train.values\n",
188 |     "X_val = X_val.values"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 14,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "y_train =  y_train.values\n",
200 |     "yy_train = to_categorical(y_train)\n",
201 |     "\n",
202 |     "y_val =  y_val.values\n",
203 |     "yy_val = to_categorical(y_val)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 15,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "# Set the CNN model \n",
215 |     "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n",
216 |     "\n",
217 |     "model = Sequential()\n",
218 |     "\n",
219 |     "model.add(BatchNormalization(input_shape=(355,)))\n",
220 |     "model.add(Reshape((355,1,1)))\n",
221 |     "\n",
222 |     "\n",
223 |     "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
224 |     "                 activation ='relu'))\n",
225 |     "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
226 |     "                 activation ='relu'))\n",
227 |     "model.add(MaxPooling2D(pool_size=2, padding='same'))\n",
228 |     "# model.add(Dropout(0.25))\n",
229 |     "\n",
230 |     "\n",
231 |     "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
232 |     "                 activation ='relu'))\n",
233 |     "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
234 |     "                 activation ='relu'))\n",
235 |     "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n",
236 |     "# model.add(Dropout(0.25))\n",
237 |     "\n",
238 |     "\n",
239 |     "model.add(Flatten())\n",
240 |     "model.add(Dense(256, activation = 'relu'))\n",
241 |     "model.add(Dropout(0.5))\n",
242 |     "model.add(Dense(2, activation = 'softmax'))"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 16,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 17,
259 |    "metadata": {
260 |     "collapsed": false,
261 |     "scrolled": true
262 |    },
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "Train on 10800 samples, validate on 1200 samples\n",
269 |       "Epoch 1/15\n",
270 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.2493 - acc: 0.9522 - val_loss: 0.4120 - val_acc: 0.9583\n",
271 |       "Epoch 2/15\n",
272 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1865 - acc: 0.9530 - val_loss: 0.2823 - val_acc: 0.9583\n",
273 |       "Epoch 3/15\n",
274 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1729 - acc: 0.9530 - val_loss: 0.2359 - val_acc: 0.9583\n",
275 |       "Epoch 4/15\n",
276 |       "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1679 - acc: 0.9530 - val_loss: 0.1838 - val_acc: 0.9583\n",
277 |       "Epoch 5/15\n",
278 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1632 - acc: 0.9530 - val_loss: 0.1968 - val_acc: 0.9583\n",
279 |       "Epoch 6/15\n",
280 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1657 - acc: 0.9530 - val_loss: 0.1643 - val_acc: 0.9583\n",
281 |       "Epoch 7/15\n",
282 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1614 - acc: 0.9530 - val_loss: 0.2133 - val_acc: 0.9583\n",
283 |       "Epoch 8/15\n",
284 |       "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1626 - acc: 0.9530 - val_loss: 0.1540 - val_acc: 0.9583\n",
285 |       "Epoch 9/15\n",
286 |       "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1612 - acc: 0.9530 - val_loss: 0.1574 - val_acc: 0.9583\n",
287 |       "Epoch 10/15\n",
288 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1605 - acc: 0.9530 - val_loss: 0.1564 - val_acc: 0.9583\n",
289 |       "Epoch 11/15\n",
290 |       "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1586 - acc: 0.9530 - val_loss: 0.1549 - val_acc: 0.9583\n",
291 |       "Epoch 12/15\n",
292 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1585 - acc: 0.9530 - val_loss: 0.1545 - val_acc: 0.9583\n",
293 |       "Epoch 13/15\n",
294 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1594 - acc: 0.9530 - val_loss: 0.1565 - val_acc: 0.9583\n",
295 |       "Epoch 14/15\n",
296 |       "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1570 - acc: 0.9530 - val_loss: 0.1589 - val_acc: 0.9583\n",
297 |       "Epoch 15/15\n",
298 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1569 - acc: 0.9529 - val_loss: 0.1572 - val_acc: 0.9583\n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 18,
309 |    "metadata": {
310 |     "collapsed": false
311 |    },
312 |    "outputs": [
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "3000/3000 [==============================] - 1s 452us/step\n",
318 |       "0.764895321667\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "predictions = model.predict_proba(X_val,verbose=1)\n",
324 |     "pre = predictions[:,1]\n",
325 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
326 |     "print(val_auc)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 19,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [
336 |     {
337 |      "name": "stdout",
338 |      "output_type": "stream",
339 |      "text": [
340 |       "10000/10000 [==============================] - 5s 453us/step\n"
341 |      ]
342 |     },
343 |     {
344 |      "data": {
345 |       "text/plain": [
346 |        "(10000,)"
347 |       ]
348 |      },
349 |      "execution_count": 19,
350 |      "metadata": {},
351 |      "output_type": "execute_result"
352 |     }
353 |    ],
354 |    "source": [
355 |     "preds = model.predict_proba(test_X.values)\n",
356 |     "pred = preds[:,1]\n",
357 |     "pred.shape"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 20,
363 |    "metadata": {
364 |     "collapsed": true
365 |    },
366 |    "outputs": [],
367 |    "source": [
368 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
369 |     "Submission.to_csv('../result/nn.csv',index=False)"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "collapsed": true
377 |    },
378 |    "outputs": [],
379 |    "source": []
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {
385 |     "collapsed": true
386 |    },
387 |    "outputs": [],
388 |    "source": []
389 |   }
390 |  ],
391 |  "metadata": {
392 |   "anaconda-cloud": {},
393 |   "kernelspec": {
394 |    "display_name": "Python [default]",
395 |    "language": "python",
396 |    "name": "python3"
397 |   },
398 |   "language_info": {
399 |    "codemirror_mode": {
400 |     "name": "ipython",
401 |     "version": 3
402 |    },
403 |    "file_extension": ".py",
404 |    "mimetype": "text/x-python",
405 |    "name": "python",
406 |    "nbconvert_exporter": "python",
407 |    "pygments_lexer": "ipython3",
408 |    "version": "3.5.2"
409 |   }
410 |  },
411 |  "nbformat": 4,
412 |  "nbformat_minor": 1
413 | }
414 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/nn预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "train = pd.read_csv('../data/train_xy.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "test = pd.read_csv('../data/test_all.csv')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(10000, 157)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 5,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
 59 |     "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
 60 |     "x_train.shape\n",
 61 |     "x_test.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 6,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "(25000, 157)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 6,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "x = pd.concat([x_train,x_test])\n",
 84 |     "x.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 7,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "Y_train = train['y']"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 8,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "for i in range(96,158):\n",
107 |     "    col = 'x'+'_'+str(i)\n",
108 |     "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 |     "    x = pd.concat([x, dummies_df], axis=1)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 9,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "(15000, 355)\n",
124 |       "(10000, 355)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "train_X = x[0:15000]\n",
130 |     "test_X = x[15000:25000]\n",
131 |     "print(train_X.shape)\n",
132 |     "print(test_X.shape)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 10,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stderr",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Using TensorFlow backend.\n",
147 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
148 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "from sklearn.metrics import accuracy_score\n",
154 |     "from sklearn import metrics\n",
155 |     "from xgboost import XGBClassifier\n",
156 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
157 |     "\n",
158 |     "from keras.models import Sequential\n",
159 |     "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
160 |     "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
161 |     "from sklearn.cross_validation import train_test_split\n",
162 |     "from keras.optimizers import RMSprop, Adam\n",
163 |     "from keras.callbacks import ReduceLROnPlateau\n",
164 |     "from keras.callbacks import ModelCheckpoint\n",
165 |     "from keras.utils.np_utils import to_categorical"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 12,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 13,
182 |    "metadata": {
183 |     "collapsed": true
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "X_train = X_train.values\n",
188 |     "X_val = X_val.values"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 14,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "y_train =  y_train.values\n",
200 |     "yy_train = to_categorical(y_train)\n",
201 |     "\n",
202 |     "y_val =  y_val.values\n",
203 |     "yy_val = to_categorical(y_val)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 15,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "# Set the CNN model \n",
215 |     "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n",
216 |     "\n",
217 |     "model = Sequential()\n",
218 |     "\n",
219 |     "model.add(BatchNormalization(input_shape=(355,)))\n",
220 |     "model.add(Reshape((355,1,1)))\n",
221 |     "\n",
222 |     "\n",
223 |     "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
224 |     "                 activation ='relu'))\n",
225 |     "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
226 |     "                 activation ='relu'))\n",
227 |     "model.add(MaxPooling2D(pool_size=2, padding='same'))\n",
228 |     "# model.add(Dropout(0.25))\n",
229 |     "\n",
230 |     "\n",
231 |     "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
232 |     "                 activation ='relu'))\n",
233 |     "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
234 |     "                 activation ='relu'))\n",
235 |     "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n",
236 |     "# model.add(Dropout(0.25))\n",
237 |     "\n",
238 |     "\n",
239 |     "model.add(Flatten())\n",
240 |     "model.add(Dense(256, activation = 'relu'))\n",
241 |     "model.add(Dropout(0.5))\n",
242 |     "model.add(Dense(2, activation = 'softmax'))"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 16,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 17,
259 |    "metadata": {
260 |     "collapsed": false,
261 |     "scrolled": true
262 |    },
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "Train on 10800 samples, validate on 1200 samples\n",
269 |       "Epoch 1/15\n",
270 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.2493 - acc: 0.9522 - val_loss: 0.4120 - val_acc: 0.9583\n",
271 |       "Epoch 2/15\n",
272 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1865 - acc: 0.9530 - val_loss: 0.2823 - val_acc: 0.9583\n",
273 |       "Epoch 3/15\n",
274 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1729 - acc: 0.9530 - val_loss: 0.2359 - val_acc: 0.9583\n",
275 |       "Epoch 4/15\n",
276 |       "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1679 - acc: 0.9530 - val_loss: 0.1838 - val_acc: 0.9583\n",
277 |       "Epoch 5/15\n",
278 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1632 - acc: 0.9530 - val_loss: 0.1968 - val_acc: 0.9583\n",
279 |       "Epoch 6/15\n",
280 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1657 - acc: 0.9530 - val_loss: 0.1643 - val_acc: 0.9583\n",
281 |       "Epoch 7/15\n",
282 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1614 - acc: 0.9530 - val_loss: 0.2133 - val_acc: 0.9583\n",
283 |       "Epoch 8/15\n",
284 |       "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1626 - acc: 0.9530 - val_loss: 0.1540 - val_acc: 0.9583\n",
285 |       "Epoch 9/15\n",
286 |       "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1612 - acc: 0.9530 - val_loss: 0.1574 - val_acc: 0.9583\n",
287 |       "Epoch 10/15\n",
288 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1605 - acc: 0.9530 - val_loss: 0.1564 - val_acc: 0.9583\n",
289 |       "Epoch 11/15\n",
290 |       "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1586 - acc: 0.9530 - val_loss: 0.1549 - val_acc: 0.9583\n",
291 |       "Epoch 12/15\n",
292 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1585 - acc: 0.9530 - val_loss: 0.1545 - val_acc: 0.9583\n",
293 |       "Epoch 13/15\n",
294 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1594 - acc: 0.9530 - val_loss: 0.1565 - val_acc: 0.9583\n",
295 |       "Epoch 14/15\n",
296 |       "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1570 - acc: 0.9530 - val_loss: 0.1589 - val_acc: 0.9583\n",
297 |       "Epoch 15/15\n",
298 |       "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1569 - acc: 0.9529 - val_loss: 0.1572 - val_acc: 0.9583\n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 18,
309 |    "metadata": {
310 |     "collapsed": false
311 |    },
312 |    "outputs": [
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "3000/3000 [==============================] - 1s 452us/step\n",
318 |       "0.764895321667\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "predictions = model.predict_proba(X_val,verbose=1)\n",
324 |     "pre = predictions[:,1]\n",
325 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
326 |     "print(val_auc)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 19,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [
336 |     {
337 |      "name": "stdout",
338 |      "output_type": "stream",
339 |      "text": [
340 |       "10000/10000 [==============================] - 5s 453us/step\n"
341 |      ]
342 |     },
343 |     {
344 |      "data": {
345 |       "text/plain": [
346 |        "(10000,)"
347 |       ]
348 |      },
349 |      "execution_count": 19,
350 |      "metadata": {},
351 |      "output_type": "execute_result"
352 |     }
353 |    ],
354 |    "source": [
355 |     "preds = model.predict_proba(test_X.values)\n",
356 |     "pred = preds[:,1]\n",
357 |     "pred.shape"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 20,
363 |    "metadata": {
364 |     "collapsed": true
365 |    },
366 |    "outputs": [],
367 |    "source": [
368 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
369 |     "Submission.to_csv('../result/nn.csv',index=False)"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "collapsed": true
377 |    },
378 |    "outputs": [],
379 |    "source": []
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {
385 |     "collapsed": true
386 |    },
387 |    "outputs": [],
388 |    "source": []
389 |   }
390 |  ],
391 |  "metadata": {
392 |   "anaconda-cloud": {},
393 |   "kernelspec": {
394 |    "display_name": "Python [default]",
395 |    "language": "python",
396 |    "name": "python3"
397 |   },
398 |   "language_info": {
399 |    "codemirror_mode": {
400 |     "name": "ipython",
401 |     "version": 3
402 |    },
403 |    "file_extension": ".py",
404 |    "mimetype": "text/x-python",
405 |    "name": "python",
406 |    "nbconvert_exporter": "python",
407 |    "pygments_lexer": "ipython3",
408 |    "version": "3.5.2"
409 |   }
410 |  },
411 |  "nbformat": 4,
412 |  "nbformat_minor": 1
413 | }
414 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_gbdt预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "(10000, 160)"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
 37 |     "train_x1.shape"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "(15000, 160)"
 51 |       ]
 52 |      },
 53 |      "execution_count": 3,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
 60 |     "train_x2.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(10000, 159)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "test = pd.read_csv('../data/test_all.csv')\n",
 83 |     "test.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "(25000, 157)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 |     "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 |     "train_x = pd.concat([train_x11, train_x22])\n",
108 |     "train_x.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "(10000, 157)"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 |     "test_x.shape"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(35000, 157)"
145 |       ]
146 |      },
147 |      "execution_count": 7,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "x = pd.concat([train_x, test_x])\n",
154 |     "x.shape"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "(25000,)"
168 |       ]
169 |      },
170 |      "execution_count": 8,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "train_y1 = train_x1['y']\n",
177 |     "train_y2 = train_x2['y']\n",
178 |     "Y_train = train_y1.append(train_y2)\n",
179 |     "Y_train.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "(35000, 364)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "for i in range(96,158):\n",
199 |     "    col = 'x'+'_'+str(i)\n",
200 |     "    if col in x.columns.values:\n",
201 |     "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 |     "        x = pd.concat([x, dummies_df], axis=1)\n",
203 |     "print(x.shape)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "(25000, 364)\n",
218 |       "(10000, 364)\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "train_X = x[0:25000]\n",
224 |     "test_X = x[25000:35000]\n",
225 |     "print(train_X.shape)\n",
226 |     "print(test_X.shape)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 11,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 |       "Using TensorFlow backend.\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "from sklearn.tree import DecisionTreeClassifier\n",
248 |     "from sklearn.ensemble import RandomForestClassifier\n",
249 |     "from sklearn.ensemble import AdaBoostClassifier\n",
250 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
251 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
252 |     "from sklearn.neighbors import KNeighborsClassifier\n",
253 |     "from sklearn.svm import SVC\n",
254 |     "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
255 |     "from sklearn.metrics import classification_report\n",
256 |     "from sklearn.metrics import precision_recall_fscore_support\n",
257 |     "from sklearn.utils.multiclass import unique_labels\n",
258 |     "from sklearn.metrics import accuracy_score\n",
259 |     "from xgboost import XGBClassifier\n",
260 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
261 |     "from sklearn.cross_validation import cross_val_score\n",
262 |     "from lightgbm import LGBMClassifier\n",
263 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 |     "from sklearn.svm import LinearSVC\n",
265 |     "from sklearn import linear_model\n",
266 |     "import lightgbm as lgb\n",
267 |     "import xgboost as xgb\n",
268 |     "from sklearn.model_selection import GridSearchCV\n",
269 |     "\n",
270 |     "from keras.models import Model\n",
271 |     "from keras.layers import Dense, Input"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 12,
277 |    "metadata": {
278 |     "collapsed": false,
279 |     "scrolled": true
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "# encoding_dim = 600\n",
284 |     "# input_dim = Input(shape=(364,))\n",
285 |     "\n",
286 |     "# encoded = Dense(364, activation='linear')(input_dim)\n",
287 |     "# # encoded = Dense(300, activation='relu')(encoded)\n",
288 |     "# # encoded = Dense(32, activation='relu')(encoded)\n",
289 |     "# encoder_output = Dense(encoding_dim)(encoded)\n",
290 |     "\n",
291 |     "# decoded = Dense(600, activation='relu')(encoder_output)\n",
292 |     "# # decoded = Dense(64, activation='relu')(decoded)\n",
293 |     "# # decoded = Dense(128, activation='relu')(decoded)\n",
294 |     "# decoded = Dense(364, activation='tanh')(decoded)\n",
295 |     "\n",
296 |     "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
297 |     "\n",
298 |     "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
299 |     "\n",
300 |     "# autoencoder.compile(optimizer='adam', loss='mse')\n",
301 |     "# # training\n",
302 |     "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 13,
308 |    "metadata": {
309 |     "collapsed": false
310 |    },
311 |    "outputs": [],
312 |    "source": [
313 |     "# new_train_feature = encoder.predict(train_X.values)\n",
314 |     "# new_test_feature = encoder.predict(test_X.values)\n",
315 |     "# print(new_train_feature.shape)\n",
316 |     "# print(new_test_feature.shape)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 14,
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "outputs": [],
326 |    "source": [
327 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 15,
333 |    "metadata": {
334 |     "collapsed": false,
335 |     "scrolled": true
336 |    },
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "GridSearchCV(cv=5, error_score='raise',\n",
342 |        "       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
343 |        "              learning_rate=0.1, loss='deviance', max_depth=3,\n",
344 |        "              max_features=None, max_leaf_nodes=None,\n",
345 |        "              min_impurity_split=1e-07, min_samples_leaf=1,\n",
346 |        "              min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
347 |        "              n_estimators=100, presort='auto', random_state=None,\n",
348 |        "              subsample=1.0, verbose=0, warm_start=False),\n",
349 |        "       fit_params={}, iid=True, n_jobs=1,\n",
350 |        "       param_grid=[{'max_depth': range(4, 8, 12), 'n_estimators': range(100, 300, 500), 'learning_rate': [0.01, 0.1]}],\n",
351 |        "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
352 |        "       scoring='roc_auc', verbose=0)"
353 |       ]
354 |      },
355 |      "execution_count": 15,
356 |      "metadata": {},
357 |      "output_type": "execute_result"
358 |     }
359 |    ],
360 |    "source": [
361 |     "tuned_parameters= [{'n_estimators':range(100,300,500),\n",
362 |     "                  'max_depth':range(4,8,12),\n",
363 |     "                  'learning_rate':[0.01, 0.1]\n",
364 |     "                  }]\n",
365 |     "clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')\n",
366 |     "clf.fit(X_train, y_train)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 18,
372 |    "metadata": {
373 |     "collapsed": false
374 |    },
375 |    "outputs": [
376 |     {
377 |      "name": "stdout",
378 |      "output_type": "stream",
379 |      "text": [
380 |       "0.805375686875\n"
381 |      ]
382 |     }
383 |    ],
384 |    "source": [
385 |     "predictions = clf.predict_proba(X_val)\n",
386 |     "pre = predictions[:,1]\n",
387 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
388 |     "print(val_auc)  "
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 19,
394 |    "metadata": {
395 |     "collapsed": false
396 |    },
397 |    "outputs": [
398 |     {
399 |      "data": {
400 |       "text/plain": [
401 |        "(10000,)"
402 |       ]
403 |      },
404 |      "execution_count": 19,
405 |      "metadata": {},
406 |      "output_type": "execute_result"
407 |     }
408 |    ],
409 |    "source": [
410 |     "preds = clf.predict_proba(test_X)\n",
411 |     "pred = preds[:,1]\n",
412 |     "pred.shape"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": 20,
418 |    "metadata": {
419 |     "collapsed": true
420 |    },
421 |    "outputs": [],
422 |    "source": [
423 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
424 |     "Submission.to_csv('../result/semi_gbdt.csv',index=False)"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {
431 |     "collapsed": true
432 |    },
433 |    "outputs": [],
434 |    "source": []
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {
440 |     "collapsed": true
441 |    },
442 |    "outputs": [],
443 |    "source": []
444 |   }
445 |  ],
446 |  "metadata": {
447 |   "anaconda-cloud": {},
448 |   "kernelspec": {
449 |    "display_name": "Python [default]",
450 |    "language": "python",
451 |    "name": "python3"
452 |   },
453 |   "language_info": {
454 |    "codemirror_mode": {
455 |     "name": "ipython",
456 |     "version": 3
457 |    },
458 |    "file_extension": ".py",
459 |    "mimetype": "text/x-python",
460 |    "name": "python",
461 |    "nbconvert_exporter": "python",
462 |    "pygments_lexer": "ipython3",
463 |    "version": "3.5.2"
464 |   }
465 |  },
466 |  "nbformat": 4,
467 |  "nbformat_minor": 1
468 | }
469 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_nn预测.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "(10000, 160)"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
 37 |     "train_x1.shape"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "(15000, 160)"
 51 |       ]
 52 |      },
 53 |      "execution_count": 3,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
 60 |     "train_x2.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(10000, 159)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "test = pd.read_csv('../data/test_all.csv')\n",
 83 |     "test.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "(25000, 157)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 |     "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 |     "train_x = pd.concat([train_x11, train_x22])\n",
108 |     "train_x.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "(10000, 157)"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 |     "test_x.shape"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(35000, 157)"
145 |       ]
146 |      },
147 |      "execution_count": 7,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "x = pd.concat([train_x, test_x])\n",
154 |     "x.shape"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "(25000,)"
168 |       ]
169 |      },
170 |      "execution_count": 8,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "train_y1 = train_x1['y']\n",
177 |     "train_y2 = train_x2['y']\n",
178 |     "Y_train = train_y1.append(train_y2)\n",
179 |     "Y_train.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "(35000, 364)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "for i in range(96,158):\n",
199 |     "    col = 'x'+'_'+str(i)\n",
200 |     "    if col in x.columns.values:\n",
201 |     "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 |     "        x = pd.concat([x, dummies_df], axis=1)\n",
203 |     "print(x.shape)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "(25000, 364)\n",
218 |       "(10000, 364)\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "train_X = x[0:25000]\n",
224 |     "test_X = x[25000:35000]\n",
225 |     "print(train_X.shape)\n",
226 |     "print(test_X.shape)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 11,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "Using TensorFlow backend.\n",
241 |       "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
242 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "from sklearn.metrics import accuracy_score\n",
248 |     "from sklearn import metrics\n",
249 |     "\n",
250 |     "from keras.models import Sequential\n",
251 |     "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
252 |     "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
253 |     "from sklearn.cross_validation import train_test_split\n",
254 |     "from keras.optimizers import RMSprop, Adam\n",
255 |     "from keras.callbacks import ReduceLROnPlateau\n",
256 |     "from keras.callbacks import ModelCheckpoint\n",
257 |     "from keras.utils.np_utils import to_categorical"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 12,
263 |    "metadata": {
264 |     "collapsed": false
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 13,
274 |    "metadata": {
275 |     "collapsed": false
276 |    },
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "(20000, 364)\n",
283 |       "(5000, 364)\n"
284 |      ]
285 |     }
286 |    ],
287 |    "source": [
288 |     "X_train = X_train.values\n",
289 |     "X_val = X_val.values\n",
290 |     "print(X_train.shape)\n",
291 |     "print(X_val.shape)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 14,
297 |    "metadata": {
298 |     "collapsed": false
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "y_train =  y_train.values\n",
303 |     "yy_train = to_categorical(y_train)\n",
304 |     "\n",
305 |     "y_val =  y_val.values\n",
306 |     "yy_val = to_categorical(y_val)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 15,
312 |    "metadata": {
313 |     "collapsed": false,
314 |     "scrolled": true
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "# Set the CNN model \n",
319 |     "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n",
320 |     "\n",
321 |     "model = Sequential()\n",
322 |     "\n",
323 |     "model.add(BatchNormalization(input_shape=(364,)))\n",
324 |     "model.add(Reshape((364,1,1)))\n",
325 |     "\n",
326 |     "\n",
327 |     "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
328 |     "                 activation ='relu'))\n",
329 |     "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
330 |     "                 activation ='relu'))\n",
331 |     "model.add(MaxPooling2D(pool_size=2, padding='same'))\n",
332 |     "# model.add(Dropout(0.25))\n",
333 |     "\n",
334 |     "\n",
335 |     "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
336 |     "                 activation ='relu'))\n",
337 |     "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
338 |     "                 activation ='relu'))\n",
339 |     "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n",
340 |     "# model.add(Dropout(0.25))\n",
341 |     "\n",
342 |     "\n",
343 |     "model.add(Flatten())\n",
344 |     "model.add(Dense(256, activation = 'relu'))\n",
345 |     "model.add(Dropout(0.5))\n",
346 |     "model.add(Dense(2, activation = 'softmax'))"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 16,
352 |    "metadata": {
353 |     "collapsed": true
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 17,
363 |    "metadata": {
364 |     "collapsed": false,
365 |     "scrolled": true
366 |    },
367 |    "outputs": [
368 |     {
369 |      "name": "stdout",
370 |      "output_type": "stream",
371 |      "text": [
372 |       "Train on 18000 samples, validate on 2000 samples\n",
373 |       "Epoch 1/15\n",
374 |       "18000/18000 [==============================] - 24s 1ms/step - loss: 0.2315 - acc: 0.9452 - val_loss: 0.3059 - val_acc: 0.9550\n",
375 |       "Epoch 2/15\n",
376 |       "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1728 - acc: 0.9552 - val_loss: 0.2073 - val_acc: 0.9550\n",
377 |       "Epoch 3/15\n",
378 |       "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1647 - acc: 0.9552 - val_loss: 0.1616 - val_acc: 0.9550\n",
379 |       "Epoch 4/15\n",
380 |       "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1610 - acc: 0.9552 - val_loss: 0.1613 - val_acc: 0.9550\n",
381 |       "Epoch 5/15\n",
382 |       "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1608 - acc: 0.9552 - val_loss: 0.1613 - val_acc: 0.9550\n",
383 |       "Epoch 6/15\n",
384 |       "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1612 - acc: 0.9552 - val_loss: 0.1608 - val_acc: 0.9550\n",
385 |       "Epoch 7/15\n",
386 |       "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1597 - acc: 0.9552 - val_loss: 0.1600 - val_acc: 0.9550\n",
387 |       "Epoch 8/15\n",
388 |       "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1592 - acc: 0.9552 - val_loss: 0.1602 - val_acc: 0.9550\n",
389 |       "Epoch 9/15\n",
390 |       "18000/18000 [==============================] - 27s 1ms/step - loss: 0.1571 - acc: 0.9552 - val_loss: 0.1635 - val_acc: 0.9550\n",
391 |       "Epoch 10/15\n",
392 |       "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1578 - acc: 0.9552 - val_loss: 0.1620 - val_acc: 0.9550\n",
393 |       "Epoch 11/15\n",
394 |       "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1571 - acc: 0.9552 - val_loss: 0.1628 - val_acc: 0.9550\n",
395 |       "Epoch 12/15\n",
396 |       "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1584 - acc: 0.9552 - val_loss: 0.1633 - val_acc: 0.9550\n",
397 |       "Epoch 13/15\n",
398 |       "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1557 - acc: 0.9552 - val_loss: 0.1664 - val_acc: 0.9550\n",
399 |       "Epoch 14/15\n",
400 |       "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1551 - acc: 0.9552 - val_loss: 0.1637 - val_acc: 0.9550\n",
401 |       "Epoch 15/15\n",
402 |       "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1562 - acc: 0.9552 - val_loss: 0.1631 - val_acc: 0.9550\n"
403 |      ]
404 |     }
405 |    ],
406 |    "source": [
407 |     "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 18,
413 |    "metadata": {
414 |     "collapsed": false
415 |    },
416 |    "outputs": [
417 |     {
418 |      "name": "stdout",
419 |      "output_type": "stream",
420 |      "text": [
421 |       "5000/5000 [==============================] - 2s 461us/step\n",
422 |       "0.793837946347\n"
423 |      ]
424 |     }
425 |    ],
426 |    "source": [
427 |     "predictions = model.predict_proba(X_val,verbose=1)\n",
428 |     "pre = predictions[:,1]\n",
429 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
430 |     "print(val_auc)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 19,
436 |    "metadata": {
437 |     "collapsed": false
438 |    },
439 |    "outputs": [
440 |     {
441 |      "name": "stdout",
442 |      "output_type": "stream",
443 |      "text": [
444 |       "10000/10000 [==============================] - 5s 459us/step\n"
445 |      ]
446 |     },
447 |     {
448 |      "data": {
449 |       "text/plain": [
450 |        "(10000,)"
451 |       ]
452 |      },
453 |      "execution_count": 19,
454 |      "metadata": {},
455 |      "output_type": "execute_result"
456 |     }
457 |    ],
458 |    "source": [
459 |     "preds = model.predict_proba(test_X.values)\n",
460 |     "pred = preds[:,1]\n",
461 |     "pred.shape"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": 20,
467 |    "metadata": {
468 |     "collapsed": true
469 |    },
470 |    "outputs": [],
471 |    "source": [
472 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
473 |     "Submission.to_csv('../result/semi_nn.csv',index=False)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {
480 |     "collapsed": true
481 |    },
482 |    "outputs": [],
483 |    "source": []
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": null,
488 |    "metadata": {
489 |     "collapsed": true
490 |    },
491 |    "outputs": [],
492 |    "source": []
493 |   }
494 |  ],
495 |  "metadata": {
496 |   "anaconda-cloud": {},
497 |   "kernelspec": {
498 |    "display_name": "Python [default]",
499 |    "language": "python",
500 |    "name": "python3"
501 |   },
502 |   "language_info": {
503 |    "codemirror_mode": {
504 |     "name": "ipython",
505 |     "version": 3
506 |    },
507 |    "file_extension": ".py",
508 |    "mimetype": "text/x-python",
509 |    "name": "python",
510 |    "nbconvert_exporter": "python",
511 |    "pygments_lexer": "ipython3",
512 |    "version": "3.5.2"
513 |   }
514 |  },
515 |  "nbformat": 4,
516 |  "nbformat_minor": 1
517 | }
518 | 


--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_gbdt预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "(10000, 160)"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
 37 |     "train_x1.shape"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "(15000, 160)"
 51 |       ]
 52 |      },
 53 |      "execution_count": 3,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
 60 |     "train_x2.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(10000, 159)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "test = pd.read_csv('../data/test_all.csv')\n",
 83 |     "test.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "(25000, 157)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 |     "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 |     "train_x = pd.concat([train_x11, train_x22])\n",
108 |     "train_x.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "(10000, 157)"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 |     "test_x.shape"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(35000, 157)"
145 |       ]
146 |      },
147 |      "execution_count": 7,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "x = pd.concat([train_x, test_x])\n",
154 |     "x.shape"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "(25000,)"
168 |       ]
169 |      },
170 |      "execution_count": 8,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "train_y1 = train_x1['y']\n",
177 |     "train_y2 = train_x2['y']\n",
178 |     "Y_train = train_y1.append(train_y2)\n",
179 |     "Y_train.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "(35000, 364)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "for i in range(96,158):\n",
199 |     "    col = 'x'+'_'+str(i)\n",
200 |     "    if col in x.columns.values:\n",
201 |     "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 |     "        x = pd.concat([x, dummies_df], axis=1)\n",
203 |     "print(x.shape)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "(25000, 364)\n",
218 |       "(10000, 364)\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "train_X = x[0:25000]\n",
224 |     "test_X = x[25000:35000]\n",
225 |     "print(train_X.shape)\n",
226 |     "print(test_X.shape)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 15,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "from sklearn.tree import DecisionTreeClassifier\n",
238 |     "from sklearn.ensemble import RandomForestClassifier\n",
239 |     "from sklearn.ensemble import AdaBoostClassifier\n",
240 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
241 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
242 |     "from sklearn.neighbors import KNeighborsClassifier\n",
243 |     "from sklearn.svm import SVC\n",
244 |     "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
245 |     "from sklearn.metrics import classification_report\n",
246 |     "from sklearn.metrics import precision_recall_fscore_support\n",
247 |     "from sklearn.utils.multiclass import unique_labels\n",
248 |     "from sklearn.metrics import accuracy_score\n",
249 |     "from xgboost import XGBClassifier\n",
250 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
251 |     "from sklearn.cross_validation import cross_val_score\n",
252 |     "from lightgbm import LGBMClassifier\n",
253 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
254 |     "from sklearn.svm import LinearSVC\n",
255 |     "from sklearn import linear_model\n",
256 |     "import lightgbm as lgb\n",
257 |     "import xgboost as xgb\n",
258 |     "from sklearn.model_selection import GridSearchCV\n",
259 |     "\n",
260 |     "from keras.models import Model\n",
261 |     "from keras.layers import Dense, Input"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 12,
267 |    "metadata": {
268 |     "collapsed": false,
269 |     "scrolled": true
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "# encoding_dim = 600\n",
274 |     "# input_dim = Input(shape=(364,))\n",
275 |     "\n",
276 |     "# encoded = Dense(364, activation='linear')(input_dim)\n",
277 |     "# # encoded = Dense(300, activation='relu')(encoded)\n",
278 |     "# # encoded = Dense(32, activation='relu')(encoded)\n",
279 |     "# encoder_output = Dense(encoding_dim)(encoded)\n",
280 |     "\n",
281 |     "# decoded = Dense(600, activation='relu')(encoder_output)\n",
282 |     "# # decoded = Dense(64, activation='relu')(decoded)\n",
283 |     "# # decoded = Dense(128, activation='relu')(decoded)\n",
284 |     "# decoded = Dense(364, activation='tanh')(decoded)\n",
285 |     "\n",
286 |     "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
287 |     "\n",
288 |     "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
289 |     "\n",
290 |     "# autoencoder.compile(optimizer='adam', loss='mse')\n",
291 |     "# # training\n",
292 |     "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 13,
298 |    "metadata": {
299 |     "collapsed": false
300 |    },
301 |    "outputs": [],
302 |    "source": [
303 |     "# new_train_feature = encoder.predict(train_X.values)\n",
304 |     "# new_test_feature = encoder.predict(test_X.values)\n",
305 |     "# print(new_train_feature.shape)\n",
306 |     "# print(new_test_feature.shape)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 27,
312 |    "metadata": {
313 |     "collapsed": false
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 29,
323 |    "metadata": {
324 |     "collapsed": false,
325 |     "scrolled": true
326 |    },
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "GridSearchCV(cv=5, error_score='raise',\n",
332 |        "       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
333 |        "              learning_rate=0.1, loss='deviance', max_depth=3,\n",
334 |        "              max_features=None, max_leaf_nodes=None,\n",
335 |        "              min_impurity_split=1e-07, min_samples_leaf=1,\n",
336 |        "              min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
337 |        "              n_estimators=100, presort='auto', random_state=None,\n",
338 |        "              subsample=1.0, verbose=0, warm_start=False),\n",
339 |        "       fit_params={}, iid=True, n_jobs=1,\n",
340 |        "       param_grid=[{'learning_rate': [0.01, 0.1], 'n_estimators': range(100, 300, 500), 'max_depth': range(4, 8, 12)}],\n",
341 |        "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
342 |        "       scoring='roc_auc', verbose=0)"
343 |       ]
344 |      },
345 |      "execution_count": 29,
346 |      "metadata": {},
347 |      "output_type": "execute_result"
348 |     }
349 |    ],
350 |    "source": [
351 |     "tuned_parameters= [{'n_estimators':range(100,300,500),\n",
352 |     "                  'max_depth':range(4,8,12),\n",
353 |     "                  'learning_rate':[0.01, 0.1]\n",
354 |     "                  }]\n",
355 |     "clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')\n",
356 |     "clf.fit(X_train, y_train)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 30,
362 |    "metadata": {
363 |     "collapsed": false
364 |    },
365 |    "outputs": [
366 |     {
367 |      "name": "stdout",
368 |      "output_type": "stream",
369 |      "text": [
370 |       "0.804463759173\n"
371 |      ]
372 |     }
373 |    ],
374 |    "source": [
375 |     "predictions = clf.predict_proba(X_val)\n",
376 |     "pre = predictions[:,1]\n",
377 |     "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
378 |     "print(val_auc)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 36,
384 |    "metadata": {
385 |     "collapsed": false
386 |    },
387 |    "outputs": [
388 |     {
389 |      "data": {
390 |       "text/plain": [
391 |        "(10000,)"
392 |       ]
393 |      },
394 |      "execution_count": 36,
395 |      "metadata": {},
396 |      "output_type": "execute_result"
397 |     }
398 |    ],
399 |    "source": [
400 |     "preds = clf.predict_proba(test_X)\n",
401 |     "pred = preds[:,1]\n",
402 |     "pred.shape"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 37,
408 |    "metadata": {
409 |     "collapsed": true
410 |    },
411 |    "outputs": [],
412 |    "source": [
413 |     "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
414 |     "Submission.to_csv('../result/semi_gbdt.csv',index=False)"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {
421 |     "collapsed": true
422 |    },
423 |    "outputs": [],
424 |    "source": []
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "collapsed": true
431 |    },
432 |    "outputs": [],
433 |    "source": []
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": 57,
438 |    "metadata": {
439 |     "collapsed": false
440 |    },
441 |    "outputs": [],
442 |    "source": [
443 |     "xgb = pd.read_csv('../result/semi_xgb4.csv')"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 58,
449 |    "metadata": {
450 |     "collapsed": true
451 |    },
452 |    "outputs": [],
453 |    "source": [
454 |     "lgb = pd.read_csv('../result/semi_lgb2.csv')"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 59,
460 |    "metadata": {
461 |     "collapsed": true
462 |    },
463 |    "outputs": [],
464 |    "source": [
465 |     "result = xgb.pred_prob*0.3 + lgb.pred_prob*0.7"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": 60,
471 |    "metadata": {
472 |     "collapsed": false
473 |    },
474 |    "outputs": [
475 |     {
476 |      "data": {
477 |       "text/html": [
478 |        "<div>\n",
479 |        "<table border=\"1\" class=\"dataframe\">\n",
480 |        "  <thead>\n",
481 |        "    <tr style=\"text-align: right;\">\n",
482 |        "      <th></th>\n",
483 |        "      <th>cust_id</th>\n",
484 |        "      <th>pred_prob</th>\n",
485 |        "    </tr>\n",
486 |        "  </thead>\n",
487 |        "  <tbody>\n",
488 |        "    <tr>\n",
489 |        "      <th>0</th>\n",
490 |        "      <td>1</td>\n",
491 |        "      <td>0.038582</td>\n",
492 |        "    </tr>\n",
493 |        "    <tr>\n",
494 |        "      <th>1</th>\n",
495 |        "      <td>2</td>\n",
496 |        "      <td>0.087885</td>\n",
497 |        "    </tr>\n",
498 |        "    <tr>\n",
499 |        "      <th>2</th>\n",
500 |        "      <td>3</td>\n",
501 |        "      <td>0.342310</td>\n",
502 |        "    </tr>\n",
503 |        "    <tr>\n",
504 |        "      <th>3</th>\n",
505 |        "      <td>4</td>\n",
506 |        "      <td>0.213558</td>\n",
507 |        "    </tr>\n",
508 |        "    <tr>\n",
509 |        "      <th>4</th>\n",
510 |        "      <td>5</td>\n",
511 |        "      <td>0.193331</td>\n",
512 |        "    </tr>\n",
513 |        "  </tbody>\n",
514 |        "</table>\n",
515 |        "</div>"
516 |       ],
517 |       "text/plain": [
518 |        "   cust_id  pred_prob\n",
519 |        "0        1   0.038582\n",
520 |        "1        2   0.087885\n",
521 |        "2        3   0.342310\n",
522 |        "3        4   0.213558\n",
523 |        "4        5   0.193331"
524 |       ]
525 |      },
526 |      "execution_count": 60,
527 |      "metadata": {},
528 |      "output_type": "execute_result"
529 |     }
530 |    ],
531 |    "source": [
532 |     "xgb.pred_prob = result\n",
533 |     "xgb.head()"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 61,
539 |    "metadata": {
540 |     "collapsed": true
541 |    },
542 |    "outputs": [],
543 |    "source": [
544 |     "xgb.to_csv('../result/semi_xgb_lgb1.csv',index= False)"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {
551 |     "collapsed": true
552 |    },
553 |    "outputs": [],
554 |    "source": []
555 |   }
556 |  ],
557 |  "metadata": {
558 |   "anaconda-cloud": {},
559 |   "kernelspec": {
560 |    "display_name": "Python [default]",
561 |    "language": "python",
562 |    "name": "python3"
563 |   },
564 |   "language_info": {
565 |    "codemirror_mode": {
566 |     "name": "ipython",
567 |     "version": 3
568 |    },
569 |    "file_extension": ".py",
570 |    "mimetype": "text/x-python",
571 |    "name": "python",
572 |    "nbconvert_exporter": "python",
573 |    "pygments_lexer": "ipython3",
574 |    "version": "3.5.2"
575 |   }
576 |  },
577 |  "nbformat": 4,
578 |  "nbformat_minor": 1
579 | }
580 | 


--------------------------------------------------------------------------------