├── .gitattributes
├── img
├── method1.PNG
└── method2.PNG
├── 模型一代码b榜0.75673
├── result
│ └── res
│ │ └── ensemble.txt
└── code
│ ├── .ipynb_checkpoints
│ └── ensemble-checkpoint.ipynb
│ └── ensemble.ipynb
├── README.md
└── 模型二代码b榜0.749880
└── code
├── ensemble1.ipynb
├── ensemble.ipynb
├── .ipynb_checkpoints
├── ensemble1-checkpoint.ipynb
├── ensemble-checkpoint.ipynb
├── gbdt预测-checkpoint.ipynb
├── _ensemble-checkpoint.ipynb
├── semi_ensemble-checkpoint.ipynb
├── lgb预测-checkpoint.ipynb
├── xgb预测-checkpoint.ipynb
├── semi_xgb预测-checkpoint.ipynb
├── semi_lgb预测-checkpoint.ipynb
├── nn预测-checkpoint.ipynb
└── semi_gbdt预测-checkpoint.ipynb
├── xgb预测.ipynb
├── gbdt预测.ipynb
├── _ensemble.ipynb
├── semi_ensemble.ipynb
├── lgb预测.ipynb
├── semi_xgb预测.ipynb
├── semi_lgb预测.ipynb
├── nn预测.ipynb
├── semi_gbdt预测.ipynb
└── semi_nn预测.ipynb
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/img/method1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CuiNing6/2018-xinwang/HEAD/img/method1.PNG
--------------------------------------------------------------------------------
/img/method2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CuiNing6/2018-xinwang/HEAD/img/method2.PNG
--------------------------------------------------------------------------------
/模型一代码b榜0.75673/result/res/ensemble.txt:
--------------------------------------------------------------------------------
1 | xgb_all_feature Submission 0.74577
2 | lgb_all_feature_cv Submission4 0.74
3 | gbdt_all_feature Submission9 0.73
4 | nn_all_feature Submission11 0.70
5 | xgb_corr_feature Submission12 0.70
6 |
7 | Submission*0.5 + Submission4*0.5 = sub_0_4 = 0.7463
8 | sub_0_4*0.7 + Submission9*0.3 = sub_0_4_9 = 0.74651
9 | sub_0_4_9*0.7 + Submission11*0.3 = sub_0_4_9_11 = 0.74757
10 | sub_0_4_9_11*0.7 + Submission12*0.3 = sub_0_4_9_11_12 = 0.75152
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 2018-新网银行杯Top1方案
2 | 比赛链接:http://www.dcjingsai.com/common/cmpt/西南财经大学“新网银行杯”数据科学竞赛_竞赛信息.html
3 | # 队伍名称:摸金校尉
4 | # 解决方案:
5 | ## 基于集成学习的信用风险预测模型
6 | 本次比赛通过机器学习和数据挖掘技术定量分析信用风险,给出每个样本的预测结果。首先,研究了违约客户和履约客户这两批客户的特征,其次,将机器学习领域比较流行的集成学习模型应用于信用风险评估领域,并利用主流的模型性能评价指标评价模型。在比赛中,对类别型数据进行哑编码,并搭建自编码网络提取特征,利用特征相关性,特征重要性,information value三个方法筛选特征,最后,选取基于加权平均法的集成学习模型和类别分布不平衡环境下基于加权平均法的半监督集成模型对数据进行建模,并使用AUC作为模型性能的具体评价指标,通过两种参数调节方法优化模型。在测试数据集上应用,竞赛结果验证了所构建的集成系统泛化能力较强,模型复杂度适中。
7 | ## 基于加权平均法的集成学习模型示意图:
8 | 
9 | ## 基于加权平均法的半监督集成学习模型示意图:
10 | 
11 | # 代码说明:
12 | * 模型一对应基于加权平均法的集成学习模型
13 | * 模型二对应基于加权平均法的半监督集成学习模型
14 | * 其他代码里存放了比赛过程中的一些尝试,但是最终没有用到,包括自编码网络提取特征,多模态集成和woe特征构建等等。
15 |
--------------------------------------------------------------------------------
/模型一代码b榜0.75673/code/.ipynb_checkpoints/ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "sub1 = pd.read_csv('../result/Submission.csv')\n",
26 | "sub2 = pd.read_csv('../result/Submission4.csv')\n",
27 | "sub3 = pd.read_csv('../result/Submission9.csv')\n",
28 | "sub4 = pd.read_csv('../result/Submission11.csv')\n",
29 | "sub5 = pd.read_csv('../result/Submission12.csv')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {
36 | "collapsed": true
37 | },
38 | "outputs": [],
39 | "source": [
40 | "ensemble = sub1\n",
41 | "ensemble['pred_prob'] = sub1['pred_prob']*0.5 + sub2['pred_prob']*0.5"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 4,
47 | "metadata": {
48 | "collapsed": true
49 | },
50 | "outputs": [],
51 | "source": [
52 | "ensemble1 = sub1\n",
53 | "ensemble1['pred_prob'] = ensemble['pred_prob']*0.7 + sub3['pred_prob']*0.3"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 5,
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "outputs": [],
63 | "source": [
64 | "ensemble2 = sub1\n",
65 | "ensemble2['pred_prob'] = ensemble1['pred_prob']*0.7 + sub4['pred_prob']*0.3"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 6,
71 | "metadata": {
72 | "collapsed": true
73 | },
74 | "outputs": [],
75 | "source": [
76 | "ensemble3 = sub1\n",
77 | "ensemble3['pred_prob'] = ensemble2['pred_prob']*0.7 + sub5['pred_prob']*0.3"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 7,
83 | "metadata": {
84 | "collapsed": false,
85 | "scrolled": true
86 | },
87 | "outputs": [],
88 | "source": [
89 | "ensemble3.to_csv('../result/ensemble.csv')"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "collapsed": true
97 | },
98 | "outputs": [],
99 | "source": []
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "Python [default]",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.5.2"
119 | }
120 | },
121 | "nbformat": 4,
122 | "nbformat_minor": 1
123 | }
124 |
--------------------------------------------------------------------------------
/模型一代码b榜0.75673/code/ensemble.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "sub1 = pd.read_csv('../result/Submission.csv')\n",
26 | "sub2 = pd.read_csv('../result/Submission4.csv')\n",
27 | "sub3 = pd.read_csv('../result/Submission9.csv')\n",
28 | "sub4 = pd.read_csv('../result/Submission11.csv')\n",
29 | "sub5 = pd.read_csv('../result/Submission12.csv')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {
36 | "collapsed": true
37 | },
38 | "outputs": [],
39 | "source": [
40 | "ensemble = sub1\n",
41 | "ensemble['pred_prob'] = sub1['pred_prob']*0.5 + sub2['pred_prob']*0.5"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 4,
47 | "metadata": {
48 | "collapsed": true
49 | },
50 | "outputs": [],
51 | "source": [
52 | "ensemble1 = sub1\n",
53 | "ensemble1['pred_prob'] = ensemble['pred_prob']*0.7 + sub3['pred_prob']*0.3"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 5,
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "outputs": [],
63 | "source": [
64 | "ensemble2 = sub1\n",
65 | "ensemble2['pred_prob'] = ensemble1['pred_prob']*0.7 + sub4['pred_prob']*0.3"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 6,
71 | "metadata": {
72 | "collapsed": true
73 | },
74 | "outputs": [],
75 | "source": [
76 | "ensemble3 = sub1\n",
77 | "ensemble3['pred_prob'] = ensemble2['pred_prob']*0.7 + sub5['pred_prob']*0.3"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 9,
83 | "metadata": {
84 | "collapsed": false,
85 | "scrolled": true
86 | },
87 | "outputs": [],
88 | "source": [
89 | "ensemble3.to_csv('../result/ensemble.csv',index=False)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "collapsed": true
97 | },
98 | "outputs": [],
99 | "source": []
100 | }
101 | ],
102 | "metadata": {
103 | "anaconda-cloud": {},
104 | "kernelspec": {
105 | "display_name": "Python [default]",
106 | "language": "python",
107 | "name": "python3"
108 | },
109 | "language_info": {
110 | "codemirror_mode": {
111 | "name": "ipython",
112 | "version": 3
113 | },
114 | "file_extension": ".py",
115 | "mimetype": "text/x-python",
116 | "name": "python",
117 | "nbconvert_exporter": "python",
118 | "pygments_lexer": "ipython3",
119 | "version": "3.5.2"
120 | }
121 | },
122 | "nbformat": 4,
123 | "nbformat_minor": 1
124 | }
125 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/ensemble1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 5,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "ensemble = pd.read_csv('../result/free2/sub_0_4_9_11_12.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 6,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 8,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "
\n",
50 | "
\n",
51 | " \n",
52 | " \n",
53 | " | \n",
54 | " cust_id | \n",
55 | " pred_prob | \n",
56 | "
\n",
57 | " \n",
58 | " \n",
59 | " \n",
60 | " | 0 | \n",
61 | " 1 | \n",
62 | " 0.016407 | \n",
63 | "
\n",
64 | " \n",
65 | " | 1 | \n",
66 | " 2 | \n",
67 | " 0.051909 | \n",
68 | "
\n",
69 | " \n",
70 | " | 2 | \n",
71 | " 3 | \n",
72 | " 0.194874 | \n",
73 | "
\n",
74 | " \n",
75 | " | 3 | \n",
76 | " 4 | \n",
77 | " 0.081682 | \n",
78 | "
\n",
79 | " \n",
80 | " | 4 | \n",
81 | " 5 | \n",
82 | " 0.154739 | \n",
83 | "
\n",
84 | " \n",
85 | "
\n",
86 | "
"
87 | ],
88 | "text/plain": [
89 | " cust_id pred_prob\n",
90 | "0 1 0.016407\n",
91 | "1 2 0.051909\n",
92 | "2 3 0.194874\n",
93 | "3 4 0.081682\n",
94 | "4 5 0.154739"
95 | ]
96 | },
97 | "execution_count": 8,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "ensemble.head()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 9,
109 | "metadata": {
110 | "collapsed": true
111 | },
112 | "outputs": [],
113 | "source": [
114 | "result = ensemble.pred_prob*0.7 + semi_ensemble.pred_prob*0.3"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 10,
120 | "metadata": {
121 | "collapsed": true
122 | },
123 | "outputs": [],
124 | "source": [
125 | "pred = _ensemble\n",
126 | "pred.pred_prob = result"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 11,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/html": [
139 | "\n",
140 | "
\n",
141 | " \n",
142 | " \n",
143 | " | \n",
144 | " cust_id | \n",
145 | " pred_prob | \n",
146 | "
\n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " | 0 | \n",
151 | " 1 | \n",
152 | " 0.018662 | \n",
153 | "
\n",
154 | " \n",
155 | " | 1 | \n",
156 | " 2 | \n",
157 | " 0.051679 | \n",
158 | "
\n",
159 | " \n",
160 | " | 2 | \n",
161 | " 3 | \n",
162 | " 0.191229 | \n",
163 | "
\n",
164 | " \n",
165 | " | 3 | \n",
166 | " 4 | \n",
167 | " 0.092566 | \n",
168 | "
\n",
169 | " \n",
170 | " | 4 | \n",
171 | " 5 | \n",
172 | " 0.142426 | \n",
173 | "
\n",
174 | " \n",
175 | "
\n",
176 | "
"
177 | ],
178 | "text/plain": [
179 | " cust_id pred_prob\n",
180 | "0 1 0.018662\n",
181 | "1 2 0.051679\n",
182 | "2 3 0.191229\n",
183 | "3 4 0.092566\n",
184 | "4 5 0.142426"
185 | ]
186 | },
187 | "execution_count": 11,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "pred.head()"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 12,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "pred.to_csv('../result/ensemble_final1.csv',index = False)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": []
215 | }
216 | ],
217 | "metadata": {
218 | "kernelspec": {
219 | "display_name": "Python [default]",
220 | "language": "python",
221 | "name": "python3"
222 | },
223 | "language_info": {
224 | "codemirror_mode": {
225 | "name": "ipython",
226 | "version": 3
227 | },
228 | "file_extension": ".py",
229 | "mimetype": "text/x-python",
230 | "name": "python",
231 | "nbconvert_exporter": "python",
232 | "pygments_lexer": "ipython3",
233 | "version": "3.5.2"
234 | }
235 | },
236 | "nbformat": 4,
237 | "nbformat_minor": 1
238 | }
239 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/ensemble.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "_ensemble = pd.read_csv('../result/_ensemble.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 5,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "
\n",
51 | " \n",
52 | " \n",
53 | " | \n",
54 | " cust_id | \n",
55 | " pred_prob | \n",
56 | "
\n",
57 | " \n",
58 | " \n",
59 | " \n",
60 | " | 0 | \n",
61 | " 1 | \n",
62 | " 0.014872 | \n",
63 | "
\n",
64 | " \n",
65 | " | 1 | \n",
66 | " 2 | \n",
67 | " 0.036687 | \n",
68 | "
\n",
69 | " \n",
70 | " | 2 | \n",
71 | " 3 | \n",
72 | " 0.194181 | \n",
73 | "
\n",
74 | " \n",
75 | " | 3 | \n",
76 | " 4 | \n",
77 | " 0.064974 | \n",
78 | "
\n",
79 | " \n",
80 | " | 4 | \n",
81 | " 5 | \n",
82 | " 0.118509 | \n",
83 | "
\n",
84 | " \n",
85 | "
\n",
86 | "
"
87 | ],
88 | "text/plain": [
89 | " cust_id pred_prob\n",
90 | "0 1 0.014872\n",
91 | "1 2 0.036687\n",
92 | "2 3 0.194181\n",
93 | "3 4 0.064974\n",
94 | "4 5 0.118509"
95 | ]
96 | },
97 | "execution_count": 5,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "_ensemble.head()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 6,
109 | "metadata": {
110 | "collapsed": true
111 | },
112 | "outputs": [],
113 | "source": [
114 | "result = semi_ensemble.pred_prob*0.4 + _ensemble.pred_prob*0.6"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 7,
120 | "metadata": {
121 | "collapsed": true
122 | },
123 | "outputs": [],
124 | "source": [
125 | "pred = _ensemble\n",
126 | "pred.pred_prob = result"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 8,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/html": [
139 | "\n",
140 | "
\n",
141 | " \n",
142 | " \n",
143 | " | \n",
144 | " cust_id | \n",
145 | " pred_prob | \n",
146 | "
\n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " | 0 | \n",
151 | " 1 | \n",
152 | " 0.015424 | \n",
153 | "
\n",
154 | " \n",
155 | " | 1 | \n",
156 | " 2 | \n",
157 | " 0.034652 | \n",
158 | "
\n",
159 | " \n",
160 | " | 2 | \n",
161 | " 3 | \n",
162 | " 0.146920 | \n",
163 | "
\n",
164 | " \n",
165 | " | 3 | \n",
166 | " 4 | \n",
167 | " 0.062570 | \n",
168 | "
\n",
169 | " \n",
170 | " | 4 | \n",
171 | " 5 | \n",
172 | " 0.093373 | \n",
173 | "
\n",
174 | " \n",
175 | "
\n",
176 | "
"
177 | ],
178 | "text/plain": [
179 | " cust_id pred_prob\n",
180 | "0 1 0.015424\n",
181 | "1 2 0.034652\n",
182 | "2 3 0.146920\n",
183 | "3 4 0.062570\n",
184 | "4 5 0.093373"
185 | ]
186 | },
187 | "execution_count": 8,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "pred.head()"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 9,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "pred.to_csv('../result/ensemble_final.csv',index = False)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": []
215 | }
216 | ],
217 | "metadata": {
218 | "anaconda-cloud": {},
219 | "kernelspec": {
220 | "display_name": "Python [default]",
221 | "language": "python",
222 | "name": "python3"
223 | },
224 | "language_info": {
225 | "codemirror_mode": {
226 | "name": "ipython",
227 | "version": 3
228 | },
229 | "file_extension": ".py",
230 | "mimetype": "text/x-python",
231 | "name": "python",
232 | "nbconvert_exporter": "python",
233 | "pygments_lexer": "ipython3",
234 | "version": "3.5.2"
235 | }
236 | },
237 | "nbformat": 4,
238 | "nbformat_minor": 1
239 | }
240 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/ensemble1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 5,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "ensemble = pd.read_csv('../result/free2/sub_0_4_9_11_12.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 6,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 8,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "
\n",
51 | " \n",
52 | " \n",
53 | " | \n",
54 | " cust_id | \n",
55 | " pred_prob | \n",
56 | "
\n",
57 | " \n",
58 | " \n",
59 | " \n",
60 | " | 0 | \n",
61 | " 1 | \n",
62 | " 0.016407 | \n",
63 | "
\n",
64 | " \n",
65 | " | 1 | \n",
66 | " 2 | \n",
67 | " 0.051909 | \n",
68 | "
\n",
69 | " \n",
70 | " | 2 | \n",
71 | " 3 | \n",
72 | " 0.194874 | \n",
73 | "
\n",
74 | " \n",
75 | " | 3 | \n",
76 | " 4 | \n",
77 | " 0.081682 | \n",
78 | "
\n",
79 | " \n",
80 | " | 4 | \n",
81 | " 5 | \n",
82 | " 0.154739 | \n",
83 | "
\n",
84 | " \n",
85 | "
\n",
86 | "
"
87 | ],
88 | "text/plain": [
89 | " cust_id pred_prob\n",
90 | "0 1 0.016407\n",
91 | "1 2 0.051909\n",
92 | "2 3 0.194874\n",
93 | "3 4 0.081682\n",
94 | "4 5 0.154739"
95 | ]
96 | },
97 | "execution_count": 8,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "ensemble.head()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 9,
109 | "metadata": {
110 | "collapsed": true
111 | },
112 | "outputs": [],
113 | "source": [
114 | "result = ensemble.pred_prob*0.7 + semi_ensemble.pred_prob*0.3"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 10,
120 | "metadata": {
121 | "collapsed": true
122 | },
123 | "outputs": [],
124 | "source": [
125 | "pred = _ensemble\n",
126 | "pred.pred_prob = result"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 11,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/html": [
139 | "\n",
140 | "
\n",
141 | " \n",
142 | " \n",
143 | " | \n",
144 | " cust_id | \n",
145 | " pred_prob | \n",
146 | "
\n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " | 0 | \n",
151 | " 1 | \n",
152 | " 0.018662 | \n",
153 | "
\n",
154 | " \n",
155 | " | 1 | \n",
156 | " 2 | \n",
157 | " 0.051679 | \n",
158 | "
\n",
159 | " \n",
160 | " | 2 | \n",
161 | " 3 | \n",
162 | " 0.191229 | \n",
163 | "
\n",
164 | " \n",
165 | " | 3 | \n",
166 | " 4 | \n",
167 | " 0.092566 | \n",
168 | "
\n",
169 | " \n",
170 | " | 4 | \n",
171 | " 5 | \n",
172 | " 0.142426 | \n",
173 | "
\n",
174 | " \n",
175 | "
\n",
176 | "
"
177 | ],
178 | "text/plain": [
179 | " cust_id pred_prob\n",
180 | "0 1 0.018662\n",
181 | "1 2 0.051679\n",
182 | "2 3 0.191229\n",
183 | "3 4 0.092566\n",
184 | "4 5 0.142426"
185 | ]
186 | },
187 | "execution_count": 11,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "pred.head()"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 12,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "pred.to_csv('../result/ensemble_final1.csv',index = False)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": []
215 | }
216 | ],
217 | "metadata": {
218 | "kernelspec": {
219 | "display_name": "Python [default]",
220 | "language": "python",
221 | "name": "python3"
222 | },
223 | "language_info": {
224 | "codemirror_mode": {
225 | "name": "ipython",
226 | "version": 3
227 | },
228 | "file_extension": ".py",
229 | "mimetype": "text/x-python",
230 | "name": "python",
231 | "nbconvert_exporter": "python",
232 | "pygments_lexer": "ipython3",
233 | "version": "3.5.2"
234 | }
235 | },
236 | "nbformat": 4,
237 | "nbformat_minor": 1
238 | }
239 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "semi_ensemble = pd.read_csv('../result/semi_ensemble.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "_ensemble = pd.read_csv('../result/_ensemble.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 5,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "
\n",
51 | " \n",
52 | " \n",
53 | " | \n",
54 | " cust_id | \n",
55 | " pred_prob | \n",
56 | "
\n",
57 | " \n",
58 | " \n",
59 | " \n",
60 | " | 0 | \n",
61 | " 1 | \n",
62 | " 0.014872 | \n",
63 | "
\n",
64 | " \n",
65 | " | 1 | \n",
66 | " 2 | \n",
67 | " 0.036687 | \n",
68 | "
\n",
69 | " \n",
70 | " | 2 | \n",
71 | " 3 | \n",
72 | " 0.194181 | \n",
73 | "
\n",
74 | " \n",
75 | " | 3 | \n",
76 | " 4 | \n",
77 | " 0.064974 | \n",
78 | "
\n",
79 | " \n",
80 | " | 4 | \n",
81 | " 5 | \n",
82 | " 0.118509 | \n",
83 | "
\n",
84 | " \n",
85 | "
\n",
86 | "
"
87 | ],
88 | "text/plain": [
89 | " cust_id pred_prob\n",
90 | "0 1 0.014872\n",
91 | "1 2 0.036687\n",
92 | "2 3 0.194181\n",
93 | "3 4 0.064974\n",
94 | "4 5 0.118509"
95 | ]
96 | },
97 | "execution_count": 5,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "_ensemble.head()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 6,
109 | "metadata": {
110 | "collapsed": true
111 | },
112 | "outputs": [],
113 | "source": [
114 | "result = semi_ensemble.pred_prob*0.4 + _ensemble.pred_prob*0.6"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 7,
120 | "metadata": {
121 | "collapsed": true
122 | },
123 | "outputs": [],
124 | "source": [
125 | "pred = _ensemble\n",
126 | "pred.pred_prob = result"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 8,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/html": [
139 | "\n",
140 | "
\n",
141 | " \n",
142 | " \n",
143 | " | \n",
144 | " cust_id | \n",
145 | " pred_prob | \n",
146 | "
\n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " | 0 | \n",
151 | " 1 | \n",
152 | " 0.015424 | \n",
153 | "
\n",
154 | " \n",
155 | " | 1 | \n",
156 | " 2 | \n",
157 | " 0.034652 | \n",
158 | "
\n",
159 | " \n",
160 | " | 2 | \n",
161 | " 3 | \n",
162 | " 0.146920 | \n",
163 | "
\n",
164 | " \n",
165 | " | 3 | \n",
166 | " 4 | \n",
167 | " 0.062570 | \n",
168 | "
\n",
169 | " \n",
170 | " | 4 | \n",
171 | " 5 | \n",
172 | " 0.093373 | \n",
173 | "
\n",
174 | " \n",
175 | "
\n",
176 | "
"
177 | ],
178 | "text/plain": [
179 | " cust_id pred_prob\n",
180 | "0 1 0.015424\n",
181 | "1 2 0.034652\n",
182 | "2 3 0.146920\n",
183 | "3 4 0.062570\n",
184 | "4 5 0.093373"
185 | ]
186 | },
187 | "execution_count": 8,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "pred.head()"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 9,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "pred.to_csv('../result/ensemble_final.csv',index = False)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": []
215 | }
216 | ],
217 | "metadata": {
218 | "anaconda-cloud": {},
219 | "kernelspec": {
220 | "display_name": "Python [default]",
221 | "language": "python",
222 | "name": "python3"
223 | },
224 | "language_info": {
225 | "codemirror_mode": {
226 | "name": "ipython",
227 | "version": 3
228 | },
229 | "file_extension": ".py",
230 | "mimetype": "text/x-python",
231 | "name": "python",
232 | "nbconvert_exporter": "python",
233 | "pygments_lexer": "ipython3",
234 | "version": "3.5.2"
235 | }
236 | },
237 | "nbformat": 4,
238 | "nbformat_minor": 1
239 | }
240 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/xgb预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 3,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 6,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 6,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 7,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 7,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 8,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 9,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 10,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 11,
138 | "metadata": {
139 | "collapsed": true
140 | },
141 | "outputs": [],
142 | "source": [
143 | "from sklearn.metrics import accuracy_score\n",
144 | "from sklearn import metrics\n",
145 | "from sklearn.model_selection import train_test_split\n",
146 | "from xgboost import XGBClassifier"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 13,
152 | "metadata": {
153 | "collapsed": false
154 | },
155 | "outputs": [],
156 | "source": [
157 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 14,
163 | "metadata": {
164 | "collapsed": false
165 | },
166 | "outputs": [],
167 | "source": [
168 | "gbm = XGBClassifier( n_estimators= 100, max_depth= 4, min_child_weight= 2, gamma=0.9, subsample=0.8, \n",
169 | " colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1).fit(X_train, y_train)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 15,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "0.80642048092\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "predictions = gbm.predict_proba(X_val)\n",
189 | "pre = predictions[:,1]\n",
190 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
191 | "print(val_auc)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 16,
197 | "metadata": {
198 | "collapsed": false
199 | },
200 | "outputs": [
201 | {
202 | "data": {
203 | "text/plain": [
204 | "(10000,)"
205 | ]
206 | },
207 | "execution_count": 16,
208 | "metadata": {},
209 | "output_type": "execute_result"
210 | }
211 | ],
212 | "source": [
213 | "preds = gbm.predict_proba(test_X)\n",
214 | "pred = preds[:,1]\n",
215 | "pred.shape"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 18,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
227 | "Submission.to_csv('../result/xgb.csv',index=False)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "collapsed": true
235 | },
236 | "outputs": [],
237 | "source": []
238 | }
239 | ],
240 | "metadata": {
241 | "anaconda-cloud": {},
242 | "kernelspec": {
243 | "display_name": "Python [default]",
244 | "language": "python",
245 | "name": "python3"
246 | },
247 | "language_info": {
248 | "codemirror_mode": {
249 | "name": "ipython",
250 | "version": 3
251 | },
252 | "file_extension": ".py",
253 | "mimetype": "text/x-python",
254 | "name": "python",
255 | "nbconvert_exporter": "python",
256 | "pygments_lexer": "ipython3",
257 | "version": "3.5.2"
258 | }
259 | },
260 | "nbformat": 4,
261 | "nbformat_minor": 1
262 | }
263 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/gbdt预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 5,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 5,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 6,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 6,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 7,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 8,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 9,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 10,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [],
142 | "source": [
143 | "from sklearn.metrics import accuracy_score\n",
144 | "from sklearn import metrics\n",
145 | "from sklearn.model_selection import train_test_split\n",
146 | "from xgboost import XGBClassifier\n",
147 | "from sklearn.ensemble import GradientBoostingClassifier"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 11,
153 | "metadata": {
154 | "collapsed": false
155 | },
156 | "outputs": [],
157 | "source": [
158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 16,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [],
168 | "source": [
169 | "clf = GradientBoostingClassifier(n_estimators=120, learning_rate=0.05,max_depth=3, random_state=0).fit(X_train, y_train)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 17,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "0.798263097577\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "predictions = clf.predict_proba(X_val)\n",
189 | "pre = predictions[:,1]\n",
190 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
191 | "print(val_auc)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 18,
197 | "metadata": {
198 | "collapsed": false
199 | },
200 | "outputs": [
201 | {
202 | "data": {
203 | "text/plain": [
204 | "(10000,)"
205 | ]
206 | },
207 | "execution_count": 18,
208 | "metadata": {},
209 | "output_type": "execute_result"
210 | }
211 | ],
212 | "source": [
213 | "preds = clf.predict_proba(test_X)\n",
214 | "pred = preds[:,1]\n",
215 | "pred.shape"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 19,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
227 | "Submission.to_csv('../result/gbdt.csv',index=False)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "collapsed": true
235 | },
236 | "outputs": [],
237 | "source": []
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "collapsed": true
244 | },
245 | "outputs": [],
246 | "source": []
247 | }
248 | ],
249 | "metadata": {
250 | "anaconda-cloud": {},
251 | "kernelspec": {
252 | "display_name": "Python [default]",
253 | "language": "python",
254 | "name": "python3"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.5.2"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 1
271 | }
272 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/gbdt预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 5,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 5,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 6,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 6,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 7,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 8,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 9,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 10,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [],
142 | "source": [
143 | "from sklearn.metrics import accuracy_score\n",
144 | "from sklearn import metrics\n",
145 | "from sklearn.model_selection import train_test_split\n",
146 | "from xgboost import XGBClassifier\n",
147 | "from sklearn.ensemble import GradientBoostingClassifier"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 11,
153 | "metadata": {
154 | "collapsed": false
155 | },
156 | "outputs": [],
157 | "source": [
158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 16,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [],
168 | "source": [
169 | "clf = GradientBoostingClassifier(n_estimators=120, learning_rate=0.05,max_depth=3, random_state=0).fit(X_train, y_train)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 17,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "0.798263097577\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "predictions = clf.predict_proba(X_val)\n",
189 | "pre = predictions[:,1]\n",
190 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
191 | "print(val_auc)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 18,
197 | "metadata": {
198 | "collapsed": false
199 | },
200 | "outputs": [
201 | {
202 | "data": {
203 | "text/plain": [
204 | "(10000,)"
205 | ]
206 | },
207 | "execution_count": 18,
208 | "metadata": {},
209 | "output_type": "execute_result"
210 | }
211 | ],
212 | "source": [
213 | "preds = clf.predict_proba(test_X)\n",
214 | "pred = preds[:,1]\n",
215 | "pred.shape"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 19,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
227 | "Submission.to_csv('../result/gbdt.csv',index=False)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "collapsed": true
235 | },
236 | "outputs": [],
237 | "source": []
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "collapsed": true
244 | },
245 | "outputs": [],
246 | "source": []
247 | }
248 | ],
249 | "metadata": {
250 | "anaconda-cloud": {},
251 | "kernelspec": {
252 | "display_name": "Python [default]",
253 | "language": "python",
254 | "name": "python3"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.5.2"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 1
271 | }
272 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/_ensemble.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "xgb = pd.read_csv('../result/xgb.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "lgb = pd.read_csv('../result/lgb.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "gbdt = pd.read_csv('../result/gbdt.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 5,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "nn = pd.read_csv('../result/nn.csv')"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 6,
64 | "metadata": {
65 | "collapsed": false
66 | },
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/html": [
71 | "\n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " | \n",
76 | " cust_id | \n",
77 | " pred_prob | \n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " | 0 | \n",
83 | " 1 | \n",
84 | " 0.005983 | \n",
85 | "
\n",
86 | " \n",
87 | " | 1 | \n",
88 | " 2 | \n",
89 | " 0.025552 | \n",
90 | "
\n",
91 | " \n",
92 | " | 2 | \n",
93 | " 3 | \n",
94 | " 0.018066 | \n",
95 | "
\n",
96 | " \n",
97 | " | 3 | \n",
98 | " 4 | \n",
99 | " 0.035206 | \n",
100 | "
\n",
101 | " \n",
102 | " | 4 | \n",
103 | " 5 | \n",
104 | " 0.067734 | \n",
105 | "
\n",
106 | " \n",
107 | "
\n",
108 | "
"
109 | ],
110 | "text/plain": [
111 | " cust_id pred_prob\n",
112 | "0 1 0.005983\n",
113 | "1 2 0.025552\n",
114 | "2 3 0.018066\n",
115 | "3 4 0.035206\n",
116 | "4 5 0.067734"
117 | ]
118 | },
119 | "execution_count": 6,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "nn.head()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 7,
131 | "metadata": {
132 | "collapsed": false
133 | },
134 | "outputs": [],
135 | "source": [
136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 8,
142 | "metadata": {
143 | "collapsed": false
144 | },
145 | "outputs": [],
146 | "source": [
147 | "pred = nn\n",
148 | "pred.pred_prob = result"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 9,
154 | "metadata": {
155 | "collapsed": false
156 | },
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/html": [
161 | "\n",
162 | "
\n",
163 | " \n",
164 | " \n",
165 | " | \n",
166 | " cust_id | \n",
167 | " pred_prob | \n",
168 | "
\n",
169 | " \n",
170 | " \n",
171 | " \n",
172 | " | 0 | \n",
173 | " 1 | \n",
174 | " 0.014872 | \n",
175 | "
\n",
176 | " \n",
177 | " | 1 | \n",
178 | " 2 | \n",
179 | " 0.036687 | \n",
180 | "
\n",
181 | " \n",
182 | " | 2 | \n",
183 | " 3 | \n",
184 | " 0.194181 | \n",
185 | "
\n",
186 | " \n",
187 | " | 3 | \n",
188 | " 4 | \n",
189 | " 0.064974 | \n",
190 | "
\n",
191 | " \n",
192 | " | 4 | \n",
193 | " 5 | \n",
194 | " 0.118509 | \n",
195 | "
\n",
196 | " \n",
197 | "
\n",
198 | "
"
199 | ],
200 | "text/plain": [
201 | " cust_id pred_prob\n",
202 | "0 1 0.014872\n",
203 | "1 2 0.036687\n",
204 | "2 3 0.194181\n",
205 | "3 4 0.064974\n",
206 | "4 5 0.118509"
207 | ]
208 | },
209 | "execution_count": 9,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "pred.head()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "pred.to_csv('../result/_ensemble.csv',index = False)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": true
234 | },
235 | "outputs": [],
236 | "source": []
237 | }
238 | ],
239 | "metadata": {
240 | "anaconda-cloud": {},
241 | "kernelspec": {
242 | "display_name": "Python [default]",
243 | "language": "python",
244 | "name": "python3"
245 | },
246 | "language_info": {
247 | "codemirror_mode": {
248 | "name": "ipython",
249 | "version": 3
250 | },
251 | "file_extension": ".py",
252 | "mimetype": "text/x-python",
253 | "name": "python",
254 | "nbconvert_exporter": "python",
255 | "pygments_lexer": "ipython3",
256 | "version": "3.5.2"
257 | }
258 | },
259 | "nbformat": 4,
260 | "nbformat_minor": 1
261 | }
262 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_ensemble.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "xgb = pd.read_csv('../result/semi_xgb.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "lgb = pd.read_csv('../result/semi_lgb.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "gbdt = pd.read_csv('../result/semi_gbdt.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 5,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "nn = pd.read_csv('../result/semi_nn.csv')"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 6,
64 | "metadata": {
65 | "collapsed": false
66 | },
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/html": [
71 | "\n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " | \n",
76 | " cust_id | \n",
77 | " pred_prob | \n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " | 0 | \n",
83 | " 1 | \n",
84 | " 0.003583 | \n",
85 | "
\n",
86 | " \n",
87 | " | 1 | \n",
88 | " 2 | \n",
89 | " 0.069257 | \n",
90 | "
\n",
91 | " \n",
92 | " | 2 | \n",
93 | " 3 | \n",
94 | " 0.015288 | \n",
95 | "
\n",
96 | " \n",
97 | " | 3 | \n",
98 | " 4 | \n",
99 | " 0.084963 | \n",
100 | "
\n",
101 | " \n",
102 | " | 4 | \n",
103 | " 5 | \n",
104 | " 0.064052 | \n",
105 | "
\n",
106 | " \n",
107 | "
\n",
108 | "
"
109 | ],
110 | "text/plain": [
111 | " cust_id pred_prob\n",
112 | "0 1 0.003583\n",
113 | "1 2 0.069257\n",
114 | "2 3 0.015288\n",
115 | "3 4 0.084963\n",
116 | "4 5 0.064052"
117 | ]
118 | },
119 | "execution_count": 6,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "nn.head()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 7,
131 | "metadata": {
132 | "collapsed": true
133 | },
134 | "outputs": [],
135 | "source": [
136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 8,
142 | "metadata": {
143 | "collapsed": true
144 | },
145 | "outputs": [],
146 | "source": [
147 | "pred = nn\n",
148 | "pred.pred_prob = result"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 9,
154 | "metadata": {
155 | "collapsed": false
156 | },
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/html": [
161 | "\n",
162 | "
\n",
163 | " \n",
164 | " \n",
165 | " | \n",
166 | " cust_id | \n",
167 | " pred_prob | \n",
168 | "
\n",
169 | " \n",
170 | " \n",
171 | " \n",
172 | " | 0 | \n",
173 | " 1 | \n",
174 | " 0.016251 | \n",
175 | "
\n",
176 | " \n",
177 | " | 1 | \n",
178 | " 2 | \n",
179 | " 0.031600 | \n",
180 | "
\n",
181 | " \n",
182 | " | 2 | \n",
183 | " 3 | \n",
184 | " 0.076029 | \n",
185 | "
\n",
186 | " \n",
187 | " | 3 | \n",
188 | " 4 | \n",
189 | " 0.058965 | \n",
190 | "
\n",
191 | " \n",
192 | " | 4 | \n",
193 | " 5 | \n",
194 | " 0.055669 | \n",
195 | "
\n",
196 | " \n",
197 | "
\n",
198 | "
"
199 | ],
200 | "text/plain": [
201 | " cust_id pred_prob\n",
202 | "0 1 0.016251\n",
203 | "1 2 0.031600\n",
204 | "2 3 0.076029\n",
205 | "3 4 0.058965\n",
206 | "4 5 0.055669"
207 | ]
208 | },
209 | "execution_count": 9,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "pred.head()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "pred.to_csv('../result/semi_ensemble.csv',index = False)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": true
234 | },
235 | "outputs": [],
236 | "source": []
237 | }
238 | ],
239 | "metadata": {
240 | "anaconda-cloud": {},
241 | "kernelspec": {
242 | "display_name": "Python [default]",
243 | "language": "python",
244 | "name": "python3"
245 | },
246 | "language_info": {
247 | "codemirror_mode": {
248 | "name": "ipython",
249 | "version": 3
250 | },
251 | "file_extension": ".py",
252 | "mimetype": "text/x-python",
253 | "name": "python",
254 | "nbconvert_exporter": "python",
255 | "pygments_lexer": "ipython3",
256 | "version": "3.5.2"
257 | }
258 | },
259 | "nbformat": 4,
260 | "nbformat_minor": 1
261 | }
262 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/_ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "xgb = pd.read_csv('../result/xgb.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "lgb = pd.read_csv('../result/lgb.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "gbdt = pd.read_csv('../result/gbdt.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 5,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "nn = pd.read_csv('../result/nn.csv')"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 6,
64 | "metadata": {
65 | "collapsed": false
66 | },
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/html": [
71 | "\n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " | \n",
76 | " cust_id | \n",
77 | " pred_prob | \n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " | 0 | \n",
83 | " 1 | \n",
84 | " 0.005983 | \n",
85 | "
\n",
86 | " \n",
87 | " | 1 | \n",
88 | " 2 | \n",
89 | " 0.025552 | \n",
90 | "
\n",
91 | " \n",
92 | " | 2 | \n",
93 | " 3 | \n",
94 | " 0.018066 | \n",
95 | "
\n",
96 | " \n",
97 | " | 3 | \n",
98 | " 4 | \n",
99 | " 0.035206 | \n",
100 | "
\n",
101 | " \n",
102 | " | 4 | \n",
103 | " 5 | \n",
104 | " 0.067734 | \n",
105 | "
\n",
106 | " \n",
107 | "
\n",
108 | "
"
109 | ],
110 | "text/plain": [
111 | " cust_id pred_prob\n",
112 | "0 1 0.005983\n",
113 | "1 2 0.025552\n",
114 | "2 3 0.018066\n",
115 | "3 4 0.035206\n",
116 | "4 5 0.067734"
117 | ]
118 | },
119 | "execution_count": 6,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "nn.head()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 7,
131 | "metadata": {
132 | "collapsed": false
133 | },
134 | "outputs": [],
135 | "source": [
136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 8,
142 | "metadata": {
143 | "collapsed": false
144 | },
145 | "outputs": [],
146 | "source": [
147 | "pred = nn\n",
148 | "pred.pred_prob = result"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 9,
154 | "metadata": {
155 | "collapsed": false
156 | },
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/html": [
161 | "\n",
162 | "
\n",
163 | " \n",
164 | " \n",
165 | " | \n",
166 | " cust_id | \n",
167 | " pred_prob | \n",
168 | "
\n",
169 | " \n",
170 | " \n",
171 | " \n",
172 | " | 0 | \n",
173 | " 1 | \n",
174 | " 0.014872 | \n",
175 | "
\n",
176 | " \n",
177 | " | 1 | \n",
178 | " 2 | \n",
179 | " 0.036687 | \n",
180 | "
\n",
181 | " \n",
182 | " | 2 | \n",
183 | " 3 | \n",
184 | " 0.194181 | \n",
185 | "
\n",
186 | " \n",
187 | " | 3 | \n",
188 | " 4 | \n",
189 | " 0.064974 | \n",
190 | "
\n",
191 | " \n",
192 | " | 4 | \n",
193 | " 5 | \n",
194 | " 0.118509 | \n",
195 | "
\n",
196 | " \n",
197 | "
\n",
198 | "
"
199 | ],
200 | "text/plain": [
201 | " cust_id pred_prob\n",
202 | "0 1 0.014872\n",
203 | "1 2 0.036687\n",
204 | "2 3 0.194181\n",
205 | "3 4 0.064974\n",
206 | "4 5 0.118509"
207 | ]
208 | },
209 | "execution_count": 9,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "pred.head()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "pred.to_csv('../result/_ensemble.csv',index = False)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": true
234 | },
235 | "outputs": [],
236 | "source": []
237 | }
238 | ],
239 | "metadata": {
240 | "anaconda-cloud": {},
241 | "kernelspec": {
242 | "display_name": "Python [default]",
243 | "language": "python",
244 | "name": "python3"
245 | },
246 | "language_info": {
247 | "codemirror_mode": {
248 | "name": "ipython",
249 | "version": 3
250 | },
251 | "file_extension": ".py",
252 | "mimetype": "text/x-python",
253 | "name": "python",
254 | "nbconvert_exporter": "python",
255 | "pygments_lexer": "ipython3",
256 | "version": "3.5.2"
257 | }
258 | },
259 | "nbformat": 4,
260 | "nbformat_minor": 1
261 | }
262 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_ensemble-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "xgb = pd.read_csv('../result/semi_xgb.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "lgb = pd.read_csv('../result/semi_lgb.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "gbdt = pd.read_csv('../result/semi_gbdt.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 5,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "nn = pd.read_csv('../result/semi_nn.csv')"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 6,
64 | "metadata": {
65 | "collapsed": false
66 | },
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/html": [
71 | "\n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " | \n",
76 | " cust_id | \n",
77 | " pred_prob | \n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " | 0 | \n",
83 | " 1 | \n",
84 | " 0.003583 | \n",
85 | "
\n",
86 | " \n",
87 | " | 1 | \n",
88 | " 2 | \n",
89 | " 0.069257 | \n",
90 | "
\n",
91 | " \n",
92 | " | 2 | \n",
93 | " 3 | \n",
94 | " 0.015288 | \n",
95 | "
\n",
96 | " \n",
97 | " | 3 | \n",
98 | " 4 | \n",
99 | " 0.084963 | \n",
100 | "
\n",
101 | " \n",
102 | " | 4 | \n",
103 | " 5 | \n",
104 | " 0.064052 | \n",
105 | "
\n",
106 | " \n",
107 | "
\n",
108 | "
"
109 | ],
110 | "text/plain": [
111 | " cust_id pred_prob\n",
112 | "0 1 0.003583\n",
113 | "1 2 0.069257\n",
114 | "2 3 0.015288\n",
115 | "3 4 0.084963\n",
116 | "4 5 0.064052"
117 | ]
118 | },
119 | "execution_count": 6,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "nn.head()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 7,
131 | "metadata": {
132 | "collapsed": true
133 | },
134 | "outputs": [],
135 | "source": [
136 | "result = xgb.pred_prob*0.7 + lgb.pred_prob*0.1 + gbdt.pred_prob*0.1 + nn.pred_prob*0.1"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 8,
142 | "metadata": {
143 | "collapsed": true
144 | },
145 | "outputs": [],
146 | "source": [
147 | "pred = nn\n",
148 | "pred.pred_prob = result"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 9,
154 | "metadata": {
155 | "collapsed": false
156 | },
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/html": [
161 | "\n",
162 | "
\n",
163 | " \n",
164 | " \n",
165 | " | \n",
166 | " cust_id | \n",
167 | " pred_prob | \n",
168 | "
\n",
169 | " \n",
170 | " \n",
171 | " \n",
172 | " | 0 | \n",
173 | " 1 | \n",
174 | " 0.016251 | \n",
175 | "
\n",
176 | " \n",
177 | " | 1 | \n",
178 | " 2 | \n",
179 | " 0.031600 | \n",
180 | "
\n",
181 | " \n",
182 | " | 2 | \n",
183 | " 3 | \n",
184 | " 0.076029 | \n",
185 | "
\n",
186 | " \n",
187 | " | 3 | \n",
188 | " 4 | \n",
189 | " 0.058965 | \n",
190 | "
\n",
191 | " \n",
192 | " | 4 | \n",
193 | " 5 | \n",
194 | " 0.055669 | \n",
195 | "
\n",
196 | " \n",
197 | "
\n",
198 | "
"
199 | ],
200 | "text/plain": [
201 | " cust_id pred_prob\n",
202 | "0 1 0.016251\n",
203 | "1 2 0.031600\n",
204 | "2 3 0.076029\n",
205 | "3 4 0.058965\n",
206 | "4 5 0.055669"
207 | ]
208 | },
209 | "execution_count": 9,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "pred.head()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "pred.to_csv('../result/semi_ensemble.csv',index = False)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": true
234 | },
235 | "outputs": [],
236 | "source": []
237 | }
238 | ],
239 | "metadata": {
240 | "anaconda-cloud": {},
241 | "kernelspec": {
242 | "display_name": "Python [default]",
243 | "language": "python",
244 | "name": "python3"
245 | },
246 | "language_info": {
247 | "codemirror_mode": {
248 | "name": "ipython",
249 | "version": 3
250 | },
251 | "file_extension": ".py",
252 | "mimetype": "text/x-python",
253 | "name": "python",
254 | "nbconvert_exporter": "python",
255 | "pygments_lexer": "ipython3",
256 | "version": "3.5.2"
257 | }
258 | },
259 | "nbformat": 4,
260 | "nbformat_minor": 1
261 | }
262 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/lgb预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 4,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 5,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 7,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 7,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 8,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 8,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 9,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 10,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 11,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 12,
138 | "metadata": {
139 | "collapsed": true
140 | },
141 | "outputs": [],
142 | "source": [
143 | "from sklearn.metrics import accuracy_score\n",
144 | "from sklearn import metrics\n",
145 | "from sklearn.model_selection import train_test_split\n",
146 | "from xgboost import XGBClassifier\n",
147 | "import lightgbm as lgb"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 13,
153 | "metadata": {
154 | "collapsed": false
155 | },
156 | "outputs": [],
157 | "source": [
158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 14,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [],
168 | "source": [
169 | "lgb_train = lgb.Dataset(X_train, y_train)\n",
170 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 15,
176 | "metadata": {
177 | "collapsed": false,
178 | "scrolled": true
179 | },
180 | "outputs": [],
181 | "source": [
182 | "param = {\n",
183 | " 'max_depth':6,\n",
184 | " 'num_leaves':80,\n",
185 | " 'learning_rate':0.03,\n",
186 | " 'scale_pos_weight':1,\n",
187 | " 'num_threads':40,\n",
188 | " 'objective':'binary',\n",
189 | " 'bagging_fraction':0.7,\n",
190 | " 'bagging_freq':1,\n",
191 | " 'min_sum_hessian_in_leaf':100\n",
192 | "}\n",
193 | "\n",
194 | "param['is_unbalance']='true'\n",
195 | "param['metric'] = 'auc'\n",
196 | "\n",
197 | "\n",
198 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=3, early_stopping_rounds=30)\n",
199 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 16,
205 | "metadata": {
206 | "collapsed": false
207 | },
208 | "outputs": [
209 | {
210 | "name": "stdout",
211 | "output_type": "stream",
212 | "text": [
213 | "0.809795678593\n"
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "ypred = gbm.predict(X_val)\n",
219 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
220 | "print(val_auc)"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 17,
226 | "metadata": {
227 | "collapsed": false
228 | },
229 | "outputs": [],
230 | "source": [
231 | "y_pred = gbm.predict(test_X)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 18,
237 | "metadata": {
238 | "collapsed": true
239 | },
240 | "outputs": [],
241 | "source": [
242 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': y_pred})\n",
243 | "Submission.to_csv('../result/lgb.csv',index=False)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {
250 | "collapsed": true
251 | },
252 | "outputs": [],
253 | "source": []
254 | }
255 | ],
256 | "metadata": {
257 | "anaconda-cloud": {},
258 | "kernelspec": {
259 | "display_name": "Python [default]",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.5.2"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 1
278 | }
279 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/lgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 4,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 5,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 7,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 7,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 8,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 8,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 9,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 10,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 11,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 12,
138 | "metadata": {
139 | "collapsed": true
140 | },
141 | "outputs": [],
142 | "source": [
143 | "from sklearn.metrics import accuracy_score\n",
144 | "from sklearn import metrics\n",
145 | "from sklearn.model_selection import train_test_split\n",
146 | "from xgboost import XGBClassifier\n",
147 | "import lightgbm as lgb"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 13,
153 | "metadata": {
154 | "collapsed": false
155 | },
156 | "outputs": [],
157 | "source": [
158 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 14,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [],
168 | "source": [
169 | "lgb_train = lgb.Dataset(X_train, y_train)\n",
170 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 15,
176 | "metadata": {
177 | "collapsed": false,
178 | "scrolled": true
179 | },
180 | "outputs": [],
181 | "source": [
182 | "param = {\n",
183 | " 'max_depth':6,\n",
184 | " 'num_leaves':80,\n",
185 | " 'learning_rate':0.03,\n",
186 | " 'scale_pos_weight':1,\n",
187 | " 'num_threads':40,\n",
188 | " 'objective':'binary',\n",
189 | " 'bagging_fraction':0.7,\n",
190 | " 'bagging_freq':1,\n",
191 | " 'min_sum_hessian_in_leaf':100\n",
192 | "}\n",
193 | "\n",
194 | "param['is_unbalance']='true'\n",
195 | "param['metric'] = 'auc'\n",
196 | "\n",
197 | "\n",
198 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=3, early_stopping_rounds=30)\n",
199 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 16,
205 | "metadata": {
206 | "collapsed": false
207 | },
208 | "outputs": [
209 | {
210 | "name": "stdout",
211 | "output_type": "stream",
212 | "text": [
213 | "0.809795678593\n"
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "ypred = gbm.predict(X_val)\n",
219 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
220 | "print(val_auc)"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 17,
226 | "metadata": {
227 | "collapsed": false
228 | },
229 | "outputs": [],
230 | "source": [
231 | "y_pred = gbm.predict(test_X)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 18,
237 | "metadata": {
238 | "collapsed": true
239 | },
240 | "outputs": [],
241 | "source": [
242 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': y_pred})\n",
243 | "Submission.to_csv('../result/lgb.csv',index=False)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {
250 | "collapsed": true
251 | },
252 | "outputs": [],
253 | "source": []
254 | }
255 | ],
256 | "metadata": {
257 | "anaconda-cloud": {},
258 | "kernelspec": {
259 | "display_name": "Python [default]",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.5.2"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 1
278 | }
279 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/xgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 4,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 5,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 7,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 7,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 8,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 8,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 9,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 10,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 11,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 12,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [
142 | {
143 | "name": "stderr",
144 | "output_type": "stream",
145 | "text": [
146 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
147 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
148 | "Using TensorFlow backend.\n"
149 | ]
150 | }
151 | ],
152 | "source": [
153 | "from sklearn.tree import DecisionTreeClassifier\n",
154 | "from sklearn.ensemble import RandomForestClassifier\n",
155 | "from sklearn.ensemble import AdaBoostClassifier\n",
156 | "from sklearn.ensemble import ExtraTreesClassifier\n",
157 | "from sklearn.ensemble import GradientBoostingClassifier\n",
158 | "from sklearn.neighbors import KNeighborsClassifier\n",
159 | "from sklearn.svm import SVC\n",
160 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n",
161 | "from sklearn.metrics import classification_report\n",
162 | "from sklearn.metrics import precision_recall_fscore_support\n",
163 | "from sklearn.utils.multiclass import unique_labels\n",
164 | "from sklearn.metrics import accuracy_score\n",
165 | "from xgboost import XGBClassifier\n",
166 | "from sklearn.ensemble import GradientBoostingClassifier\n",
167 | "from sklearn.cross_validation import cross_val_score\n",
168 | "from lightgbm import LGBMClassifier\n",
169 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
170 | "from sklearn.svm import LinearSVC\n",
171 | "from sklearn import linear_model\n",
172 | "import lightgbm as lgb\n",
173 | "import xgboost as xgb\n",
174 | "\n",
175 | "from keras.models import Model\n",
176 | "from keras.layers import Dense, Input"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 14,
182 | "metadata": {
183 | "collapsed": false
184 | },
185 | "outputs": [],
186 | "source": [
187 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 15,
193 | "metadata": {
194 | "collapsed": true
195 | },
196 | "outputs": [],
197 | "source": [
198 | "xgb_train = xgb.DMatrix(X_train, y_train)\n",
199 | "xgb_val = xgb.DMatrix(X_val, y_val)\n",
200 | "xgb_test = xgb.DMatrix(test_X)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 29,
206 | "metadata": {
207 | "collapsed": false
208 | },
209 | "outputs": [],
210 | "source": [
211 | "param = {\n",
212 | " 'booster':'gbtree',\n",
213 | " 'max_depth':4,\n",
214 | " 'num_leaves':50,\n",
215 | " 'learning_rate':0.05,\n",
216 | " 'scale_pos_weight':1,\n",
217 | " 'num_threads':40,\n",
218 | " 'objective':'binary:logistic',\n",
219 | " 'bagging_fraction':0.7,\n",
220 | " 'bagging_freq':1,\n",
221 | " 'min_sum_hessian_in_leaf':100,\n",
222 | "}\n",
223 | "\n",
224 | "param['is_unbalance']='true'\n",
225 | "param['metric'] = 'auc'\n",
226 | "\n",
227 | "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=100)\n",
228 | "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 30,
234 | "metadata": {
235 | "collapsed": false
236 | },
237 | "outputs": [
238 | {
239 | "name": "stdout",
240 | "output_type": "stream",
241 | "text": [
242 | "0.806100532112\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "ypred = gbm.predict(xgb_val)\n",
248 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
249 | "print(val_auc)"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 31,
255 | "metadata": {
256 | "collapsed": false
257 | },
258 | "outputs": [
259 | {
260 | "data": {
261 | "text/plain": [
262 | "(10000,)"
263 | ]
264 | },
265 | "execution_count": 31,
266 | "metadata": {},
267 | "output_type": "execute_result"
268 | }
269 | ],
270 | "source": [
271 | "pred = gbm.predict(xgb_test)\n",
272 | "pred.shape"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 32,
278 | "metadata": {
279 | "collapsed": true
280 | },
281 | "outputs": [],
282 | "source": [
283 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
284 | "Submission.to_csv('../result/xgb.csv',index=False)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {
291 | "collapsed": true
292 | },
293 | "outputs": [],
294 | "source": []
295 | }
296 | ],
297 | "metadata": {
298 | "anaconda-cloud": {},
299 | "kernelspec": {
300 | "display_name": "Python [default]",
301 | "language": "python",
302 | "name": "python3"
303 | },
304 | "language_info": {
305 | "codemirror_mode": {
306 | "name": "ipython",
307 | "version": 3
308 | },
309 | "file_extension": ".py",
310 | "mimetype": "text/x-python",
311 | "name": "python",
312 | "nbconvert_exporter": "python",
313 | "pygments_lexer": "ipython3",
314 | "version": "3.5.2"
315 | }
316 | },
317 | "nbformat": 4,
318 | "nbformat_minor": 1
319 | }
320 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_xgb预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "(10000, 160)"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
37 | "train_x1.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(15000, 160)"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
60 | "train_x2.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(10000, 159)"
74 | ]
75 | },
76 | "execution_count": 4,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "test = pd.read_csv('../data/test_all.csv')\n",
83 | "test.shape"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "(25000, 157)"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 | "train_x = pd.concat([train_x11, train_x22])\n",
108 | "train_x.shape"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "(10000, 157)"
122 | ]
123 | },
124 | "execution_count": 6,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 | "test_x.shape"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "(35000, 157)"
145 | ]
146 | },
147 | "execution_count": 7,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "x = pd.concat([train_x, test_x])\n",
154 | "x.shape"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "(25000,)"
168 | ]
169 | },
170 | "execution_count": 8,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "train_y1 = train_x1['y']\n",
177 | "train_y2 = train_x2['y']\n",
178 | "Y_train = train_y1.append(train_y2)\n",
179 | "Y_train.shape"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "(35000, 364)\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "for i in range(96,158):\n",
199 | " col = 'x'+'_'+str(i)\n",
200 | " if col in x.columns.values:\n",
201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 | " x = pd.concat([x, dummies_df], axis=1)\n",
203 | "print(x.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "(25000, 364)\n",
218 | "(10000, 364)\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "train_X = x[0:25000]\n",
224 | "test_X = x[25000:35000]\n",
225 | "print(train_X.shape)\n",
226 | "print(test_X.shape)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 11,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stderr",
238 | "output_type": "stream",
239 | "text": [
240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 | "Using TensorFlow backend.\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "from sklearn.tree import DecisionTreeClassifier\n",
248 | "from sklearn.ensemble import RandomForestClassifier\n",
249 | "from sklearn.ensemble import AdaBoostClassifier\n",
250 | "from sklearn.ensemble import ExtraTreesClassifier\n",
251 | "from sklearn.ensemble import GradientBoostingClassifier\n",
252 | "from sklearn.neighbors import KNeighborsClassifier\n",
253 | "from sklearn.svm import SVC\n",
254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n",
255 | "from sklearn.metrics import classification_report\n",
256 | "from sklearn.metrics import precision_recall_fscore_support\n",
257 | "from sklearn.utils.multiclass import unique_labels\n",
258 | "from sklearn.metrics import accuracy_score\n",
259 | "from xgboost import XGBClassifier\n",
260 | "from sklearn.ensemble import GradientBoostingClassifier\n",
261 | "from sklearn.cross_validation import cross_val_score\n",
262 | "from lightgbm import LGBMClassifier\n",
263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 | "from sklearn.svm import LinearSVC\n",
265 | "from sklearn import linear_model\n",
266 | "import lightgbm as lgb\n",
267 | "import xgboost as xgb\n",
268 | "\n",
269 | "from keras.models import Model\n",
270 | "from keras.layers import Dense, Input"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 12,
276 | "metadata": {
277 | "collapsed": false,
278 | "scrolled": true
279 | },
280 | "outputs": [],
281 | "source": [
282 | "# encoding_dim = 600\n",
283 | "# input_dim = Input(shape=(364,))\n",
284 | "\n",
285 | "# encoded = Dense(364, activation='linear')(input_dim)\n",
286 | "# # encoded = Dense(300, activation='relu')(encoded)\n",
287 | "# # encoded = Dense(32, activation='relu')(encoded)\n",
288 | "# encoder_output = Dense(encoding_dim)(encoded)\n",
289 | "\n",
290 | "# decoded = Dense(600, activation='relu')(encoder_output)\n",
291 | "# # decoded = Dense(64, activation='relu')(decoded)\n",
292 | "# # decoded = Dense(128, activation='relu')(decoded)\n",
293 | "# decoded = Dense(364, activation='tanh')(decoded)\n",
294 | "\n",
295 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
296 | "\n",
297 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
298 | "\n",
299 | "# autoencoder.compile(optimizer='adam', loss='mse')\n",
300 | "# # training\n",
301 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 13,
307 | "metadata": {
308 | "collapsed": false
309 | },
310 | "outputs": [],
311 | "source": [
312 | "# new_train_feature = encoder.predict(train_X.values)\n",
313 | "# new_test_feature = encoder.predict(test_X.values)\n",
314 | "# print(new_train_feature.shape)\n",
315 | "# print(new_test_feature.shape)"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 14,
321 | "metadata": {
322 | "collapsed": false
323 | },
324 | "outputs": [],
325 | "source": [
326 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 15,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [],
336 | "source": [
337 | "xgb_train = xgb.DMatrix(X_train, y_train)\n",
338 | "xgb_val = xgb.DMatrix(X_val, y_val)\n",
339 | "xgb_test = xgb.DMatrix(test_X)"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 16,
345 | "metadata": {
346 | "collapsed": false
347 | },
348 | "outputs": [],
349 | "source": [
350 | "param = {\n",
351 | " 'booster':'gbtree',\n",
352 | " 'max_depth':10,\n",
353 | " 'num_leaves':80,\n",
354 | " 'learning_rate':0.03,\n",
355 | " 'scale_pos_weight':1,\n",
356 | " 'num_threads':40,\n",
357 | " 'objective':'binary:logistic',\n",
358 | " 'bagging_fraction':0.7,\n",
359 | " 'bagging_freq':1,\n",
360 | " 'min_sum_hessian_in_leaf':100,\n",
361 | "}\n",
362 | "\n",
363 | "param['is_unbalance']='true'\n",
364 | "param['metric'] = 'auc'\n",
365 | "\n",
366 | "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=10, early_stopping_rounds=100)\n",
367 | "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])\n"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 17,
373 | "metadata": {
374 | "collapsed": false
375 | },
376 | "outputs": [
377 | {
378 | "name": "stdout",
379 | "output_type": "stream",
380 | "text": [
381 | "0.817833166049\n"
382 | ]
383 | }
384 | ],
385 | "source": [
386 | "ypred = gbm.predict(xgb_val)\n",
387 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
388 | "print(val_auc)"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 18,
394 | "metadata": {
395 | "collapsed": false
396 | },
397 | "outputs": [
398 | {
399 | "data": {
400 | "text/plain": [
401 | "(10000,)"
402 | ]
403 | },
404 | "execution_count": 18,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "pred = gbm.predict(xgb_test)\n",
411 | "pred.shape"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 19,
417 | "metadata": {
418 | "collapsed": true
419 | },
420 | "outputs": [],
421 | "source": [
422 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
423 | "Submission.to_csv('../result/semi_xgb.csv',index=False)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {
430 | "collapsed": true
431 | },
432 | "outputs": [],
433 | "source": []
434 | }
435 | ],
436 | "metadata": {
437 | "anaconda-cloud": {},
438 | "kernelspec": {
439 | "display_name": "Python [default]",
440 | "language": "python",
441 | "name": "python3"
442 | },
443 | "language_info": {
444 | "codemirror_mode": {
445 | "name": "ipython",
446 | "version": 3
447 | },
448 | "file_extension": ".py",
449 | "mimetype": "text/x-python",
450 | "name": "python",
451 | "nbconvert_exporter": "python",
452 | "pygments_lexer": "ipython3",
453 | "version": "3.5.2"
454 | }
455 | },
456 | "nbformat": 4,
457 | "nbformat_minor": 1
458 | }
459 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_xgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "(10000, 160)"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
37 | "train_x1.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(15000, 160)"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
60 | "train_x2.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(10000, 159)"
74 | ]
75 | },
76 | "execution_count": 4,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "test = pd.read_csv('../data/test_all.csv')\n",
83 | "test.shape"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "(25000, 157)"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 | "train_x = pd.concat([train_x11, train_x22])\n",
108 | "train_x.shape"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "(10000, 157)"
122 | ]
123 | },
124 | "execution_count": 6,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 | "test_x.shape"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "(35000, 157)"
145 | ]
146 | },
147 | "execution_count": 7,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "x = pd.concat([train_x, test_x])\n",
154 | "x.shape"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "(25000,)"
168 | ]
169 | },
170 | "execution_count": 8,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "train_y1 = train_x1['y']\n",
177 | "train_y2 = train_x2['y']\n",
178 | "Y_train = train_y1.append(train_y2)\n",
179 | "Y_train.shape"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "(35000, 364)\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "for i in range(96,158):\n",
199 | " col = 'x'+'_'+str(i)\n",
200 | " if col in x.columns.values:\n",
201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 | " x = pd.concat([x, dummies_df], axis=1)\n",
203 | "print(x.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "(25000, 364)\n",
218 | "(10000, 364)\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "train_X = x[0:25000]\n",
224 | "test_X = x[25000:35000]\n",
225 | "print(train_X.shape)\n",
226 | "print(test_X.shape)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 11,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stderr",
238 | "output_type": "stream",
239 | "text": [
240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 | "Using TensorFlow backend.\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "from sklearn.tree import DecisionTreeClassifier\n",
248 | "from sklearn.ensemble import RandomForestClassifier\n",
249 | "from sklearn.ensemble import AdaBoostClassifier\n",
250 | "from sklearn.ensemble import ExtraTreesClassifier\n",
251 | "from sklearn.ensemble import GradientBoostingClassifier\n",
252 | "from sklearn.neighbors import KNeighborsClassifier\n",
253 | "from sklearn.svm import SVC\n",
254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n",
255 | "from sklearn.metrics import classification_report\n",
256 | "from sklearn.metrics import precision_recall_fscore_support\n",
257 | "from sklearn.utils.multiclass import unique_labels\n",
258 | "from sklearn.metrics import accuracy_score\n",
259 | "from xgboost import XGBClassifier\n",
260 | "from sklearn.ensemble import GradientBoostingClassifier\n",
261 | "from sklearn.cross_validation import cross_val_score\n",
262 | "from lightgbm import LGBMClassifier\n",
263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 | "from sklearn.svm import LinearSVC\n",
265 | "from sklearn import linear_model\n",
266 | "import lightgbm as lgb\n",
267 | "import xgboost as xgb\n",
268 | "\n",
269 | "from keras.models import Model\n",
270 | "from keras.layers import Dense, Input"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 12,
276 | "metadata": {
277 | "collapsed": false,
278 | "scrolled": true
279 | },
280 | "outputs": [],
281 | "source": [
282 | "# encoding_dim = 600\n",
283 | "# input_dim = Input(shape=(364,))\n",
284 | "\n",
285 | "# encoded = Dense(364, activation='linear')(input_dim)\n",
286 | "# # encoded = Dense(300, activation='relu')(encoded)\n",
287 | "# # encoded = Dense(32, activation='relu')(encoded)\n",
288 | "# encoder_output = Dense(encoding_dim)(encoded)\n",
289 | "\n",
290 | "# decoded = Dense(600, activation='relu')(encoder_output)\n",
291 | "# # decoded = Dense(64, activation='relu')(decoded)\n",
292 | "# # decoded = Dense(128, activation='relu')(decoded)\n",
293 | "# decoded = Dense(364, activation='tanh')(decoded)\n",
294 | "\n",
295 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
296 | "\n",
297 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
298 | "\n",
299 | "# autoencoder.compile(optimizer='adam', loss='mse')\n",
300 | "# # training\n",
301 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 13,
307 | "metadata": {
308 | "collapsed": false
309 | },
310 | "outputs": [],
311 | "source": [
312 | "# new_train_feature = encoder.predict(train_X.values)\n",
313 | "# new_test_feature = encoder.predict(test_X.values)\n",
314 | "# print(new_train_feature.shape)\n",
315 | "# print(new_test_feature.shape)"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 14,
321 | "metadata": {
322 | "collapsed": false
323 | },
324 | "outputs": [],
325 | "source": [
326 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 15,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [],
336 | "source": [
337 | "xgb_train = xgb.DMatrix(X_train, y_train)\n",
338 | "xgb_val = xgb.DMatrix(X_val, y_val)\n",
339 | "xgb_test = xgb.DMatrix(test_X)"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 16,
345 | "metadata": {
346 | "collapsed": false
347 | },
348 | "outputs": [],
349 | "source": [
350 | "param = {\n",
351 | " 'booster':'gbtree',\n",
352 | " 'max_depth':10,\n",
353 | " 'num_leaves':80,\n",
354 | " 'learning_rate':0.03,\n",
355 | " 'scale_pos_weight':1,\n",
356 | " 'num_threads':40,\n",
357 | " 'objective':'binary:logistic',\n",
358 | " 'bagging_fraction':0.7,\n",
359 | " 'bagging_freq':1,\n",
360 | " 'min_sum_hessian_in_leaf':100,\n",
361 | "}\n",
362 | "\n",
363 | "param['is_unbalance']='true'\n",
364 | "param['metric'] = 'auc'\n",
365 | "\n",
366 | "bst=xgb.cv(param,xgb_train, num_boost_round=1000, nfold=10, early_stopping_rounds=100)\n",
367 | "gbm = xgb.train(param,xgb_train,num_boost_round=bst.shape[0])\n"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 17,
373 | "metadata": {
374 | "collapsed": false
375 | },
376 | "outputs": [
377 | {
378 | "name": "stdout",
379 | "output_type": "stream",
380 | "text": [
381 | "0.817833166049\n"
382 | ]
383 | }
384 | ],
385 | "source": [
386 | "ypred = gbm.predict(xgb_val)\n",
387 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
388 | "print(val_auc)"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 18,
394 | "metadata": {
395 | "collapsed": false
396 | },
397 | "outputs": [
398 | {
399 | "data": {
400 | "text/plain": [
401 | "(10000,)"
402 | ]
403 | },
404 | "execution_count": 18,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "pred = gbm.predict(xgb_test)\n",
411 | "pred.shape"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 19,
417 | "metadata": {
418 | "collapsed": true
419 | },
420 | "outputs": [],
421 | "source": [
422 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
423 | "Submission.to_csv('../result/semi_xgb.csv',index=False)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {
430 | "collapsed": true
431 | },
432 | "outputs": [],
433 | "source": []
434 | }
435 | ],
436 | "metadata": {
437 | "anaconda-cloud": {},
438 | "kernelspec": {
439 | "display_name": "Python [default]",
440 | "language": "python",
441 | "name": "python3"
442 | },
443 | "language_info": {
444 | "codemirror_mode": {
445 | "name": "ipython",
446 | "version": 3
447 | },
448 | "file_extension": ".py",
449 | "mimetype": "text/x-python",
450 | "name": "python",
451 | "nbconvert_exporter": "python",
452 | "pygments_lexer": "ipython3",
453 | "version": "3.5.2"
454 | }
455 | },
456 | "nbformat": 4,
457 | "nbformat_minor": 1
458 | }
459 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_lgb预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "(10000, 160)"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
37 | "train_x1.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(15000, 160)"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
60 | "train_x2.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(10000, 159)"
74 | ]
75 | },
76 | "execution_count": 4,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "test = pd.read_csv('../data/test_all.csv')\n",
83 | "test.shape"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "(25000, 157)"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 | "train_x = pd.concat([train_x11, train_x22])\n",
108 | "train_x.shape"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "(10000, 157)"
122 | ]
123 | },
124 | "execution_count": 6,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 | "test_x.shape"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "(35000, 157)"
145 | ]
146 | },
147 | "execution_count": 7,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "x = pd.concat([train_x, test_x])\n",
154 | "x.shape"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "(25000,)"
168 | ]
169 | },
170 | "execution_count": 8,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "train_y1 = train_x1['y']\n",
177 | "train_y2 = train_x2['y']\n",
178 | "Y_train = train_y1.append(train_y2)\n",
179 | "Y_train.shape"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "(35000, 364)\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "for i in range(96,158):\n",
199 | " col = 'x'+'_'+str(i)\n",
200 | " if col in x.columns.values:\n",
201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 | " x = pd.concat([x, dummies_df], axis=1)\n",
203 | "print(x.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "(25000, 364)\n",
218 | "(10000, 364)\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "train_X = x[0:25000]\n",
224 | "test_X = x[25000:35000]\n",
225 | "print(train_X.shape)\n",
226 | "print(test_X.shape)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 11,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stderr",
238 | "output_type": "stream",
239 | "text": [
240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 | "Using TensorFlow backend.\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "from sklearn.tree import DecisionTreeClassifier\n",
248 | "from sklearn.ensemble import RandomForestClassifier\n",
249 | "from sklearn.ensemble import AdaBoostClassifier\n",
250 | "from sklearn.ensemble import ExtraTreesClassifier\n",
251 | "from sklearn.ensemble import GradientBoostingClassifier\n",
252 | "from sklearn.neighbors import KNeighborsClassifier\n",
253 | "from sklearn.svm import SVC\n",
254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n",
255 | "from sklearn.metrics import classification_report\n",
256 | "from sklearn.metrics import precision_recall_fscore_support\n",
257 | "from sklearn.utils.multiclass import unique_labels\n",
258 | "from sklearn.metrics import accuracy_score\n",
259 | "from xgboost import XGBClassifier\n",
260 | "from sklearn.ensemble import GradientBoostingClassifier\n",
261 | "from sklearn.cross_validation import cross_val_score\n",
262 | "from lightgbm import LGBMClassifier\n",
263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 | "from sklearn.svm import LinearSVC\n",
265 | "from sklearn import linear_model\n",
266 | "import lightgbm as lgb\n",
267 | "\n",
268 | "from keras.models import Model\n",
269 | "from keras.layers import Dense, Input"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 12,
275 | "metadata": {
276 | "collapsed": false,
277 | "scrolled": true
278 | },
279 | "outputs": [],
280 | "source": [
281 | "# encoding_dim = 600\n",
282 | "# input_dim = Input(shape=(364,))\n",
283 | "\n",
284 | "# encoded = Dense(364, activation='linear')(input_dim)\n",
285 | "# # encoded = Dense(300, activation='relu')(encoded)\n",
286 | "# # encoded = Dense(32, activation='relu')(encoded)\n",
287 | "# encoder_output = Dense(encoding_dim)(encoded)\n",
288 | "\n",
289 | "# decoded = Dense(600, activation='relu')(encoder_output)\n",
290 | "# # decoded = Dense(64, activation='relu')(decoded)\n",
291 | "# # decoded = Dense(128, activation='relu')(decoded)\n",
292 | "# decoded = Dense(364, activation='tanh')(decoded)\n",
293 | "\n",
294 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
295 | "\n",
296 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
297 | "\n",
298 | "# autoencoder.compile(optimizer='adam', loss='mse')\n",
299 | "# # training\n",
300 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 13,
306 | "metadata": {
307 | "collapsed": false
308 | },
309 | "outputs": [],
310 | "source": [
311 | "# new_train_feature = encoder.predict(train_X.values)\n",
312 | "# new_test_feature = encoder.predict(test_X.values)\n",
313 | "# print(new_train_feature.shape)\n",
314 | "# print(new_test_feature.shape)"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 14,
320 | "metadata": {
321 | "collapsed": false
322 | },
323 | "outputs": [],
324 | "source": [
325 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 15,
331 | "metadata": {
332 | "collapsed": false
333 | },
334 | "outputs": [],
335 | "source": [
336 | "lgb_train = lgb.Dataset(X_train, y_train)\n",
337 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 16,
343 | "metadata": {
344 | "collapsed": false
345 | },
346 | "outputs": [],
347 | "source": [
348 | "param = {\n",
349 | " 'max_depth':6,\n",
350 | " 'num_leaves':80,\n",
351 | " 'learning_rate':0.03,\n",
352 | " 'scale_pos_weight':1,\n",
353 | " 'num_threads':40,\n",
354 | " 'objective':'binary',\n",
355 | " 'bagging_fraction':0.7,\n",
356 | " 'bagging_freq':1,\n",
357 | " 'min_sum_hessian_in_leaf':100\n",
358 | "}\n",
359 | "\n",
360 | "param['is_unbalance']='true'\n",
361 | "param['metric'] = 'auc'\n",
362 | "\n",
363 | "\n",
364 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=30)\n",
365 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 17,
371 | "metadata": {
372 | "collapsed": false
373 | },
374 | "outputs": [
375 | {
376 | "name": "stdout",
377 | "output_type": "stream",
378 | "text": [
379 | "0.810753521759\n"
380 | ]
381 | }
382 | ],
383 | "source": [
384 | "ypred = gbm.predict(X_val)\n",
385 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
386 | "print(val_auc)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 18,
392 | "metadata": {
393 | "collapsed": false
394 | },
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "(10000,)"
400 | ]
401 | },
402 | "execution_count": 18,
403 | "metadata": {},
404 | "output_type": "execute_result"
405 | }
406 | ],
407 | "source": [
408 | "pred = gbm.predict(test_X)\n",
409 | "pred.shape"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 19,
415 | "metadata": {
416 | "collapsed": true
417 | },
418 | "outputs": [],
419 | "source": [
420 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
421 | "Submission.to_csv('../result/semi_lgb.csv',index=False)"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "metadata": {
428 | "collapsed": true
429 | },
430 | "outputs": [],
431 | "source": []
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {
437 | "collapsed": true
438 | },
439 | "outputs": [],
440 | "source": []
441 | }
442 | ],
443 | "metadata": {
444 | "anaconda-cloud": {},
445 | "kernelspec": {
446 | "display_name": "Python [default]",
447 | "language": "python",
448 | "name": "python3"
449 | },
450 | "language_info": {
451 | "codemirror_mode": {
452 | "name": "ipython",
453 | "version": 3
454 | },
455 | "file_extension": ".py",
456 | "mimetype": "text/x-python",
457 | "name": "python",
458 | "nbconvert_exporter": "python",
459 | "pygments_lexer": "ipython3",
460 | "version": "3.5.2"
461 | }
462 | },
463 | "nbformat": 4,
464 | "nbformat_minor": 1
465 | }
466 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_lgb预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "(10000, 160)"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
37 | "train_x1.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(15000, 160)"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
60 | "train_x2.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(10000, 159)"
74 | ]
75 | },
76 | "execution_count": 4,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "test = pd.read_csv('../data/test_all.csv')\n",
83 | "test.shape"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "(25000, 157)"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 | "train_x = pd.concat([train_x11, train_x22])\n",
108 | "train_x.shape"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "(10000, 157)"
122 | ]
123 | },
124 | "execution_count": 6,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 | "test_x.shape"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "(35000, 157)"
145 | ]
146 | },
147 | "execution_count": 7,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "x = pd.concat([train_x, test_x])\n",
154 | "x.shape"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "(25000,)"
168 | ]
169 | },
170 | "execution_count": 8,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "train_y1 = train_x1['y']\n",
177 | "train_y2 = train_x2['y']\n",
178 | "Y_train = train_y1.append(train_y2)\n",
179 | "Y_train.shape"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "(35000, 364)\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "for i in range(96,158):\n",
199 | " col = 'x'+'_'+str(i)\n",
200 | " if col in x.columns.values:\n",
201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 | " x = pd.concat([x, dummies_df], axis=1)\n",
203 | "print(x.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "(25000, 364)\n",
218 | "(10000, 364)\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "train_X = x[0:25000]\n",
224 | "test_X = x[25000:35000]\n",
225 | "print(train_X.shape)\n",
226 | "print(test_X.shape)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 11,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stderr",
238 | "output_type": "stream",
239 | "text": [
240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 | "Using TensorFlow backend.\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "from sklearn.tree import DecisionTreeClassifier\n",
248 | "from sklearn.ensemble import RandomForestClassifier\n",
249 | "from sklearn.ensemble import AdaBoostClassifier\n",
250 | "from sklearn.ensemble import ExtraTreesClassifier\n",
251 | "from sklearn.ensemble import GradientBoostingClassifier\n",
252 | "from sklearn.neighbors import KNeighborsClassifier\n",
253 | "from sklearn.svm import SVC\n",
254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n",
255 | "from sklearn.metrics import classification_report\n",
256 | "from sklearn.metrics import precision_recall_fscore_support\n",
257 | "from sklearn.utils.multiclass import unique_labels\n",
258 | "from sklearn.metrics import accuracy_score\n",
259 | "from xgboost import XGBClassifier\n",
260 | "from sklearn.ensemble import GradientBoostingClassifier\n",
261 | "from sklearn.cross_validation import cross_val_score\n",
262 | "from lightgbm import LGBMClassifier\n",
263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 | "from sklearn.svm import LinearSVC\n",
265 | "from sklearn import linear_model\n",
266 | "import lightgbm as lgb\n",
267 | "\n",
268 | "from keras.models import Model\n",
269 | "from keras.layers import Dense, Input"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 12,
275 | "metadata": {
276 | "collapsed": false,
277 | "scrolled": true
278 | },
279 | "outputs": [],
280 | "source": [
281 | "# encoding_dim = 600\n",
282 | "# input_dim = Input(shape=(364,))\n",
283 | "\n",
284 | "# encoded = Dense(364, activation='linear')(input_dim)\n",
285 | "# # encoded = Dense(300, activation='relu')(encoded)\n",
286 | "# # encoded = Dense(32, activation='relu')(encoded)\n",
287 | "# encoder_output = Dense(encoding_dim)(encoded)\n",
288 | "\n",
289 | "# decoded = Dense(600, activation='relu')(encoder_output)\n",
290 | "# # decoded = Dense(64, activation='relu')(decoded)\n",
291 | "# # decoded = Dense(128, activation='relu')(decoded)\n",
292 | "# decoded = Dense(364, activation='tanh')(decoded)\n",
293 | "\n",
294 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
295 | "\n",
296 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
297 | "\n",
298 | "# autoencoder.compile(optimizer='adam', loss='mse')\n",
299 | "# # training\n",
300 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 13,
306 | "metadata": {
307 | "collapsed": false
308 | },
309 | "outputs": [],
310 | "source": [
311 | "# new_train_feature = encoder.predict(train_X.values)\n",
312 | "# new_test_feature = encoder.predict(test_X.values)\n",
313 | "# print(new_train_feature.shape)\n",
314 | "# print(new_test_feature.shape)"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 14,
320 | "metadata": {
321 | "collapsed": false
322 | },
323 | "outputs": [],
324 | "source": [
325 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 15,
331 | "metadata": {
332 | "collapsed": false
333 | },
334 | "outputs": [],
335 | "source": [
336 | "lgb_train = lgb.Dataset(X_train, y_train)\n",
337 | "lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 16,
343 | "metadata": {
344 | "collapsed": false
345 | },
346 | "outputs": [],
347 | "source": [
348 | "param = {\n",
349 | " 'max_depth':6,\n",
350 | " 'num_leaves':80,\n",
351 | " 'learning_rate':0.03,\n",
352 | " 'scale_pos_weight':1,\n",
353 | " 'num_threads':40,\n",
354 | " 'objective':'binary',\n",
355 | " 'bagging_fraction':0.7,\n",
356 | " 'bagging_freq':1,\n",
357 | " 'min_sum_hessian_in_leaf':100\n",
358 | "}\n",
359 | "\n",
360 | "param['is_unbalance']='true'\n",
361 | "param['metric'] = 'auc'\n",
362 | "\n",
363 | "\n",
364 | "bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=5, early_stopping_rounds=30)\n",
365 | "gbm = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))\n"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 17,
371 | "metadata": {
372 | "collapsed": false
373 | },
374 | "outputs": [
375 | {
376 | "name": "stdout",
377 | "output_type": "stream",
378 | "text": [
379 | "0.810753521759\n"
380 | ]
381 | }
382 | ],
383 | "source": [
384 | "ypred = gbm.predict(X_val)\n",
385 | "val_auc = metrics.roc_auc_score(y_val,ypred)#验证集上的auc值\n",
386 | "print(val_auc)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 18,
392 | "metadata": {
393 | "collapsed": false
394 | },
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "(10000,)"
400 | ]
401 | },
402 | "execution_count": 18,
403 | "metadata": {},
404 | "output_type": "execute_result"
405 | }
406 | ],
407 | "source": [
408 | "pred = gbm.predict(test_X)\n",
409 | "pred.shape"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 19,
415 | "metadata": {
416 | "collapsed": true
417 | },
418 | "outputs": [],
419 | "source": [
420 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
421 | "Submission.to_csv('../result/semi_lgb.csv',index=False)"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "metadata": {
428 | "collapsed": true
429 | },
430 | "outputs": [],
431 | "source": []
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {
437 | "collapsed": true
438 | },
439 | "outputs": [],
440 | "source": []
441 | }
442 | ],
443 | "metadata": {
444 | "anaconda-cloud": {},
445 | "kernelspec": {
446 | "display_name": "Python [default]",
447 | "language": "python",
448 | "name": "python3"
449 | },
450 | "language_info": {
451 | "codemirror_mode": {
452 | "name": "ipython",
453 | "version": 3
454 | },
455 | "file_extension": ".py",
456 | "mimetype": "text/x-python",
457 | "name": "python",
458 | "nbconvert_exporter": "python",
459 | "pygments_lexer": "ipython3",
460 | "version": "3.5.2"
461 | }
462 | },
463 | "nbformat": 4,
464 | "nbformat_minor": 1
465 | }
466 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/nn预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 5,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 5,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 6,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 6,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 7,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 8,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 9,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 10,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [
142 | {
143 | "name": "stderr",
144 | "output_type": "stream",
145 | "text": [
146 | "Using TensorFlow backend.\n",
147 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
148 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
149 | ]
150 | }
151 | ],
152 | "source": [
153 | "from sklearn.metrics import accuracy_score\n",
154 | "from sklearn import metrics\n",
155 | "from xgboost import XGBClassifier\n",
156 | "from sklearn.ensemble import GradientBoostingClassifier\n",
157 | "\n",
158 | "from keras.models import Sequential\n",
159 | "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
160 | "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
161 | "from sklearn.cross_validation import train_test_split\n",
162 | "from keras.optimizers import RMSprop, Adam\n",
163 | "from keras.callbacks import ReduceLROnPlateau\n",
164 | "from keras.callbacks import ModelCheckpoint\n",
165 | "from keras.utils.np_utils import to_categorical"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 12,
171 | "metadata": {
172 | "collapsed": false
173 | },
174 | "outputs": [],
175 | "source": [
176 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 13,
182 | "metadata": {
183 | "collapsed": true
184 | },
185 | "outputs": [],
186 | "source": [
187 | "X_train = X_train.values\n",
188 | "X_val = X_val.values"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 14,
194 | "metadata": {
195 | "collapsed": true
196 | },
197 | "outputs": [],
198 | "source": [
199 | "y_train = y_train.values\n",
200 | "yy_train = to_categorical(y_train)\n",
201 | "\n",
202 | "y_val = y_val.values\n",
203 | "yy_val = to_categorical(y_val)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 15,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [],
213 | "source": [
214 | "# Set the CNN model \n",
215 | "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n",
216 | "\n",
217 | "model = Sequential()\n",
218 | "\n",
219 | "model.add(BatchNormalization(input_shape=(355,)))\n",
220 | "model.add(Reshape((355,1,1)))\n",
221 | "\n",
222 | "\n",
223 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
224 | " activation ='relu'))\n",
225 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
226 | " activation ='relu'))\n",
227 | "model.add(MaxPooling2D(pool_size=2, padding='same'))\n",
228 | "# model.add(Dropout(0.25))\n",
229 | "\n",
230 | "\n",
231 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
232 | " activation ='relu'))\n",
233 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
234 | " activation ='relu'))\n",
235 | "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n",
236 | "# model.add(Dropout(0.25))\n",
237 | "\n",
238 | "\n",
239 | "model.add(Flatten())\n",
240 | "model.add(Dense(256, activation = 'relu'))\n",
241 | "model.add(Dropout(0.5))\n",
242 | "model.add(Dense(2, activation = 'softmax'))"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 16,
248 | "metadata": {
249 | "collapsed": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 17,
259 | "metadata": {
260 | "collapsed": false,
261 | "scrolled": true
262 | },
263 | "outputs": [
264 | {
265 | "name": "stdout",
266 | "output_type": "stream",
267 | "text": [
268 | "Train on 10800 samples, validate on 1200 samples\n",
269 | "Epoch 1/15\n",
270 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.2493 - acc: 0.9522 - val_loss: 0.4120 - val_acc: 0.9583\n",
271 | "Epoch 2/15\n",
272 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1865 - acc: 0.9530 - val_loss: 0.2823 - val_acc: 0.9583\n",
273 | "Epoch 3/15\n",
274 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1729 - acc: 0.9530 - val_loss: 0.2359 - val_acc: 0.9583\n",
275 | "Epoch 4/15\n",
276 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1679 - acc: 0.9530 - val_loss: 0.1838 - val_acc: 0.9583\n",
277 | "Epoch 5/15\n",
278 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1632 - acc: 0.9530 - val_loss: 0.1968 - val_acc: 0.9583\n",
279 | "Epoch 6/15\n",
280 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1657 - acc: 0.9530 - val_loss: 0.1643 - val_acc: 0.9583\n",
281 | "Epoch 7/15\n",
282 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1614 - acc: 0.9530 - val_loss: 0.2133 - val_acc: 0.9583\n",
283 | "Epoch 8/15\n",
284 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1626 - acc: 0.9530 - val_loss: 0.1540 - val_acc: 0.9583\n",
285 | "Epoch 9/15\n",
286 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1612 - acc: 0.9530 - val_loss: 0.1574 - val_acc: 0.9583\n",
287 | "Epoch 10/15\n",
288 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1605 - acc: 0.9530 - val_loss: 0.1564 - val_acc: 0.9583\n",
289 | "Epoch 11/15\n",
290 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1586 - acc: 0.9530 - val_loss: 0.1549 - val_acc: 0.9583\n",
291 | "Epoch 12/15\n",
292 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1585 - acc: 0.9530 - val_loss: 0.1545 - val_acc: 0.9583\n",
293 | "Epoch 13/15\n",
294 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1594 - acc: 0.9530 - val_loss: 0.1565 - val_acc: 0.9583\n",
295 | "Epoch 14/15\n",
296 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1570 - acc: 0.9530 - val_loss: 0.1589 - val_acc: 0.9583\n",
297 | "Epoch 15/15\n",
298 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1569 - acc: 0.9529 - val_loss: 0.1572 - val_acc: 0.9583\n"
299 | ]
300 | }
301 | ],
302 | "source": [
303 | "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 18,
309 | "metadata": {
310 | "collapsed": false
311 | },
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "3000/3000 [==============================] - 1s 452us/step\n",
318 | "0.764895321667\n"
319 | ]
320 | }
321 | ],
322 | "source": [
323 | "predictions = model.predict_proba(X_val,verbose=1)\n",
324 | "pre = predictions[:,1]\n",
325 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
326 | "print(val_auc)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 19,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [
336 | {
337 | "name": "stdout",
338 | "output_type": "stream",
339 | "text": [
340 | "10000/10000 [==============================] - 5s 453us/step\n"
341 | ]
342 | },
343 | {
344 | "data": {
345 | "text/plain": [
346 | "(10000,)"
347 | ]
348 | },
349 | "execution_count": 19,
350 | "metadata": {},
351 | "output_type": "execute_result"
352 | }
353 | ],
354 | "source": [
355 | "preds = model.predict_proba(test_X.values)\n",
356 | "pred = preds[:,1]\n",
357 | "pred.shape"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 20,
363 | "metadata": {
364 | "collapsed": true
365 | },
366 | "outputs": [],
367 | "source": [
368 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
369 | "Submission.to_csv('../result/nn.csv',index=False)"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {
376 | "collapsed": true
377 | },
378 | "outputs": [],
379 | "source": []
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {
385 | "collapsed": true
386 | },
387 | "outputs": [],
388 | "source": []
389 | }
390 | ],
391 | "metadata": {
392 | "anaconda-cloud": {},
393 | "kernelspec": {
394 | "display_name": "Python [default]",
395 | "language": "python",
396 | "name": "python3"
397 | },
398 | "language_info": {
399 | "codemirror_mode": {
400 | "name": "ipython",
401 | "version": 3
402 | },
403 | "file_extension": ".py",
404 | "mimetype": "text/x-python",
405 | "name": "python",
406 | "nbconvert_exporter": "python",
407 | "pygments_lexer": "ipython3",
408 | "version": "3.5.2"
409 | }
410 | },
411 | "nbformat": 4,
412 | "nbformat_minor": 1
413 | }
414 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/nn预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "train = pd.read_csv('../data/train_xy.csv')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "test = pd.read_csv('../data/test_all.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 5,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(10000, 157)"
50 | ]
51 | },
52 | "execution_count": 5,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
59 | "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
60 | "x_train.shape\n",
61 | "x_test.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 6,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "(25000, 157)"
75 | ]
76 | },
77 | "execution_count": 6,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "x = pd.concat([x_train,x_test])\n",
84 | "x.shape"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 7,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "Y_train = train['y']"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 8,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "for i in range(96,158):\n",
107 | " col = 'x'+'_'+str(i)\n",
108 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
109 | " x = pd.concat([x, dummies_df], axis=1)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 9,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "(15000, 355)\n",
124 | "(10000, 355)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "train_X = x[0:15000]\n",
130 | "test_X = x[15000:25000]\n",
131 | "print(train_X.shape)\n",
132 | "print(test_X.shape)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 10,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [
142 | {
143 | "name": "stderr",
144 | "output_type": "stream",
145 | "text": [
146 | "Using TensorFlow backend.\n",
147 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
148 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
149 | ]
150 | }
151 | ],
152 | "source": [
153 | "from sklearn.metrics import accuracy_score\n",
154 | "from sklearn import metrics\n",
155 | "from xgboost import XGBClassifier\n",
156 | "from sklearn.ensemble import GradientBoostingClassifier\n",
157 | "\n",
158 | "from keras.models import Sequential\n",
159 | "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
160 | "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
161 | "from sklearn.cross_validation import train_test_split\n",
162 | "from keras.optimizers import RMSprop, Adam\n",
163 | "from keras.callbacks import ReduceLROnPlateau\n",
164 | "from keras.callbacks import ModelCheckpoint\n",
165 | "from keras.utils.np_utils import to_categorical"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 12,
171 | "metadata": {
172 | "collapsed": false
173 | },
174 | "outputs": [],
175 | "source": [
176 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 13,
182 | "metadata": {
183 | "collapsed": true
184 | },
185 | "outputs": [],
186 | "source": [
187 | "X_train = X_train.values\n",
188 | "X_val = X_val.values"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 14,
194 | "metadata": {
195 | "collapsed": true
196 | },
197 | "outputs": [],
198 | "source": [
199 | "y_train = y_train.values\n",
200 | "yy_train = to_categorical(y_train)\n",
201 | "\n",
202 | "y_val = y_val.values\n",
203 | "yy_val = to_categorical(y_val)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 15,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [],
213 | "source": [
214 | "# Set the CNN model \n",
215 | "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n",
216 | "\n",
217 | "model = Sequential()\n",
218 | "\n",
219 | "model.add(BatchNormalization(input_shape=(355,)))\n",
220 | "model.add(Reshape((355,1,1)))\n",
221 | "\n",
222 | "\n",
223 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
224 | " activation ='relu'))\n",
225 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
226 | " activation ='relu'))\n",
227 | "model.add(MaxPooling2D(pool_size=2, padding='same'))\n",
228 | "# model.add(Dropout(0.25))\n",
229 | "\n",
230 | "\n",
231 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
232 | " activation ='relu'))\n",
233 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
234 | " activation ='relu'))\n",
235 | "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n",
236 | "# model.add(Dropout(0.25))\n",
237 | "\n",
238 | "\n",
239 | "model.add(Flatten())\n",
240 | "model.add(Dense(256, activation = 'relu'))\n",
241 | "model.add(Dropout(0.5))\n",
242 | "model.add(Dense(2, activation = 'softmax'))"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 16,
248 | "metadata": {
249 | "collapsed": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 17,
259 | "metadata": {
260 | "collapsed": false,
261 | "scrolled": true
262 | },
263 | "outputs": [
264 | {
265 | "name": "stdout",
266 | "output_type": "stream",
267 | "text": [
268 | "Train on 10800 samples, validate on 1200 samples\n",
269 | "Epoch 1/15\n",
270 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.2493 - acc: 0.9522 - val_loss: 0.4120 - val_acc: 0.9583\n",
271 | "Epoch 2/15\n",
272 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1865 - acc: 0.9530 - val_loss: 0.2823 - val_acc: 0.9583\n",
273 | "Epoch 3/15\n",
274 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1729 - acc: 0.9530 - val_loss: 0.2359 - val_acc: 0.9583\n",
275 | "Epoch 4/15\n",
276 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1679 - acc: 0.9530 - val_loss: 0.1838 - val_acc: 0.9583\n",
277 | "Epoch 5/15\n",
278 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1632 - acc: 0.9530 - val_loss: 0.1968 - val_acc: 0.9583\n",
279 | "Epoch 6/15\n",
280 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1657 - acc: 0.9530 - val_loss: 0.1643 - val_acc: 0.9583\n",
281 | "Epoch 7/15\n",
282 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1614 - acc: 0.9530 - val_loss: 0.2133 - val_acc: 0.9583\n",
283 | "Epoch 8/15\n",
284 | "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1626 - acc: 0.9530 - val_loss: 0.1540 - val_acc: 0.9583\n",
285 | "Epoch 9/15\n",
286 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1612 - acc: 0.9530 - val_loss: 0.1574 - val_acc: 0.9583\n",
287 | "Epoch 10/15\n",
288 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1605 - acc: 0.9530 - val_loss: 0.1564 - val_acc: 0.9583\n",
289 | "Epoch 11/15\n",
290 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1586 - acc: 0.9530 - val_loss: 0.1549 - val_acc: 0.9583\n",
291 | "Epoch 12/15\n",
292 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1585 - acc: 0.9530 - val_loss: 0.1545 - val_acc: 0.9583\n",
293 | "Epoch 13/15\n",
294 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1594 - acc: 0.9530 - val_loss: 0.1565 - val_acc: 0.9583\n",
295 | "Epoch 14/15\n",
296 | "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1570 - acc: 0.9530 - val_loss: 0.1589 - val_acc: 0.9583\n",
297 | "Epoch 15/15\n",
298 | "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1569 - acc: 0.9529 - val_loss: 0.1572 - val_acc: 0.9583\n"
299 | ]
300 | }
301 | ],
302 | "source": [
303 | "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 18,
309 | "metadata": {
310 | "collapsed": false
311 | },
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "3000/3000 [==============================] - 1s 452us/step\n",
318 | "0.764895321667\n"
319 | ]
320 | }
321 | ],
322 | "source": [
323 | "predictions = model.predict_proba(X_val,verbose=1)\n",
324 | "pre = predictions[:,1]\n",
325 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
326 | "print(val_auc)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 19,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [
336 | {
337 | "name": "stdout",
338 | "output_type": "stream",
339 | "text": [
340 | "10000/10000 [==============================] - 5s 453us/step\n"
341 | ]
342 | },
343 | {
344 | "data": {
345 | "text/plain": [
346 | "(10000,)"
347 | ]
348 | },
349 | "execution_count": 19,
350 | "metadata": {},
351 | "output_type": "execute_result"
352 | }
353 | ],
354 | "source": [
355 | "preds = model.predict_proba(test_X.values)\n",
356 | "pred = preds[:,1]\n",
357 | "pred.shape"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 20,
363 | "metadata": {
364 | "collapsed": true
365 | },
366 | "outputs": [],
367 | "source": [
368 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
369 | "Submission.to_csv('../result/nn.csv',index=False)"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {
376 | "collapsed": true
377 | },
378 | "outputs": [],
379 | "source": []
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {
385 | "collapsed": true
386 | },
387 | "outputs": [],
388 | "source": []
389 | }
390 | ],
391 | "metadata": {
392 | "anaconda-cloud": {},
393 | "kernelspec": {
394 | "display_name": "Python [default]",
395 | "language": "python",
396 | "name": "python3"
397 | },
398 | "language_info": {
399 | "codemirror_mode": {
400 | "name": "ipython",
401 | "version": 3
402 | },
403 | "file_extension": ".py",
404 | "mimetype": "text/x-python",
405 | "name": "python",
406 | "nbconvert_exporter": "python",
407 | "pygments_lexer": "ipython3",
408 | "version": "3.5.2"
409 | }
410 | },
411 | "nbformat": 4,
412 | "nbformat_minor": 1
413 | }
414 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_gbdt预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "(10000, 160)"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
37 | "train_x1.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(15000, 160)"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
60 | "train_x2.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(10000, 159)"
74 | ]
75 | },
76 | "execution_count": 4,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "test = pd.read_csv('../data/test_all.csv')\n",
83 | "test.shape"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "(25000, 157)"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 | "train_x = pd.concat([train_x11, train_x22])\n",
108 | "train_x.shape"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "(10000, 157)"
122 | ]
123 | },
124 | "execution_count": 6,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 | "test_x.shape"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "(35000, 157)"
145 | ]
146 | },
147 | "execution_count": 7,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "x = pd.concat([train_x, test_x])\n",
154 | "x.shape"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "(25000,)"
168 | ]
169 | },
170 | "execution_count": 8,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "train_y1 = train_x1['y']\n",
177 | "train_y2 = train_x2['y']\n",
178 | "Y_train = train_y1.append(train_y2)\n",
179 | "Y_train.shape"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "(35000, 364)\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "for i in range(96,158):\n",
199 | " col = 'x'+'_'+str(i)\n",
200 | " if col in x.columns.values:\n",
201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 | " x = pd.concat([x, dummies_df], axis=1)\n",
203 | "print(x.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "(25000, 364)\n",
218 | "(10000, 364)\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "train_X = x[0:25000]\n",
224 | "test_X = x[25000:35000]\n",
225 | "print(train_X.shape)\n",
226 | "print(test_X.shape)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 11,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stderr",
238 | "output_type": "stream",
239 | "text": [
240 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
241 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
242 | "Using TensorFlow backend.\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "from sklearn.tree import DecisionTreeClassifier\n",
248 | "from sklearn.ensemble import RandomForestClassifier\n",
249 | "from sklearn.ensemble import AdaBoostClassifier\n",
250 | "from sklearn.ensemble import ExtraTreesClassifier\n",
251 | "from sklearn.ensemble import GradientBoostingClassifier\n",
252 | "from sklearn.neighbors import KNeighborsClassifier\n",
253 | "from sklearn.svm import SVC\n",
254 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n",
255 | "from sklearn.metrics import classification_report\n",
256 | "from sklearn.metrics import precision_recall_fscore_support\n",
257 | "from sklearn.utils.multiclass import unique_labels\n",
258 | "from sklearn.metrics import accuracy_score\n",
259 | "from xgboost import XGBClassifier\n",
260 | "from sklearn.ensemble import GradientBoostingClassifier\n",
261 | "from sklearn.cross_validation import cross_val_score\n",
262 | "from lightgbm import LGBMClassifier\n",
263 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
264 | "from sklearn.svm import LinearSVC\n",
265 | "from sklearn import linear_model\n",
266 | "import lightgbm as lgb\n",
267 | "import xgboost as xgb\n",
268 | "from sklearn.model_selection import GridSearchCV\n",
269 | "\n",
270 | "from keras.models import Model\n",
271 | "from keras.layers import Dense, Input"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 12,
277 | "metadata": {
278 | "collapsed": false,
279 | "scrolled": true
280 | },
281 | "outputs": [],
282 | "source": [
283 | "# encoding_dim = 600\n",
284 | "# input_dim = Input(shape=(364,))\n",
285 | "\n",
286 | "# encoded = Dense(364, activation='linear')(input_dim)\n",
287 | "# # encoded = Dense(300, activation='relu')(encoded)\n",
288 | "# # encoded = Dense(32, activation='relu')(encoded)\n",
289 | "# encoder_output = Dense(encoding_dim)(encoded)\n",
290 | "\n",
291 | "# decoded = Dense(600, activation='relu')(encoder_output)\n",
292 | "# # decoded = Dense(64, activation='relu')(decoded)\n",
293 | "# # decoded = Dense(128, activation='relu')(decoded)\n",
294 | "# decoded = Dense(364, activation='tanh')(decoded)\n",
295 | "\n",
296 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
297 | "\n",
298 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
299 | "\n",
300 | "# autoencoder.compile(optimizer='adam', loss='mse')\n",
301 | "# # training\n",
302 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 13,
308 | "metadata": {
309 | "collapsed": false
310 | },
311 | "outputs": [],
312 | "source": [
313 | "# new_train_feature = encoder.predict(train_X.values)\n",
314 | "# new_test_feature = encoder.predict(test_X.values)\n",
315 | "# print(new_train_feature.shape)\n",
316 | "# print(new_test_feature.shape)"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 14,
322 | "metadata": {
323 | "collapsed": false
324 | },
325 | "outputs": [],
326 | "source": [
327 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 15,
333 | "metadata": {
334 | "collapsed": false,
335 | "scrolled": true
336 | },
337 | "outputs": [
338 | {
339 | "data": {
340 | "text/plain": [
341 | "GridSearchCV(cv=5, error_score='raise',\n",
342 | " estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
343 | " learning_rate=0.1, loss='deviance', max_depth=3,\n",
344 | " max_features=None, max_leaf_nodes=None,\n",
345 | " min_impurity_split=1e-07, min_samples_leaf=1,\n",
346 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
347 | " n_estimators=100, presort='auto', random_state=None,\n",
348 | " subsample=1.0, verbose=0, warm_start=False),\n",
349 | " fit_params={}, iid=True, n_jobs=1,\n",
350 | " param_grid=[{'max_depth': range(4, 8, 12), 'n_estimators': range(100, 300, 500), 'learning_rate': [0.01, 0.1]}],\n",
351 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
352 | " scoring='roc_auc', verbose=0)"
353 | ]
354 | },
355 | "execution_count": 15,
356 | "metadata": {},
357 | "output_type": "execute_result"
358 | }
359 | ],
360 | "source": [
361 | "tuned_parameters= [{'n_estimators':range(100,300,500),\n",
362 | " 'max_depth':range(4,8,12),\n",
363 | " 'learning_rate':[0.01, 0.1]\n",
364 | " }]\n",
365 | "clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')\n",
366 | "clf.fit(X_train, y_train)"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 18,
372 | "metadata": {
373 | "collapsed": false
374 | },
375 | "outputs": [
376 | {
377 | "name": "stdout",
378 | "output_type": "stream",
379 | "text": [
380 | "0.805375686875\n"
381 | ]
382 | }
383 | ],
384 | "source": [
385 | "predictions = clf.predict_proba(X_val)\n",
386 | "pre = predictions[:,1]\n",
387 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
388 | "print(val_auc) "
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 19,
394 | "metadata": {
395 | "collapsed": false
396 | },
397 | "outputs": [
398 | {
399 | "data": {
400 | "text/plain": [
401 | "(10000,)"
402 | ]
403 | },
404 | "execution_count": 19,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "preds = clf.predict_proba(test_X)\n",
411 | "pred = preds[:,1]\n",
412 | "pred.shape"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": 20,
418 | "metadata": {
419 | "collapsed": true
420 | },
421 | "outputs": [],
422 | "source": [
423 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
424 | "Submission.to_csv('../result/semi_gbdt.csv',index=False)"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": null,
430 | "metadata": {
431 | "collapsed": true
432 | },
433 | "outputs": [],
434 | "source": []
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": null,
439 | "metadata": {
440 | "collapsed": true
441 | },
442 | "outputs": [],
443 | "source": []
444 | }
445 | ],
446 | "metadata": {
447 | "anaconda-cloud": {},
448 | "kernelspec": {
449 | "display_name": "Python [default]",
450 | "language": "python",
451 | "name": "python3"
452 | },
453 | "language_info": {
454 | "codemirror_mode": {
455 | "name": "ipython",
456 | "version": 3
457 | },
458 | "file_extension": ".py",
459 | "mimetype": "text/x-python",
460 | "name": "python",
461 | "nbconvert_exporter": "python",
462 | "pygments_lexer": "ipython3",
463 | "version": "3.5.2"
464 | }
465 | },
466 | "nbformat": 4,
467 | "nbformat_minor": 1
468 | }
469 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/semi_nn预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "(10000, 160)"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
37 | "train_x1.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(15000, 160)"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
60 | "train_x2.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(10000, 159)"
74 | ]
75 | },
76 | "execution_count": 4,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "test = pd.read_csv('../data/test_all.csv')\n",
83 | "test.shape"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "(25000, 157)"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 | "train_x = pd.concat([train_x11, train_x22])\n",
108 | "train_x.shape"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "(10000, 157)"
122 | ]
123 | },
124 | "execution_count": 6,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 | "test_x.shape"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "(35000, 157)"
145 | ]
146 | },
147 | "execution_count": 7,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "x = pd.concat([train_x, test_x])\n",
154 | "x.shape"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "(25000,)"
168 | ]
169 | },
170 | "execution_count": 8,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "train_y1 = train_x1['y']\n",
177 | "train_y2 = train_x2['y']\n",
178 | "Y_train = train_y1.append(train_y2)\n",
179 | "Y_train.shape"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "(35000, 364)\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "for i in range(96,158):\n",
199 | " col = 'x'+'_'+str(i)\n",
200 | " if col in x.columns.values:\n",
201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 | " x = pd.concat([x, dummies_df], axis=1)\n",
203 | "print(x.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "(25000, 364)\n",
218 | "(10000, 364)\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "train_X = x[0:25000]\n",
224 | "test_X = x[25000:35000]\n",
225 | "print(train_X.shape)\n",
226 | "print(test_X.shape)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 11,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stderr",
238 | "output_type": "stream",
239 | "text": [
240 | "Using TensorFlow backend.\n",
241 | "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
242 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "from sklearn.metrics import accuracy_score\n",
248 | "from sklearn import metrics\n",
249 | "\n",
250 | "from keras.models import Sequential\n",
251 | "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
252 | "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
253 | "from sklearn.cross_validation import train_test_split\n",
254 | "from keras.optimizers import RMSprop, Adam\n",
255 | "from keras.callbacks import ReduceLROnPlateau\n",
256 | "from keras.callbacks import ModelCheckpoint\n",
257 | "from keras.utils.np_utils import to_categorical"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 12,
263 | "metadata": {
264 | "collapsed": false
265 | },
266 | "outputs": [],
267 | "source": [
268 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 13,
274 | "metadata": {
275 | "collapsed": false
276 | },
277 | "outputs": [
278 | {
279 | "name": "stdout",
280 | "output_type": "stream",
281 | "text": [
282 | "(20000, 364)\n",
283 | "(5000, 364)\n"
284 | ]
285 | }
286 | ],
287 | "source": [
288 | "X_train = X_train.values\n",
289 | "X_val = X_val.values\n",
290 | "print(X_train.shape)\n",
291 | "print(X_val.shape)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 14,
297 | "metadata": {
298 | "collapsed": false
299 | },
300 | "outputs": [],
301 | "source": [
302 | "y_train = y_train.values\n",
303 | "yy_train = to_categorical(y_train)\n",
304 | "\n",
305 | "y_val = y_val.values\n",
306 | "yy_val = to_categorical(y_val)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 15,
312 | "metadata": {
313 | "collapsed": false,
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "# Set the CNN model \n",
319 | "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n",
320 | "\n",
321 | "model = Sequential()\n",
322 | "\n",
323 | "model.add(BatchNormalization(input_shape=(364,)))\n",
324 | "model.add(Reshape((364,1,1)))\n",
325 | "\n",
326 | "\n",
327 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
328 | " activation ='relu'))\n",
329 | "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
330 | " activation ='relu'))\n",
331 | "model.add(MaxPooling2D(pool_size=2, padding='same'))\n",
332 | "# model.add(Dropout(0.25))\n",
333 | "\n",
334 | "\n",
335 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
336 | " activation ='relu'))\n",
337 | "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
338 | " activation ='relu'))\n",
339 | "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n",
340 | "# model.add(Dropout(0.25))\n",
341 | "\n",
342 | "\n",
343 | "model.add(Flatten())\n",
344 | "model.add(Dense(256, activation = 'relu'))\n",
345 | "model.add(Dropout(0.5))\n",
346 | "model.add(Dense(2, activation = 'softmax'))"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 16,
352 | "metadata": {
353 | "collapsed": true
354 | },
355 | "outputs": [],
356 | "source": [
357 | "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 17,
363 | "metadata": {
364 | "collapsed": false,
365 | "scrolled": true
366 | },
367 | "outputs": [
368 | {
369 | "name": "stdout",
370 | "output_type": "stream",
371 | "text": [
372 | "Train on 18000 samples, validate on 2000 samples\n",
373 | "Epoch 1/15\n",
374 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.2315 - acc: 0.9452 - val_loss: 0.3059 - val_acc: 0.9550\n",
375 | "Epoch 2/15\n",
376 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1728 - acc: 0.9552 - val_loss: 0.2073 - val_acc: 0.9550\n",
377 | "Epoch 3/15\n",
378 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1647 - acc: 0.9552 - val_loss: 0.1616 - val_acc: 0.9550\n",
379 | "Epoch 4/15\n",
380 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1610 - acc: 0.9552 - val_loss: 0.1613 - val_acc: 0.9550\n",
381 | "Epoch 5/15\n",
382 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1608 - acc: 0.9552 - val_loss: 0.1613 - val_acc: 0.9550\n",
383 | "Epoch 6/15\n",
384 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1612 - acc: 0.9552 - val_loss: 0.1608 - val_acc: 0.9550\n",
385 | "Epoch 7/15\n",
386 | "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1597 - acc: 0.9552 - val_loss: 0.1600 - val_acc: 0.9550\n",
387 | "Epoch 8/15\n",
388 | "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1592 - acc: 0.9552 - val_loss: 0.1602 - val_acc: 0.9550\n",
389 | "Epoch 9/15\n",
390 | "18000/18000 [==============================] - 27s 1ms/step - loss: 0.1571 - acc: 0.9552 - val_loss: 0.1635 - val_acc: 0.9550\n",
391 | "Epoch 10/15\n",
392 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1578 - acc: 0.9552 - val_loss: 0.1620 - val_acc: 0.9550\n",
393 | "Epoch 11/15\n",
394 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1571 - acc: 0.9552 - val_loss: 0.1628 - val_acc: 0.9550\n",
395 | "Epoch 12/15\n",
396 | "18000/18000 [==============================] - 26s 1ms/step - loss: 0.1584 - acc: 0.9552 - val_loss: 0.1633 - val_acc: 0.9550\n",
397 | "Epoch 13/15\n",
398 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1557 - acc: 0.9552 - val_loss: 0.1664 - val_acc: 0.9550\n",
399 | "Epoch 14/15\n",
400 | "18000/18000 [==============================] - 25s 1ms/step - loss: 0.1551 - acc: 0.9552 - val_loss: 0.1637 - val_acc: 0.9550\n",
401 | "Epoch 15/15\n",
402 | "18000/18000 [==============================] - 24s 1ms/step - loss: 0.1562 - acc: 0.9552 - val_loss: 0.1631 - val_acc: 0.9550\n"
403 | ]
404 | }
405 | ],
406 | "source": [
407 | "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 18,
413 | "metadata": {
414 | "collapsed": false
415 | },
416 | "outputs": [
417 | {
418 | "name": "stdout",
419 | "output_type": "stream",
420 | "text": [
421 | "5000/5000 [==============================] - 2s 461us/step\n",
422 | "0.793837946347\n"
423 | ]
424 | }
425 | ],
426 | "source": [
427 | "predictions = model.predict_proba(X_val,verbose=1)\n",
428 | "pre = predictions[:,1]\n",
429 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
430 | "print(val_auc)"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 19,
436 | "metadata": {
437 | "collapsed": false
438 | },
439 | "outputs": [
440 | {
441 | "name": "stdout",
442 | "output_type": "stream",
443 | "text": [
444 | "10000/10000 [==============================] - 5s 459us/step\n"
445 | ]
446 | },
447 | {
448 | "data": {
449 | "text/plain": [
450 | "(10000,)"
451 | ]
452 | },
453 | "execution_count": 19,
454 | "metadata": {},
455 | "output_type": "execute_result"
456 | }
457 | ],
458 | "source": [
459 | "preds = model.predict_proba(test_X.values)\n",
460 | "pred = preds[:,1]\n",
461 | "pred.shape"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": 20,
467 | "metadata": {
468 | "collapsed": true
469 | },
470 | "outputs": [],
471 | "source": [
472 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
473 | "Submission.to_csv('../result/semi_nn.csv',index=False)"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {
480 | "collapsed": true
481 | },
482 | "outputs": [],
483 | "source": []
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": null,
488 | "metadata": {
489 | "collapsed": true
490 | },
491 | "outputs": [],
492 | "source": []
493 | }
494 | ],
495 | "metadata": {
496 | "anaconda-cloud": {},
497 | "kernelspec": {
498 | "display_name": "Python [default]",
499 | "language": "python",
500 | "name": "python3"
501 | },
502 | "language_info": {
503 | "codemirror_mode": {
504 | "name": "ipython",
505 | "version": 3
506 | },
507 | "file_extension": ".py",
508 | "mimetype": "text/x-python",
509 | "name": "python",
510 | "nbconvert_exporter": "python",
511 | "pygments_lexer": "ipython3",
512 | "version": "3.5.2"
513 | }
514 | },
515 | "nbformat": 4,
516 | "nbformat_minor": 1
517 | }
518 |
--------------------------------------------------------------------------------
/模型二代码b榜0.749880/code/.ipynb_checkpoints/semi_gbdt预测-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "(10000, 160)"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
37 | "train_x1.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(15000, 160)"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
60 | "train_x2.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(10000, 159)"
74 | ]
75 | },
76 | "execution_count": 4,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "test = pd.read_csv('../data/test_all.csv')\n",
83 | "test.shape"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "(25000, 157)"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
106 | "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
107 | "train_x = pd.concat([train_x11, train_x22])\n",
108 | "train_x.shape"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "(10000, 157)"
122 | ]
123 | },
124 | "execution_count": 6,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
131 | "test_x.shape"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "(35000, 157)"
145 | ]
146 | },
147 | "execution_count": 7,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "x = pd.concat([train_x, test_x])\n",
154 | "x.shape"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "(25000,)"
168 | ]
169 | },
170 | "execution_count": 8,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "train_y1 = train_x1['y']\n",
177 | "train_y2 = train_x2['y']\n",
178 | "Y_train = train_y1.append(train_y2)\n",
179 | "Y_train.shape"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "(35000, 364)\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "for i in range(96,158):\n",
199 | " col = 'x'+'_'+str(i)\n",
200 | " if col in x.columns.values:\n",
201 | " dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
202 | " x = pd.concat([x, dummies_df], axis=1)\n",
203 | "print(x.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "(25000, 364)\n",
218 | "(10000, 364)\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "train_X = x[0:25000]\n",
224 | "test_X = x[25000:35000]\n",
225 | "print(train_X.shape)\n",
226 | "print(test_X.shape)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 15,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [],
236 | "source": [
237 | "from sklearn.tree import DecisionTreeClassifier\n",
238 | "from sklearn.ensemble import RandomForestClassifier\n",
239 | "from sklearn.ensemble import AdaBoostClassifier\n",
240 | "from sklearn.ensemble import ExtraTreesClassifier\n",
241 | "from sklearn.ensemble import GradientBoostingClassifier\n",
242 | "from sklearn.neighbors import KNeighborsClassifier\n",
243 | "from sklearn.svm import SVC\n",
244 | "from sklearn import metrics #accuracy_score,recall_score,f1_score\n",
245 | "from sklearn.metrics import classification_report\n",
246 | "from sklearn.metrics import precision_recall_fscore_support\n",
247 | "from sklearn.utils.multiclass import unique_labels\n",
248 | "from sklearn.metrics import accuracy_score\n",
249 | "from xgboost import XGBClassifier\n",
250 | "from sklearn.ensemble import GradientBoostingClassifier\n",
251 | "from sklearn.cross_validation import cross_val_score\n",
252 | "from lightgbm import LGBMClassifier\n",
253 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
254 | "from sklearn.svm import LinearSVC\n",
255 | "from sklearn import linear_model\n",
256 | "import lightgbm as lgb\n",
257 | "import xgboost as xgb\n",
258 | "from sklearn.model_selection import GridSearchCV\n",
259 | "\n",
260 | "from keras.models import Model\n",
261 | "from keras.layers import Dense, Input"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 12,
267 | "metadata": {
268 | "collapsed": false,
269 | "scrolled": true
270 | },
271 | "outputs": [],
272 | "source": [
273 | "# encoding_dim = 600\n",
274 | "# input_dim = Input(shape=(364,))\n",
275 | "\n",
276 | "# encoded = Dense(364, activation='linear')(input_dim)\n",
277 | "# # encoded = Dense(300, activation='relu')(encoded)\n",
278 | "# # encoded = Dense(32, activation='relu')(encoded)\n",
279 | "# encoder_output = Dense(encoding_dim)(encoded)\n",
280 | "\n",
281 | "# decoded = Dense(600, activation='relu')(encoder_output)\n",
282 | "# # decoded = Dense(64, activation='relu')(decoded)\n",
283 | "# # decoded = Dense(128, activation='relu')(decoded)\n",
284 | "# decoded = Dense(364, activation='tanh')(decoded)\n",
285 | "\n",
286 | "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
287 | "\n",
288 | "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
289 | "\n",
290 | "# autoencoder.compile(optimizer='adam', loss='mse')\n",
291 | "# # training\n",
292 | "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 13,
298 | "metadata": {
299 | "collapsed": false
300 | },
301 | "outputs": [],
302 | "source": [
303 | "# new_train_feature = encoder.predict(train_X.values)\n",
304 | "# new_test_feature = encoder.predict(test_X.values)\n",
305 | "# print(new_train_feature.shape)\n",
306 | "# print(new_test_feature.shape)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 27,
312 | "metadata": {
313 | "collapsed": false
314 | },
315 | "outputs": [],
316 | "source": [
317 | "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 29,
323 | "metadata": {
324 | "collapsed": false,
325 | "scrolled": true
326 | },
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "GridSearchCV(cv=5, error_score='raise',\n",
332 | " estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
333 | " learning_rate=0.1, loss='deviance', max_depth=3,\n",
334 | " max_features=None, max_leaf_nodes=None,\n",
335 | " min_impurity_split=1e-07, min_samples_leaf=1,\n",
336 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
337 | " n_estimators=100, presort='auto', random_state=None,\n",
338 | " subsample=1.0, verbose=0, warm_start=False),\n",
339 | " fit_params={}, iid=True, n_jobs=1,\n",
340 | " param_grid=[{'learning_rate': [0.01, 0.1], 'n_estimators': range(100, 300, 500), 'max_depth': range(4, 8, 12)}],\n",
341 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
342 | " scoring='roc_auc', verbose=0)"
343 | ]
344 | },
345 | "execution_count": 29,
346 | "metadata": {},
347 | "output_type": "execute_result"
348 | }
349 | ],
350 | "source": [
351 | "tuned_parameters= [{'n_estimators':range(100,300,500),\n",
352 | " 'max_depth':range(4,8,12),\n",
353 | " 'learning_rate':[0.01, 0.1]\n",
354 | " }]\n",
355 | "clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')\n",
356 | "clf.fit(X_train, y_train)"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 30,
362 | "metadata": {
363 | "collapsed": false
364 | },
365 | "outputs": [
366 | {
367 | "name": "stdout",
368 | "output_type": "stream",
369 | "text": [
370 | "0.804463759173\n"
371 | ]
372 | }
373 | ],
374 | "source": [
375 | "predictions = clf.predict_proba(X_val)\n",
376 | "pre = predictions[:,1]\n",
377 | "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
378 | "print(val_auc)"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 36,
384 | "metadata": {
385 | "collapsed": false
386 | },
387 | "outputs": [
388 | {
389 | "data": {
390 | "text/plain": [
391 | "(10000,)"
392 | ]
393 | },
394 | "execution_count": 36,
395 | "metadata": {},
396 | "output_type": "execute_result"
397 | }
398 | ],
399 | "source": [
400 | "preds = clf.predict_proba(test_X)\n",
401 | "pred = preds[:,1]\n",
402 | "pred.shape"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 37,
408 | "metadata": {
409 | "collapsed": true
410 | },
411 | "outputs": [],
412 | "source": [
413 | "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
414 | "Submission.to_csv('../result/semi_gbdt.csv',index=False)"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {
421 | "collapsed": true
422 | },
423 | "outputs": [],
424 | "source": []
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {
430 | "collapsed": true
431 | },
432 | "outputs": [],
433 | "source": []
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 57,
438 | "metadata": {
439 | "collapsed": false
440 | },
441 | "outputs": [],
442 | "source": [
443 | "xgb = pd.read_csv('../result/semi_xgb4.csv')"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 58,
449 | "metadata": {
450 | "collapsed": true
451 | },
452 | "outputs": [],
453 | "source": [
454 | "lgb = pd.read_csv('../result/semi_lgb2.csv')"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 59,
460 | "metadata": {
461 | "collapsed": true
462 | },
463 | "outputs": [],
464 | "source": [
465 | "result = xgb.pred_prob*0.3 + lgb.pred_prob*0.7"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 60,
471 | "metadata": {
472 | "collapsed": false
473 | },
474 | "outputs": [
475 | {
476 | "data": {
477 | "text/html": [
478 | "\n",
479 | "
\n",
480 | " \n",
481 | " \n",
482 | " | \n",
483 | " cust_id | \n",
484 | " pred_prob | \n",
485 | "
\n",
486 | " \n",
487 | " \n",
488 | " \n",
489 | " | 0 | \n",
490 | " 1 | \n",
491 | " 0.038582 | \n",
492 | "
\n",
493 | " \n",
494 | " | 1 | \n",
495 | " 2 | \n",
496 | " 0.087885 | \n",
497 | "
\n",
498 | " \n",
499 | " | 2 | \n",
500 | " 3 | \n",
501 | " 0.342310 | \n",
502 | "
\n",
503 | " \n",
504 | " | 3 | \n",
505 | " 4 | \n",
506 | " 0.213558 | \n",
507 | "
\n",
508 | " \n",
509 | " | 4 | \n",
510 | " 5 | \n",
511 | " 0.193331 | \n",
512 | "
\n",
513 | " \n",
514 | "
\n",
515 | "
"
516 | ],
517 | "text/plain": [
518 | " cust_id pred_prob\n",
519 | "0 1 0.038582\n",
520 | "1 2 0.087885\n",
521 | "2 3 0.342310\n",
522 | "3 4 0.213558\n",
523 | "4 5 0.193331"
524 | ]
525 | },
526 | "execution_count": 60,
527 | "metadata": {},
528 | "output_type": "execute_result"
529 | }
530 | ],
531 | "source": [
532 | "xgb.pred_prob = result\n",
533 | "xgb.head()"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 61,
539 | "metadata": {
540 | "collapsed": true
541 | },
542 | "outputs": [],
543 | "source": [
544 | "xgb.to_csv('../result/semi_xgb_lgb1.csv',index= False)"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "metadata": {
551 | "collapsed": true
552 | },
553 | "outputs": [],
554 | "source": []
555 | }
556 | ],
557 | "metadata": {
558 | "anaconda-cloud": {},
559 | "kernelspec": {
560 | "display_name": "Python [default]",
561 | "language": "python",
562 | "name": "python3"
563 | },
564 | "language_info": {
565 | "codemirror_mode": {
566 | "name": "ipython",
567 | "version": 3
568 | },
569 | "file_extension": ".py",
570 | "mimetype": "text/x-python",
571 | "name": "python",
572 | "nbconvert_exporter": "python",
573 | "pygments_lexer": "ipython3",
574 | "version": "3.5.2"
575 | }
576 | },
577 | "nbformat": 4,
578 | "nbformat_minor": 1
579 | }
580 |
--------------------------------------------------------------------------------