├── README.md
├── f_sample_20180204.csv
├── f_test_a_20180204.csv
├── f_train_20180204.csv
└── top12-baseline.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Readme.md
2 | ### 天池精准医疗大赛-糖尿病遗传风险预测
3 | ##### Top12 思路 由于初赛和复赛题目相差太大,谨在此给出复赛的一点思路权当抛砖引玉
4 |
5 | #### 特征工程
6 | ##### 新特征构造
7 | 1.构造加减乘除四则运算特征,做特征间的交互(考虑可解释的 基因拮抗、基因协同)
8 | 2.构造特征本身的乘方,幂方,开方等数值特征
9 | 3.利用多项式特征包来构造特征(线上表现不行)
10 |
11 | ##### 缺失值的处理
12 | 1.观察数据分布,对于缺失数据在非长尾的特征,均值填充/中值填充
13 | 2.把缺失值的特征当Label,考虑Label Propagation传播算法,半监督填充Label
14 | 3.不用GBDT等模型填充的原因是对于缺失值较多的(40%-75%),无法保证数据的分布一致
15 | 4.将缺失值数量超过75%的进行删除
16 |
17 | ##### 模型的选择
18 | 其实可以很轻松的发现这题数据量小,利用堆叠复杂的模型可能导致过拟合,故我们采用的是贪心法选择最优特征,基本框架为
19 | ```
20 | if Choose_Best_Feature(now_feature)0].shape)
32 | return ans.sort_values(by=['score'],ascending=False).reset_index(drop=True)
33 |
34 | nums = 45
35 | feature_name1 = train_data[feature_name].columns
36 | get_ans_face = list(set(get_pic(lgb_model,feature_name1).head(nums)['name'])|set(get_pic(xgb_model,feature_name1).head(nums)['name'])|set(get_pic(gbc_model,feature_name1).head(nums)['name']))
37 | # get_ans_face = list(set(get_pic(lgb_model,feature_name1).head(nums)['name'])&set(get_pic(xgb_model,feature_name1).head(nums)['name'])&set(get_pic(gbc_model,feature_name1).head(nums)['name']))
38 | # 先训练好三个模型 第一种方法是将三个模型的Feature_importances的Top K选择出来后,将这些特征取并集;而第二种方法则是取交集
39 | ```
40 | 在经验上 第一种方法所需要设置的nums较小,而第二种方法所需要设置的nums较大,籍此选出较强的特征后进入前文所述的贪心选择法中,即选择出较优的特征向量组,而在Choose_Best_Feature中,笔者使用的是`Xgboost`,`Lightgbm`,`GBDT`三种模型的CV值的平均值量度加入New_Feature对模型的影响,如此可以保证线上与线下的`同增同减`
41 |
42 | ```
43 | def get_model(nums,cv_fold):
44 | feature_name1 = train_data[feature_name].columns
45 | get_ans_face = list(set(get_pic(gbc_model,feature_name1).head(nums)['name'])&set(get_pic(xgb_model,feature_name1).head(nums)['name'])&set(get_pic(lgb_model,feature_name1).head(nums)['name']))
46 | print('New Feature: ',len(get_ans_face))
47 | new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1)
48 | cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1')
49 | new_lgb_model.fit(train_data[get_ans_face], train_label)
50 | m1 = cv_model.mean()
51 |
52 | new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1)
53 | cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1')
54 | new_xgb_model1.fit(train_data[get_ans_face].values, train_label)
55 | m2 = cv_model.mean()
56 |
57 | new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1)
58 | kkk = train_data[get_ans_face].fillna(7)
59 | cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1')
60 | new_gbc_model.fit(kkk.fillna(7),train_label)
61 |
62 | m3 = cv_model.mean()
63 | print((m1+m2+m3)/3)
64 | pro1 = new_lgb_model.predict_proba(test_data[get_ans_face])
65 | pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values)
66 | pro3 = new_gbc_model.predict_proba(test_data[get_ans_face].fillna(7).values)
67 | ans = (pro1+pro2+pro3)/3
68 | return ans
69 | ```
70 |
71 | 在最后的结果提交环节中,也有一个可以参考的小技巧,将选择出来的特征向量组放入三个树模型中可以得到Ans1,Ans2,Ans3,也可以得到概率P1,P2,P3,那么将Ans1、2、3做结果的投票融合得到Ans4,将P1/P2/P3做概率融合得到Ans5,再利用线下表现较好的线性模型利用特征向量组产生Ans6,把Ans4,Ans5,Ans6再进行结果投票即可得到Ans7,Ans7的效果经过笔者的实践证明还不错
72 |
73 |
74 | 如果您觉得笔者的骚操作是可以借鉴的,那么请给个可爱的Star吧!
75 |
76 |
--------------------------------------------------------------------------------
/f_sample_20180204.csv:
--------------------------------------------------------------------------------
1 | 1
2 | 0
3 | 1
4 | 0
5 |
--------------------------------------------------------------------------------
/f_test_a_20180204.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luoda888/tianchi-diabetes-top12/37787bf145c8824e614d107cef118f50af37b214/f_test_a_20180204.csv
--------------------------------------------------------------------------------
/f_train_20180204.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luoda888/tianchi-diabetes-top12/37787bf145c8824e614d107cef118f50af37b214/f_train_20180204.csv
--------------------------------------------------------------------------------
/top12-baseline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "from sklearn.preprocessing import MinMaxScaler\n",
12 | "from pandas import DataFrame as DF\n",
13 | "import xgboost as xgb\n",
14 | "import lightgbm as lgb\n",
15 | "from sklearn.svm import SVC\n",
16 | "from sklearn.ensemble import GradientBoostingClassifier as GBC"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import warnings\n",
26 | "warnings.filterwarnings(\"ignore\")\n",
27 | "from sklearn.cross_validation import cross_val_score as cv\n",
28 | "train = pd.read_csv('f_train_20180204.csv',encoding='gbk')\n",
29 | "test = pd.read_csv('f_test_a_20180204.csv',encoding='gbk')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "del train['id']\n",
39 | "del test['id']\n",
40 | "feature_name = [i for i in train.columns if i!='label']"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 4,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "def get_model(nums,cv_fold):\n",
50 | " feature_name1 = train_data[feature_name].columns\n",
51 | " get_ans_face = list(set(get_pic(gbc_model,feature_name1).head(nums)['name'])&set(get_pic(xgb_model,feature_name1).head(nums)['name'])&set(get_pic(lgb_model,feature_name1).head(nums)['name']))\n",
52 | " print('New Feature: ',len(get_ans_face))\n",
53 | " if 'SNP32*SNP34' not in get_ans_face:\n",
54 | " get_ans_face.append('SNP32*SNP34')\n",
55 | " print('New Feature: ',len(get_ans_face))\n",
56 | " new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1)\n",
57 | " cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n",
58 | " new_lgb_model.fit(train_data[get_ans_face], train_label)\n",
59 | " m1 = cv_model.mean()\n",
60 | "\n",
61 | " new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1)\n",
62 | " cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1')\n",
63 | " new_xgb_model1.fit(train_data[get_ans_face].values, train_label)\n",
64 | " m2 = cv_model.mean()\n",
65 | "\n",
66 | " new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1)\n",
67 | " kkk = train_data[get_ans_face].fillna(7)\n",
68 | " cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n",
69 | " new_gbc_model.fit(kkk.fillna(7),train_label)\n",
70 | "\n",
71 | " m3 = cv_model.mean()\n",
72 | " print((m1+m2+m3)/3)\n",
73 | " pro1 = new_lgb_model.predict_proba(test_data[get_ans_face])\n",
74 | " pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values)\n",
75 | " pro3 = new_gbc_model.predict_proba(test_data[get_ans_face].fillna(7).values)\n",
76 | " ans = (pro1+pro2+pro3)/3\n",
77 | " return ans\n",
78 | " \n",
79 | "# temp = [140,160,180,200,220,240,260,280,300,320]\n",
80 | "\n",
81 | "# ans = []\n",
82 | "# for i in range(len(temp)):\n",
83 | "# print('Now All Feature:',temp[i])\n",
84 | "# ans = get_model(temp[i],5)\n",
85 | "# if i == 0:\n",
86 | "# ans1 = ans\n",
87 | "# else:\n",
88 | "# ans1 += ans\n",
89 | "# ans1 /= len(temp)\n",
90 | "\n",
91 | "def find_best_feature(feature_name,cv_fold):\n",
92 | " get_ans_face = feature_name\n",
93 | " new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1)\n",
94 | " cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n",
95 | " new_lgb_model.fit(train_data[get_ans_face], train_label)\n",
96 | " m1 = cv_model.mean()\n",
97 | "\n",
98 | " new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1)\n",
99 | " cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1')\n",
100 | " new_xgb_model1.fit(train_data[get_ans_face].values, train_label)\n",
101 | " m2 = cv_model.mean()\n",
102 | "\n",
103 | " new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1)\n",
104 | " kkk = train_data[get_ans_face].fillna(7)\n",
105 | " cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n",
106 | " new_gbc_model.fit(kkk.fillna(7),train_label)\n",
107 | " m3 = cv_model.mean()\n",
108 | " return (m1+m2+m3)/3\n",
109 | "\n",
110 | "def train_best_feature(feature_name):\n",
111 | " get_ans_face = feature_name\n",
112 | " new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1)\n",
113 | " new_lgb_model.fit(train_data[get_ans_face], train_label)\n",
114 | "\n",
115 | " new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1)\n",
116 | " new_xgb_model1.fit(train_data[get_ans_face].values, train_label)\n",
117 | "\n",
118 | " new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1)\n",
119 | " kkk = train_data[get_ans_face].fillna(7)\n",
120 | " new_gbc_model.fit(kkk.fillna(7),train_label)\n",
121 | " \n",
122 | " pro1 = new_lgb_model.predict_proba(test_data[get_ans_face])\n",
123 | " pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values)\n",
124 | " pro3 = new_gbc_model.predict_proba(test_data[get_ans_face].fillna(7).values)\n",
125 | " ans = (pro1+pro2+pro3)/3\n",
126 | "\n",
127 | " return ans"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 5,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "train_data = pd.concat([train],axis=0)\n",
137 | "train_label = train_data['label']\n",
138 | "del train_data['label']\n",
139 | "test_data = test[feature_name]"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 6,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "feature_SNP = [i for i in feature_name if 'SNP' in i]\n",
149 | "feature_no_SNP = list(set(feature_name)-set(feature_SNP))\n",
150 | "train_no_SNP_mean = train.describe().T[['mean','min','max']].T[feature_no_SNP]\n",
151 | "train_no_SNP = train[feature_no_SNP]\n",
152 | "train_SNP = train[feature_SNP]\n",
153 | "test_no_SNP_mean = test.describe().T[['mean','min','max']].T[feature_no_SNP]\n",
154 | "test_SNP = test[feature_SNP]\n",
155 | "test_no_SNP = test[feature_no_SNP]"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 7,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "train_no_SNP.to_csv('train_no_SNP.csv',index=False)\n",
165 | "test_no_SNP.to_csv('test_no_SNP.csv',index=False)\n",
166 | "train_SNP.to_csv('train_SNP.csv',index=False)\n",
167 | "test_SNP.to_csv('test_SNP.csv',index=False)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 8,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "def get_division_feature(data,feature_name):\n",
177 | " new_feature = []\n",
178 | " new_feature_name = []\n",
179 | " for i in range(len(data[feature_name].columns)-1):\n",
180 | " for j in range(i+1,len(data[feature_name].columns)):\n",
181 | " new_feature_name.append(data[feature_name].columns[i] + '/' + data[feature_name].columns[j])\n",
182 | " new_feature_name.append(data[feature_name].columns[i] + '*' + data[feature_name].columns[j])\n",
183 | " new_feature_name.append(data[feature_name].columns[i] + '+' + data[feature_name].columns[j])\n",
184 | " new_feature_name.append(data[feature_name].columns[i] + '-' + data[feature_name].columns[j])\n",
185 | " new_feature.append(data[data[feature_name].columns[i]]/data[data[feature_name].columns[j]])\n",
186 | " new_feature.append(data[data[feature_name].columns[i]]*data[data[feature_name].columns[j]])\n",
187 | " new_feature.append(data[data[feature_name].columns[i]]+data[data[feature_name].columns[j]])\n",
188 | " new_feature.append(data[data[feature_name].columns[i]]-data[data[feature_name].columns[j]])\n",
189 | " \n",
190 | " \n",
191 | " temp_data = DF(pd.concat(new_feature,axis=1))\n",
192 | " temp_data.columns = new_feature_name\n",
193 | " data = pd.concat([data,temp_data],axis=1).reset_index(drop=True)\n",
194 | " \n",
195 | " print(data.shape)\n",
196 | " \n",
197 | " return data.reset_index(drop=True)\n",
198 | "\n",
199 | "def get_square_feature(data,feature_name):\n",
200 | " new_feature = []\n",
201 | " new_feature_name = []\n",
202 | " for i in range(len(data[feature_name].columns)):\n",
203 | " new_feature_name.append(data[feature_name].columns[i] + '**2')\n",
204 | " new_feature_name.append(data[feature_name].columns[i] + '**1/2')\n",
205 | " new_feature.append(data[data[feature_name].columns[i]]**2)\n",
206 | " new_feature.append(data[data[feature_name].columns[i]]**(1/2))\n",
207 | " \n",
208 | " temp_data = DF(pd.concat(new_feature,axis=1))\n",
209 | " temp_data.columns = new_feature_name\n",
210 | " data = pd.concat([data,temp_data],axis=1).reset_index(drop=True)\n",
211 | " \n",
212 | " print(data.shape)\n",
213 | " \n",
214 | " return data.reset_index(drop=True)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 9,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "name": "stdout",
224 | "output_type": "stream",
225 | "text": [
226 | "(1000, 56)\n",
227 | "(200, 56)\n",
228 | "(1000, 5995)\n",
229 | "(1000, 1540)\n",
230 | "(200, 5995)\n",
231 | "(200, 1540)\n",
232 | "7591\n",
233 | "(1000, 7591)\n",
234 | "(200, 7591)\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "train_data = get_square_feature(train_no_SNP,feature_no_SNP)\n",
240 | "test_data = get_square_feature(test_no_SNP,feature_no_SNP)\n",
241 | "\n",
242 | "train_data_SNP = get_division_feature(train_SNP,train_SNP.columns)\n",
243 | "train_data_no_SNP = get_division_feature(train_no_SNP,train_no_SNP.columns)\n",
244 | "train_data = pd.concat([train_data_SNP,train_data_no_SNP,train_data],axis=1)\n",
245 | "test_data_SNP = get_division_feature(test_SNP,test_SNP.columns)\n",
246 | "test_data_no_SNP = get_division_feature(test_no_SNP,test_no_SNP.columns)\n",
247 | "test_data = pd.concat([test_data_SNP,test_data_no_SNP,test_data],axis=1)\n",
248 | "\n",
249 | "feature_name = [i for i in train_data.columns if i!='label']\n",
250 | "print(len(train_data.columns))\n",
251 | "print(train_data.shape)\n",
252 | "print(test_data.shape)"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {
259 | "scrolled": true
260 | },
261 | "outputs": [],
262 | "source": []
263 | },
264 | {
265 | "cell_type": "raw",
266 | "metadata": {},
267 | "source": []
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 10,
272 | "metadata": {
273 | "scrolled": true
274 | },
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,\n",
280 | " max_bin=255, max_depth=-1, min_child_samples=20,\n",
281 | " min_child_weight=0.001, min_split_gain=0.0, n_estimators=120,\n",
282 | " n_jobs=-1, nthread=4, num_leaves=31, objective='binary',\n",
283 | " random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,\n",
284 | " subsample=0.9, subsample_for_bin=200000, subsample_freq=1)"
285 | ]
286 | },
287 | "execution_count": 10,
288 | "metadata": {},
289 | "output_type": "execute_result"
290 | }
291 | ],
292 | "source": [
293 | "lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=120,subsample=0.9,nthread=4)\n",
294 | "# cv_model = cv(lgb_model, train_data[feature_name], train_label, cv=10, scoring='f1')\n",
295 | "lgb_model.fit(train_data[feature_name], train_label)\n",
296 | "# print(cv_model)\n",
297 | "# print(cv_model.mean())\n",
298 | "\n",
299 | "# mean 0.650 166 feature\n",
300 | "# mean 0.650 6900 feature\n",
301 | "# median 0.648"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 11,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "def get_pic(model,feature_name):\n",
311 | " ans = DF()\n",
312 | " ans['name'] = feature_name\n",
313 | " ans['score'] = model.feature_importances_\n",
314 | "# print(ans[ans['score']>0].shape)\n",
315 | " return ans.sort_values(by=['score'],ascending=False).reset_index(drop=True)"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 12,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "data": {
325 | "text/plain": [
326 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
327 | " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
328 | " max_depth=3, min_child_weight=1, missing=None, n_estimators=120,\n",
329 | " n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,\n",
330 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
331 | " silent=True, subsample=0.9)"
332 | ]
333 | },
334 | "execution_count": 12,
335 | "metadata": {},
336 | "output_type": "execute_result"
337 | }
338 | ],
339 | "source": [
340 | "xgb_model = xgb.XGBClassifier(objective='binary:logistic',n_estimators=120,subsample=0.9,nthread=4)\n",
341 | "# cv_model = cv(xgb_model, train_data[feature_name].values, train_label, cv=10, scoring='f1')\n",
342 | "xgb_model.fit(train_data[feature_name].values, train_label)\n",
343 | "# print(cv_model)\n",
344 | "# print(cv_model.mean())\n",
345 | "\n",
346 | "# mean 166 632\n",
347 | "# median 0.657"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 13,
353 | "metadata": {},
354 | "outputs": [
355 | {
356 | "data": {
357 | "text/plain": [
358 | "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
359 | " learning_rate=0.1, loss='deviance', max_depth=3,\n",
360 | " max_features=None, max_leaf_nodes=None,\n",
361 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
362 | " min_samples_leaf=1, min_samples_split=2,\n",
363 | " min_weight_fraction_leaf=0.0, n_estimators=200,\n",
364 | " presort='auto', random_state=None, subsample=0.9, verbose=0,\n",
365 | " warm_start=False)"
366 | ]
367 | },
368 | "execution_count": 13,
369 | "metadata": {},
370 | "output_type": "execute_result"
371 | }
372 | ],
373 | "source": [
374 | "gbc_model = GBC(n_estimators=200,subsample=0.9,min_samples_split=2)\n",
375 | "kkk = train_data[feature_name].fillna(7)\n",
376 | "kkk.replace(np.inf,999,inplace=True)\n",
377 | "# cv_model = cv(gbc_model, kkk[feature_name], train_label, 1cv=10, scoring='f1')\n",
378 | "gbc_model.fit(kkk.fillna(7),train_label)\n",
379 | "# print(cv_model)\n",
380 | "# print(cv_model.mean())\n",
381 | "\n",
382 | "# mean 0.653\n",
383 | "# median 0.664"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 14,
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "data": {
393 | "text/plain": [
394 | "7591"
395 | ]
396 | },
397 | "execution_count": 14,
398 | "metadata": {},
399 | "output_type": "execute_result"
400 | }
401 | ],
402 | "source": [
403 | "len(feature_name)"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 21,
409 | "metadata": {},
410 | "outputs": [
411 | {
412 | "name": "stdout",
413 | "output_type": "stream",
414 | "text": [
415 | "New Feature: 96\n"
416 | ]
417 | }
418 | ],
419 | "source": [
420 | "nums = 45\n",
421 | "feature_name1 = train_data[feature_name].columns\n",
422 | "get_ans_face = list(set(get_pic(lgb_model,feature_name1).head(nums)['name'])|set(get_pic(xgb_model,feature_name1).head(nums)['name'])|set(get_pic(gbc_model,feature_name1).head(nums)['name']))\n",
423 | "print('New Feature: ',len(get_ans_face))\n",
424 | "\n",
425 | "# 320 0.739"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": null,
431 | "metadata": {},
432 | "outputs": [],
433 | "source": []
434 | },
435 | {
436 | "cell_type": "raw",
437 | "metadata": {},
438 | "source": [
439 | "在nums = 400的时候 能够达到0.739"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 22,
445 | "metadata": {},
446 | "outputs": [
447 | {
448 | "name": "stdout",
449 | "output_type": "stream",
450 | "text": [
451 | "目前特征长度为 1 目前帅气的cv值是 0.386775127677 成功加入第 1 个 增值为 0.386775127677\n",
452 | "目前特征长度为 2 目前帅气的cv值是 0.51616384038 成功加入第 2 个 增值为 0.129388712703\n",
453 | "目前特征长度为 3 目前帅气的cv值是 0.527535265985 成功加入第 3 个 增值为 0.011371425605\n",
454 | "目前特征长度为 4 目前帅气的cv值是 0.563174983085 成功加入第 4 个 增值为 0.0356397171\n",
455 | "目前特征长度为 5 目前帅气的cv值是 0.57436190063 成功加入第 5 个 增值为 0.0111869175454\n",
456 | "目前特征长度为 6 目前帅气的cv值是 0.586587422568 成功加入第 7 个 增值为 0.0122255219373\n",
457 | "目前特征长度为 7 目前帅气的cv值是 0.593785226558 成功加入第 12 个 增值为 0.00719780399015\n",
458 | "目前特征长度为 8 目前帅气的cv值是 0.608606465091 成功加入第 14 个 增值为 0.0148212385332\n",
459 | "目前特征长度为 9 目前帅气的cv值是 0.609209748232 成功加入第 17 个 增值为 0.000603283141013\n",
460 | "目前特征长度为 10 目前帅气的cv值是 0.620925798111 成功加入第 18 个 增值为 0.011716049879\n",
461 | "目前特征长度为 11 目前帅气的cv值是 0.634570115268 成功加入第 19 个 增值为 0.0136443171573\n",
462 | "目前特征长度为 12 目前帅气的cv值是 0.688309863978 成功加入第 20 个 增值为 0.0537397487097\n",
463 | "目前特征长度为 13 目前帅气的cv值是 0.689758693609 成功加入第 21 个 增值为 0.00144882963117\n",
464 | "目前特征长度为 14 目前帅气的cv值是 0.692031700018 成功加入第 22 个 增值为 0.00227300640844\n",
465 | "目前特征长度为 15 目前帅气的cv值是 0.70464125809 成功加入第 24 个 增值为 0.0126095580718\n",
466 | "目前特征长度为 16 目前帅气的cv值是 0.707376667537 成功加入第 25 个 增值为 0.00273540944779\n",
467 | "目前特征长度为 17 目前帅气的cv值是 0.707770917495 成功加入第 27 个 增值为 0.000394249957276\n",
468 | "目前特征长度为 18 目前帅气的cv值是 0.71005231562 成功加入第 28 个 增值为 0.00228139812537\n",
469 | "目前特征长度为 19 目前帅气的cv值是 0.712136621888 成功加入第 31 个 增值为 0.00208430626829\n",
470 | "目前特征长度为 20 目前帅气的cv值是 0.718013110585 成功加入第 32 个 增值为 0.00587648869632\n",
471 | "目前特征长度为 21 目前帅气的cv值是 0.718307792721 成功加入第 37 个 增值为 0.000294682136085\n",
472 | "目前特征长度为 22 目前帅气的cv值是 0.719082461863 成功加入第 38 个 增值为 0.000774669142242\n",
473 | "目前特征长度为 23 目前帅气的cv值是 0.721935152094 成功加入第 41 个 增值为 0.00285269023075\n",
474 | "目前特征长度为 24 目前帅气的cv值是 0.725049329819 成功加入第 44 个 增值为 0.00311417772499\n",
475 | "目前特征长度为 25 目前帅气的cv值是 0.72606671688 成功加入第 51 个 增值为 0.00101738706101\n",
476 | "目前特征长度为 26 目前帅气的cv值是 0.729606229912 成功加入第 53 个 增值为 0.00353951303223\n",
477 | "目前特征长度为 27 目前帅气的cv值是 0.729661495167 成功加入第 61 个 增值为 5.52652553621e-05\n",
478 | "目前特征长度为 28 目前帅气的cv值是 0.730213956901 成功加入第 62 个 增值为 0.000552461733845\n",
479 | "目前特征长度为 29 目前帅气的cv值是 0.734158746716 成功加入第 66 个 增值为 0.00394478981494\n"
480 | ]
481 | }
482 | ],
483 | "source": [
484 | "now_feature = []\n",
485 | "check = 0\n",
486 | "for i in range(len(get_ans_face)):\n",
487 | " now_feature.append(get_ans_face[i])\n",
488 | " jj = find_best_feature(now_feature,6)\n",
489 | " if jj>check:\n",
490 | " print('目前特征长度为',len(now_feature),' 目前帅气的cv值是',jj,' 成功加入第',i+1,'个','增值为',jj-check)\n",
491 | " check = jj\n",
492 | " else:\n",
493 | " now_feature.pop()\n",
494 | "# print('目前特征长度为',len(now_feature),'第',i+1,'个拉闸了')"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 17,
500 | "metadata": {},
501 | "outputs": [
502 | {
503 | "data": {
504 | "text/plain": [
505 | "['VAR00007*DM家族史',\n",
506 | " 'AST-hsCRP',\n",
507 | " '分娩时/wbc',\n",
508 | " 'wbc-LDLC',\n",
509 | " 'SNP32*SNP33',\n",
510 | " 'SNP20*SNP34',\n",
511 | " 'ApoB/BUN',\n",
512 | " 'SNP37/SNP53',\n",
513 | " 'SNP22/SNP34',\n",
514 | " 'VAR00007',\n",
515 | " 'hsCRP+年龄',\n",
516 | " '年龄+LDLC',\n",
517 | " 'VAR00007*年龄',\n",
518 | " 'SNP39*SNP47',\n",
519 | " 'hsCRP-LDLC',\n",
520 | " 'TG*年龄',\n",
521 | " '孕次/Lpa',\n",
522 | " 'SNP46/SNP47',\n",
523 | " 'SNP26*SNP48',\n",
524 | " 'wbc-年龄',\n",
525 | " '孕前BMI/Cr',\n",
526 | " 'VAR00007*糖筛孕周',\n",
527 | " 'SNP16/SNP34',\n",
528 | " '舒张压/ApoA1',\n",
529 | " 'BUN/DM家族史',\n",
530 | " '孕前体重-RBP4',\n",
531 | " 'SNP45*SNP46',\n",
532 | " 'SNP36*SNP49',\n",
533 | " 'SNP11*SNP15',\n",
534 | " 'SNP33/SNP46',\n",
535 | " 'TG+wbc',\n",
536 | " 'HDLC/wbc',\n",
537 | " 'TG*ALT']"
538 | ]
539 | },
540 | "execution_count": 17,
541 | "metadata": {},
542 | "output_type": "execute_result"
543 | }
544 | ],
545 | "source": [
546 | "now_feature"
547 | ]
548 | },
549 | {
550 | "cell_type": "markdown",
551 | "metadata": {},
552 | "source": [
553 | "First 1"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": null,
559 | "metadata": {},
560 | "outputs": [],
561 | "source": []
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": null,
566 | "metadata": {},
567 | "outputs": [],
568 | "source": []
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 23,
573 | "metadata": {},
574 | "outputs": [
575 | {
576 | "name": "stdout",
577 | "output_type": "stream",
578 | "text": [
579 | "目前特征长度为 1 目前帅气的cv值是 0.529126383031 成功加入第 1 个 增值为 0.529126383031\n",
580 | "目前特征长度为 2 目前帅气的cv值是 0.541969584999 成功加入第 2 个 增值为 0.0128432019677\n",
581 | "目前特征长度为 3 目前帅气的cv值是 0.568554327993 成功加入第 3 个 增值为 0.0265847429934\n",
582 | "目前特征长度为 4 目前帅气的cv值是 0.57652332479 成功加入第 11 个 增值为 0.0079689967979\n",
583 | "目前特征长度为 5 目前帅气的cv值是 0.59432805946 成功加入第 12 个 增值为 0.0178047346692\n",
584 | "目前特征长度为 6 目前帅气的cv值是 0.594995772882 成功加入第 14 个 增值为 0.000667713422274\n",
585 | "目前特征长度为 7 目前帅气的cv值是 0.601384057634 成功加入第 18 个 增值为 0.00638828475201\n",
586 | "目前特征长度为 8 目前帅气的cv值是 0.621135701011 成功加入第 20 个 增值为 0.0197516433766\n",
587 | "目前特征长度为 9 目前帅气的cv值是 0.659833147249 成功加入第 24 个 增值为 0.0386974462387\n",
588 | "目前特征长度为 10 目前帅气的cv值是 0.678642746028 成功加入第 25 个 增值为 0.0188095987783\n",
589 | "目前特征长度为 11 目前帅气的cv值是 0.685003138629 成功加入第 31 个 增值为 0.00636039260157\n",
590 | "目前特征长度为 12 目前帅气的cv值是 0.686918440568 成功加入第 33 个 增值为 0.00191530193904\n",
591 | "目前特征长度为 13 目前帅气的cv值是 0.689605039799 成功加入第 44 个 增值为 0.00268659923099\n",
592 | "目前特征长度为 14 目前帅气的cv值是 0.691387941235 成功加入第 53 个 增值为 0.00178290143601\n",
593 | "目前特征长度为 15 目前帅气的cv值是 0.699233952221 成功加入第 54 个 增值为 0.00784601098582\n",
594 | "目前特征长度为 16 目前帅气的cv值是 0.709950933425 成功加入第 55 个 增值为 0.0107169812039\n",
595 | "目前特征长度为 17 目前帅气的cv值是 0.713050765167 成功加入第 56 个 增值为 0.00309983174182\n",
596 | "目前特征长度为 18 目前帅气的cv值是 0.714504690354 成功加入第 62 个 增值为 0.0014539251878\n",
597 | "目前特征长度为 19 目前帅气的cv值是 0.715334590971 成功加入第 67 个 增值为 0.000829900616473\n",
598 | "目前特征长度为 20 目前帅气的cv值是 0.725797483685 成功加入第 70 个 增值为 0.0104628927137\n",
599 | "目前特征长度为 21 目前帅气的cv值是 0.72841012063 成功加入第 87 个 增值为 0.0026126369455\n",
600 | "目前特征长度为 22 目前帅气的cv值是 0.731058233319 成功加入第 92 个 增值为 0.00264811268908\n"
601 | ]
602 | }
603 | ],
604 | "source": [
605 | "now_feature2 = []\n",
606 | "check = 0\n",
607 | "for i in range(len(get_ans_face)):\n",
608 | " now_feature2.append(get_ans_face[len(get_ans_face)-i-1])\n",
609 | " jj = find_best_feature(now_feature2,6)\n",
610 | " if jj>check:\n",
611 | " print('目前特征长度为',len(now_feature2),' 目前帅气的cv值是',jj,' 成功加入第',i+1,'个','增值为',jj-check)\n",
612 | " check = jj\n",
613 | " else:\n",
614 | " now_feature2.pop()\n",
615 | "# print('目前特征长度为',len(now_feature),'第',i+1,'个拉闸了')"
616 | ]
617 | },
618 | {
619 | "cell_type": "code",
620 | "execution_count": null,
621 | "metadata": {},
622 | "outputs": [],
623 | "source": [
624 | "def get_proba(ans):\n",
625 | " kfc = []\n",
626 | " tot0 = 0\n",
627 | " tot1 = 0\n",
628 | " for i in range(len(ans)):\n",
629 | " if ans[i][0]>0.5:\n",
630 | " kfc.append(0)\n",
631 | " tot0 += 1\n",
632 | " else:\n",
633 | " kfc.append(1)\n",
634 | " tot1 += 1\n",
635 | " print('1 = ',tot1,' ','0 =',tot0)\n",
636 | " return kfc\n",
637 | "# ans1 = get_proba(train_best_feature(now_feature_1))\n",
638 | "ans1 = get_proba(train_best_feature(now_feature))\n",
639 | "# ans3 = get_proba((train_best_feature(now_feature2)+train_best_feature(now_feature))/2)\n",
640 | "ans2 = get_proba(train_best_feature(now_feature2))"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": null,
646 | "metadata": {},
647 | "outputs": [],
648 | "source": []
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": 409,
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "DF(ans).to_csv('真的不想做了.csv',header=False,index=False)"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": null,
662 | "metadata": {},
663 | "outputs": [],
664 | "source": [
665 | "pro1 = lgb_model.predict_proba(test_data[feature_name])\n",
666 | "pro2 = xgb_model.predict_proba(test_data[feature_name].values)\n",
667 | "pro3 = gbc_model.predict_proba(test_data[feature_name].fillna(7).values)"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": null,
673 | "metadata": {},
674 | "outputs": [],
675 | "source": [
676 | "lgb_ans = lgb_model.predict(test_data[feature_name])\n",
677 | "xgb_ans = xgb_model.predict(test_data[feature_name].values)\n",
678 | "gbc_ans = gbc_model.predict(test_data[feature_name].fillna(7.01))"
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": null,
684 | "metadata": {},
685 | "outputs": [],
686 | "source": []
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": 115,
691 | "metadata": {},
692 | "outputs": [
693 | {
694 | "name": "stdout",
695 | "output_type": "stream",
696 | "text": [
697 | "200\n"
698 | ]
699 | }
700 | ],
701 | "source": [
702 | "kfc = []\n",
703 | "for i in range(len(lgb_ans)):\n",
704 | " if (lgb_ans[i]==xgb_ans[i]):\n",
705 | " kfc.append(lgb_ans[i])\n",
706 | " elif (lgb_ans[i]==gbc_ans[i]):\n",
707 | " kfc.append(lgb_ans[i])\n",
708 | " elif (gbc_ans[i]==xgb_ans[i]):\n",
709 | " kfc.append(gbc_ans[i])\n",
710 | " else:\n",
711 | " kfc.append(gbc_ans[i])\n",
712 | " \n",
713 | "print(len(kfc))\n",
714 | "DF(kfc).to_csv('ans_fuck2.csv',index=False,header=False)"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": null,
720 | "metadata": {},
721 | "outputs": [],
722 | "source": []
723 | },
724 | {
725 | "cell_type": "code",
726 | "execution_count": null,
727 | "metadata": {},
728 | "outputs": [],
729 | "source": []
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": null,
734 | "metadata": {},
735 | "outputs": [],
736 | "source": []
737 | },
738 | {
739 | "cell_type": "code",
740 | "execution_count": null,
741 | "metadata": {},
742 | "outputs": [],
743 | "source": []
744 | },
745 | {
746 | "cell_type": "code",
747 | "execution_count": null,
748 | "metadata": {},
749 | "outputs": [],
750 | "source": []
751 | }
752 | ],
753 | "metadata": {
754 | "kernelspec": {
755 | "display_name": "Python 3",
756 | "language": "python",
757 | "name": "python3"
758 | },
759 | "language_info": {
760 | "codemirror_mode": {
761 | "name": "ipython",
762 | "version": 3
763 | },
764 | "file_extension": ".py",
765 | "mimetype": "text/x-python",
766 | "name": "python",
767 | "nbconvert_exporter": "python",
768 | "pygments_lexer": "ipython3",
769 | "version": "3.6.3"
770 | }
771 | },
772 | "nbformat": 4,
773 | "nbformat_minor": 2
774 | }
775 |
--------------------------------------------------------------------------------