├── README.md ├── f_sample_20180204.csv ├── f_test_a_20180204.csv ├── f_train_20180204.csv └── top12-baseline.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Readme.md 2 | ### 天池精准医疗大赛-糖尿病遗传风险预测 3 | ##### Top12 思路 由于初赛和复赛题目相差太大,谨在此给出复赛的一点思路权当抛砖引玉 4 | 5 | #### 特征工程 6 | ##### 新特征构造 7 | 1.构造加减乘除四则运算特征,做特征间的交互(考虑可解释的 基因拮抗、基因协同)
8 | 2.构造特征本身的乘方,幂方,开方等数值特征
9 | 3.利用多项式特征包来构造特征(线上表现不行)
10 | 11 | ##### 缺失值的处理 12 | 1.观察数据分布,对于缺失数据在非长尾的特征,均值填充/中值填充
13 | 2.把缺失值的特征当Label,考虑Label Propagation传播算法,半监督填充Label
14 | 3.不用GBDT等模型填充的原因是对于缺失值较多的(40%-75%),无法保证数据的分布一致
15 | 4.将缺失值数量超过75%的进行删除
16 | 17 | ##### 模型的选择 18 | 其实可以很轻松的发现这题数据量小,利用堆叠复杂的模型可能导致过拟合,故我们采用的是贪心法选择最优特征,基本框架为 19 | ``` 20 | if Choose_Best_Feature(now_feature)0].shape) 32 | return ans.sort_values(by=['score'],ascending=False).reset_index(drop=True) 33 | 34 | nums = 45 35 | feature_name1 = train_data[feature_name].columns 36 | get_ans_face = list(set(get_pic(lgb_model,feature_name1).head(nums)['name'])|set(get_pic(xgb_model,feature_name1).head(nums)['name'])|set(get_pic(gbc_model,feature_name1).head(nums)['name'])) 37 | # get_ans_face = list(set(get_pic(lgb_model,feature_name1).head(nums)['name'])&set(get_pic(xgb_model,feature_name1).head(nums)['name'])&set(get_pic(gbc_model,feature_name1).head(nums)['name'])) 38 | # 先训练好三个模型 第一种方法是将三个模型的Feature_importances的Top K选择出来后,将这些特征取并集;而第二种方法则是取交集 39 | ``` 40 | 在经验上 第一种方法所需要设置的nums较小,而第二种方法所需要设置的nums较大,籍此选出较强的特征后进入前文所述的贪心选择法中,即选择出较优的特征向量组,而在Choose_Best_Feature中,笔者使用的是`Xgboost`,`Lightgbm`,`GBDT`三种模型的CV值的平均值量度加入New_Feature对模型的影响,如此可以保证线上与线下的`同增同减` 41 | 42 | ``` 43 | def get_model(nums,cv_fold): 44 | feature_name1 = train_data[feature_name].columns 45 | get_ans_face = list(set(get_pic(gbc_model,feature_name1).head(nums)['name'])&set(get_pic(xgb_model,feature_name1).head(nums)['name'])&set(get_pic(lgb_model,feature_name1).head(nums)['name'])) 46 | print('New Feature: ',len(get_ans_face)) 47 | new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1) 48 | cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1') 49 | new_lgb_model.fit(train_data[get_ans_face], train_label) 50 | m1 = cv_model.mean() 51 | 52 | new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1) 53 | cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1') 54 | new_xgb_model1.fit(train_data[get_ans_face].values, train_label) 55 | m2 = cv_model.mean() 56 | 57 | new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1) 58 | kkk = train_data[get_ans_face].fillna(7) 59 | cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1') 60 | new_gbc_model.fit(kkk.fillna(7),train_label) 61 | 62 | m3 = cv_model.mean() 63 | print((m1+m2+m3)/3) 64 | pro1 = new_lgb_model.predict_proba(test_data[get_ans_face]) 65 | pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values) 66 | pro3 = new_gbc_model.predict_proba(test_data[get_ans_face].fillna(7).values) 67 | ans = (pro1+pro2+pro3)/3 68 | return ans 69 | ``` 70 | 71 | 在最后的结果提交环节中,也有一个可以参考的小技巧,将选择出来的特征向量组放入三个树模型中可以得到Ans1,Ans2,Ans3,也可以得到概率P1,P2,P3,那么将Ans1、2、3做结果的投票融合得到Ans4,将P1/P2/P3做概率融合得到Ans5,再利用线下表现较好的线性模型利用特征向量组产生Ans6,把Ans4,Ans5,Ans6再进行结果投票即可得到Ans7,Ans7的效果经过笔者的实践证明还不错 72 |
73 | 74 | 如果您觉得笔者的骚操作是可以借鉴的,那么请给个可爱的Star吧! 75 | 76 | -------------------------------------------------------------------------------- /f_sample_20180204.csv: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 1 4 | 0 5 | -------------------------------------------------------------------------------- /f_test_a_20180204.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luoda888/tianchi-diabetes-top12/37787bf145c8824e614d107cef118f50af37b214/f_test_a_20180204.csv -------------------------------------------------------------------------------- /f_train_20180204.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luoda888/tianchi-diabetes-top12/37787bf145c8824e614d107cef118f50af37b214/f_train_20180204.csv -------------------------------------------------------------------------------- /top12-baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from sklearn.preprocessing import MinMaxScaler\n", 12 | "from pandas import DataFrame as DF\n", 13 | "import xgboost as xgb\n", 14 | "import lightgbm as lgb\n", 15 | "from sklearn.svm import SVC\n", 16 | "from sklearn.ensemble import GradientBoostingClassifier as GBC" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import warnings\n", 26 | "warnings.filterwarnings(\"ignore\")\n", 27 | "from sklearn.cross_validation import cross_val_score as cv\n", 28 | "train = pd.read_csv('f_train_20180204.csv',encoding='gbk')\n", 29 | "test = pd.read_csv('f_test_a_20180204.csv',encoding='gbk')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "del train['id']\n", 39 | "del test['id']\n", 40 | "feature_name = [i for i in train.columns if i!='label']" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "def get_model(nums,cv_fold):\n", 50 | " feature_name1 = train_data[feature_name].columns\n", 51 | " get_ans_face = list(set(get_pic(gbc_model,feature_name1).head(nums)['name'])&set(get_pic(xgb_model,feature_name1).head(nums)['name'])&set(get_pic(lgb_model,feature_name1).head(nums)['name']))\n", 52 | " print('New Feature: ',len(get_ans_face))\n", 53 | " if 'SNP32*SNP34' not in get_ans_face:\n", 54 | " get_ans_face.append('SNP32*SNP34')\n", 55 | " print('New Feature: ',len(get_ans_face))\n", 56 | " new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1)\n", 57 | " cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n", 58 | " new_lgb_model.fit(train_data[get_ans_face], train_label)\n", 59 | " m1 = cv_model.mean()\n", 60 | "\n", 61 | " new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1)\n", 62 | " cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1')\n", 63 | " new_xgb_model1.fit(train_data[get_ans_face].values, train_label)\n", 64 | " m2 = cv_model.mean()\n", 65 | "\n", 66 | " new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1)\n", 67 | " kkk = train_data[get_ans_face].fillna(7)\n", 68 | " cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n", 69 | " new_gbc_model.fit(kkk.fillna(7),train_label)\n", 70 | "\n", 71 | " m3 = cv_model.mean()\n", 72 | " print((m1+m2+m3)/3)\n", 73 | " pro1 = new_lgb_model.predict_proba(test_data[get_ans_face])\n", 74 | " pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values)\n", 75 | " pro3 = new_gbc_model.predict_proba(test_data[get_ans_face].fillna(7).values)\n", 76 | " ans = (pro1+pro2+pro3)/3\n", 77 | " return ans\n", 78 | " \n", 79 | "# temp = [140,160,180,200,220,240,260,280,300,320]\n", 80 | "\n", 81 | "# ans = []\n", 82 | "# for i in range(len(temp)):\n", 83 | "# print('Now All Feature:',temp[i])\n", 84 | "# ans = get_model(temp[i],5)\n", 85 | "# if i == 0:\n", 86 | "# ans1 = ans\n", 87 | "# else:\n", 88 | "# ans1 += ans\n", 89 | "# ans1 /= len(temp)\n", 90 | "\n", 91 | "def find_best_feature(feature_name,cv_fold):\n", 92 | " get_ans_face = feature_name\n", 93 | " new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1)\n", 94 | " cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n", 95 | " new_lgb_model.fit(train_data[get_ans_face], train_label)\n", 96 | " m1 = cv_model.mean()\n", 97 | "\n", 98 | " new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1)\n", 99 | " cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='f1')\n", 100 | " new_xgb_model1.fit(train_data[get_ans_face].values, train_label)\n", 101 | " m2 = cv_model.mean()\n", 102 | "\n", 103 | " new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1)\n", 104 | " kkk = train_data[get_ans_face].fillna(7)\n", 105 | " cv_model = cv(new_gbc_model, kkk[get_ans_face], train_label, cv=cv_fold, scoring='f1')\n", 106 | " new_gbc_model.fit(kkk.fillna(7),train_label)\n", 107 | " m3 = cv_model.mean()\n", 108 | " return (m1+m2+m3)/3\n", 109 | "\n", 110 | "def train_best_feature(feature_name):\n", 111 | " get_ans_face = feature_name\n", 112 | " new_lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=300,max_depth=3,min_child_samples=6,learning_rate=0.102,random_state=1)\n", 113 | " new_lgb_model.fit(train_data[get_ans_face], train_label)\n", 114 | "\n", 115 | " new_xgb_model1 = xgb.XGBClassifier(objective='binary:logistic',n_estimators=300,max_depth=4,learning_rate=0.101,random_state=1)\n", 116 | " new_xgb_model1.fit(train_data[get_ans_face].values, train_label)\n", 117 | "\n", 118 | " new_gbc_model = GBC(n_estimators=310,subsample=1,min_samples_split=2,max_depth=3,learning_rate=0.1900,min_weight_fraction_leaf=0.1)\n", 119 | " kkk = train_data[get_ans_face].fillna(7)\n", 120 | " new_gbc_model.fit(kkk.fillna(7),train_label)\n", 121 | " \n", 122 | " pro1 = new_lgb_model.predict_proba(test_data[get_ans_face])\n", 123 | " pro2 = new_xgb_model1.predict_proba(test_data[get_ans_face].values)\n", 124 | " pro3 = new_gbc_model.predict_proba(test_data[get_ans_face].fillna(7).values)\n", 125 | " ans = (pro1+pro2+pro3)/3\n", 126 | "\n", 127 | " return ans" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "train_data = pd.concat([train],axis=0)\n", 137 | "train_label = train_data['label']\n", 138 | "del train_data['label']\n", 139 | "test_data = test[feature_name]" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 6, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "feature_SNP = [i for i in feature_name if 'SNP' in i]\n", 149 | "feature_no_SNP = list(set(feature_name)-set(feature_SNP))\n", 150 | "train_no_SNP_mean = train.describe().T[['mean','min','max']].T[feature_no_SNP]\n", 151 | "train_no_SNP = train[feature_no_SNP]\n", 152 | "train_SNP = train[feature_SNP]\n", 153 | "test_no_SNP_mean = test.describe().T[['mean','min','max']].T[feature_no_SNP]\n", 154 | "test_SNP = test[feature_SNP]\n", 155 | "test_no_SNP = test[feature_no_SNP]" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 7, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "train_no_SNP.to_csv('train_no_SNP.csv',index=False)\n", 165 | "test_no_SNP.to_csv('test_no_SNP.csv',index=False)\n", 166 | "train_SNP.to_csv('train_SNP.csv',index=False)\n", 167 | "test_SNP.to_csv('test_SNP.csv',index=False)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 8, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "def get_division_feature(data,feature_name):\n", 177 | " new_feature = []\n", 178 | " new_feature_name = []\n", 179 | " for i in range(len(data[feature_name].columns)-1):\n", 180 | " for j in range(i+1,len(data[feature_name].columns)):\n", 181 | " new_feature_name.append(data[feature_name].columns[i] + '/' + data[feature_name].columns[j])\n", 182 | " new_feature_name.append(data[feature_name].columns[i] + '*' + data[feature_name].columns[j])\n", 183 | " new_feature_name.append(data[feature_name].columns[i] + '+' + data[feature_name].columns[j])\n", 184 | " new_feature_name.append(data[feature_name].columns[i] + '-' + data[feature_name].columns[j])\n", 185 | " new_feature.append(data[data[feature_name].columns[i]]/data[data[feature_name].columns[j]])\n", 186 | " new_feature.append(data[data[feature_name].columns[i]]*data[data[feature_name].columns[j]])\n", 187 | " new_feature.append(data[data[feature_name].columns[i]]+data[data[feature_name].columns[j]])\n", 188 | " new_feature.append(data[data[feature_name].columns[i]]-data[data[feature_name].columns[j]])\n", 189 | " \n", 190 | " \n", 191 | " temp_data = DF(pd.concat(new_feature,axis=1))\n", 192 | " temp_data.columns = new_feature_name\n", 193 | " data = pd.concat([data,temp_data],axis=1).reset_index(drop=True)\n", 194 | " \n", 195 | " print(data.shape)\n", 196 | " \n", 197 | " return data.reset_index(drop=True)\n", 198 | "\n", 199 | "def get_square_feature(data,feature_name):\n", 200 | " new_feature = []\n", 201 | " new_feature_name = []\n", 202 | " for i in range(len(data[feature_name].columns)):\n", 203 | " new_feature_name.append(data[feature_name].columns[i] + '**2')\n", 204 | " new_feature_name.append(data[feature_name].columns[i] + '**1/2')\n", 205 | " new_feature.append(data[data[feature_name].columns[i]]**2)\n", 206 | " new_feature.append(data[data[feature_name].columns[i]]**(1/2))\n", 207 | " \n", 208 | " temp_data = DF(pd.concat(new_feature,axis=1))\n", 209 | " temp_data.columns = new_feature_name\n", 210 | " data = pd.concat([data,temp_data],axis=1).reset_index(drop=True)\n", 211 | " \n", 212 | " print(data.shape)\n", 213 | " \n", 214 | " return data.reset_index(drop=True)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 9, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "(1000, 56)\n", 227 | "(200, 56)\n", 228 | "(1000, 5995)\n", 229 | "(1000, 1540)\n", 230 | "(200, 5995)\n", 231 | "(200, 1540)\n", 232 | "7591\n", 233 | "(1000, 7591)\n", 234 | "(200, 7591)\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "train_data = get_square_feature(train_no_SNP,feature_no_SNP)\n", 240 | "test_data = get_square_feature(test_no_SNP,feature_no_SNP)\n", 241 | "\n", 242 | "train_data_SNP = get_division_feature(train_SNP,train_SNP.columns)\n", 243 | "train_data_no_SNP = get_division_feature(train_no_SNP,train_no_SNP.columns)\n", 244 | "train_data = pd.concat([train_data_SNP,train_data_no_SNP,train_data],axis=1)\n", 245 | "test_data_SNP = get_division_feature(test_SNP,test_SNP.columns)\n", 246 | "test_data_no_SNP = get_division_feature(test_no_SNP,test_no_SNP.columns)\n", 247 | "test_data = pd.concat([test_data_SNP,test_data_no_SNP,test_data],axis=1)\n", 248 | "\n", 249 | "feature_name = [i for i in train_data.columns if i!='label']\n", 250 | "print(len(train_data.columns))\n", 251 | "print(train_data.shape)\n", 252 | "print(test_data.shape)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "scrolled": true 260 | }, 261 | "outputs": [], 262 | "source": [] 263 | }, 264 | { 265 | "cell_type": "raw", 266 | "metadata": {}, 267 | "source": [] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 10, 272 | "metadata": { 273 | "scrolled": true 274 | }, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,\n", 280 | " max_bin=255, max_depth=-1, min_child_samples=20,\n", 281 | " min_child_weight=0.001, min_split_gain=0.0, n_estimators=120,\n", 282 | " n_jobs=-1, nthread=4, num_leaves=31, objective='binary',\n", 283 | " random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,\n", 284 | " subsample=0.9, subsample_for_bin=200000, subsample_freq=1)" 285 | ] 286 | }, 287 | "execution_count": 10, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "lgb_model = lgb.LGBMClassifier(objective='binary',n_estimators=120,subsample=0.9,nthread=4)\n", 294 | "# cv_model = cv(lgb_model, train_data[feature_name], train_label, cv=10, scoring='f1')\n", 295 | "lgb_model.fit(train_data[feature_name], train_label)\n", 296 | "# print(cv_model)\n", 297 | "# print(cv_model.mean())\n", 298 | "\n", 299 | "# mean 0.650 166 feature\n", 300 | "# mean 0.650 6900 feature\n", 301 | "# median 0.648" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 11, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "def get_pic(model,feature_name):\n", 311 | " ans = DF()\n", 312 | " ans['name'] = feature_name\n", 313 | " ans['score'] = model.feature_importances_\n", 314 | "# print(ans[ans['score']>0].shape)\n", 315 | " return ans.sort_values(by=['score'],ascending=False).reset_index(drop=True)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 12, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", 327 | " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", 328 | " max_depth=3, min_child_weight=1, missing=None, n_estimators=120,\n", 329 | " n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,\n", 330 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", 331 | " silent=True, subsample=0.9)" 332 | ] 333 | }, 334 | "execution_count": 12, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "xgb_model = xgb.XGBClassifier(objective='binary:logistic',n_estimators=120,subsample=0.9,nthread=4)\n", 341 | "# cv_model = cv(xgb_model, train_data[feature_name].values, train_label, cv=10, scoring='f1')\n", 342 | "xgb_model.fit(train_data[feature_name].values, train_label)\n", 343 | "# print(cv_model)\n", 344 | "# print(cv_model.mean())\n", 345 | "\n", 346 | "# mean 166 632\n", 347 | "# median 0.657" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 13, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", 359 | " learning_rate=0.1, loss='deviance', max_depth=3,\n", 360 | " max_features=None, max_leaf_nodes=None,\n", 361 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 362 | " min_samples_leaf=1, min_samples_split=2,\n", 363 | " min_weight_fraction_leaf=0.0, n_estimators=200,\n", 364 | " presort='auto', random_state=None, subsample=0.9, verbose=0,\n", 365 | " warm_start=False)" 366 | ] 367 | }, 368 | "execution_count": 13, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "gbc_model = GBC(n_estimators=200,subsample=0.9,min_samples_split=2)\n", 375 | "kkk = train_data[feature_name].fillna(7)\n", 376 | "kkk.replace(np.inf,999,inplace=True)\n", 377 | "# cv_model = cv(gbc_model, kkk[feature_name], train_label, 1cv=10, scoring='f1')\n", 378 | "gbc_model.fit(kkk.fillna(7),train_label)\n", 379 | "# print(cv_model)\n", 380 | "# print(cv_model.mean())\n", 381 | "\n", 382 | "# mean 0.653\n", 383 | "# median 0.664" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 14, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/plain": [ 394 | "7591" 395 | ] 396 | }, 397 | "execution_count": 14, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "len(feature_name)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 21, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "name": "stdout", 413 | "output_type": "stream", 414 | "text": [ 415 | "New Feature: 96\n" 416 | ] 417 | } 418 | ], 419 | "source": [ 420 | "nums = 45\n", 421 | "feature_name1 = train_data[feature_name].columns\n", 422 | "get_ans_face = list(set(get_pic(lgb_model,feature_name1).head(nums)['name'])|set(get_pic(xgb_model,feature_name1).head(nums)['name'])|set(get_pic(gbc_model,feature_name1).head(nums)['name']))\n", 423 | "print('New Feature: ',len(get_ans_face))\n", 424 | "\n", 425 | "# 320 0.739" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [] 434 | }, 435 | { 436 | "cell_type": "raw", 437 | "metadata": {}, 438 | "source": [ 439 | "在nums = 400的时候 能够达到0.739" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 22, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "目前特征长度为 1 目前帅气的cv值是 0.386775127677 成功加入第 1 个 增值为 0.386775127677\n", 452 | "目前特征长度为 2 目前帅气的cv值是 0.51616384038 成功加入第 2 个 增值为 0.129388712703\n", 453 | "目前特征长度为 3 目前帅气的cv值是 0.527535265985 成功加入第 3 个 增值为 0.011371425605\n", 454 | "目前特征长度为 4 目前帅气的cv值是 0.563174983085 成功加入第 4 个 增值为 0.0356397171\n", 455 | "目前特征长度为 5 目前帅气的cv值是 0.57436190063 成功加入第 5 个 增值为 0.0111869175454\n", 456 | "目前特征长度为 6 目前帅气的cv值是 0.586587422568 成功加入第 7 个 增值为 0.0122255219373\n", 457 | "目前特征长度为 7 目前帅气的cv值是 0.593785226558 成功加入第 12 个 增值为 0.00719780399015\n", 458 | "目前特征长度为 8 目前帅气的cv值是 0.608606465091 成功加入第 14 个 增值为 0.0148212385332\n", 459 | "目前特征长度为 9 目前帅气的cv值是 0.609209748232 成功加入第 17 个 增值为 0.000603283141013\n", 460 | "目前特征长度为 10 目前帅气的cv值是 0.620925798111 成功加入第 18 个 增值为 0.011716049879\n", 461 | "目前特征长度为 11 目前帅气的cv值是 0.634570115268 成功加入第 19 个 增值为 0.0136443171573\n", 462 | "目前特征长度为 12 目前帅气的cv值是 0.688309863978 成功加入第 20 个 增值为 0.0537397487097\n", 463 | "目前特征长度为 13 目前帅气的cv值是 0.689758693609 成功加入第 21 个 增值为 0.00144882963117\n", 464 | "目前特征长度为 14 目前帅气的cv值是 0.692031700018 成功加入第 22 个 增值为 0.00227300640844\n", 465 | "目前特征长度为 15 目前帅气的cv值是 0.70464125809 成功加入第 24 个 增值为 0.0126095580718\n", 466 | "目前特征长度为 16 目前帅气的cv值是 0.707376667537 成功加入第 25 个 增值为 0.00273540944779\n", 467 | "目前特征长度为 17 目前帅气的cv值是 0.707770917495 成功加入第 27 个 增值为 0.000394249957276\n", 468 | "目前特征长度为 18 目前帅气的cv值是 0.71005231562 成功加入第 28 个 增值为 0.00228139812537\n", 469 | "目前特征长度为 19 目前帅气的cv值是 0.712136621888 成功加入第 31 个 增值为 0.00208430626829\n", 470 | "目前特征长度为 20 目前帅气的cv值是 0.718013110585 成功加入第 32 个 增值为 0.00587648869632\n", 471 | "目前特征长度为 21 目前帅气的cv值是 0.718307792721 成功加入第 37 个 增值为 0.000294682136085\n", 472 | "目前特征长度为 22 目前帅气的cv值是 0.719082461863 成功加入第 38 个 增值为 0.000774669142242\n", 473 | "目前特征长度为 23 目前帅气的cv值是 0.721935152094 成功加入第 41 个 增值为 0.00285269023075\n", 474 | "目前特征长度为 24 目前帅气的cv值是 0.725049329819 成功加入第 44 个 增值为 0.00311417772499\n", 475 | "目前特征长度为 25 目前帅气的cv值是 0.72606671688 成功加入第 51 个 增值为 0.00101738706101\n", 476 | "目前特征长度为 26 目前帅气的cv值是 0.729606229912 成功加入第 53 个 增值为 0.00353951303223\n", 477 | "目前特征长度为 27 目前帅气的cv值是 0.729661495167 成功加入第 61 个 增值为 5.52652553621e-05\n", 478 | "目前特征长度为 28 目前帅气的cv值是 0.730213956901 成功加入第 62 个 增值为 0.000552461733845\n", 479 | "目前特征长度为 29 目前帅气的cv值是 0.734158746716 成功加入第 66 个 增值为 0.00394478981494\n" 480 | ] 481 | } 482 | ], 483 | "source": [ 484 | "now_feature = []\n", 485 | "check = 0\n", 486 | "for i in range(len(get_ans_face)):\n", 487 | " now_feature.append(get_ans_face[i])\n", 488 | " jj = find_best_feature(now_feature,6)\n", 489 | " if jj>check:\n", 490 | " print('目前特征长度为',len(now_feature),' 目前帅气的cv值是',jj,' 成功加入第',i+1,'个','增值为',jj-check)\n", 491 | " check = jj\n", 492 | " else:\n", 493 | " now_feature.pop()\n", 494 | "# print('目前特征长度为',len(now_feature),'第',i+1,'个拉闸了')" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 17, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "['VAR00007*DM家族史',\n", 506 | " 'AST-hsCRP',\n", 507 | " '分娩时/wbc',\n", 508 | " 'wbc-LDLC',\n", 509 | " 'SNP32*SNP33',\n", 510 | " 'SNP20*SNP34',\n", 511 | " 'ApoB/BUN',\n", 512 | " 'SNP37/SNP53',\n", 513 | " 'SNP22/SNP34',\n", 514 | " 'VAR00007',\n", 515 | " 'hsCRP+年龄',\n", 516 | " '年龄+LDLC',\n", 517 | " 'VAR00007*年龄',\n", 518 | " 'SNP39*SNP47',\n", 519 | " 'hsCRP-LDLC',\n", 520 | " 'TG*年龄',\n", 521 | " '孕次/Lpa',\n", 522 | " 'SNP46/SNP47',\n", 523 | " 'SNP26*SNP48',\n", 524 | " 'wbc-年龄',\n", 525 | " '孕前BMI/Cr',\n", 526 | " 'VAR00007*糖筛孕周',\n", 527 | " 'SNP16/SNP34',\n", 528 | " '舒张压/ApoA1',\n", 529 | " 'BUN/DM家族史',\n", 530 | " '孕前体重-RBP4',\n", 531 | " 'SNP45*SNP46',\n", 532 | " 'SNP36*SNP49',\n", 533 | " 'SNP11*SNP15',\n", 534 | " 'SNP33/SNP46',\n", 535 | " 'TG+wbc',\n", 536 | " 'HDLC/wbc',\n", 537 | " 'TG*ALT']" 538 | ] 539 | }, 540 | "execution_count": 17, 541 | "metadata": {}, 542 | "output_type": "execute_result" 543 | } 544 | ], 545 | "source": [ 546 | "now_feature" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "First 1" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 23, 573 | "metadata": {}, 574 | "outputs": [ 575 | { 576 | "name": "stdout", 577 | "output_type": "stream", 578 | "text": [ 579 | "目前特征长度为 1 目前帅气的cv值是 0.529126383031 成功加入第 1 个 增值为 0.529126383031\n", 580 | "目前特征长度为 2 目前帅气的cv值是 0.541969584999 成功加入第 2 个 增值为 0.0128432019677\n", 581 | "目前特征长度为 3 目前帅气的cv值是 0.568554327993 成功加入第 3 个 增值为 0.0265847429934\n", 582 | "目前特征长度为 4 目前帅气的cv值是 0.57652332479 成功加入第 11 个 增值为 0.0079689967979\n", 583 | "目前特征长度为 5 目前帅气的cv值是 0.59432805946 成功加入第 12 个 增值为 0.0178047346692\n", 584 | "目前特征长度为 6 目前帅气的cv值是 0.594995772882 成功加入第 14 个 增值为 0.000667713422274\n", 585 | "目前特征长度为 7 目前帅气的cv值是 0.601384057634 成功加入第 18 个 增值为 0.00638828475201\n", 586 | "目前特征长度为 8 目前帅气的cv值是 0.621135701011 成功加入第 20 个 增值为 0.0197516433766\n", 587 | "目前特征长度为 9 目前帅气的cv值是 0.659833147249 成功加入第 24 个 增值为 0.0386974462387\n", 588 | "目前特征长度为 10 目前帅气的cv值是 0.678642746028 成功加入第 25 个 增值为 0.0188095987783\n", 589 | "目前特征长度为 11 目前帅气的cv值是 0.685003138629 成功加入第 31 个 增值为 0.00636039260157\n", 590 | "目前特征长度为 12 目前帅气的cv值是 0.686918440568 成功加入第 33 个 增值为 0.00191530193904\n", 591 | "目前特征长度为 13 目前帅气的cv值是 0.689605039799 成功加入第 44 个 增值为 0.00268659923099\n", 592 | "目前特征长度为 14 目前帅气的cv值是 0.691387941235 成功加入第 53 个 增值为 0.00178290143601\n", 593 | "目前特征长度为 15 目前帅气的cv值是 0.699233952221 成功加入第 54 个 增值为 0.00784601098582\n", 594 | "目前特征长度为 16 目前帅气的cv值是 0.709950933425 成功加入第 55 个 增值为 0.0107169812039\n", 595 | "目前特征长度为 17 目前帅气的cv值是 0.713050765167 成功加入第 56 个 增值为 0.00309983174182\n", 596 | "目前特征长度为 18 目前帅气的cv值是 0.714504690354 成功加入第 62 个 增值为 0.0014539251878\n", 597 | "目前特征长度为 19 目前帅气的cv值是 0.715334590971 成功加入第 67 个 增值为 0.000829900616473\n", 598 | "目前特征长度为 20 目前帅气的cv值是 0.725797483685 成功加入第 70 个 增值为 0.0104628927137\n", 599 | "目前特征长度为 21 目前帅气的cv值是 0.72841012063 成功加入第 87 个 增值为 0.0026126369455\n", 600 | "目前特征长度为 22 目前帅气的cv值是 0.731058233319 成功加入第 92 个 增值为 0.00264811268908\n" 601 | ] 602 | } 603 | ], 604 | "source": [ 605 | "now_feature2 = []\n", 606 | "check = 0\n", 607 | "for i in range(len(get_ans_face)):\n", 608 | " now_feature2.append(get_ans_face[len(get_ans_face)-i-1])\n", 609 | " jj = find_best_feature(now_feature2,6)\n", 610 | " if jj>check:\n", 611 | " print('目前特征长度为',len(now_feature2),' 目前帅气的cv值是',jj,' 成功加入第',i+1,'个','增值为',jj-check)\n", 612 | " check = jj\n", 613 | " else:\n", 614 | " now_feature2.pop()\n", 615 | "# print('目前特征长度为',len(now_feature),'第',i+1,'个拉闸了')" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "def get_proba(ans):\n", 625 | " kfc = []\n", 626 | " tot0 = 0\n", 627 | " tot1 = 0\n", 628 | " for i in range(len(ans)):\n", 629 | " if ans[i][0]>0.5:\n", 630 | " kfc.append(0)\n", 631 | " tot0 += 1\n", 632 | " else:\n", 633 | " kfc.append(1)\n", 634 | " tot1 += 1\n", 635 | " print('1 = ',tot1,' ','0 =',tot0)\n", 636 | " return kfc\n", 637 | "# ans1 = get_proba(train_best_feature(now_feature_1))\n", 638 | "ans1 = get_proba(train_best_feature(now_feature))\n", 639 | "# ans3 = get_proba((train_best_feature(now_feature2)+train_best_feature(now_feature))/2)\n", 640 | "ans2 = get_proba(train_best_feature(now_feature2))" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 409, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "DF(ans).to_csv('真的不想做了.csv',header=False,index=False)" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "pro1 = lgb_model.predict_proba(test_data[feature_name])\n", 666 | "pro2 = xgb_model.predict_proba(test_data[feature_name].values)\n", 667 | "pro3 = gbc_model.predict_proba(test_data[feature_name].fillna(7).values)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "lgb_ans = lgb_model.predict(test_data[feature_name])\n", 677 | "xgb_ans = xgb_model.predict(test_data[feature_name].values)\n", 678 | "gbc_ans = gbc_model.predict(test_data[feature_name].fillna(7.01))" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 115, 691 | "metadata": {}, 692 | "outputs": [ 693 | { 694 | "name": "stdout", 695 | "output_type": "stream", 696 | "text": [ 697 | "200\n" 698 | ] 699 | } 700 | ], 701 | "source": [ 702 | "kfc = []\n", 703 | "for i in range(len(lgb_ans)):\n", 704 | " if (lgb_ans[i]==xgb_ans[i]):\n", 705 | " kfc.append(lgb_ans[i])\n", 706 | " elif (lgb_ans[i]==gbc_ans[i]):\n", 707 | " kfc.append(lgb_ans[i])\n", 708 | " elif (gbc_ans[i]==xgb_ans[i]):\n", 709 | " kfc.append(gbc_ans[i])\n", 710 | " else:\n", 711 | " kfc.append(gbc_ans[i])\n", 712 | " \n", 713 | "print(len(kfc))\n", 714 | "DF(kfc).to_csv('ans_fuck2.csv',index=False,header=False)" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": null, 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": null, 748 | "metadata": {}, 749 | "outputs": [], 750 | "source": [] 751 | } 752 | ], 753 | "metadata": { 754 | "kernelspec": { 755 | "display_name": "Python 3", 756 | "language": "python", 757 | "name": "python3" 758 | }, 759 | "language_info": { 760 | "codemirror_mode": { 761 | "name": "ipython", 762 | "version": 3 763 | }, 764 | "file_extension": ".py", 765 | "mimetype": "text/x-python", 766 | "name": "python", 767 | "nbconvert_exporter": "python", 768 | "pygments_lexer": "ipython3", 769 | "version": "3.6.3" 770 | } 771 | }, 772 | "nbformat": 4, 773 | "nbformat_minor": 2 774 | } 775 | --------------------------------------------------------------------------------