├── README.md ├── round1_rank2 ├── features │ ├── word_label.txt │ └── num_label.txt ├── version.txt ├── README.md ├── team │ ├── data_process_by_Mongo.py │ └── team_feature_work.py └── code │ └── main.py └── round2_rank10 ├── xgb_model ├── split_5_fold_data_xgb.py ├── add_prefix_for_xgb_model.py ├── calc_xgb_test_loss_and_save.py └── xgb_in_odps.sql ├── submit_result.py ├── README.md ├── data_pre_process ├── origin_part1_part2_row2col.py ├── data_merge_split.py ├── get_num_features.py └── get_word_features.py ├── feature_selection ├── select_features_by_model.py ├── snp_drop_one_hot.py ├── classification_tl.py ├── predict_value_tl_gt_4.py ├── gbdt_log_model.py └── get_best_rounds.py └── every_predict_model ├── sys_gbdt_best_rounds.py ├── dia_gbdt_best_rounds.py ├── ldl_gbdt_best_rounds.py ├── hdl_gbdt_best_rounds.py └── tl_gbdt_best_rounds.py /README.md: -------------------------------------------------------------------------------- 1 | ### 团队:Unreal 2 | 3 | ### 初赛Rank2, 复赛Rank10 4 | 5 | ### 队友Github 6 | zhuifeng414: https://github.com/Zhuifeng414 7 | 8 | wzm : https://github.com/w-zm 9 | -------------------------------------------------------------------------------- /round1_rank2/features/word_label.txt: -------------------------------------------------------------------------------- 1 | vid, 121, 2302, 709, 3486, 3485, 30007, 3194, 101, 102, 113, 114, 116, 117, 118, 1001, 409, 413, 434, 439, 985, A201, A202, 4001, A301, 705, 3192, 3196, 3190, 3197, 3195, 3430, 100010, 3191, 3193, 426, 420, 421, 423, 430, 431, 976, 3399, 405, 429, 3400, 403, 3301, 422 -------------------------------------------------------------------------------- /round1_rank2/version.txt: -------------------------------------------------------------------------------- 1 | 操作系统及主要package版本号: 2 | 1、main1.py: 3 | OS: Win10 4 | Python: 3.6.1 5 | pandas: 0.20.3 6 | lightgbm: 2.1.0 7 | numpy: 1.13.3 8 | sklearn: 0.19.1 9 | 10 | 2、main2.ipynb: 11 | OS: win10 12 | numpy: 1.14.2 13 | pandas: 0.22.0 14 | xgboost: 0.7 15 | scikit-learn: 0.19.1 16 | 17 | 3、main3.ipynb: 18 | OS: win10 19 | numpy: 1.14.2 20 | pandas: 0.22.0 21 | xgboost: 0.7 22 | scikit-learn: 0.19.1 23 | -------------------------------------------------------------------------------- /round2_rank10/xgb_model/split_5_fold_data_xgb.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from odps.df import DataFrame 4 | from sklearn.model_selection import KFold 5 | 6 | train_data = odps.get_table('juz_train_6_7_xgb').to_df().to_pandas() 7 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 8 | for t, (train_index, test_index) in enumerate(kf.split(train_data), start=1): 9 | print('第{}次拆分...'.format(t)) 10 | x_train, x_test = train_data.iloc[train_index], train_data.iloc[test_index] 11 | print(x_train.shape, x_test.shape) 12 | train_odps = DataFrame(x_train) 13 | test_odps = DataFrame(x_test) 14 | train_odps.persist('tl_xgb_train_{}'.format(t)) 15 | test_odps.persist('tl_xgb_test_{}'.format(t)) 16 | 17 | -------------------------------------------------------------------------------- /round1_rank2/features/num_label.txt: -------------------------------------------------------------------------------- 1 | vid, 183, 190, 191, 192, 193, 314, 1115, 1117, 2403, 2404, 2405, 1814, 1815, 1840, 1845, 1850, 10002, 10003, 10004, 100005, 100006, 100007, 2174, 31, 32, 34, 37, 38, 39, 317, 315, 312, 1321, 320, 319, 2372, 33, 316, 313, 2406, 269024, 269005, 269021, 269012, 269019, 269009, 269013, 155, 269023, 269004, 269008, 269003, 269010, 1345, 269022, 300012, 2333, 1127, 269006, 300021, 809009, 809008, 979002, 979019, 1106, 979012, 979005, 269015, 269007, 300019, 669002, 979018, 2376, 269016, 269020, 269017, 269018, 269025, 2420, 269014, 669006, 669009, 979021, 979006, 979014, 979008, 979003, 979009, 979011, 300018, 300017, 2409, 300092, 669021, 979004, 809021, 979007, 979013, 979022, 979016, 669005, 979001, 669004, 1474, 300008, 809023, 139, 300009, 100012, 809010, 809025, 2386, 979015, 809001, 979017, 300011, 1112, 100014, 300013, 669001, 143, 1107, 809004, 10009, 300001, 809026, 979020, 300014, 809017, 100013, 979023, 424, 2177 -------------------------------------------------------------------------------- /round1_rank2/README.md: -------------------------------------------------------------------------------- 1 | ### 团队:Unreal 2 | 3 | ### Rank:2 4 | 5 | ### 文件夹说明: 6 | - data:数据文件夹 7 | - features: 手动整理好的数值型和文字型特征,分别为num_label.txt和word_label.txt,数据清洗过程中需要使用这两个文件 8 | - code: 主运行代码,我的数据融合了zhuifeng414的数据,可直接运行main.py 9 | - team: 队友特征工程代码以及我的Mongodb操作代码 10 | - submit: 提交结果文件夹 11 | 12 | 13 | 14 | ### PS: 15 |       我之前数据预处理部分并没有使用豆腐大佬分享的开源代码,是通过把数据存入Mongodb文件里,然后再转存为csv文件的。原始数据其中有个别同一个vid的table_id对应了多个结果,我这边对他们进行了拼接操作,具体逻辑可见压缩包中的team文件夹的data_process_by_Mongo.py。考虑到主办方工作任务比较繁重且代码整理的时间比较少,我这里还是和队友一起使用了开源的数据预处理代码,但两者结果可能存在一定的差异,最终也许会对提交的成绩有所影响,所以这里特此说明下。 16 | 17 |       **团队目前使用的是去掉缺失值为96%的数据**,A,B榜的最优成绩的舒张压,收缩压,血清高密度脂蛋白是基于去掉缺失值96%,血清低密度脂蛋白和血清甘油三酯是基于98%的,代码位置在team_feature_work.py的886行。 18 | 19 |       这份代码是融合了我和zhuifeng414的特征,成绩A榜可以到0.02817,B榜可以到0.02792,B榜的最优成绩0.02764是用xgb进行融合的,那部分xgb代码是由wzm提供的。 20 | 21 | ### 队友Github 22 | zhuifeng414: https://github.com/Zhuifeng414 23 | 24 | wzm : https://github.com/w-zm 25 | -------------------------------------------------------------------------------- /round2_rank10/xgb_model/add_prefix_for_xgb_model.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from odps.df import DataFrame 4 | from sklearn.model_selection import KFold 5 | import time 6 | import numpy as np 7 | 8 | import sys 9 | reload(sys) 10 | sys.setdefaultencoding('utf8') 11 | 12 | label = 'tl' 13 | train = odps.get_table('{}_juz_train_6_6_snp_onehot_22'.format(label)).to_df().to_pandas() 14 | test = odps.get_table('{}_juz_test_6_6_snp_onehot_22'.format(label)).to_df().to_pandas() 15 | print(train.shape, test.shape) 16 | 17 | train['log_{}'.format(label)] = np.log(train[label]) 18 | test['log_{}'.format(label)] = np.log(test[label]) 19 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 20 | 21 | for i in train.columns: 22 | if i != 'vid' and not 'snp' in i and not 'log' in i and i not in predict_features: 23 | train['jz_{}'.format(i)] = train[i] 24 | test['jz_{}'.format(i)] = test[i] 25 | predict_features.append(i) 26 | 27 | train.drop(predict_features, axis=1, inplace=True) 28 | test.drop(predict_features, axis=1, inplace=True) 29 | print(train.shape, test.shape) 30 | 31 | juz_train = DataFrame(train) 32 | juz_test = DataFrame(test) 33 | juz_train.persist('juz_train_6_7_xgb') 34 | juz_test.persist('juz_test_6_7_xgb') -------------------------------------------------------------------------------- /round2_rank10/submit_result.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | sys = odps.get_table('sys_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:,['vid', 'sys']] 5 | dia = odps.get_table('dia_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:,['vid', 'dia']] 6 | tl = odps.get_table('tl_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:,['vid', 'tl']] 7 | hdl = odps.get_table('hdl_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:, ['vid', 'hdl']] 8 | ldl = odps.get_table('ldl_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:, ['vid', 'ldl']] 9 | 10 | print(tl.sort_values(by=['tl'], ascending=False).head(15)) 11 | 12 | tl_xgb = odps.get_table('tl_xgb_result').to_df().to_pandas().loc[:,['vid', 'tl']] 13 | tl['tl'] = tl['tl']*0.7 + tl_xgb['tl']*0.35 14 | 15 | sys_dia = pd.merge(sys, dia, on=['vid'], how='inner') 16 | sys_dia_tl = pd.merge(sys_dia, tl, on=['vid'], how='inner') 17 | sys_dia_tl_hdl = pd.merge(sys_dia_tl, hdl, on=['vid'], how='inner') 18 | submit = pd.merge(sys_dia_tl_hdl, ldl, on=['vid'], how='inner') 19 | 20 | submit.loc[submit['vid'] == '7b437e2632c91be2a0789adabce4b953', 'tl'] = 6 21 | print(submit.describe()) 22 | print(submit.head(5)) 23 | print(submit.sort_values(by=['tl'], ascending=False).head(15)) 24 | 25 | juz_submit = DataFrame(submit) 26 | juz_submit.persist('meinian_round2_submit_b') -------------------------------------------------------------------------------- /round2_rank10/README.md: -------------------------------------------------------------------------------- 1 | ### 团队:Unreal 2 | 3 | ### Rank:10 4 | 5 | 6 | ### 代码说明 7 | 8 | 9 | #### data_pre_process 10 | 11 |       1.origin_part1_part2_row2col:进行原始数据转换,包括行转列,去重等; 12 | 13 |       2. get_num_features,生成数值特征的表; 14 | 15 |       3. get_word_features,生成文字特征的表; 16 | 17 |       4. data_merge_split,合并数值、文字以及snp数据。 18 | 19 | 20 | #### feature_selection 21 | 22 |       1. 分别针对sys, dia, tl, hdl, ldl 运行snp_drop_one_hot, 得出五个对应特征的数据集,这一步骤主要是删去gbdt预训练中不重要的snp特征,然后进行one_hot编码; 23 | 24 |       2. 分别针对sys, dia, tl, hdl, ldl 运行get_best_rounds, 得出a步骤五个数据对应的五折最优迭代次数。 25 | 26 | #### every_prediction_model 27 | 28 |       运行所有文件,得出sys,dia,tl,hdl,ldl在测试集上的预测结果。 29 | 30 | #### xgb_model 31 | 32 |       1. add_prefix_for_xgb_model,得出带有前缀的特征数据集; 33 | 34 |       2. split_5_fold_data_xgb,分割五折训练的数据; 35 | 36 |       3. baseline_xgboost_jz,训练xgb模型; 37 | 38 |       4. calc_xgb_test_loss_and_save, 将c步骤中的五个tl的预测结果融合并取均值。 39 | 40 | 41 | #### submit_result 42 | 提交最终结果,最终结果是sys,dia,hdl,ldl为gbdt单模型,tl为gbdt和xgb的加权融合,比例为0.7和0.35。 43 | 44 | A榜单模型GBDT最优得分为0.0318,B榜单模型GBDT最优0.0321,tl加权融合后最优成绩0.0319。 45 | 46 | ### 队友Github 47 | zhuifeng414: https://github.com/Zhuifeng414 48 | 49 | wzm : https://github.com/w-zm 50 | -------------------------------------------------------------------------------- /round2_rank10/data_pre_process/origin_part1_part2_row2col.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pandas as pd 3 | from odps import ODPS 4 | from odps.df import DataFrame 5 | 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf8') 9 | 10 | # 读取数据 11 | part_1 = odps.get_table('meinian_round2_data_part1').to_df().to_pandas() 12 | part_2 = odps.get_table('meinian_round2_data_part2').to_df().to_pandas() 13 | part_1_2 = pd.concat([part_1,part_2]) 14 | part_1_2 = pd.DataFrame(part_1_2).sort_values('vid').reset_index(drop=True) 15 | begin_time = time.time() 16 | 17 | # 重复数据的拼接操作 18 | def merge_table(df): 19 | df['results'] = df['results'].astype(str) 20 | if df.shape[0] > 1: 21 | merge_df = "$".join(list(df['results'])) 22 | else: 23 | merge_df = df['results'].values[0] 24 | return merge_df 25 | # 数据简单处理 26 | print(part_1_2.shape) 27 | is_happen = part_1_2.groupby(['vid','test_id']).size().reset_index() 28 | # 重塑index用来去重 29 | is_happen['new_index'] = is_happen['vid'] + '_' + is_happen['test_id'] 30 | is_happen_new = is_happen[is_happen[0]>1]['new_index'] 31 | 32 | part_1_2['new_index'] = part_1_2['vid'] + '_' + part_1_2['test_id'] 33 | 34 | unique_part = part_1_2[part_1_2['new_index'].isin(list(is_happen_new))] 35 | unique_part = unique_part.sort_values(['vid','test_id']) 36 | no_unique_part = part_1_2[~part_1_2['new_index'].isin(list(is_happen_new))] 37 | print('begin') 38 | part_1_2_not_unique = unique_part.groupby(['vid','test_id']).apply(merge_table).reset_index() 39 | part_1_2_not_unique.rename(columns={0:'results'},inplace=True) 40 | tmp = pd.concat([part_1_2_not_unique,no_unique_part[['vid','test_id','results']]]) 41 | # 行列转换 42 | print('finish') 43 | tmp = tmp.pivot(index='vid',values='results',columns='test_id') 44 | print(tmp.shape) 45 | combine_data = DataFrame(tmp,unknown_as_string=True) 46 | combine_data.persist('origin_data_combine_part1_part2') 47 | print('total time',time.time() - begin_time) -------------------------------------------------------------------------------- /round2_rank10/xgb_model/calc_xgb_test_loss_and_save.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from odps.df import DataFrame 4 | import numpy as np 5 | 6 | 7 | def eval_metric(pred, labels): 8 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 9 | 10 | 11 | ''' 12 | ('fold 1: ', 0.088066106616791956) 13 | ('fold 2: ', 0.087444759182314105) 14 | ('fold 3: ', 0.097941499769017726) 15 | ('fold 4: ', 0.078793753494365307) 16 | ('fold 5: ', 0.086734232908105002) 17 | ('total loss: ', 0.087796043720344913) 18 | ''' 19 | 20 | # 这边的val都是预测了test b榜的值,然后保存表 21 | # 如果要计算xgb在验证集上的loss,需要在xgb的模型修改代码,并使用底端的注释内容进行计算 22 | val_1 = odps.get_table('jz_xgb_pred_val_1').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] 23 | val_2 = odps.get_table('jz_xgb_pred_val_2').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] 24 | val_3 = odps.get_table('jz_xgb_pred_val_3').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] 25 | val_4 = odps.get_table('jz_xgb_pred_val_4').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] 26 | val_5 = odps.get_table('jz_xgb_pred_val_5').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] 27 | 28 | xgb_result = val_1.loc[:, ['vid']] 29 | 30 | xgb_result['tl'] = np.exp((val_1['result'] + val_2['result'] + val_3['result'] + val_4['result'] + val_5['result'])/5) 31 | test_odps = DataFrame(xgb_result) 32 | test_odps.persist('tl_xgb_result') 33 | 34 | ''' 35 | val = pd.concat([val_1, val_2, val_3, val_4, val_5]) 36 | print('fold 1: ', eval_metric(np.exp(val_1['result']), np.exp(val_1['log_tl']))) 37 | print('fold 2: ', eval_metric(np.exp(val_2['result']), np.exp(val_2['log_tl']))) 38 | print('fold 3: ', eval_metric(np.exp(val_3['result']), np.exp(val_3['log_tl']))) 39 | print('fold 4: ', eval_metric(np.exp(val_4['result']), np.exp(val_4['log_tl']))) 40 | print('fold 5: ', eval_metric(np.exp(val_5['result']), np.exp(val_5['log_tl']))) 41 | print('total loss: ', eval_metric(np.exp(val['result']), np.exp(val['log_tl']))) 42 | ''' -------------------------------------------------------------------------------- /round2_rank10/data_pre_process/data_merge_split.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | import pandas as pd 4 | from odps import ODPS 5 | from odps.df import DataFrame 6 | import numpy as np 7 | from collections import Iterable 8 | from sklearn import preprocessing 9 | from itertools import combinations 10 | 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf8') 14 | 15 | # my all features 16 | train = odps.get_table('meinian_round2_train').to_df().to_pandas() 17 | test = odps.get_table('meinian_round2_submit_b').to_df().to_pandas() 18 | num_data = odps.get_table('juz_num_data_5_31').to_df().to_pandas() 19 | word_data = odps.get_table('juz_word_data_5_30').to_df().to_pandas() 20 | 21 | # add wl word features 22 | wl_word = odps.get_table('pre_txt_features_b').to_df().to_pandas() 23 | 24 | gene_data = odps.get_table('meinian_round2_snp').to_df().to_pandas() 25 | 26 | word_data = pd.merge(word_data, wl_word, on='vid', how='inner') 27 | # fix feature 314 28 | num_data.loc[num_data['314']<=1, '314'] = num_data.loc[num_data['314']<=1, '314'] * 100 29 | 30 | lbl = preprocessing.LabelEncoder() 31 | 32 | for c in gene_data.columns: 33 | if c not in ['vid']: 34 | gene_data[c] = lbl.fit_transform(gene_data[c]) 35 | 36 | print('final word data shape: ', word_data.shape) 37 | print('final num data shape: ', num_data.shape) 38 | print('final gene data shape: ', gene_data.shape) 39 | 40 | merge_tmp = pd.merge(num_data, word_data, on='vid', how='inner') 41 | merge_tmp = pd.merge(merge_tmp, gene_data, on='vid', how='left') 42 | 43 | print('final data shape: ', merge_tmp.shape) 44 | 45 | train_merge = pd.merge(train, merge_tmp, on='vid', how='left') 46 | test_merge = pd.merge(test, merge_tmp, on='vid', how='left') 47 | 48 | # fix some value of hdl and dia 49 | train_merge.loc[train_merge['vid'] == '605ebf5c6173cd3aab071060c9618b79', 'hdl'] = 1.28 50 | train_merge.loc[train_merge['vid'] == 'c6aec5461b1c5cca1c4ead3d4c2b83d9', 'dia'] = 90 51 | train_merge.fillna(-999, inplace=True) 52 | test_merge.fillna(-999, inplace=True) 53 | print('final train shape:{}, test shape:{} '.format(train_merge.shape, test_merge.shape)) 54 | juz_train = DataFrame(train_merge) 55 | juz_test = DataFrame(test_merge) 56 | juz_train.persist('juz_train_6_6_final') 57 | juz_test.persist('juz_test_6_6_final') -------------------------------------------------------------------------------- /round2_rank10/feature_selection/select_features_by_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from odps.df import DataFrame 4 | from sklearn.feature_selection import SelectFromModel 5 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor 6 | from sklearn.linear_model import RidgeCV 7 | 8 | train = odps.get_table('jz_combine_tl_train_6_2').to_df().to_pandas() 9 | test = odps.get_table('jz_combine_tl_test_6_2').to_df().to_pandas() 10 | 11 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 12 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 13 | x_train = train.loc[:, use_features] 14 | label = train['tl'] 15 | 16 | gbdt = GradientBoostingRegressor(random_state=1) 17 | rf = RandomForestRegressor(random_state=1) 18 | l2 = RidgeCV() 19 | 20 | sfm_gbdt = SelectFromModel(gbdt, threshold=0.001) 21 | sfm_gbdt.fit_transform(x_train, label) 22 | gbdt_features = set(x_train.columns[sfm_gbdt.get_support()]) 23 | print('*************************************') 24 | print(gbdt_features) 25 | 26 | 27 | sfm_rf = SelectFromModel(rf, threshold=0.001) 28 | sfm_rf.fit_transform(x_train, label) 29 | rf_features = set(x_train.columns[sfm_rf.get_support()]) 30 | print('*************************************') 31 | print(rf_features) 32 | 33 | print(gbdt_features & rf_features) 34 | sfm_l2 = SelectFromModel(l2, threshold=0.5) 35 | sfm_l2.fit_transform(x_train, label) 36 | l2_features = set(x_train.columns[sfm_l2.get_support()]) 37 | print('*************************************') 38 | print(l2_features) 39 | 40 | final_features = list(gbdt_features | rf_features | l2_features) 41 | # choose top k features 42 | #final_features = list((gbdt_features & rf_features) | l2_features) 43 | print('gbdt model has {} features'.format(len(gbdt_features))) 44 | print('rf model has {} features'.format(len(rf_features))) 45 | print('l2 model has {} features'.format(len(l2_features))) 46 | print('final has {} features'.format(len(final_features))) 47 | print('*************************************') 48 | print(final_features) 49 | print('*************************************') 50 | 51 | final_features.extend(['vid', 'tl']) 52 | train_final = DataFrame(train.loc[:, final_features]) 53 | train_final.persist('combine_tl_train_6_2') 54 | test_final = DataFrame(test.loc[:, final_features]) 55 | test_final.persist('combine_tl_test_6_2') -------------------------------------------------------------------------------- /round2_rank10/feature_selection/snp_drop_one_hot.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | import pandas as pd 4 | from odps import ODPS 5 | from odps.df import DataFrame 6 | import numpy as np 7 | from collections import Iterable 8 | from sklearn.ensemble import GradientBoostingRegressor 9 | from sklearn import preprocessing 10 | from itertools import combinations 11 | 12 | import sys 13 | reload(sys) 14 | sys.setdefaultencoding('utf8') 15 | 16 | # 选择超过设置阈值的snp数据进行one hot 17 | def get_one_hot_list(data_frame, pred_feac, threshold=10): 18 | print('Now we extract snp features for {} ...'.format(pred_feac)) 19 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl','vid'] 20 | use_features = [i for i in data_frame.columns if i not in predict_features] 21 | x_train = data_frame.loc[:, use_features] 22 | label = data_frame[pred_feac] 23 | 24 | gbdt = GradientBoostingRegressor(random_state=1, n_estimators=100) 25 | gbdt.fit(x_train, label) 26 | feature_imp = gbdt.feature_importances_ 27 | df = pd.DataFrame() 28 | df['feature'] = x_train.columns 29 | df['imp'] = feature_imp 30 | df.sort_values(by='imp', ascending=False, inplace=True) 31 | snp_list = [s for s in df['feature'] if s.startswith('snp')][:threshold] 32 | return snp_list 33 | 34 | 35 | # sys snp one-hot threshold:14 use_data_set: juz_train_6_6_add_wzm_for145_final2 generate_data_set: sys_juz_train_6_6_snp_onehot_22 36 | # dia snp one-hot threshold:10 use_data_set: juz_train_6_6_add_wzm_for145_final2 generate_data_set: dia_juz_train_6_6_snp_onehot_22 37 | # tl snp one-hot threshold:14 use_data_set: juz_train_6_6_add_wzm_onlytl_final generate_data_set: tl_juz_train_6_6_snp_onehot_22 38 | # hdl snp one-hot threshold:10 use_data_set: juz_train_6_6_add_wzm_for145_final generate_data_set: ldl_juz_train_6_6_snp_onehot_22 39 | # ldl snp one-hot threshold:1 use_data_set: juz_train_6_6_add_wzm_for145_final generate_data_set: hdl_juz_train_6_6_snp_onehot_22 40 | 41 | 42 | if __name__ == "__main__": 43 | use_label = 'hdl' 44 | train = odps.get_table('juz_train_6_6_add_wzm_for145_final').to_df().to_pandas() 45 | test = odps.get_table('juz_test_6_6_add_wzm_for145_final').to_df().to_pandas() 46 | print(train.shape, test.shape) 47 | gene_list = get_one_hot_list(train, use_label, 14) 48 | train.replace(-999, np.nan,inplace=True) 49 | test.replace(-999, np.nan, inplace=True) 50 | 51 | drop_snp = [s for s in train.columns if 'snp' in s] 52 | train.drop(drop_snp, axis=1, inplace=True) 53 | test.drop(drop_snp, axis=1, inplace=True) 54 | 55 | gene_data = odps.get_table('meinian_round2_snp').to_df().to_pandas() 56 | snp_data = pd.get_dummies(gene_data.loc[:, gene_list]) 57 | snp_data['vid'] = gene_data['vid'].values 58 | for s in snp_data.columns: 59 | if s != 'vid': 60 | snp_data[s] = snp_data[s].astype(int) 61 | 62 | train_merge = pd.merge(train, snp_data, on='vid', how='left') 63 | test_merge = pd.merge(test, snp_data, on='vid', how='left') 64 | 65 | train_merge.fillna(-999, inplace=True) 66 | test_merge.fillna(-999, inplace=True) 67 | print('final train shape:{}, test shape:{} '.format(train_merge.shape, test_merge.shape)) 68 | 69 | juz_train = DataFrame(train_merge) 70 | juz_test = DataFrame(test_merge) 71 | juz_train.persist('{}_juz_train_6_6_snp_onehot_22'.format(use_label)) 72 | juz_test.persist('{}_juz_test_6_6_snp_onehot_22'.format(use_label)) -------------------------------------------------------------------------------- /round1_rank2/team/data_process_by_Mongo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/4/10 0010 上午 9:10 4 | # @Author : Juzphy 5 | 6 | import pandas as pd 7 | import time 8 | from pymongo import MongoClient 9 | from collections import defaultdict 10 | 11 | 12 | ''' 13 | save data to MongoDB and export the mongo data to csv file. 14 | ''' 15 | 16 | 17 | def feature_data(): 18 | df = pd.read_csv('../data/meinian_round1_data_part1_20180408.txt', delimiter='$') 19 | df2 = pd.read_csv('./data/meinian_round1_data_part2_20180408.txt', delimiter='$') 20 | df = df.append(df2, ignore_index=True) 21 | df = df.fillna('') 22 | return df 23 | 24 | 25 | def data_load(path): 26 | data = pd.read_csv(path, encoding='gbk') 27 | data = data.fillna('') 28 | return data 29 | 30 | 31 | def match(feature, data_frame, hostname, db_name, set_name, port=27017): 32 | count = 0 33 | time_start = time.time() 34 | df_group = (d for d in feature.groupby(by='vid')) 35 | vid_value = set(data_frame['vid'].values) 36 | mongo_conn = MongoClient(hostname, port) 37 | db_set = mongo_conn[db_name][set_name] 38 | for j in df_group: 39 | if j[0] in vid_value: 40 | count += 1 41 | temp_dict = data_frame[data_frame['vid'] == j[0]].to_dict() 42 | temp_dict = {'_id' if k == 'vid' else k: n for k, v in temp_dict.items() for n in v.values()} 43 | if len(j[1]['table_id']) > len(j[1]['table_id'].unique()): 44 | j[1].index = range(j[1].shape[0]) 45 | table_dict = defaultdict(int) 46 | for t in j[1]['table_id']: 47 | table_dict[t] += 1 48 | beyond_one = [k for k, v in table_dict.items() if v > 1] 49 | other = [k for k, v in table_dict.items() if v == 1] 50 | other_index = j[1][j[1]['table_id'].isin(other)].index 51 | temp = dict(zip(j[1]['table_id'].iloc[other_index], j[1]['field_results'].iloc[other_index])) 52 | beyond_dict = {k: '$'.join(j[1]['field_results'].iloc[j[1][j[1]['table_id'] == k].index].fillna('')) for 53 | k in beyond_one} 54 | temp.update(beyond_dict) 55 | else: 56 | temp = dict(zip(j[1]['table_id'], j[1]['field_results'])) 57 | temp_dict.update(temp) 58 | db_set.save(temp_dict) 59 | print('vid of {} has writen.'.format(j[0])) 60 | print("total writen {0} records and spend {1} s.".format(count, round(time.time() - time_start), 2)) 61 | 62 | 63 | def feature_count(hostname, db_name, set_name, port=27017, count_threshold=0.4, name='train_set'): 64 | mongo_conn = MongoClient(hostname, port) 65 | mongo_set = mongo_conn[db_name][set_name] 66 | cursor = mongo_set.find() 67 | feature_dict = defaultdict(list) 68 | size = 0 69 | for c in cursor: 70 | size += 1 71 | for k in c.keys(): 72 | if k != '_id': 73 | feature_dict[k].append(c['_id']) 74 | 75 | feature_lt_threshold = [code for code, b_list in feature_dict.items() if len(b_list)/size < count_threshold] 76 | feature_gt_threshold = set(feature_dict.keys()) - set(feature_lt_threshold) 77 | barcode_gt_threshold = list({f for fd in feature_gt_threshold for f in feature_dict[fd]}) 78 | temp = {t: 0 for t in feature_lt_threshold} 79 | data = pd.DataFrame(list(mongo_set.find({"_id": {"$in": barcode_gt_threshold}}, temp))) 80 | data.to_csv('../data/{}.csv'.format(name), index=None, encoding='gbk') 81 | print('{}.csv has been successfully saved.'.format(name)) 82 | 83 | 84 | def mongo2csv(hostname, db_name, set_name, port=27017, name='train_set'): 85 | mongo_conn = MongoClient(hostname, port) 86 | mongo_set = mongo_conn[db_name][set_name] 87 | data = pd.DataFrame(list(mongo_set.find())) 88 | data.to_csv('./data/{}.csv'.format(name), index=None, encoding='gbk') 89 | print('{}.csv has been successfully saved.'.format(name)) 90 | 91 | 92 | if __name__ == '__main__': 93 | all_features = feature_data() 94 | # train = data_load('./data/origin_train.csv') 95 | # a_test = data_load('./data/origin_test_a.csv') 96 | b_test = data_load('./data/origin_test_b.csv') 97 | # print(train.shape, a_test.shape) 98 | match(all_features, b_test, 'localhost', 'meinian', 'test_b') 99 | # match(feature_data, a_test, 'localhost', 'meinian', 'test_data') 100 | # feature_count('10.10.0.7', 'meinian', 'train_data', name='new_meinian_train') 101 | # feature_count('10.10.0.7', 'meinian', 'test_data', name='new_meinian_test') 102 | mongo2csv('localhost', 'meinian', 'test_b', name='meinian_test_b') 103 | -------------------------------------------------------------------------------- /round2_rank10/feature_selection/classification_tl.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingClassifier 4 | from sklearn.metrics import classification_report 5 | from odps.df import DataFrame 6 | from sklearn.model_selection import KFold 7 | import time 8 | import numpy as np 9 | 10 | 11 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model): 12 | print(submission_data.head()) 13 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 14 | value4preds = df['pos_4'] 15 | train_data = df.loc[:, use_feature] 16 | print(train_data.shape, true_test.shape) 17 | pred_labels = np.zeros(df.shape[0]) 18 | submission_label = np.zeros((true_test.shape[0], 5)) 19 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 20 | five_fold_index = list(kf.split(train_data, value4preds)) 21 | 22 | train_index_1, test_index_1 = five_fold_index[0] 23 | print('第1次训练...') 24 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 25 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 26 | gbdt_model.fit(x_train_1, y_train_1) 27 | pred_labels[x_test_1.index] = np.where(gbdt_model.predict(x_test_1) > 0.5, 1, 0) 28 | submission_label[:, 0] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) 29 | print('第1次训练结束') 30 | print('*******************************************************************') 31 | train_index_2, test_index_2 = five_fold_index[1] 32 | print('第2次训练...') 33 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 34 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 35 | gbdt_model.fit(x_train_2, y_train_2) 36 | pred_labels[x_test_2.index] = np.where(gbdt_model.predict(x_test_2) > 0.5, 1, 0) 37 | submission_label[:, 1] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) 38 | print('第2次训练结束') 39 | print('*******************************************************************') 40 | train_index_3, test_index_3 = five_fold_index[2] 41 | print('第3次训练...') 42 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 43 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 44 | gbdt_model.fit(x_train_3, y_train_3) 45 | pred_labels[x_test_3.index] = np.where(gbdt_model.predict(x_test_3) > 0.5, 1, 0) 46 | submission_label[:, 2] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) 47 | print('第3次训练结束') 48 | print('*******************************************************************') 49 | train_index_4, test_index_4 = five_fold_index[3] 50 | print('第4次训练...') 51 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 52 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 53 | gbdt_model.fit(x_train_4, y_train_4) 54 | pred_labels[x_test_4.index] = np.where(gbdt_model.predict(x_test_4) > 0.5, 1, 0) 55 | submission_label[:, 3] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) 56 | print('第4次训练结束') 57 | print('*******************************************************************') 58 | train_index_5, test_index_5 = five_fold_index[4] 59 | print('第5次训练...') 60 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 61 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 62 | gbdt_model.fit(x_train_5, y_train_5) 63 | pred_labels[x_test_5.index] = np.where(gbdt_model.predict(x_test_5) > 0.5, 1, 0) 64 | submission_label[:, 4] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) 65 | print('第5次训练结束') 66 | print('*******************************************************************') 67 | submission_data['pos_4'] = np.where(np.sum(submission_label, axis=1) >= 1, 1, 0) 68 | print(classification_report(pred_labels, value4preds)) 69 | print(submission_data[submission_data['pos_4']==1]) 70 | sub_class = DataFrame(submission_data[submission_data['pos_4']==1], unknown_as_string=True) 71 | sub_class.persist('tl_gt_4_vid_6_6') 72 | 73 | # A榜使用了tl的高低值分类,B榜没有 74 | if __name__ == "__main__": 75 | train = odps.get_table('juz_train_6_6_final').to_df().to_pandas() 76 | train['pos_4'] = train['tl'].apply(lambda x: 1 if x > 4 else 0) 77 | test = odps.get_table('juz_test_6_6_final').to_df().to_pandas() 78 | class_result = test.loc[:, ['vid', 'tl']] 79 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 80 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 81 | test_data = test.loc[:, use_features] 82 | start = time.time() 83 | model = GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500, max_depth=5, subsample=0.7, 84 | random_state=1, verbose=0, min_samples_leaf=50) 85 | for i, j in enumerate(predict_features): 86 | if j in ['tl']: 87 | gbdt_model(train, j, use_features, test_data, class_result, model) -------------------------------------------------------------------------------- /round2_rank10/feature_selection/predict_value_tl_gt_4.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor 4 | from sklearn.linear_model import BayesianRidge 5 | from odps.df import DataFrame 6 | from sklearn.model_selection import KFold 7 | import time 8 | import numpy as np 9 | 10 | 11 | def eval_metric(pred, labels): 12 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 13 | 14 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model): 15 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 16 | value4preds = np.log(df[label]) 17 | train_data = df.loc[:, use_feature] 18 | print(train_data.shape) 19 | scores = np.zeros(len(value4preds)) 20 | submission_scores = np.zeros((len(submission_data), 5)) 21 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 22 | five_fold_index = list(kf.split(train_data, value4preds)) 23 | 24 | train_index_1, test_index_1 = five_fold_index[0] 25 | print('第1次训练...') 26 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 27 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 28 | gbdt_model.fit(x_train_1, y_train_1) 29 | scores[test_index_1] = np.exp(gbdt_model.predict(x_test_1)) 30 | submission_scores[:, 0] = gbdt_model.predict(true_test) 31 | print('第1次训练结束') 32 | print('*******************************************************************') 33 | train_index_2, test_index_2 = five_fold_index[1] 34 | print('第2次训练...') 35 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 36 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 37 | gbdt_model.fit(x_train_2, y_train_2) 38 | scores[test_index_2] = np.exp(gbdt_model.predict(x_test_2)) 39 | submission_scores[:, 1] = gbdt_model.predict(true_test) 40 | print('第2次训练结束') 41 | print('*******************************************************************') 42 | train_index_3, test_index_3 = five_fold_index[2] 43 | print('第3次训练...') 44 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 45 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 46 | gbdt_model.fit(x_train_3, y_train_3) 47 | scores[test_index_3] = np.exp(gbdt_model.predict(x_test_3)) 48 | submission_scores[:, 2] = gbdt_model.predict(true_test) 49 | print('第3次训练结束') 50 | print('*******************************************************************') 51 | train_index_4, test_index_4 = five_fold_index[3] 52 | print('第4次训练...') 53 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 54 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 55 | gbdt_model.fit(x_train_4, y_train_4) 56 | scores[test_index_4] = np.exp(gbdt_model.predict(x_test_4)) 57 | submission_scores[:, 3] = gbdt_model.predict(true_test) 58 | print('第4次训练结束') 59 | print('*******************************************************************') 60 | train_index_5, test_index_5 = five_fold_index[4] 61 | print('第5次训练...') 62 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 63 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 64 | gbdt_model.fit(x_train_5, y_train_5) 65 | scores[test_index_5] = np.exp(gbdt_model.predict(x_test_5)) 66 | submission_scores[:, 4] = gbdt_model.predict(true_test) 67 | print('第5次训练结束') 68 | print('*******************************************************************') 69 | submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3) 70 | 71 | 72 | # A榜使用了tl的高低值分类,B榜没有 73 | if __name__ == "__main__": 74 | train = odps.get_table('juz_train_6_6').to_df().to_pandas() 75 | test = odps.get_table('juz_test_6_6').to_df().to_pandas() 76 | submission = odps.get_table('tl_jz_5_fold_6_6_submit_22').to_df().to_pandas() 77 | vid_gt_4 = odps.get_table('tl_gt_4_vid_6_6').to_df().to_pandas()['vid'] 78 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 79 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features and t != 'pos_4' and not 'log' in t] 80 | pos_eq_1 = test[test['vid'].isin(vid_gt_4)] 81 | test_eq_1 = pos_eq_1.loc[:, use_features] 82 | submission_gt_4 = pos_eq_1.loc[:, ['vid', 'tl']] 83 | train_gt_4 = train[train['tl'] >= 4] 84 | train_gt_4.index = list(range(train_gt_4.shape[0])) 85 | model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=800, max_depth=5, subsample=0.8, 86 | random_state=1, verbose=1, min_samples_leaf=20) 87 | gbdt_model(train_gt_4, 'tl', use_features, test_eq_1, submission_gt_4, model) 88 | gt_4_index = submission[submission['vid'].isin(submission_gt_4['vid'])].index 89 | submission_temp = submission.loc[gt_4_index, ['vid', 'tl']] 90 | merge_fat = pd.merge(submission_temp, submission_gt_4, on='vid') 91 | temp_columns = [tc for tc in merge_fat.columns if tc != 'vid'] 92 | replace_num = np.max(merge_fat.loc[:, temp_columns], axis=1) 93 | submission.loc[gt_4_index, 'tl'] = replace_num.values 94 | print(submission.sort_values(by=['tl'], ascending=False)) 95 | sub_final = DataFrame(submission) 96 | sub_final.persist('tl_jz_5_fold_6_6_22_submit_modified_high_value') 97 | -------------------------------------------------------------------------------- /round2_rank10/feature_selection/gbdt_log_model.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor 4 | from odps.df import DataFrame 5 | from sklearn.model_selection import KFold 6 | import time 7 | import numpy as np 8 | 9 | 10 | def eval_metric(pred, labels): 11 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 12 | 13 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model): 14 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 15 | value4preds = np.log(df[label]) 16 | train_data = df.loc[:, use_feature] 17 | print(train_data.shape) 18 | scores = np.zeros(len(value4preds)) 19 | submission_scores = np.zeros((len(submission_data), 5)) 20 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 21 | five_fold_index = list(kf.split(train_data, value4preds)) 22 | 23 | train_index_1, test_index_1 = five_fold_index[0] 24 | print('第1次训练...') 25 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 26 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 27 | gbdt_model.fit(x_train_1, y_train_1) 28 | scores[test_index_1] = np.exp(gbdt_model.predict(x_test_1)) 29 | submission_scores[:, 0] = gbdt_model.predict(true_test) 30 | print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1))) 31 | print('第1次训练结束') 32 | print('*******************************************************************') 33 | train_index_2, test_index_2 = five_fold_index[1] 34 | print('第2次训练...') 35 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 36 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 37 | gbdt_model.fit(x_train_2, y_train_2) 38 | scores[test_index_2] = np.exp(gbdt_model.predict(x_test_2)) 39 | submission_scores[:, 1] = gbdt_model.predict(true_test) 40 | print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2))) 41 | print('第2次训练结束') 42 | print('*******************************************************************') 43 | train_index_3, test_index_3 = five_fold_index[2] 44 | print('第3次训练...') 45 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 46 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 47 | gbdt_model.fit(x_train_3, y_train_3) 48 | scores[test_index_3] = np.exp(gbdt_model.predict(x_test_3)) 49 | submission_scores[:, 2] = gbdt_model.predict(true_test) 50 | print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3))) 51 | print('第3次训练结束') 52 | print('*******************************************************************') 53 | train_index_4, test_index_4 = five_fold_index[3] 54 | print('第4次训练...') 55 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 56 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 57 | gbdt_model.fit(x_train_4, y_train_4) 58 | scores[test_index_4] = np.exp(gbdt_model.predict(x_test_4)) 59 | submission_scores[:, 3] = gbdt_model.predict(true_test) 60 | print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4))) 61 | print('第4次训练结束') 62 | print('*******************************************************************') 63 | train_index_5, test_index_5 = five_fold_index[4] 64 | print('第5次训练...') 65 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 66 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 67 | gbdt_model.fit(x_train_5, y_train_5) 68 | scores[test_index_5] = np.exp(gbdt_model.predict(x_test_5)) 69 | submission_scores[:, 4] = gbdt_model.predict(true_test) 70 | print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5))) 71 | print('第5次训练结束') 72 | print('*******************************************************************') 73 | submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3) 74 | return eval_metric(scores, np.exp(value4preds)) 75 | 76 | 77 | # snp one-hot:14 'sys': 0.013770852754101864 juz_train_6_6_add_wzm_for145_final2 sys_juz_train_6_6_snp_onehot_22 78 | # snp one-hot:10 'dia': 0.01811632794174798 juz_train_6_6_add_wzm_for145_final2 dia_juz_train_6_6_snp_onehot_22 79 | # snp one-hot:14 'tl': 0.088753086260020458 juz_train_6_6_add_wzm_onlytl_final tl_juz_train_6_6_snp_onehot_22 80 | # snp one-hot:10 'hdl': 0.011026393729362835 juz_train_6_6_add_wzm_for145_final ldl_juz_train_6_6_snp_onehot_22 81 | # snp one-hot:1 'ldl': 0.033357708093281487 juz_train_6_6_add_wzm_for145_final hdl_juz_train_6_6_snp_onehot_22 82 | if __name__ == "__main__": 83 | use_label = 'dia' 84 | train = odps.get_table('{}_juz_train_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas() 85 | test = odps.get_table('{}_juz_test_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas() 86 | 87 | print(train.shape) 88 | print(test.shape) 89 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 90 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 91 | test_data = test.loc[:, use_features] 92 | 93 | submission = test.loc[:, ['vid', use_label]] 94 | base_line_score = np.zeros(5) 95 | start = time.time() 96 | model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=800, max_depth=5, subsample=0.7, 97 | random_state=1, verbose=0, min_samples_leaf=50) 98 | for i, j in enumerate(predict_features): 99 | if j in [use_label]: 100 | base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission, model) 101 | print(dict(zip(predict_features, base_line_score))) 102 | print('CV训练用时{}秒'.format(time.time() - start)) 103 | print('scores:', np.mean(base_line_score)) -------------------------------------------------------------------------------- /round2_rank10/feature_selection/get_best_rounds.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import GradientBoostingRegressor 2 | from sklearn.model_selection import KFold 3 | import time 4 | import numpy as np 5 | import pandas as pd 6 | 7 | def eval_metric(pred, labels): 8 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 9 | 10 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model): 11 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 12 | value4preds = np.log(df[label]) 13 | train_data = df.loc[:, use_feature] 14 | print(train_data.shape) 15 | scores = np.zeros(len(value4preds)) 16 | submission_scores = np.zeros((len(submission_data), 5)) 17 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 18 | five_fold_index = list(kf.split(train_data, value4preds)) 19 | 20 | train_index_1, test_index_1 = five_fold_index[0] 21 | print('第1次训练...') 22 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 23 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 24 | gbdt_model.fit(x_train_1, y_train_1) 25 | errors_1 = [eval_metric(np.exp(y_test_1), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_1)] 26 | best_n_estimators_1 = np.argmin(errors_1)+1 27 | print("best number of estimators_1 is : ", best_n_estimators_1) 28 | scores[test_index_1] = np.exp(gbdt_model.predict(x_test_1)) 29 | print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1))) 30 | print('第1次训练结束') 31 | print('*******************************************************************') 32 | 33 | train_index_2, test_index_2 = five_fold_index[1] 34 | print('第2次训练...') 35 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 36 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 37 | gbdt_model.fit(x_train_2, y_train_2) 38 | errors_2 = [eval_metric(np.exp(y_test_2), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_2)] 39 | best_n_estimators_2 = np.argmin(errors_2)+1 40 | print("best number of estimators_2 is : ", best_n_estimators_2) 41 | scores[test_index_2] = np.exp(gbdt_model.predict(x_test_2)) 42 | print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2))) 43 | print('第2次训练结束') 44 | print('*******************************************************************') 45 | 46 | train_index_3, test_index_3 = five_fold_index[2] 47 | print('第3次训练...') 48 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 49 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 50 | gbdt_model.fit(x_train_3, y_train_3) 51 | errors_3 = [eval_metric(np.exp(y_test_3), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_3)] 52 | best_n_estimators_3 = np.argmin(errors_3)+1 53 | print("best number of estimators_3 is : ", best_n_estimators_3) 54 | scores[test_index_3] = np.exp(gbdt_model.predict(x_test_3)) 55 | print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3))) 56 | print('第3次训练结束') 57 | print('*******************************************************************') 58 | 59 | train_index_4, test_index_4 = five_fold_index[3] 60 | print('第4次训练...') 61 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 62 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 63 | gbdt_model.fit(x_train_4, y_train_4) 64 | errors_4 = [eval_metric(np.exp(y_test_4), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_4)] 65 | best_n_estimators_4 = np.argmin(errors_4)+1 66 | print("best number of estimators_4 is : ", best_n_estimators_4) 67 | scores[test_index_4] = np.exp(gbdt_model.predict(x_test_4)) 68 | print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4))) 69 | print('第4次训练结束') 70 | print('*******************************************************************') 71 | 72 | train_index_5, test_index_5 = five_fold_index[4] 73 | print('第5次训练...') 74 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 75 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 76 | gbdt_model.fit(x_train_5, y_train_5) 77 | errors_5 = [eval_metric(np.exp(y_test_5), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_5)] 78 | best_n_estimators_5 = np.argmin(errors_5)+1 79 | print("best number of estimators_5 is : ", best_n_estimators_5) 80 | scores[test_index_5] = np.exp(gbdt_model.predict(x_test_5)) 81 | print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5))) 82 | print('第5次训练结束') 83 | print('*******************************************************************') 84 | return eval_metric(scores, np.exp(value4preds)) 85 | 86 | 87 | # get best rounds of every predict label in 2000 rounds 88 | if __name__ == "__main__": 89 | use_label = 'sys' 90 | train = odps.get_table('{}_juz_train_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas() 91 | test = odps.get_table('{}_juz_test_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas() 92 | print(train.shape) 93 | print(test.shape) 94 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 95 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 96 | test_data = test.loc[:, use_features] 97 | submission = test.loc[:, ['vid', use_label]] 98 | base_line_score = np.zeros(5) 99 | start = time.time() 100 | model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=2000, max_depth=5, subsample=0.7, 101 | random_state=1, verbose=0, min_samples_leaf=50) 102 | for i, j in enumerate(predict_features): 103 | if j in [use_label]: 104 | base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission, model) 105 | print(dict(zip(predict_features, base_line_score))) 106 | print('CV训练用时{}秒'.format(time.time() - start)) 107 | print('scores:', np.mean(base_line_score)) -------------------------------------------------------------------------------- /round2_rank10/every_predict_model/sys_gbdt_best_rounds.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor 4 | from odps.df import DataFrame 5 | from sklearn.model_selection import KFold 6 | import time 7 | import numpy as np 8 | 9 | 10 | def eval_metric(pred, labels): 11 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 12 | 13 | def gbdt_model(df, label, use_feature, true_test, submission_data): 14 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 15 | value4preds = np.log(df[label]) 16 | train_data = df.loc[:, use_feature] 17 | print(train_data.shape) 18 | scores = np.zeros(len(value4preds)) 19 | submission_scores = np.zeros((len(submission_data), 5)) 20 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 21 | five_fold_index = list(kf.split(train_data, value4preds)) 22 | 23 | train_index_1, test_index_1 = five_fold_index[0] 24 | print('第1次训练...') 25 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 26 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 27 | gbdt_model_1 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=624, max_depth=5, subsample=0.7, 28 | random_state=1, verbose=0, min_samples_leaf=50) 29 | gbdt_model_1.fit(x_train_1, y_train_1) 30 | scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1)) 31 | submission_scores[:, 0] = gbdt_model_1.predict(true_test) 32 | print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1))) 33 | print('第1次训练结束') 34 | print('*******************************************************************') 35 | train_index_2, test_index_2 = five_fold_index[1] 36 | print('第2次训练...') 37 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 38 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 39 | gbdt_model_2 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=902, max_depth=5, subsample=0.7, 40 | random_state=1, verbose=0, min_samples_leaf=50) 41 | gbdt_model_2.fit(x_train_2, y_train_2) 42 | scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2)) 43 | submission_scores[:, 1] = gbdt_model_2.predict(true_test) 44 | print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2))) 45 | print('第2次训练结束') 46 | print('*******************************************************************') 47 | train_index_3, test_index_3 = five_fold_index[2] 48 | print('第3次训练...') 49 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 50 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 51 | gbdt_model_3 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=911, max_depth=5, subsample=0.7, 52 | random_state=1, verbose=0, min_samples_leaf=50) 53 | gbdt_model_3.fit(x_train_3, y_train_3) 54 | scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3)) 55 | submission_scores[:, 2] = gbdt_model_3.predict(true_test) 56 | print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3))) 57 | print('第3次训练结束') 58 | print('*******************************************************************') 59 | train_index_4, test_index_4 = five_fold_index[3] 60 | print('第4次训练...') 61 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 62 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 63 | gbdt_model_4 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=535, max_depth=5, subsample=0.7, 64 | random_state=1, verbose=0, min_samples_leaf=50) 65 | gbdt_model_4.fit(x_train_4, y_train_4) 66 | scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4)) 67 | submission_scores[:, 3] = gbdt_model_4.predict(true_test) 68 | print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4))) 69 | print('第4次训练结束') 70 | print('*******************************************************************') 71 | train_index_5, test_index_5 = five_fold_index[4] 72 | print('第5次训练...') 73 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 74 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 75 | gbdt_model_5 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=533, max_depth=5, subsample=0.7, 76 | random_state=1, verbose=0, min_samples_leaf=50) 77 | gbdt_model_5.fit(x_train_5, y_train_5) 78 | scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5)) 79 | submission_scores[:, 4] = gbdt_model_5.predict(true_test) 80 | print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5))) 81 | print('第5次训练结束') 82 | print('*******************************************************************') 83 | submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3) 84 | return eval_metric(scores, np.exp(value4preds)) 85 | 86 | 87 | # b-board 88 | # 624 902 911 535 533 89 | # 'sys': 0.013720952232896292 90 | if __name__ == "__main__": 91 | train = odps.get_table('sys_juz_train_6_6_snp_onehot_22').to_df().to_pandas() 92 | test = odps.get_table('sys_juz_test_6_6_snp_onehot_22').to_df().to_pandas() 93 | print(train.shape) 94 | print(test.shape) 95 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 96 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 97 | test_data = test.loc[:, use_features] 98 | 99 | submission = test.loc[:, ['vid', 'sys']] 100 | base_line_score = np.zeros(5) 101 | start = time.time() 102 | for i, j in enumerate(predict_features): 103 | if j in ['sys']: 104 | base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission) 105 | print(dict(zip(predict_features, base_line_score))) 106 | print('CV训练用时{}秒'.format(time.time() - start)) 107 | print('scores:', np.mean(base_line_score)) 108 | sub_final = DataFrame(submission) 109 | sub_final.persist('sys_jz_5_fold_6_6_submit_22') -------------------------------------------------------------------------------- /round2_rank10/every_predict_model/dia_gbdt_best_rounds.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor 4 | from odps.df import DataFrame 5 | from sklearn.model_selection import KFold 6 | import time 7 | import numpy as np 8 | 9 | 10 | def eval_metric(pred, labels): 11 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 12 | 13 | def gbdt_model(df, label, use_feature, true_test, submission_data): 14 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 15 | value4preds = np.log(df[label]) 16 | train_data = df.loc[:, use_feature] 17 | print(train_data.shape) 18 | scores = np.zeros(len(value4preds)) 19 | submission_scores = np.zeros((len(submission_data), 5)) 20 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 21 | five_fold_index = list(kf.split(train_data, value4preds)) 22 | 23 | train_index_1, test_index_1 = five_fold_index[0] 24 | print('第1次训练...') 25 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 26 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 27 | gbdt_model_1 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=739, max_depth=5, subsample=0.7, 28 | random_state=1, verbose=0, min_samples_leaf=50) 29 | gbdt_model_1.fit(x_train_1, y_train_1) 30 | scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1)) 31 | submission_scores[:, 0] = gbdt_model_1.predict(true_test) 32 | print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1))) 33 | print('第1次训练结束') 34 | print('*******************************************************************') 35 | train_index_2, test_index_2 = five_fold_index[1] 36 | print('第2次训练...') 37 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 38 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 39 | gbdt_model_2 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=601, max_depth=5, subsample=0.7, 40 | random_state=1, verbose=0, min_samples_leaf=50) 41 | gbdt_model_2.fit(x_train_2, y_train_2) 42 | scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2)) 43 | submission_scores[:, 1] = gbdt_model_2.predict(true_test) 44 | print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2))) 45 | print('第2次训练结束') 46 | print('*******************************************************************') 47 | train_index_3, test_index_3 = five_fold_index[2] 48 | print('第3次训练...') 49 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 50 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 51 | gbdt_model_3 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=887, max_depth=5, subsample=0.7, 52 | random_state=1, verbose=0, min_samples_leaf=50) 53 | gbdt_model_3.fit(x_train_3, y_train_3) 54 | scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3)) 55 | submission_scores[:, 2] = gbdt_model_3.predict(true_test) 56 | print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3))) 57 | print('第3次训练结束') 58 | print('*******************************************************************') 59 | train_index_4, test_index_4 = five_fold_index[3] 60 | print('第4次训练...') 61 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 62 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 63 | gbdt_model_4 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=709, max_depth=5, subsample=0.7, 64 | random_state=1, verbose=0, min_samples_leaf=50) 65 | gbdt_model_4.fit(x_train_4, y_train_4) 66 | scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4)) 67 | submission_scores[:, 3] = gbdt_model_4.predict(true_test) 68 | print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4))) 69 | print('第4次训练结束') 70 | print('*******************************************************************') 71 | train_index_5, test_index_5 = five_fold_index[4] 72 | print('第5次训练...') 73 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 74 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 75 | gbdt_model_5 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1221, max_depth=5, subsample=0.7, 76 | random_state=1, verbose=0, min_samples_leaf=50) 77 | gbdt_model_5.fit(x_train_5, y_train_5) 78 | scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5)) 79 | submission_scores[:, 4] = gbdt_model_5.predict(true_test) 80 | print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5))) 81 | print('第5次训练结束') 82 | print('*******************************************************************') 83 | submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3) 84 | return eval_metric(scores, np.exp(value4preds)) 85 | 86 | 87 | # b-board 88 | # 739 601 887 709 1221 89 | # 'dia': 0.018069628693683809 90 | if __name__ == "__main__": 91 | train = odps.get_table('dia_juz_train_6_6_snp_onehot_22').to_df().to_pandas() 92 | test = odps.get_table('dia_juz_test_6_6_snp_onehot_22').to_df().to_pandas() 93 | print(train.shape) 94 | print(test.shape) 95 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 96 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 97 | test_data = test.loc[:, use_features] 98 | 99 | submission = test.loc[:, ['vid', 'dia']] 100 | base_line_score = np.zeros(5) 101 | start = time.time() 102 | for i, j in enumerate(predict_features): 103 | if j in ['dia']: 104 | base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission) 105 | print(dict(zip(predict_features, base_line_score))) 106 | print('CV训练用时{}秒'.format(time.time() - start)) 107 | print('scores:', np.mean(base_line_score)) 108 | sub_final = DataFrame(submission) 109 | sub_final.persist('dia_jz_5_fold_6_6_submit_22') -------------------------------------------------------------------------------- /round2_rank10/every_predict_model/ldl_gbdt_best_rounds.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor 4 | from odps.df import DataFrame 5 | from sklearn.model_selection import KFold 6 | import time 7 | import numpy as np 8 | 9 | 10 | def eval_metric(pred, labels): 11 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 12 | 13 | def gbdt_model(df, label, use_feature, true_test, submission_data): 14 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 15 | value4preds = np.log(df[label]) 16 | train_data = df.loc[:, use_feature] 17 | print(train_data.shape) 18 | scores = np.zeros(len(value4preds)) 19 | submission_scores = np.zeros((len(submission_data), 5)) 20 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 21 | five_fold_index = list(kf.split(train_data, value4preds)) 22 | 23 | train_index_1, test_index_1 = five_fold_index[0] 24 | print('第1次训练...') 25 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 26 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 27 | gbdt_model_1 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=718, max_depth=5, subsample=0.7, 28 | random_state=1, verbose=0, min_samples_leaf=50) 29 | gbdt_model_1.fit(x_train_1, y_train_1) 30 | scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1)) 31 | submission_scores[:, 0] = gbdt_model_1.predict(true_test) 32 | print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1))) 33 | print('第1次训练结束') 34 | print('*******************************************************************') 35 | train_index_2, test_index_2 = five_fold_index[1] 36 | print('第2次训练...') 37 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 38 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 39 | gbdt_model_2 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=968, max_depth=5, subsample=0.7, 40 | random_state=1, verbose=0, min_samples_leaf=50) 41 | gbdt_model_2.fit(x_train_2, y_train_2) 42 | scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2)) 43 | submission_scores[:, 1] = gbdt_model_2.predict(true_test) 44 | print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2))) 45 | print('第2次训练结束') 46 | print('*******************************************************************') 47 | train_index_3, test_index_3 = five_fold_index[2] 48 | print('第3次训练...') 49 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 50 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 51 | gbdt_model_3 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=993, max_depth=5, subsample=0.7, 52 | random_state=1, verbose=0, min_samples_leaf=50) 53 | gbdt_model_3.fit(x_train_3, y_train_3) 54 | scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3)) 55 | submission_scores[:, 2] = gbdt_model_3.predict(true_test) 56 | print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3))) 57 | print('第3次训练结束') 58 | print('*******************************************************************') 59 | train_index_4, test_index_4 = five_fold_index[3] 60 | print('第4次训练...') 61 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 62 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 63 | gbdt_model_4 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1499, max_depth=5, subsample=0.7, 64 | random_state=1, verbose=0, min_samples_leaf=50) 65 | gbdt_model_4.fit(x_train_4, y_train_4) 66 | scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4)) 67 | submission_scores[:, 3] = gbdt_model_4.predict(true_test) 68 | print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4))) 69 | print('第4次训练结束') 70 | print('*******************************************************************') 71 | train_index_5, test_index_5 = five_fold_index[4] 72 | print('第5次训练...') 73 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 74 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 75 | gbdt_model_5 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=923, max_depth=5, subsample=0.7, 76 | random_state=1, verbose=0, min_samples_leaf=50) 77 | gbdt_model_5.fit(x_train_5, y_train_5) 78 | scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5)) 79 | submission_scores[:, 4] = gbdt_model_5.predict(true_test) 80 | print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5))) 81 | print('第5次训练结束') 82 | print('*******************************************************************') 83 | submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3) 84 | return eval_metric(scores, np.exp(value4preds)) 85 | 86 | 87 | # b-board 88 | # 718 968 993 1499 923 89 | # 'ldl': 0.033119396519559752 90 | if __name__ == "__main__": 91 | train = odps.get_table('ldl_juz_train_6_6_snp_onehot_22').to_df().to_pandas() 92 | test = odps.get_table('ldl_juz_test_6_6_snp_onehot_22').to_df().to_pandas() 93 | print(train.shape) 94 | print(test.shape) 95 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 96 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 97 | test_data = test.loc[:, use_features] 98 | 99 | submission = test.loc[:, ['vid', 'ldl']] 100 | base_line_score = np.zeros(5) 101 | start = time.time() 102 | for i, j in enumerate(predict_features): 103 | if j in ['ldl']: 104 | base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission) 105 | print(dict(zip(predict_features, base_line_score))) 106 | print('CV训练用时{}秒'.format(time.time() - start)) 107 | print('scores:', np.mean(base_line_score)) 108 | sub_final = DataFrame(submission) 109 | sub_final.persist('ldl_jz_5_fold_6_6_submit_22') -------------------------------------------------------------------------------- /round2_rank10/every_predict_model/hdl_gbdt_best_rounds.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor 4 | from odps.df import DataFrame 5 | from sklearn.model_selection import KFold 6 | import time 7 | import numpy as np 8 | 9 | 10 | def eval_metric(pred, labels): 11 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 12 | 13 | def gbdt_model(df, label, use_feature, true_test, submission_data): 14 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 15 | value4preds = np.log(df[label]) 16 | train_data = df.loc[:, use_feature] 17 | print(train_data.shape) 18 | scores = np.zeros(len(value4preds)) 19 | submission_scores = np.zeros((len(submission_data), 5)) 20 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 21 | five_fold_index = list(kf.split(train_data, value4preds)) 22 | 23 | train_index_1, test_index_1 = five_fold_index[0] 24 | print('第1次训练...') 25 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 26 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 27 | gbdt_model_1 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1972, max_depth=5, subsample=0.7, 28 | random_state=1, verbose=0, min_samples_leaf=50) 29 | gbdt_model_1.fit(x_train_1, y_train_1) 30 | scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1)) 31 | submission_scores[:, 0] = gbdt_model_1.predict(true_test) 32 | print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1))) 33 | print('第1次训练结束') 34 | print('*******************************************************************') 35 | train_index_2, test_index_2 = five_fold_index[1] 36 | print('第2次训练...') 37 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 38 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 39 | gbdt_model_2 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=2000, max_depth=5, subsample=0.7, 40 | random_state=1, verbose=0, min_samples_leaf=50) 41 | gbdt_model_2.fit(x_train_2, y_train_2) 42 | scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2)) 43 | submission_scores[:, 1] = gbdt_model_2.predict(true_test) 44 | print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2))) 45 | print('第2次训练结束') 46 | print('*******************************************************************') 47 | train_index_3, test_index_3 = five_fold_index[2] 48 | print('第3次训练...') 49 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 50 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 51 | gbdt_model_3 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=2000, max_depth=5, subsample=0.7, 52 | random_state=1, verbose=0, min_samples_leaf=50) 53 | gbdt_model_3.fit(x_train_3, y_train_3) 54 | scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3)) 55 | submission_scores[:, 2] = gbdt_model_3.predict(true_test) 56 | print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3))) 57 | print('第3次训练结束') 58 | print('*******************************************************************') 59 | train_index_4, test_index_4 = five_fold_index[3] 60 | print('第4次训练...') 61 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 62 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 63 | gbdt_model_4 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1492, max_depth=5, subsample=0.7, 64 | random_state=1, verbose=0, min_samples_leaf=50) 65 | gbdt_model_4.fit(x_train_4, y_train_4) 66 | scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4)) 67 | submission_scores[:, 3] = gbdt_model_4.predict(true_test) 68 | print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4))) 69 | print('第4次训练结束') 70 | print('*******************************************************************') 71 | train_index_5, test_index_5 = five_fold_index[4] 72 | print('第5次训练...') 73 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 74 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 75 | gbdt_model_5 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1954, max_depth=5, subsample=0.7, 76 | random_state=1, verbose=0, min_samples_leaf=50) 77 | gbdt_model_5.fit(x_train_5, y_train_5) 78 | scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5)) 79 | submission_scores[:, 4] = gbdt_model_5.predict(true_test) 80 | print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5))) 81 | print('第5次训练结束') 82 | print('*******************************************************************') 83 | submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3) 84 | return eval_metric(scores, np.exp(value4preds)) 85 | 86 | 87 | # b-board 88 | # 1972 2000 2000 1492 1954 89 | # 'hdl': 0.010681349752220548 90 | if __name__ == "__main__": 91 | train = odps.get_table('hdl_juz_train_6_6_snp_onehot_22').to_df().to_pandas() 92 | test = odps.get_table('hdl_juz_test_6_6_snp_onehot_22').to_df().to_pandas() 93 | print(train.shape) 94 | print(test.shape) 95 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 96 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 97 | test_data = test.loc[:, use_features] 98 | 99 | submission = test.loc[:, ['vid', 'hdl']] 100 | base_line_score = np.zeros(5) 101 | start = time.time() 102 | for i, j in enumerate(predict_features): 103 | if j in ['hdl']: 104 | base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission) 105 | print(dict(zip(predict_features, base_line_score))) 106 | print('CV训练用时{}秒'.format(time.time() - start)) 107 | print('scores:', np.mean(base_line_score)) 108 | sub_final = DataFrame(submission) 109 | sub_final.persist('hdl_jz_5_fold_6_6_submit_22') -------------------------------------------------------------------------------- /round2_rank10/every_predict_model/tl_gbdt_best_rounds.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor 4 | from odps.df import DataFrame 5 | from sklearn.model_selection import KFold 6 | import time 7 | import numpy as np 8 | 9 | 10 | def eval_metric(pred, labels): 11 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 12 | 13 | def gbdt_model(df, label, use_feature, true_test, submission_data): 14 | print("基于GBDT: 开始训练 label 为{}...".format(label)) 15 | value4preds = np.log(df[label]) 16 | train_data = df.loc[:, use_feature] 17 | print(train_data.shape) 18 | scores = np.zeros(len(value4preds)) 19 | submission_scores = np.zeros((len(submission_data), 5)) 20 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 21 | five_fold_index = list(kf.split(train_data, value4preds)) 22 | 23 | train_index_1, test_index_1 = five_fold_index[0] 24 | print('第1次训练...') 25 | x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] 26 | y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] 27 | gbdt_model_1 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1777, max_depth=5, subsample=0.7, 28 | random_state=1, verbose=0, min_samples_leaf=50) 29 | gbdt_model_1.fit(x_train_1, y_train_1) 30 | scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1)) 31 | submission_scores[:, 0] = gbdt_model_1.predict(true_test) 32 | print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1))) 33 | print('第1次训练结束') 34 | del train_index_1 35 | del test_index_1 36 | del gbdt_model_1 37 | print('*******************************************************************') 38 | train_index_2, test_index_2 = five_fold_index[1] 39 | print('第2次训练...') 40 | x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] 41 | y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] 42 | gbdt_model_2 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1795, max_depth=5, subsample=0.7, 43 | random_state=1, verbose=0, min_samples_leaf=50) 44 | gbdt_model_2.fit(x_train_2, y_train_2) 45 | scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2)) 46 | submission_scores[:, 1] = gbdt_model_2.predict(true_test) 47 | print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2))) 48 | print('第2次训练结束') 49 | del train_index_2 50 | del test_index_2 51 | del gbdt_model_2 52 | print('*******************************************************************') 53 | train_index_3, test_index_3 = five_fold_index[2] 54 | print('第3次训练...') 55 | x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] 56 | y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] 57 | gbdt_model_3 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1800, max_depth=5, subsample=0.7, 58 | random_state=1, verbose=0, min_samples_leaf=50) 59 | gbdt_model_3.fit(x_train_3, y_train_3) 60 | scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3)) 61 | submission_scores[:, 2] = gbdt_model_3.predict(true_test) 62 | print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3))) 63 | print('第3次训练结束') 64 | del train_index_3 65 | del test_index_3 66 | del gbdt_model_3 67 | print('*******************************************************************') 68 | train_index_4, test_index_4 = five_fold_index[3] 69 | print('第4次训练...') 70 | x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] 71 | y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] 72 | gbdt_model_4 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1419, max_depth=5, subsample=0.7, 73 | random_state=1, verbose=0, min_samples_leaf=50) 74 | gbdt_model_4.fit(x_train_4, y_train_4) 75 | scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4)) 76 | submission_scores[:, 3] = gbdt_model_4.predict(true_test) 77 | print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4))) 78 | print('第4次训练结束') 79 | del train_index_4 80 | del test_index_4 81 | del gbdt_model_4 82 | print('*******************************************************************') 83 | train_index_5, test_index_5 = five_fold_index[4] 84 | print('第5次训练...') 85 | x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] 86 | y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] 87 | gbdt_model_5 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1800, max_depth=5, subsample=0.7, 88 | random_state=1, verbose=0, min_samples_leaf=50) 89 | gbdt_model_5.fit(x_train_5, y_train_5) 90 | scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5)) 91 | submission_scores[:, 4] = gbdt_model_5.predict(true_test) 92 | print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5))) 93 | print('第5次训练结束') 94 | del train_index_5 95 | del test_index_5 96 | del gbdt_model_5 97 | print('*******************************************************************') 98 | submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3) 99 | return eval_metric(scores, np.exp(value4preds)) 100 | 101 | 102 | # b-board 103 | # 1777 1795 1800 1419 1800 104 | # 'tl': 0.086486107163495141 105 | if __name__ == "__main__": 106 | train = odps.get_table('tl_juz_train_6_6_snp_onehot_22').to_df().to_pandas() 107 | test = odps.get_table('tl_juz_test_6_6_snp_onehot_22').to_df().to_pandas() 108 | print(train.shape) 109 | print(test.shape) 110 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] 111 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features] 112 | test_data = test.loc[:, use_features] 113 | 114 | submission = test.loc[:, ['vid', 'tl']] 115 | base_line_score = np.zeros(5) 116 | start = time.time() 117 | for i, j in enumerate(predict_features): 118 | if j in ['tl']: 119 | base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission) 120 | print(dict(zip(predict_features, base_line_score))) 121 | print('CV训练用时{}秒'.format(time.time() - start)) 122 | print('scores:', np.mean(base_line_score)) 123 | sub_final = DataFrame(submission) 124 | sub_final.persist('tl_jz_5_fold_6_6_submit_22') -------------------------------------------------------------------------------- /round2_rank10/data_pre_process/get_num_features.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | import pandas as pd 4 | from odps import ODPS 5 | from odps.df import DataFrame 6 | import numpy as np 7 | from collections import Iterable 8 | 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | 13 | def extract_num_norm(df): 14 | if isinstance(df, Iterable): 15 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 16 | if temp: 17 | return np.mean([float(i.replace('--', '')) for i in temp]) 18 | else: 19 | return np.nan 20 | else: 21 | return np.nan 22 | 23 | def transform_0424(df): 24 | if isinstance(df, Iterable): 25 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 26 | if temp: 27 | return np.mean([float(i.replace('--', '')) for i in temp]) 28 | else: 29 | if '常' in df: 30 | return 75 31 | elif '过速' in df: 32 | return 100 33 | elif '过缓' in df: 34 | return 50 35 | else: 36 | return np.nan 37 | else: 38 | return np.nan 39 | 40 | def transform_0425(df): 41 | if isinstance(df, Iterable): 42 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 43 | if temp: 44 | return np.mean([float(i.replace('--', '')) for i in temp]) 45 | else: 46 | # median 47 | if '常' in df: 48 | return 17 49 | # min 50 | elif '粗糙' in df: 51 | return 14 52 | else: 53 | return np.nan 54 | else: 55 | return np.nan 56 | 57 | def transform_1308(df): 58 | if isinstance(df, Iterable): 59 | if '裸眼' in df: 60 | temp1 = re.findall(r'\-*\d+(?:\.\d+)?', df) 61 | if temp1: 62 | luo_yan = np.mean([float(i.replace('--', '')) for i in temp1]) 63 | if luo_yan >= 1: 64 | return 4 65 | else: 66 | return 3 67 | elif '矫正' in df: 68 | temp2 = re.findall(r'\-*\d+(?:\.\d+)?', df) 69 | if temp2: 70 | jiao_zheng = np.mean([float(i.replace('--', '')) for i in temp2]) 71 | if jiao_zheng >= 1: 72 | return 2 73 | else: 74 | return 1 75 | else: 76 | return np.nan 77 | else: 78 | return np.nan 79 | 80 | def transform_1321_1322(df): 81 | if isinstance(df, Iterable): 82 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 83 | if temp: 84 | return np.mean([float(i.replace('--', '')) for i in temp]) 85 | else: 86 | if '失明' in df or '义眼' in df: 87 | return 0 88 | elif '指数' in df: 89 | return 0.003 90 | elif '手动' in df: 91 | return 0.002 92 | elif '光感' in df: 93 | return 0.001 94 | else: 95 | return np.nan 96 | else: 97 | return np.nan 98 | 99 | def calc_voice_area(df, desc): 100 | if isinstance(df, Iterable) and desc in df: 101 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 102 | if temp: 103 | if 'cm' in df: 104 | if len(temp) == 2: 105 | return float(temp[0]) * float(temp[1]) * 100 106 | if len(temp) == 4: 107 | return (float(temp[0]) * float(temp[1]) + float(temp[2]) * float(temp[3]))*100.0/2 108 | if 'mm' in df: 109 | if len(temp) == 2: 110 | return float(temp[0]) * float(temp[1]) 111 | if len(temp) == 4: 112 | return (float(temp[0]) * float(temp[1]) + float(temp[2]) * float(temp[3]))*1.0/2 113 | else: 114 | return 0 115 | else: 116 | return np.nan 117 | 118 | # 眼压 119 | def transform_1319_1320(df): 120 | if isinstance(df, Iterable): 121 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 122 | if temp: 123 | return float(temp[0]) 124 | else: 125 | if '正常' in df: 126 | return 15 127 | elif '偏高' in df: 128 | return 22 129 | else: 130 | return np.nan 131 | else: 132 | return np.nan 133 | 134 | def get_pure_num_features(data_frame, threshold): 135 | pure_num_list = ['vid'] 136 | for c in data_frame.columns: 137 | if c != 'vid': 138 | data_frame[c] = pd.to_numeric(data_frame[c], errors='ignore') 139 | if data_frame[c].dtypes != 'object' and (data_frame[c].isnull().sum() * 1.0 / data_frame.shape[0] <= threshold): 140 | #if np.abs(ex_num[c].skew()) <= pian_tai: 141 | pure_num_list.append(c) 142 | return data_frame.loc[:, pure_num_list] 143 | 144 | def split_data(data_series, desc): 145 | check_array = ['' for _ in range(data_series.shape[0])] 146 | for pos, j in enumerate(data_series): 147 | if isinstance(j, Iterable): 148 | tmp = set(j.split('$')) 149 | for t in tmp: 150 | if isinstance(t, Iterable) and desc in t: 151 | check_array[pos] = t 152 | return check_array 153 | 154 | def qian_lie_xian(df, pos): 155 | if isinstance(df, Iterable): 156 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 157 | if temp: 158 | if 'cm' in df: 159 | if len(temp) >= 3: 160 | return float(temp[pos]) * 10 161 | if 'mm' in df: 162 | if len(temp) >= 3: 163 | return float(temp[pos]) 164 | else: 165 | return np.nan 166 | 167 | def dpm_check(df): 168 | if isinstance(df, Iterable): 169 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 170 | if temp and 'dpm' in df: 171 | return float(temp[0]) 172 | else: 173 | return np.nan 174 | 175 | def ex_num_from_str(data_frame): 176 | word_096_norm = ['004997', '0107', '100008', '100013', '100014','10002', '10003', '1106', '1107', '1110','1112', '1115', 177 | '1117', '1345', '139', '141', '143', '1474', '155', '1814', '1815', '183', '1845', '1850','1873', '191', 178 | '192', '193', '20002', '2165', '2174', '2371', '2376', '2390', '2403', '2404', '2405', '2406','2420', 179 | '269011', '300001', '300021', '300035', '300051','300069', '300070', '300073', '300074', '300076', '300078', 180 | '300093', '300113', '300119', '300125', '300129', '314','3193', '321', '3804', '3807', '669003', '809021', 181 | '979001', '979002', '979003', 'a701', 'a703'] 182 | shili = ['1308','1319','1320','1321','1322'] 183 | heart = ['0424','0425', 'vid'] 184 | for w in word_096_norm: 185 | data_frame[w] = data_frame[w].apply(extract_num_norm) 186 | data_frame['0424'] = data_frame['0424'].apply(transform_0424) 187 | data_frame['0425'] = data_frame['0425'].apply(transform_0425) 188 | data_frame['1308'] = data_frame['1308'].apply(transform_1308) 189 | data_frame['1319'] = data_frame['1319'].apply(transform_1319_1320) 190 | data_frame['1320'] = data_frame['1320'].apply(transform_1319_1320) 191 | data_frame['1321'] = data_frame['1321'].apply(transform_1321_1322) 192 | data_frame['1322'] = data_frame['1322'].apply(transform_1321_1322) 193 | data_frame['left_shen_no_voice'] = data_frame['left_shen'].apply(calc_voice_area, args=('无回声',)) 194 | #data_frame['left_shen_strong_voice'] = data_frame['left_shen'].apply(calc_voice_area, args=('强回声',)) 195 | data_frame['right_shen_no_voice'] = data_frame['right_shen'].apply(calc_voice_area, args=('无回声',)) 196 | data_frame['right_shen_strong_voice'] = data_frame['right_shen'].apply(calc_voice_area, args=('强回声',)) 197 | data_frame['jzx_no_voice_area'] = data_frame['jia_zx'].apply(calc_voice_area, args=('无回声区',)) 198 | data_frame['jzx_no_voice_jiejie'] = data_frame['jia_zx'].apply(calc_voice_area, args=('无回声结节',)) 199 | data_frame['jzx_low_voice_area'] = data_frame['jia_zx'].apply(calc_voice_area, args=('低回声区',)) 200 | data_frame['jzx_low_voice_jiejie'] = data_frame['jia_zx'].apply(calc_voice_area, args=('低回声结节',)) 201 | data_frame['liver_no_voice'] = data_frame['0113'].apply(calc_voice_area, args=('无回声',)) 202 | data_frame['liver_strong_voice'] = data_frame['0113'].apply(calc_voice_area, args=('强回声',)) 203 | data_frame['dan_strong_voice'] = data_frame['0114'].apply(calc_voice_area, args=('强回声',)) 204 | data_frame['qian_lie_xian_1'] = data_frame['0120'].apply(qian_lie_xian, args=(0,)) 205 | data_frame['qian_lie_xian_2'] = data_frame['0120'].apply(qian_lie_xian, args=(1,)) 206 | data_frame['qian_lie_xian_3'] = data_frame['0120'].apply(qian_lie_xian, args=(2,)) 207 | data_frame['dpm_from_3301'] = data_frame['3301'].apply(dpm_check) 208 | huishen = ['left_shen_no_voice','right_shen_no_voice','right_shen_strong_voice','jzx_no_voice_area','qian_lie_xian_2','qian_lie_xian_3','dpm_from_3301', 209 | 'jzx_no_voice_jiejie','jzx_low_voice_area','jzx_low_voice_jiejie','liver_no_voice','liver_strong_voice','dan_strong_voice','qian_lie_xian_1'] 210 | total = word_096_norm + shili + heart + huishen 211 | num_ex_str = data_frame.loc[:, total] 212 | return num_ex_str 213 | 214 | if __name__ == "__main__": 215 | part_1_2 = odps.get_table('origin_data_combine_part1_part2').to_df().to_pandas() 216 | part_1_2['jia_zx'] = split_data(part_1_2['0101'], '甲状腺') 217 | part_1_2['left_shen'] = split_data(part_1_2['0117'], '左肾') 218 | part_1_2['right_shen'] = split_data(part_1_2['0118'], '右肾') 219 | part_1_2_copy = part_1_2.copy(deep=True) 220 | ex_num_data = ex_num_from_str(part_1_2) 221 | print('the shape of the num_data get from word: ', ex_num_data.shape) 222 | pure_num_data = get_pure_num_features(part_1_2_copy, 0.96) 223 | pure_columns = [p for p in pure_num_data.columns if p != 'vid'] 224 | ex_num_columns = [i for i in ex_num_data.columns if i not in ['vid', '314','1308','1319','1320','1321','1322','0424','0425']] 225 | print('the shape of origin num data: ', pure_num_data.shape) 226 | numeric_data = pd.merge(pure_num_data, ex_num_data, on='vid', how='inner') 227 | exm_drop = [] 228 | for w in pure_columns + ex_num_columns: 229 | if np.abs(numeric_data[w].skew()) > 12: 230 | exm_drop.append(w) 231 | print(exm_drop) 232 | numeric_data.drop(exm_drop, axis=1, inplace=True) 233 | print('total data shape: ', numeric_data.shape) 234 | juz_num_data = DataFrame(numeric_data) 235 | juz_num_data.persist('juz_num_data_5_31') -------------------------------------------------------------------------------- /round2_rank10/data_pre_process/get_word_features.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | import pandas as pd 4 | from odps import ODPS 5 | from odps.df import DataFrame 6 | import numpy as np 7 | from collections import Iterable 8 | 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | 13 | def transform_2302(df): 14 | try: 15 | if '健康' in df: 16 | if '亚健康' in df: 17 | return 1 18 | else: 19 | return 0 20 | elif '疾病' in df: 21 | return 2 22 | except Exception: 23 | return df 24 | 25 | 26 | def high_sugar(df): 27 | if df: 28 | if '血糖偏高' in df or '降糖' in df or '血糖' in df: 29 | return 1 30 | else: 31 | return 0 32 | else: 33 | return np.nan 34 | 35 | 36 | def high_fat(df): 37 | if df: 38 | if '血脂偏高' in df or '低脂' in df or '血脂' in df: 39 | return 1 40 | else: 41 | return 0 42 | else: 43 | return np.nan 44 | 45 | 46 | def high_pressure(df): 47 | if df: 48 | if '血压偏高' in df or '降压' in df or '血压' in df: 49 | return 1 50 | else: 51 | return 0 52 | else: 53 | return np.nan 54 | 55 | 56 | def higher_pressure(df): 57 | if df: 58 | if '血压偏高' not in df: 59 | if '高血压' in df: 60 | return 1 61 | else: 62 | return 0 63 | else: 64 | return np.nan 65 | 66 | 67 | def higher_fat(df): 68 | if df: 69 | if '血脂偏高' not in df: 70 | if '高血脂' in df: 71 | return 1 72 | else: 73 | return 0 74 | else: 75 | return np.nan 76 | 77 | 78 | def higher_sugar(df): 79 | if df: 80 | if '血糖偏高' not in df: 81 | if '高血糖' in df or '糖尿病' in df: 82 | return 1 83 | else: 84 | return 0 85 | else: 86 | return np.nan 87 | 88 | def coronary_heart_disease(df): 89 | if df: 90 | if '冠心病' in df or '冠状' in df: 91 | return 1 92 | else: 93 | return 0 94 | else: 95 | return np.nan 96 | 97 | 98 | def kidney(df): 99 | if df: 100 | if '肾' in df: 101 | return 1 102 | else: 103 | return 0 104 | else: 105 | return np.nan 106 | 107 | 108 | def smoke(df): 109 | if df: 110 | if '烟' in df: 111 | return 1 112 | else: 113 | return 0 114 | else: 115 | return np.nan 116 | 117 | def blood_pipe_style(df): 118 | try: 119 | if '良好' in df or '正常' in df: 120 | return 0 121 | elif '趋势' in df: 122 | return 1 123 | elif '轻度' in df: 124 | return 2 125 | elif '中度' in df: 126 | return 3 127 | elif '重度' in df: 128 | return 4 129 | elif '硬化' in df: 130 | return 5 131 | else: 132 | return np.nan 133 | except Exception: 134 | return df 135 | 136 | def ying_yang(df): 137 | try: 138 | if '+' in df and '-' in df: 139 | return 1 140 | elif '+' in df and '-' not in df: 141 | return 2 142 | elif ('-' in df or '阴' in df or '正常' in df or 'Normal' in df) and '+' not in df: 143 | return 0 144 | else: 145 | return 0 146 | except Exception: 147 | return df 148 | 149 | def HP_yy(df): 150 | try: 151 | if '阳' in df: 152 | return 1 153 | else: 154 | return 0 155 | except Exception: 156 | return df 157 | 158 | # 尿 159 | def urine(df): 160 | try: 161 | if '>=' in df: 162 | return 1 163 | else: 164 | return 0 165 | except Exception: 166 | return df 167 | 168 | def heart_rate(df): 169 | try: 170 | if df != '强弱不等': 171 | if '弱' in df or '远' in df or '低' in df: 172 | return 1 173 | elif '强' in df or '力' in df: 174 | return 3 175 | else: 176 | return 0 177 | else: 178 | return 2 179 | except Exception: 180 | return df 181 | 182 | def transform_421(df): 183 | try: 184 | if '齐' in df and '不' not in df: 185 | return 0 186 | else: 187 | return 1 188 | except Exception: 189 | return df 190 | 191 | def transform_403(df): 192 | try: 193 | if '大' in df and '无' not in df: 194 | return 1 195 | else: 196 | return 0 197 | except Exception: 198 | return df 199 | 200 | def transform_3399(df): 201 | try: 202 | if df == '黄色' or df == 'yellow': 203 | return 2 204 | elif df == '淡黄色' or df == '浅黄色': 205 | return 1 206 | elif df == '无色': 207 | return 0 208 | elif '红' in df: 209 | return 3 210 | elif df == '混浊': 211 | return 4 212 | else: 213 | return 5 214 | except Exception: 215 | return df 216 | 217 | def lung_voice(df): 218 | try: 219 | if '干啰' in df: 220 | return 1 221 | elif '湿啰' in df: 222 | return 2 223 | elif '哮鸣' in df: 224 | return 3 225 | elif '湿鸣' in df: 226 | return 4 227 | else: 228 | return 0 229 | except Exception: 230 | return df 231 | 232 | def get_num_from_102_front(df): 233 | try: 234 | temp_x = re.findall('(\d+)/(\d+)', df) 235 | if temp_x: 236 | return float(temp_x[0][0]) 237 | except Exception: 238 | return np.nan 239 | 240 | 241 | def get_num_from_102_back(df): 242 | try: 243 | temp_x = re.findall('(\d+)/(\d+)', df) 244 | if temp_x: 245 | return float(temp_x[0][1]) 246 | except Exception: 247 | return np.nan 248 | 249 | def dannan_xirou(df): 250 | if df: 251 | if '胆囊息肉' in df: 252 | return 1 253 | else: 254 | return 0 255 | else: 256 | return np.nan 257 | 258 | 259 | def dannan_jieshi(df): 260 | if df: 261 | if '胆囊结石' in df: 262 | return 1 263 | else: 264 | return 0 265 | else: 266 | return np.nan 267 | 268 | 269 | def shen_jieshi(df): 270 | if df: 271 | if '肾结石' in df: 272 | return 1 273 | else: 274 | return 0 275 | else: 276 | return np.nan 277 | 278 | 279 | def shen_nangzhong(df): 280 | if df: 281 | if '肾囊肿' in df: 282 | return 1 283 | else: 284 | return 0 285 | else: 286 | return np.nan 287 | 288 | 289 | def gan_nangzhong(df): 290 | if df: 291 | if '肝囊肿' in df: 292 | return 1 293 | else: 294 | return 0 295 | else: 296 | return np.nan 297 | 298 | def map_deal_0113(temp): 299 | try: 300 | if isnan(float(temp)): 301 | return -1 302 | else: 303 | return float(temp) 304 | except Exception: 305 | temp = str(temp) 306 | value = 0 307 | if "弥漫性" in temp: 308 | value = 5 309 | if "欠清晰" in temp: 310 | value += 2 311 | if "粗" in temp: 312 | value += 0.5 313 | if "多发" in temp: 314 | value += 0.5 315 | if "斑点状" in temp: 316 | value += 1 317 | if "回声区" in temp: 318 | value += 1 319 | return value 320 | 321 | def gan_ying_hua(df): 322 | if df: 323 | if '肝脏' in df: 324 | return 1 325 | else: 326 | return 0 327 | else: 328 | return np.nan 329 | 330 | def strQ2B(ustring): 331 | """全角转半角""" 332 | ustring = str(ustring) 333 | rstring = "" 334 | for uchar in ustring: 335 | inside_code=ord(uchar) 336 | if inside_code == 12288: 337 | inside_code = 32 338 | elif (inside_code >= 65281 and inside_code <= 65374): 339 | inside_code -= 65248 340 | 341 | rstring += chr(inside_code) 342 | return rstring 343 | 344 | def extract_num_norm(df): 345 | if isinstance(df, Iterable): 346 | temp = re.findall(r'\-*\d+(?:\.\d+)?', df) 347 | if temp: 348 | return np.mean([float(i.replace('--', '')) for i in temp]) 349 | else: 350 | return np.nan 351 | else: 352 | return np.nan 353 | 354 | 355 | def is_sex(x): 356 | x = str(x) 357 | if ('阴道' in x)|('子宫' in x)|('妇' in x)|('乳' in x)|('孕' in x)|('卵巢' in x)|('女' in x)|('宫颈' in x)|('妊娠' in x)|('剖腹产' in x): 358 | return 1 359 | elif ('前列腺' in x)|('包皮' in x)|('包茎' in x)|('男' in x)|('阴茎' in x)|('睾丸' in x): 360 | return 2 361 | else: 362 | return 0 363 | 364 | def word2num(data_frame): 365 | one_hot_list = ['0101', '0102', '0113', '0409', '0413', '0434', '0439', 'a201', 'a202', '4001', '0705', 'a301', '0709', 366 | '0985', 'a705'] 367 | data_frame.loc[:, one_hot_list] = data_frame.loc[:, one_hot_list].fillna('') 368 | frame_409_434 = data_frame['0409'] + data_frame['0434'] + data_frame['0413'] + data_frame['4001'] + \ 369 | data_frame['a201'] + data_frame['a301'] + data_frame['a202'] + data_frame['0705'] + \ 370 | data_frame['0709'] + data_frame['0985'] + data_frame['0439'] 371 | data_frame['xue_ya_pian_gao'] = frame_409_434.apply(high_pressure) 372 | data_frame['gan_by_ts'] = data_frame['0113'].apply(map_deal_0113) 373 | data_frame['xue_zhi_pian_gao'] = frame_409_434.apply(high_fat) 374 | data_frame['xue_tang_pian_gao'] = frame_409_434.apply(high_sugar) 375 | data_frame['high_sugar'] = frame_409_434.apply(higher_sugar) 376 | data_frame['guan_xin_bin'] = frame_409_434.apply(coronary_heart_disease) 377 | data_frame['shen'] = frame_409_434.apply(kidney) 378 | data_frame['smoke'] = frame_409_434.apply(smoke) 379 | fat_liver_num = data_frame['0101'] + data_frame['0102'] + data_frame['0113'] + data_frame['a202'] 380 | data_frame['dannan_jieshi'] = fat_liver_num.apply(dannan_jieshi) 381 | data_frame['dannan_xirou'] = fat_liver_num.apply(dannan_xirou) 382 | data_frame['shen_jieshi'] = fat_liver_num.apply(shen_jieshi) 383 | data_frame['shen_nanz'] = fat_liver_num.apply(shen_nangzhong) 384 | data_frame['gan_nanz'] = fat_liver_num.apply(gan_nangzhong) 385 | data_frame['gan_ying_hua'] = data_frame['a705'].apply(gan_ying_hua) 386 | yy_list = ['3190', '3191', '3192', '3194', '3195', '3197', '3430', '100010'] 387 | for y in yy_list: 388 | data_frame[y] = data_frame[y].apply(ying_yang) 389 | data_frame['niao'] = data_frame['3193'].apply(urine) 390 | data_frame['heart_rate'] = data_frame['0420'].apply(heart_rate) 391 | data_frame['3399_w'] = data_frame['3399'].apply(transform_3399) 392 | data_frame['3301_w'] = data_frame['3301'].apply(HP_yy) 393 | data_frame['0403_w'] = data_frame['0403'].apply(transform_403) 394 | data_frame['0421_w'] = data_frame['0421'].apply(transform_421) 395 | data_frame['0405_w'] = data_frame['0405'].apply(lung_voice) 396 | data_frame['blood_pipe_style'] = data_frame['4001'].apply(blood_pipe_style) 397 | data_frame['health'] = data_frame['2302'].apply(transform_2302) 398 | data_frame['pres_front'] = data_frame['0102'].apply(get_num_from_102_front) 399 | data_frame['pres_back'] = data_frame['0102'].apply(get_num_from_102_back) 400 | data_frame['heart_times'] = data_frame['1001'].apply(extract_num_norm) 401 | 402 | data_frame['all_result'] = '_' 403 | for p in data_frame.columns: 404 | if p != 'vid': 405 | data_frame['all_result'] = data_frame['all_result'] + '_' + data_frame[p].astype('str') 406 | 407 | data_frame['gender'] = data_frame['all_result'].apply(is_sex) 408 | del data_frame['all_result'] 409 | 410 | new_add = ['xue_ya_pian_gao', 'xue_zhi_pian_gao', 'xue_tang_pian_gao', 'high_sugar', 'guan_xin_bin', 'shen', 'smoke','niao', 'heart_rate', '3399_w', 411 | '3301_w', '0403_w', '0421_w', '0405_w', 'gender','blood_pipe_style', 'health','pres_front', 'pres_back','heart_times', 'vid', 'dannan_jieshi', 412 | 'dannan_xirou', 'shen_jieshi', 'shen_nanz', 'gan_nanz','gan_ying_hua'] 413 | yy_list.extend(new_add) 414 | return data_frame.loc[:, yy_list] 415 | 416 | 417 | if __name__ == "__main__": 418 | part_1_2 = odps.get_table('origin_data_combine_part1_part2').to_df().to_pandas() 419 | word_data = word2num(part_1_2) 420 | print('the shape of word_data: ',word_data.shape) 421 | juz_word_data = DataFrame(word_data) 422 | juz_word_data.persist('juz_word_data_5_30') -------------------------------------------------------------------------------- /round1_rank2/team/team_feature_work.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/5/8 0008 下午 16:40 4 | # @Author : Juzphy 5 | 6 | import time 7 | import pandas as pd 8 | from math import isnan 9 | start_time=time.time() 10 | 11 | def filter_None(data): 12 | data=data[data['field_results']!=''] 13 | data=data[data['field_results']!='未查'] 14 | return data 15 | 16 | # 重复数据的拼接操作 17 | def merge_table(df): 18 | df['field_results'] = df['field_results'].astype(str) 19 | if df.shape[0] > 1: 20 | merge_df = " ".join(list(df['field_results'])) 21 | else: 22 | merge_df = df['field_results'].values[0] 23 | return merge_df 24 | 25 | # 删除掉一些出现次数低,缺失比例大的字段,保留超过阈值的特征 26 | def remain_feat(df,thresh=0.9): 27 | exclude_feats = [] 28 | print('----------移除数据缺失多的字段-----------') 29 | print('移除之前总的字段数量',len(df.columns)) 30 | num_rows = df.shape[0] 31 | for c in df.columns: 32 | num_missing = df[c].isnull().sum() 33 | if num_missing == 0: 34 | continue 35 | missing_percent = num_missing / float(num_rows) 36 | if missing_percent > thresh: 37 | exclude_feats.append(c) 38 | print("移除缺失数据的字段数量: %s" % len(exclude_feats)) 39 | # 保留超过阈值的特征 40 | feats = [] 41 | for c in df.columns: 42 | if c not in exclude_feats: 43 | feats.append(c) 44 | print('剩余的字段数量',len(feats)) 45 | return feats 46 | 47 | def map_deal_3601(temp): 48 | try: 49 | if isnan(float(temp)): 50 | return -1 51 | else: 52 | return float(temp) 53 | except Exception: 54 | temp = str(temp) 55 | if "严重" in temp: 56 | return 4 57 | elif "中度" in temp: 58 | return 3 59 | elif "减少" in temp or "降低" in temp or "疏松": 60 | return 2 61 | else: 62 | return 1 63 | 64 | def map_deal_0102(temp): 65 | try: 66 | if isnan(float(temp)): 67 | return -1 68 | else: 69 | return float(temp) 70 | except Exception: 71 | temp = str(temp) 72 | value = 0 73 | if "脂肪" in temp: 74 | if "重" in temp: 75 | value = 4 76 | elif "中" in temp: 77 | value = 3 78 | elif "轻" in temp: 79 | value = 2 80 | else: 81 | value = 1 82 | else: 83 | value = 0.0 84 | if "多发" in temp: 85 | value += 0.5 86 | return value 87 | 88 | def map_deal_0113(temp): 89 | try: 90 | if isnan(float(temp)): 91 | return -1 92 | else: 93 | return float(temp) 94 | except Exception: 95 | temp = str(temp) 96 | value = 0 97 | if "弥漫性" in temp: 98 | value = 5 99 | if "欠清晰" in temp: 100 | value += 2 101 | if "粗" in temp: 102 | value += 0.5 103 | if "多发" in temp: 104 | value += 0.5 105 | if "斑点状" in temp: 106 | value += 1 107 | if "回声区" in temp: 108 | value += 1 109 | return value 110 | 111 | def map_deal_0114(temp): 112 | try: 113 | if isnan(float(temp)): 114 | return -1 115 | else: 116 | return float(temp) 117 | except Exception: 118 | temp = str(temp) 119 | value = 0 120 | if "毛糙" in temp: 121 | value = 4 122 | if "强回声" in temp: 123 | value += 1 124 | return value 125 | 126 | def map_deal_0115(temp): 127 | try: 128 | if isnan(float(temp)): 129 | return -1 130 | else: 131 | return float(temp) 132 | except Exception: 133 | temp = str(temp) 134 | value = 0 135 | if "不清晰" in temp: 136 | value = 4 137 | if "增强" in temp: 138 | value += 1 139 | return value 140 | 141 | def map_deal_0115(temp): 142 | try: 143 | if isnan(float(temp)): 144 | return -1 145 | else: 146 | return float(temp) 147 | except Exception: 148 | temp = str(temp) 149 | value = 0 150 | if "不清晰" in temp: 151 | value = 4 152 | if "增强" in temp: 153 | value += 1 154 | return value 155 | 156 | def map_deal_0116(temp): 157 | try: 158 | if isnan(float(temp)): 159 | return -1 160 | else: 161 | return float(temp) 162 | except Exception: 163 | temp = str(temp) 164 | value = 0 165 | if "不清晰" in temp: 166 | value = 4 167 | if "增强" in temp: 168 | value += 1 169 | return value 170 | 171 | def map_deal_0117(temp): 172 | try: 173 | if isnan(float(temp)): 174 | return -1 175 | else: 176 | return float(temp) 177 | except Exception: 178 | temp = str(temp) 179 | value = 0 180 | if "强回声" in temp: 181 | value = 4 182 | if "无回声" in temp: 183 | value += 1 184 | if "欠均匀" in temp: 185 | value += 1 186 | return value 187 | 188 | def map_deal_0118(temp): 189 | try: 190 | if isnan(float(temp)): 191 | return -1 192 | else: 193 | return float(temp) 194 | except Exception: 195 | temp = str(temp) 196 | value = 0 197 | if "强回声" in temp: 198 | value = 4 199 | if "无回声" in temp: 200 | value += 1 201 | if "欠均匀" in temp: 202 | value += 1 203 | return value 204 | 205 | def map_deal_0118(temp): 206 | try: 207 | if isnan(float(temp)): 208 | return -1 209 | else: 210 | return float(temp) 211 | except Exception: 212 | temp = str(temp) 213 | value = 0 214 | if "强回声" in temp: 215 | value = 4 216 | if "无回声" in temp: 217 | value += 1 218 | if "欠均匀" in temp: 219 | value += 1 220 | return value 221 | 222 | def map_deal_0503(temp): 223 | try: 224 | if isnan(float(temp)): 225 | return -1 226 | else: 227 | return float(temp) 228 | except Exception: 229 | temp = str(temp) 230 | value = 0 231 | if "分泌物多" in temp: 232 | value = 8 233 | if "分泌物中" in temp: 234 | value = 5 235 | if "分泌物少" in temp: 236 | value = 3 237 | if "浓性" in temp: 238 | value += 1 239 | if "充血" in temp: 240 | value += 1 241 | if "黄色" in temp: 242 | value += 0.5 243 | return value 244 | 245 | def map_deal_0509(temp): 246 | try: 247 | if isnan(float(temp)): 248 | return -1 249 | else: 250 | return float(temp) 251 | except Exception: 252 | temp = str(temp) 253 | value = 0 254 | if "充血" in temp: 255 | value = 8 256 | if "肥大" in temp: 257 | value = 5 258 | if "轻糜" in temp: 259 | value += 1 260 | if "中糜" in temp: 261 | value += 1.5 262 | if "囊" in temp: 263 | value += 0.5 264 | return value 265 | 266 | def map_deal_0516(temp): 267 | try: 268 | if isnan(float(temp)): 269 | return -1 270 | else: 271 | return float(temp) 272 | except Exception: 273 | temp = str(temp) 274 | value = 0 275 | if "前位" in temp: 276 | value = 8 277 | if "后位" in temp: 278 | value = 5 279 | if "平位" in temp: 280 | value = 3 281 | if "增大" in temp: 282 | value += 1 283 | if "硬" in temp: 284 | value += 0.5 285 | return value 286 | 287 | def map_deal_0539(temp): 288 | try: 289 | if isnan(float(temp)): 290 | return -1 291 | else: 292 | return float(temp) 293 | except Exception: 294 | temp = str(temp) 295 | value = 0 296 | if "分泌物" in temp: 297 | value += 1 298 | if "肥大" in temp: 299 | value += 2 300 | if "充血" in temp: 301 | value += 3 302 | if "炎" in temp: 303 | value += 0.5 304 | return value 305 | 306 | def map_deal_2302(temp): 307 | try: 308 | if isnan(float(temp)): 309 | return -1 310 | else: 311 | return float(temp) 312 | except Exception: 313 | temp = str(temp) 314 | value = 0 315 | if "亚健康" in temp: 316 | value = 3 317 | else: 318 | value = 1 319 | return value 320 | 321 | def map_deal_1316(temp): 322 | try: 323 | if isnan(float(temp)): 324 | return -1 325 | else: 326 | return float(temp) 327 | except Exception: 328 | temp = str(temp) 329 | value = 0 330 | if "正常" in temp or "未见" in temp: 331 | pass 332 | else: 333 | value += 2 334 | return value 335 | 336 | def map_deal_0101(temp): 337 | try: 338 | if isnan(float(temp)): 339 | return -1 340 | else: 341 | return float(temp) 342 | except Exception: 343 | temp = str(temp) 344 | value = 0 345 | if "低回声" in temp or "回声区" in temp: 346 | value += 1 347 | return value 348 | 349 | def map_deal_0119(temp): 350 | try: 351 | if isnan(float(temp)): 352 | return -1 353 | else: 354 | return float(temp) 355 | except Exception: 356 | temp = str(temp) 357 | value = 0 358 | if "欠佳" in temp: 359 | value = 2 360 | return value 361 | 362 | def map_deal_0121(temp): 363 | try: 364 | if isnan(float(temp)): 365 | return -1 366 | else: 367 | return float(temp) 368 | except Exception: 369 | temp = str(temp) 370 | value = 0 371 | if "低回声" in temp or "回声区" in temp: 372 | value += 1 373 | return value 374 | 375 | def map_deal_0122(temp): 376 | try: 377 | if isnan(float(temp)): 378 | return -1 379 | else: 380 | return float(temp) 381 | except Exception: 382 | temp = str(temp) 383 | value = 0 384 | if "回声团" in temp or "回声区" in temp: 385 | value += 1 386 | return value 387 | 388 | def map_deal_0123(temp): 389 | try: 390 | if isnan(float(temp)): 391 | return -1 392 | else: 393 | return float(temp) 394 | except Exception: 395 | temp = str(temp) 396 | value = 0 397 | if "回声团" in temp or "回声区" in temp: 398 | value += 1 399 | return value 400 | 401 | def map_deal_A705(temp): 402 | try: 403 | if isnan(float(temp)): 404 | return -1 405 | else: 406 | return float(temp) 407 | except Exception: 408 | temp = str(temp) 409 | value = 0 410 | if "衰减" in temp: 411 | value += 5 412 | return value 413 | 414 | def map_deal_0911(temp): 415 | try: 416 | if isnan(float(temp)): 417 | return -1 418 | else: 419 | return float(temp) 420 | except Exception: 421 | temp = str(temp) 422 | value = 0 423 | if "肿大" in temp: 424 | value += 2 425 | return value 426 | 427 | def map_deal_0912(temp): 428 | try: 429 | if isnan(float(temp)): 430 | return -1 431 | else: 432 | return float(temp) 433 | except Exception: 434 | temp = str(temp) 435 | value = 0 436 | if "无肿大" in temp or "未见" in temp: 437 | pass 438 | else: 439 | value += 2 440 | return value 441 | 442 | def map_deal_0929(temp): 443 | try: 444 | if isnan(float(temp)): 445 | return -1 446 | else: 447 | return float(temp) 448 | except Exception: 449 | temp = str(temp) 450 | value = 0 451 | if "不全" in temp: 452 | value = 3 453 | if "增生" in temp: 454 | value = 6 455 | return value 456 | 457 | def map_deal_A202(temp): 458 | try: 459 | if isnan(float(temp)): 460 | return -1 461 | else: 462 | return float(temp) 463 | except Exception: 464 | temp = str(temp) 465 | value = 0 466 | if "陈旧" in temp: 467 | value = 5 468 | if "灶" in temp: 469 | value += 1 470 | return value 471 | 472 | def map_deal_1102(temp): 473 | try: 474 | if isnan(float(temp)): 475 | return -1 476 | else: 477 | return float(temp) 478 | except Exception: 479 | temp = str(temp) 480 | value = 0 481 | if "增生" in temp: 482 | value += 1 483 | return value 484 | 485 | def map_deal_0208(temp): 486 | try: 487 | if isnan(float(temp)): 488 | return -1 489 | else: 490 | return float(temp) 491 | except Exception: 492 | temp = str(temp) 493 | value = 0 494 | if "正常" in temp or "未见" in temp: 495 | pass 496 | else: 497 | value += 1 498 | return value 499 | 500 | def map_deal_0209(temp): 501 | try: 502 | if isnan(float(temp)): 503 | return -1 504 | else: 505 | return float(temp) 506 | except Exception: 507 | temp = str(temp) 508 | value = 0 509 | if "正常" in temp or "未见" in temp: 510 | pass 511 | else: 512 | value += 1 513 | return value 514 | 515 | def map_deal_0210(temp): 516 | try: 517 | if isnan(float(temp)): 518 | return -1 519 | else: 520 | return float(temp) 521 | except Exception: 522 | temp = str(temp) 523 | value = 0 524 | if "正常" in temp or "未见" in temp: 525 | pass 526 | else: 527 | value += 1 528 | return value 529 | 530 | def map_deal_0215(temp): 531 | try: 532 | if isnan(float(temp)): 533 | return -1 534 | else: 535 | return float(temp) 536 | except Exception: 537 | temp = str(temp) 538 | value = 0 539 | if "充血" in temp: 540 | value = 5 541 | 542 | if "正常" in temp or "未见" in temp: 543 | pass 544 | else: 545 | value += 1 546 | return value 547 | 548 | def map_deal_0217(temp): 549 | try: 550 | if isnan(float(temp)): 551 | return -1 552 | else: 553 | return float(temp) 554 | except Exception: 555 | temp = str(temp) 556 | value = 0 557 | if "肿" in temp: 558 | value = 3 559 | return value 560 | 561 | def map_deal_4001(temp): 562 | try: 563 | if isnan(float(temp)): 564 | return -1 565 | else: 566 | return float(temp) 567 | except Exception: 568 | temp = str(temp) 569 | value = 0 570 | if "轻度" in temp: 571 | value = 3 572 | if "中度" in temp: 573 | value = 5 574 | if "重度" in temp: 575 | value = 8 576 | return value 577 | 578 | def map_deal_1001(temp): 579 | try: 580 | if isnan(float(temp)): 581 | return -1 582 | else: 583 | return float(temp) 584 | except Exception: 585 | temp = str(temp) 586 | value = 0 587 | if "过缓" in temp or "不齐" in temp or "偏" in temp: 588 | value += 3 589 | return value 590 | 591 | def map_deal_0409(temp): 592 | try: 593 | if isnan(float(temp)): 594 | return -1 595 | else: 596 | return float(temp) 597 | except Exception: 598 | temp = str(temp) 599 | value = 0 600 | if "血压" in temp: 601 | value += 9 602 | if "糖尿" in temp: 603 | value += 3 604 | if "脂肪" in temp: 605 | value += 5 606 | return value 607 | 608 | def map_deal_0421(temp): 609 | try: 610 | if isnan(float(temp)): 611 | return -1 612 | else: 613 | return float(temp) 614 | except Exception: 615 | temp = str(temp) 616 | value = 0 617 | if "不齐" in temp: 618 | value += 3 619 | return value 620 | 621 | def map_deal_0424(temp): 622 | try: 623 | if isnan(float(temp)): 624 | return -1 625 | else: 626 | return float(temp) 627 | except Exception: 628 | temp = str(temp) 629 | value = 0 630 | if "次" in temp: 631 | if "70" in temp: 632 | value = 70 633 | else: 634 | value = 80 635 | return value 636 | 637 | def map_deal_0434(temp): 638 | try: 639 | if isnan(float(temp)): 640 | return -1 641 | else: 642 | return float(temp) 643 | except Exception: 644 | temp = str(temp) 645 | value = 0 646 | if "血压" in temp: 647 | value += 9 648 | if "糖尿" in temp: 649 | value += 3 650 | if "脂肪" in temp: 651 | value += 5 652 | if "心" in temp: 653 | value += 1 654 | return value 655 | 656 | def map_deal_1402(temp): 657 | try: 658 | if isnan(float(temp)): 659 | return -1 660 | else: 661 | return float(temp) 662 | except Exception: 663 | temp = str(temp) 664 | value = 0 665 | if "硬" in temp: 666 | value += 5 667 | if "低" in temp: 668 | value += 1 669 | if "慢" in temp: 670 | value += 1 671 | return value 672 | 673 | def map_deal_0120(temp): 674 | try: 675 | if isnan(float(temp)): 676 | return -1 677 | else: 678 | return float(temp) 679 | except Exception: 680 | temp = str(temp) 681 | value = 0 682 | if "强回声" in temp: 683 | value += 5 684 | if "低" in temp: 685 | value += 1 686 | return value 687 | 688 | def map_deal_0984(temp): 689 | try: 690 | if isnan(float(temp)): 691 | return -1 692 | else: 693 | return float(temp) 694 | except Exception: 695 | temp = str(temp) 696 | value = 0 697 | if "增" in temp: 698 | value += 5 699 | return value 700 | 701 | def map_deal_100010(temp): 702 | try: 703 | if isnan(float(temp)): 704 | return -1 705 | else: 706 | return float(temp) 707 | except Exception: 708 | temp = str(temp) 709 | value = 0 710 | if "+" in temp: 711 | value += 5 712 | return value 713 | 714 | def map_deal_3190(temp): 715 | try: 716 | if isnan(float(temp)): 717 | return -1 718 | else: 719 | return float(temp) 720 | except Exception: 721 | temp = str(temp) 722 | value = 0 723 | if "+" in temp: 724 | value += 5 725 | return value 726 | 727 | def map_deal_3191(temp): 728 | try: 729 | if isnan(float(temp)): 730 | return -1 731 | else: 732 | return float(temp) 733 | except Exception: 734 | temp = str(temp) 735 | value = 0 736 | if "+" in temp: 737 | value += 5 738 | return value 739 | 740 | def map_deal_3192(temp): 741 | try: 742 | if isnan(float(temp)): 743 | return -1 744 | else: 745 | return float(temp) 746 | except Exception: 747 | temp = str(temp) 748 | value = 0 749 | if "+" in temp: 750 | value += 5 751 | return value 752 | 753 | 754 | def map_deal_3195(temp): 755 | try: 756 | if isnan(float(temp)): 757 | return -1 758 | else: 759 | return float(temp) 760 | except Exception: 761 | temp = str(temp) 762 | value = 0 763 | if "+" in temp: 764 | value += 5 765 | return value 766 | 767 | 768 | def map_deal_3196(temp): 769 | try: 770 | if isnan(float(temp)): 771 | return -1 772 | else: 773 | return float(temp) 774 | except Exception: 775 | temp = str(temp) 776 | value = 0 777 | if "+" in temp: 778 | value += 5 779 | return value 780 | 781 | 782 | def map_deal_3197(temp): 783 | try: 784 | if isnan(float(temp)): 785 | return -1 786 | else: 787 | return float(temp) 788 | except Exception: 789 | temp = str(temp) 790 | value = 0 791 | if "+" in temp: 792 | value += 5 793 | return value 794 | 795 | 796 | def map_deal_3430(temp): 797 | value = 0 798 | try: 799 | if isnan(float(temp)): 800 | return -1 801 | else: 802 | return float(temp) 803 | except Exception: 804 | temp = str(temp) 805 | if "+" in temp: 806 | value += 5 807 | return value 808 | 809 | 810 | def map_deal_3399(temp): 811 | try: 812 | if isnan(float(temp)): 813 | return -1 814 | else: 815 | return float(temp) 816 | except Exception: 817 | temp = str(temp) 818 | value = 0 819 | if "淡" in temp: 820 | value += 5 821 | return value 822 | 823 | 824 | item_list = ['3601', '0102', '0113', '0114', '0115', '0116', 825 | '0117', '0118', '0503', '0509', '0516', '0539', 826 | '2302', '1316', '0101', '0119', '0121', '0122', 827 | '0123', 'A705', '0911', '0912', '0929', 'A202', 828 | '1102', '0208', '0209', '0210', '0215', '0217', 829 | '4001', '1001', '0409', '0421', '0424', '0434', 830 | '1402', '0120', '0984', '100010', '3190', '3191', 831 | '3192', '3195', '3196', '3197', '3430', '3399'] 832 | 833 | map_list = [map_deal_3601, map_deal_0102, map_deal_0113, map_deal_0114, map_deal_0115, map_deal_0116, 834 | map_deal_0117, map_deal_0118, map_deal_0503, map_deal_0509, map_deal_0516, map_deal_0539, 835 | map_deal_2302, map_deal_1316, map_deal_0101, map_deal_0119, map_deal_0121, map_deal_0122, 836 | map_deal_0123, map_deal_A705, map_deal_0911, map_deal_0912, map_deal_0929, map_deal_A202, 837 | map_deal_1102, map_deal_0208, map_deal_0209, map_deal_0210, map_deal_0215, map_deal_0217, 838 | map_deal_4001, map_deal_1001, map_deal_0409, map_deal_0421, map_deal_0424, map_deal_0434, 839 | map_deal_1402, map_deal_0120, map_deal_0984, map_deal_100010, map_deal_3190, map_deal_3191, 840 | map_deal_3192, map_deal_3195, map_deal_3196, map_deal_3197, map_deal_3430, map_deal_3399 841 | ] 842 | 843 | 844 | def get_file(): 845 | train = pd.read_csv('../data/meinian_round1_train_20180408.csv', sep=',', encoding='gbk') 846 | test = pd.read_csv('../data//meinian_round1_test_b_20180505.csv', sep=',', encoding='gbk') 847 | data_part1 = pd.read_csv('../data/meinian_round1_data_part1_20180408.txt', sep='$', encoding='utf-8') 848 | data_part2 = pd.read_csv('../data/meinian_round1_data_part2_20180408.txt', sep='$', encoding='utf-8') 849 | 850 | # data_part1和data_part2进行合并,并剔除掉与train、test不相关vid所在的行 851 | part1_2 = pd.concat([data_part1, data_part2], axis=0) # {0/'index', 1/'columns'}, default 0 852 | part1_2 = pd.DataFrame(part1_2).sort_values('vid').reset_index(drop=True) 853 | vid_set = pd.concat([train['vid'], test['vid']], axis=0) 854 | vid_set = pd.DataFrame(vid_set).sort_values('vid').reset_index(drop=True) 855 | part1_2 = part1_2[part1_2['vid'].isin(vid_set['vid'])] 856 | # 根据常识判断无用的'检查项'table_id,过滤掉无用的table_id 857 | part1_2 = filter_None(part1_2) 858 | # 数据简单处理 859 | print(part1_2.shape) 860 | vid_tabid_group = part1_2.groupby(['vid', 'table_id']).size().reset_index() 861 | print('------------------------------去重和组合-----------------------------') 862 | vid_tabid_group['new_index'] = vid_tabid_group['vid'] + '_' + vid_tabid_group['table_id'] 863 | vid_tabid_group_dup = vid_tabid_group[vid_tabid_group[0] > 1]['new_index'] 864 | 865 | # print(vid_tabid_group_dup.head()) #000330ad1f424114719b7525f400660b_0102 866 | part1_2['new_index'] = part1_2['vid'] + '_' + part1_2['table_id'] 867 | 868 | dup_part = part1_2[part1_2['new_index'].isin(list(vid_tabid_group_dup))] 869 | dup_part = dup_part.sort_values(['vid', 'table_id']) 870 | unique_part = part1_2[~part1_2['new_index'].isin(list(vid_tabid_group_dup))] 871 | 872 | part1_2_dup = dup_part.groupby(['vid', 'table_id']).apply(merge_table).reset_index() 873 | part1_2_dup.rename(columns={0: 'field_results'}, inplace=True) 874 | part1_2_res = pd.concat([part1_2_dup, unique_part[['vid', 'table_id', 'field_results']]]) 875 | 876 | # 行列转换 877 | print('--------------------------重新组织index和columns---------------------------') 878 | merge_part1_2 = part1_2_res.pivot(index='vid', values='field_results', columns='table_id') 879 | merge_part1_2.to_csv('../data/merge_part1_2.csv', encoding='utf-8') 880 | del merge_part1_2 881 | time.sleep(10) 882 | print('------------------------重新读取数据merge_part1_2--------------------------') 883 | merge_part1_2 = pd.read_csv('../data/merge_part1_2.csv', sep=',', encoding='utf-8') 884 | print('--------------新的part1_2组合完毕----------') 885 | print(merge_part1_2.shape) 886 | feats = remain_feat(merge_part1_2, thresh=0.96) 887 | merge_part1_2 = merge_part1_2[feats] 888 | 889 | for i in range(len(item_list)): 890 | merge_part1_2[item_list[i]] = merge_part1_2[item_list[i]].apply(map_list[i]) 891 | 892 | tran_kind_dict = {} 893 | for x in merge_part1_2.columns: 894 | if merge_part1_2[x].dtype == 'object': 895 | a = len(merge_part1_2[x].unique()) 896 | tran_kind_dict[x] = a 897 | 898 | drop_list = [] 899 | onehot_list = [] 900 | for x in tran_kind_dict.keys(): 901 | 902 | if tran_kind_dict[x] <= 200: 903 | onehot_list.append(x) 904 | else: 905 | if x != 'vid': 906 | drop_list.append(x) 907 | 908 | from sklearn import preprocessing 909 | lbl = preprocessing.LabelEncoder() 910 | for x in onehot_list: 911 | merge_part1_2[x] = lbl.fit_transform(merge_part1_2[x].map(lambda x: str(x))) 912 | 913 | merge_part1_2.drop(drop_list, axis=1, inplace=True) 914 | merge_part1_2 = merge_part1_2.convert_objects(convert_numeric=True) 915 | train_of_part = merge_part1_2[merge_part1_2['vid'].isin(train['vid'])] 916 | test_of_part = merge_part1_2[merge_part1_2['vid'].isin(test['vid'])] 917 | train = pd.merge(train, train_of_part, on='vid') 918 | test = pd.merge(test, test_of_part, on='vid') 919 | return train, test 920 | 921 | 922 | def do_map(merge_part1_2): 923 | for i in range(len(item_list)): 924 | merge_part1_2[item_list[i]] = merge_part1_2[item_list[i]].apply(map_list[i]) 925 | 926 | merge_part1_2.info() 927 | tran_kind_dict = {} 928 | for x in merge_part1_2.columns: 929 | if merge_part1_2[x].dtype == 'object': 930 | a = len(merge_part1_2[x].unique()) 931 | tran_kind_dict[x] = a 932 | 933 | drop_list = [] 934 | onehot_list = [] 935 | for x in tran_kind_dict.keys(): 936 | 937 | if tran_kind_dict[x] <= 200: 938 | onehot_list.append(x) 939 | else: 940 | if x != 'vid': 941 | drop_list.append(x) 942 | 943 | from sklearn import preprocessing 944 | lbl = preprocessing.LabelEncoder() 945 | for x in onehot_list: 946 | merge_part1_2[x] = lbl.fit_transform(merge_part1_2[x].map(lambda x: str(x))) 947 | 948 | merge_part1_2.drop(drop_list, axis=1, inplace=True) 949 | merge_part1_2 = merge_part1_2.convert_objects(convert_numeric=True) 950 | return merge_part1_2 951 | -------------------------------------------------------------------------------- /round1_rank2/code/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/5/7 0007 下午 13:02 4 | # @Author : Juzphy 5 | import time 6 | import pandas as pd 7 | import lightgbm as lgb 8 | import numpy as np 9 | import re 10 | from collections import Iterable 11 | from sklearn.model_selection import KFold 12 | from sklearn.metrics import classification_report 13 | import warnings 14 | from team.team_feature_work import get_file 15 | 16 | warnings.filterwarnings('ignore') 17 | 18 | 19 | class DataPreProcess(object): 20 | def __init__(self, threshold): 21 | self.thresh = threshold 22 | 23 | def pre_process(self): 24 | # 过滤掉无用的table_id 25 | def filter_none(data): 26 | data = data[data['field_results'] != ''] 27 | data = data[data['field_results'] != '未查'] 28 | return data 29 | 30 | # 重复数据的拼接操作 31 | def merge_table(df): 32 | df['field_results'] = df['field_results'].astype(str) 33 | if df.shape[0] > 1: 34 | merge_df = " ".join(list(df['field_results'])) 35 | else: 36 | merge_df = df['field_results'].values[0] 37 | return merge_df 38 | 39 | # 删除掉一些出现次数低,缺失比例大的字段,保留超过阈值的特征 40 | def get_remain_feats(df): 41 | exclude_feats = set() 42 | print('----------目前移除缺失的阈值为{}-----------'.format(self.thresh)) 43 | print('----------移除数据缺失多的字段-----------') 44 | print('移除之前总的字段数量', len(df.columns)) 45 | num_rows = df.shape[0] 46 | for c in df.columns: 47 | num_missing = df[c].isnull().sum() 48 | if num_missing == 0: 49 | continue 50 | missing_percent = num_missing / float(num_rows) 51 | if missing_percent > self.thresh: 52 | exclude_feats.add(c) 53 | print("移除后数据的字段数量: %s" % len(exclude_feats)) 54 | # 保留超过阈值的特征 55 | remain_feats = set(df.columns) - exclude_feats 56 | print('剩余的字段数量', len(remain_feats)) 57 | return list(remain_feats) 58 | 59 | origin_train = pd.read_csv('../data/meinian_round1_train_20180408.csv', sep=',', encoding='gbk') 60 | origin_test = pd.read_csv('../data/meinian_round1_test_b_20180505.csv', sep=',', encoding='gbk') 61 | data_part1 = pd.read_csv('../data/meinian_round1_data_part1_20180408.txt', sep='$', encoding='utf-8') 62 | data_part2 = pd.read_csv('../data/meinian_round1_data_part2_20180408.txt', sep='$', encoding='utf-8') 63 | # data_part1和data_part2进行合并,并剔除掉与train、test不相关vid所在的行 64 | # {0/'index', 1/'columns'}, default 0 65 | part1_2 = pd.concat([data_part1, data_part2], axis=0) 66 | part1_2 = pd.DataFrame(part1_2).sort_values('vid').reset_index(drop=True) 67 | vid_set = pd.concat([origin_train['vid'], origin_test['vid']], axis=0) 68 | vid_set = pd.DataFrame(vid_set).sort_values('vid').reset_index(drop=True) 69 | part1_2 = part1_2[part1_2['vid'].isin(vid_set['vid'])] 70 | part1_2 = filter_none(part1_2) 71 | print(part1_2.shape) 72 | vid_tabid_group = part1_2.groupby(['vid', 'table_id']).size().reset_index() 73 | print('------------------------------去重和组合-----------------------------') 74 | vid_tabid_group['new_index'] = vid_tabid_group['vid'] + '_' + vid_tabid_group['table_id'] 75 | vid_tabid_group_dup = vid_tabid_group[vid_tabid_group[0] > 1]['new_index'] 76 | part1_2['new_index'] = part1_2['vid'] + '_' + part1_2['table_id'] 77 | dup_part = part1_2[part1_2['new_index'].isin(list(vid_tabid_group_dup))] 78 | dup_part = dup_part.sort_values(['vid', 'table_id']) 79 | unique_part = part1_2[~part1_2['new_index'].isin(list(vid_tabid_group_dup))] 80 | part1_2_dup = dup_part.groupby(['vid', 'table_id']).apply(merge_table).reset_index() 81 | part1_2_dup.rename(columns={0: 'field_results'}, inplace=True) 82 | part1_2_res = pd.concat([part1_2_dup, unique_part[['vid', 'table_id', 'field_results']]]) 83 | 84 | # 行列转换 85 | print('--------------------------重新组织index和columns---------------------------') 86 | merge_part1_2 = part1_2_res.pivot(index='vid', values='field_results', columns='table_id') 87 | merge_part1_2.to_csv('../data/merge_part1_2_{}.csv'.format(self.thresh), encoding='utf-8') 88 | del merge_part1_2 89 | time.sleep(10) 90 | print('------------------------重新读取数据merge_part1_2--------------------------') 91 | merge_part1_2 = pd.read_csv('../data/merge_part1_2_{}.csv'.format(self.thresh), sep=',', encoding='utf-8') 92 | print('--------------新的part1_2组合完毕----------') 93 | print(merge_part1_2.shape) 94 | feats = get_remain_feats(merge_part1_2) 95 | return merge_part1_2[feats] 96 | 97 | 98 | class FeatureWork(object): 99 | def __init__(self, thresh_num): 100 | self.thresh_num = thresh_num 101 | 102 | def get_features(self): 103 | # 脂肪肝程度 104 | def transform_101_102_113(df): 105 | if df: 106 | if '脂肪肝趋势' in df: 107 | return 1 108 | elif '轻度' in df: 109 | if '中' not in df: 110 | return 2 111 | else: 112 | return 3 113 | elif '中度' in df: 114 | if '重' not in df: 115 | return 3 116 | else: 117 | return 4 118 | elif '重度' in df: 119 | return 4 120 | else: 121 | return 0 122 | else: 123 | return np.nan 124 | 125 | def transform_2302(df): 126 | try: 127 | if '健康' in df: 128 | if '亚健康' in df: 129 | return 1 130 | else: 131 | return 0 132 | elif '疾病' in df: 133 | return 2 134 | except Exception: 135 | return df 136 | 137 | def high_sugar(df): 138 | if df: 139 | if '血糖偏高' in df or '降糖' in df or '血糖' in df: 140 | return 1 141 | else: 142 | return 0 143 | else: 144 | return np.nan 145 | 146 | def high_fat(df): 147 | if df: 148 | if '血脂偏高' in df or '低脂' in df or '血脂' in df: 149 | return 1 150 | else: 151 | return 0 152 | else: 153 | return np.nan 154 | 155 | def high_pressure(df): 156 | if df: 157 | if '血压偏高' in df or '降压' in df or '血压' in df: 158 | return 1 159 | else: 160 | return 0 161 | else: 162 | return np.nan 163 | 164 | def higher_pressure(df): 165 | if df: 166 | if '血压偏高' not in df: 167 | if '高血压' in df: 168 | return 1 169 | else: 170 | return 0 171 | else: 172 | return np.nan 173 | 174 | def higher_fat(df): 175 | if df: 176 | if '血脂偏高' not in df: 177 | if '高血脂' in df: 178 | return 1 179 | else: 180 | return 0 181 | else: 182 | return np.nan 183 | 184 | def higher_sugar(df): 185 | if df: 186 | if '血糖偏高' not in df: 187 | if '高血糖' in df or '糖尿病' in df: 188 | return 1 189 | else: 190 | return 0 191 | else: 192 | return np.nan 193 | 194 | def fatty_liver(df): 195 | if df: 196 | if '脂肪肝' in df: 197 | return 1 198 | else: 199 | return 0 200 | else: 201 | return np.nan 202 | 203 | def coronary_heart_disease(df): 204 | if df: 205 | if '冠心病' in df or '冠状' in df: 206 | return 1 207 | else: 208 | return 0 209 | else: 210 | return np.nan 211 | 212 | def kidney(df): 213 | if df: 214 | if '肾' in df: 215 | return 1 216 | else: 217 | return 0 218 | else: 219 | return np.nan 220 | 221 | def smoke(df): 222 | if df: 223 | if '烟' in df: 224 | return 1 225 | else: 226 | return 0 227 | else: 228 | return np.nan 229 | 230 | def strQ2B(df): 231 | """全角转半角""" 232 | if isinstance(df, Iterable): 233 | rstring = "" 234 | for uchar in df: 235 | inside_code = ord(uchar) 236 | # 全角空格直接转换 237 | if inside_code == 12288: 238 | inside_code = 32 239 | elif 65281 <= inside_code <= 65374: 240 | inside_code -= 65248 241 | rstring += chr(inside_code) 242 | return rstring 243 | else: 244 | return df 245 | 246 | def extract_num(df): 247 | try: 248 | df = float(df) 249 | if df <= 0: 250 | return np.nan 251 | return df 252 | except Exception: 253 | if '.' in df: 254 | temp = re.findall('(\d+\.\d+)', df) 255 | else: 256 | temp = re.findall('(\d+)', df) 257 | if temp: 258 | return float(temp[0]) 259 | else: 260 | return np.nan 261 | 262 | def blood_pipe_style(df): 263 | try: 264 | if '良好' in df or '正常' in df: 265 | return 0 266 | elif '趋势' in df: 267 | return 1 268 | elif '轻度' in df: 269 | return 2 270 | elif '中度' in df: 271 | return 3 272 | elif '重度' in df: 273 | return 4 274 | elif '硬化' in df: 275 | return 5 276 | else: 277 | return np.nan 278 | except Exception: 279 | return df 280 | 281 | def ying_yang(df): 282 | try: 283 | if '+' in df and '-' in df: 284 | return 1 285 | elif '+' in df and '-' not in df: 286 | return 2 287 | elif ('-' in df or '阴' in df or '正常' in df or 'Normal' in df) and '+' not in df: 288 | return 0 289 | else: 290 | return 0 291 | except Exception: 292 | return df 293 | 294 | def HP_yy(df): 295 | try: 296 | if '阳' in df: 297 | return 1 298 | else: 299 | return 0 300 | except Exception: 301 | return df 302 | 303 | # 尿 304 | def urine(df): 305 | try: 306 | if '>=' in df: 307 | return 1 308 | else: 309 | return 0 310 | except Exception: 311 | return df 312 | 313 | def heart_rate(df): 314 | try: 315 | if df != '强弱不等': 316 | if '弱' in df or '远' in df or '低' in df: 317 | return 1 318 | elif '强' in df or '力' in df: 319 | return 3 320 | else: 321 | return 0 322 | else: 323 | return 2 324 | except Exception: 325 | return df 326 | 327 | def transform_421(df): 328 | try: 329 | if '齐' in df and '不' not in df: 330 | return 0 331 | else: 332 | return 1 333 | except Exception: 334 | return df 335 | 336 | def transform_430(df): 337 | try: 338 | if df == '软': 339 | return 1 340 | elif df == '中': 341 | return 2 342 | elif df == '硬': 343 | return 3 344 | else: 345 | return 0 346 | except Exception: 347 | return df 348 | 349 | def transform_403(df): 350 | try: 351 | if '大' in df and '无' not in df: 352 | return 1 353 | else: 354 | return 0 355 | except Exception: 356 | return df 357 | 358 | def transform_3399(df): 359 | try: 360 | if df == '黄色' or df == 'yellow': 361 | return 2 362 | elif df == '淡黄色' or df == '浅黄色': 363 | return 1 364 | elif df == '无色': 365 | return 0 366 | elif '红' in df: 367 | return 3 368 | elif df == '混浊': 369 | return 4 370 | else: 371 | return 5 372 | except Exception: 373 | return df 374 | 375 | def lung_voice(df): 376 | try: 377 | if '干啰' in df: 378 | return 1 379 | elif '湿啰' in df: 380 | return 2 381 | elif '哮鸣' in df: 382 | return 3 383 | elif '湿鸣' in df: 384 | return 4 385 | else: 386 | return 0 387 | except Exception: 388 | return df 389 | 390 | def one_hot(data_frame): 391 | one_hot_list = ['101', '102', '113', '409', '413', '434', '439', 'A201', 'A202', '4001', '705', 'A301', '709', '985'] 392 | data_frame.loc[:, one_hot_list] = data_frame.loc[:, one_hot_list].fillna('') 393 | data_frame['4001'] = data_frame['4001'].astype(str) 394 | data_frame['705'] = data_frame['705'].astype(str) 395 | data_frame['709'] = data_frame['709'].astype(str) 396 | data_frame['A301'] = data_frame['A301'].astype(str) 397 | data_frame['985'] = data_frame['985'].astype(str) 398 | data_frame['439'] = data_frame['439'].astype(str) 399 | frame_409_434 = data_frame['409'] + data_frame['434'] + data_frame['413'] + data_frame['4001'] + \ 400 | data_frame['A201'] + data_frame['A301'] + data_frame['A202'] + data_frame['705'] + \ 401 | data_frame['709'] + data_frame['985'] + data_frame['439'] 402 | data_frame['血压偏高'] = frame_409_434.apply(high_pressure) 403 | data_frame['血脂偏高'] = frame_409_434.apply(high_fat) 404 | data_frame['血糖偏高'] = frame_409_434.apply(high_sugar) 405 | data_frame['高血糖'] = frame_409_434.apply(higher_sugar) 406 | data_frame['高血脂'] = frame_409_434.apply(higher_fat) 407 | data_frame['高血压'] = frame_409_434.apply(higher_pressure) 408 | data_frame['脂肪肝'] = frame_409_434.apply(fatty_liver) 409 | data_frame['冠心病'] = frame_409_434.apply(coronary_heart_disease) 410 | data_frame['肾问题'] = frame_409_434.apply(kidney) 411 | data_frame['吸烟'] = frame_409_434.apply(smoke) 412 | fat_liver_num = data_frame['101'] + data_frame['102'] + data_frame['113'] 413 | data_frame['脂肪肝程度'] = fat_liver_num.apply(transform_101_102_113) 414 | 415 | def cm2mm(df): 416 | try: 417 | if 'cm' in df: 418 | temp_cm = re.findall('\d+(?:\.\d+)?.*?x?\d+(?:\.\d+)?', df) 419 | if temp_cm: 420 | return float(temp_cm[0][0]) * float(temp_cm[0][1]) * 100 421 | elif 'mm' in df: 422 | temp_mm = re.findall('\d+(?:\.\d+)?.*?x?\d+(?:\.\d+)?', df) 423 | if temp_mm: 424 | return float(temp_mm[0][0]) * float(temp_mm[0][1]) 425 | else: 426 | return np.nan 427 | except Exception: 428 | return np.nan 429 | 430 | def get_num_from_102_front(df): 431 | try: 432 | temp_x = re.findall('(\d+)/(\d+)', df) 433 | if temp_x: 434 | return float(temp_x[0][0]) 435 | except Exception: 436 | return np.nan 437 | 438 | def get_num_from_102_back(df): 439 | try: 440 | temp_x = re.findall('(\d+)/(\d+)', df) 441 | if temp_x: 442 | return float(temp_x[0][1]) 443 | except Exception: 444 | return np.nan 445 | 446 | def word2num(data_frame): 447 | drop_list = ['3193', '420', '431', '976', '429', '422', '423', '426', '3400', '3485', '3486', '30007'] 448 | drop_list2 = ['101', '102', '113', '409', '413', '434', 'A201', 'A202', '4001', '705', 'A301', '709'] 449 | drop_list3 = ['1001', '114', '116', '117', '118', '121', '985', '439'] 450 | drop_list.extend(drop_list2) 451 | drop_list.extend(drop_list3) 452 | yy_list = ['3190', '3191', '3192', '3194', '3195', '3196', '3197', '3430', '100010'] 453 | for y in yy_list: 454 | data_frame[y] = data_frame[y].apply(ying_yang) 455 | data_frame['尿比重'] = data_frame['3193'].apply(urine) 456 | data_frame['心音'] = data_frame['420'].apply(heart_rate) 457 | data_frame['430'] = data_frame['430'].apply(transform_430) 458 | data_frame['3399'] = data_frame['3399'].apply(transform_3399) 459 | data_frame['3301'] = data_frame['3301'].apply(HP_yy) 460 | data_frame['403'] = data_frame['3301'].apply(transform_403) 461 | data_frame['421'] = data_frame['421'].apply(transform_421) 462 | data_frame['405'] = data_frame['405'].apply(lung_voice) 463 | data_frame['gender'] = data_frame['121'].apply(lambda n: 1 if isinstance(n, Iterable) else 0) 464 | data_frame['血管弹性'] = data_frame['4001'].apply(blood_pipe_style) 465 | data_frame['2302'] = data_frame['2302'].apply(transform_2302) 466 | one_hot(data_frame) 467 | for x, y in zip(['113', '114', '116', '117', '118'], ['肝脏回声', '胆囊回声', '脾脏回声', '左肾回声', '右肾回声']): 468 | data_frame[x] = data_frame[x].apply(strQ2B) 469 | data_frame[x] = data_frame[x].apply(lambda n: n.lower().replace('×', 'x').replace('*', 'x') if 470 | isinstance(n, Iterable) else n) 471 | data_frame[y] = data_frame[x].apply(cm2mm) 472 | data_frame['血压_front'] = data_frame['102'].apply(get_num_from_102_front) 473 | data_frame['血压_back'] = data_frame['102'].apply(get_num_from_102_back) 474 | data_frame['心跳次数'] = data_frame['1001'].apply(extract_num) 475 | data_frame.drop(drop_list, axis=1, inplace=True) 476 | return data_frame 477 | 478 | def file_split(data_frame, path): 479 | with open(path, encoding='utf8') as f: 480 | feature_list = [i for i in f.read().split(', ')] 481 | features = data_frame[feature_list] 482 | return features 483 | 484 | def save_all_num(data_frame): 485 | for c in data_frame.columns: 486 | if c != 'vid': 487 | data_frame[c] = data_frame[c].apply(extract_num) 488 | q_num = data_frame[c].quantile(0.9) * 1.5 489 | data_frame[c] = data_frame[c].apply(lambda x: x if x < q_num else np.nan) 490 | return data_frame 491 | 492 | # 添加队友的数据特征 493 | def add_new_feature(df_mine, df_team, save_path): 494 | columns = list(set(df_team.columns) - set(df_mine.columns) - 495 | set(['舒张压', '收缩压', '血清高密度脂蛋白', '血清低密度脂蛋白', '血清甘油三酯'])) 496 | columns.append('vid') 497 | new_data = df_team[columns] 498 | final_data = pd.merge(df_mine, new_data, on='vid') 499 | final_data.to_csv(save_path, encoding='utf8', index=False) 500 | 501 | dpp = DataPreProcess(threshold=self.thresh_num) 502 | all_data = dpp.pre_process() 503 | all_data.columns = [a[1:] if a.startswith('0') else a for a in all_data.columns] 504 | train_set = pd.read_csv('../data/meinian_round1_train_20180408.csv', sep=',', encoding='gbk') 505 | for t in train_set.columns: 506 | if t != 'vid': 507 | train_set[t] = train_set[t].apply(extract_num) 508 | test_set = pd.read_csv('../data/meinian_round1_test_b_20180505.csv', sep=',', encoding='gbk') 509 | num_data_temp = file_split(all_data, '../features/num_label.txt') 510 | word_data_temp = file_split(all_data, '../features/word_label.txt') 511 | num_data_temp.to_csv('../data/num_data.csv', encoding='utf8', index=False) 512 | word_data_temp.to_csv('../data/word_data.csv', encoding='utf8', index=False) 513 | num_data = pd.read_csv('../data/num_data.csv', encoding='utf8') 514 | word_data = pd.read_csv('../data/word_data.csv', encoding='utf8') 515 | num_data = save_all_num(num_data) 516 | word_data = word2num(word_data) 517 | transform_data = pd.merge(num_data, word_data, on='vid') 518 | train_of_part = transform_data[transform_data['vid'].isin(train_set['vid'])] 519 | test_of_part = transform_data[transform_data['vid'].isin(test_set['vid'])] 520 | train_set = pd.merge(train_set, train_of_part, on='vid') 521 | train_set.loc[train_set['vid'] == '7685d48685028a006c84070f68854ce1', '舒张压'] = 64 522 | train_set.loc[train_set['vid'] == 'fa04c8db6d201b9f705a00c3086481b0', '舒张压'] = 74 523 | train_set.loc[train_set['vid'] == 'de82a4130c4907cff4bfb96736674bbc', '血清低密度脂蛋白'] = 1.22 524 | train_set.loc[train_set['vid'] == 'd9919661f0a45fbcacc4aa2c1119c3d2', '血清低密度脂蛋白'] = 0.12 525 | train_set.loc[train_set['vid'] == '798d859a63044a8a5addf1f8c528629e', '血清低密度脂蛋白'] = 0.06 526 | test_set = pd.merge(test_set, test_of_part, on='vid') 527 | team_train, team_test = get_file() 528 | add_new_feature(train_set, team_train, '../data/train_set_merge.csv') 529 | add_new_feature(test_set, team_test, '../data/test_set_merge.csv') 530 | print('*************************训练集和测试集数据已成功写入。*************************') 531 | 532 | 533 | class LGBRegression(object): 534 | def __init__(self): 535 | self.params = { 536 | 'learning_rate': 0.01, 537 | 'boosting_type': 'gbdt', 538 | 'objective': 'mse', 539 | 'num_leaves': 62, 540 | 'reg_sqrt': True, 541 | 'feature_fraction': 0.8, 542 | 'bagging_fraction': 0.8, 543 | 'bagging_freq': 2, 544 | 'num_threads': -1, 545 | 'min_data_in_leaf': 5, 546 | 'verbose': -1 547 | } 548 | 549 | def eval_metric(self, pred, labels): 550 | return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 551 | 552 | def eval_error(self, pred, train_data): 553 | labels = train_data.get_label() 554 | score = np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2)) 555 | return 'meinian', score, False 556 | 557 | def lgb_regression_model(self, df, label, use_feature, true_test, submission_data): 558 | print("基于lightgbm: 开始训练 label 为{}...".format(label)) 559 | value4preds = df[label] 560 | value4preds = value4preds[value4preds.isnull().values == False] 561 | df = df.iloc[value4preds.index] 562 | train_data = df.loc[:, use_feature] 563 | print(train_data.shape) 564 | scores = np.zeros(len(value4preds)) 565 | submission_scores = np.zeros((len(submission_data), 5)) 566 | num_round = 8000 567 | kf = KFold(n_splits=5, shuffle=True, random_state=1024) 568 | for t, (train_index, test_index) in enumerate(kf.split(train_data, value4preds), start=1): 569 | print('第{}次训练...'.format(t)) 570 | x_train, x_test = train_data.iloc[train_index], train_data.iloc[test_index] 571 | y_train, y_test = value4preds.iloc[train_index], value4preds.iloc[test_index] 572 | lgb_train = lgb.Dataset(x_train, y_train) 573 | lgb_test = lgb.Dataset(x_test, y_test) 574 | gbm = lgb.train(self.params, 575 | lgb_train, 576 | num_boost_round=num_round, 577 | valid_sets=lgb_test, 578 | verbose_eval=100, 579 | feval=self.eval_error, 580 | early_stopping_rounds=100) 581 | scores[test_index] = gbm.predict(x_test) 582 | submission_scores[:, t - 1] = gbm.predict(true_test) 583 | submission_data[label] = np.mean(submission_scores, axis=1).round(3) 584 | return self.eval_metric(scores, value4preds) 585 | 586 | 587 | class LGBClassification(object): 588 | def __init__(self): 589 | self.params = { 590 | 'learning_rate': 0.01, 591 | 'boosting_type': 'gbdt', 592 | 'objective': 'binary', 593 | 'metric': 'auc', 594 | 'num_leaves': 62, 595 | 'feature_fraction': 0.8, 596 | 'bagging_fraction': 0.8, 597 | 'bagging_freq': 2, 598 | 'verbose': -1, 599 | 'min_data_in_leaf': 5, 600 | } 601 | 602 | # pos: 分类的分界线 603 | # df: 训练集 604 | # label: 主要针对血清甘油三脂(分界线4)和血清低密度脂蛋白(分界线5) 605 | # use_feature: 训练使用的特征 606 | # save_path: 分类结果保存路径 607 | def lgb_classification_model(self, pos, df, label, use_feature, test_class, save_path): 608 | print("开始训练分界线为{}...".format(pos)) 609 | df['pos_{}'.format(pos)] = df[label].apply(lambda x: 1 if x > pos else 0) 610 | test_preds = df['pos_{}'.format(pos)] 611 | test4lgb = test_class.loc[:, use_feature] 612 | train_preds = df[use_feature] 613 | kf = KFold(n_splits=5, random_state=1024, shuffle=True) 614 | pred_labels = np.zeros(df.shape[0]) 615 | submission_label = np.zeros((test4lgb.shape[0], 5)) 616 | for t, (train_index, test_index) in enumerate(kf.split(train_preds, test_preds), start=1): 617 | print('第{}次训练...'.format(t)) 618 | X_train, X_test = train_preds.iloc[train_index], train_preds.iloc[test_index] 619 | y_train, y_test = test_preds.iloc[train_index], test_preds.iloc[test_index] 620 | pos_weight = y_train.sum() / y_train.size 621 | print(pos_weight) 622 | self.params.update({'scale_pos_weight': pos_weight}) 623 | lgb_train = lgb.Dataset(X_train, y_train) 624 | lgb_test = lgb.Dataset(X_test, y_test) 625 | gbm = lgb.train(self.params, 626 | lgb_train, 627 | num_boost_round=8000, 628 | valid_sets=lgb_test, 629 | verbose_eval=100, 630 | early_stopping_rounds=100) 631 | pred_labels[X_test.index] = np.where(gbm.predict(X_test) > 0.5, 1, 0) 632 | self.params.pop('scale_pos_weight') 633 | submission_label[:, t - 1] = np.where(gbm.predict(test4lgb) > 0.5, 1, 0) 634 | test_class['pos_{}'.format(pos)] = np.where(np.sum(submission_label, axis=1) >= 1, 1, 0) 635 | print(classification_report(pred_labels, test_preds)) 636 | test_class.to_csv(save_path, index=False, encoding='utf8') 637 | 638 | 639 | if __name__ == "__main__": 640 | fw = FeatureWork(thresh_num=0.9) 641 | fw.get_features() 642 | train = pd.read_csv('../data/train_set_merge.csv', encoding='utf8', low_memory=False) 643 | test = pd.read_csv('../data/test_set_merge.csv', encoding='utf8', low_memory=False) 644 | print(train.shape, test.shape) 645 | predict_features = ['舒张压', '收缩压', '血清高密度脂蛋白', '血清低密度脂蛋白', '血清甘油三酯'] 646 | train[predict_features] = train[predict_features] 647 | test[predict_features] = test[predict_features] 648 | use_features = [t for t in test.columns if t != 'vid' and t not in predict_features] 649 | test_data = test.loc[:, use_features] 650 | submission = test.loc[:, ['vid', '收缩压', '舒张压', '血清甘油三酯', '血清高密度脂蛋白', '血清低密度脂蛋白']] 651 | base_line_score = np.zeros(5) 652 | start = time.time() 653 | lgb_reg = LGBRegression() 654 | for i, j in enumerate(predict_features): 655 | base_line_score[i] = lgb_reg.lgb_regression_model(train, j, use_features, test_data, submission) 656 | print(dict(zip(predict_features, base_line_score))) 657 | print('CV训练用时{}秒'.format(time.time() - start)) 658 | print('线下得分为:', np.mean(base_line_score)) 659 | date1 = time.strftime('%Y%m%d_%H%M%S') 660 | submission.to_csv('../submit/submit_{}.csv'.format(date1), index=None, header=None, encoding='utf8') 661 | time.sleep(10) 662 | lgr_class = LGBClassification() 663 | lgr_class.lgb_classification_model(4, train, '血清甘油三酯', use_features, test, '../data/fat_class_pos4.csv') 664 | time.sleep(10) 665 | reg_test = pd.read_csv('../data/fat_class_pos4.csv', encoding='utf8', low_memory=False) 666 | pos_eq_1 = reg_test[reg_test['pos_4'] == 1] 667 | test_eq_1 = pos_eq_1.loc[:, use_features] 668 | submission_gt_4 = pos_eq_1.loc[:, ['vid', '血清甘油三酯']] 669 | train_gt_4 = train[train['血清甘油三酯'] >= 4] 670 | train_gt_4.index = list(range(train_gt_4.shape[0])) 671 | lgb_reg.lgb_regression_model(train_gt_4, '血清甘油三酯', use_features, test_eq_1, submission_gt_4) 672 | submission_gt_4.to_csv('../data/submit_gt_4.csv', index=None, header=None, encoding='utf8') 673 | gt_4_index = submission[submission['vid'].isin(submission_gt_4['vid'])].index 674 | submission_temp = submission.loc[gt_4_index, ['vid', '血清甘油三酯']] 675 | merge_fat = pd.merge(submission_temp, submission_gt_4, on='vid') 676 | temp_columns = [tc for tc in merge_fat.columns if tc != 'vid'] 677 | replace_num = np.max(merge_fat.loc[:, temp_columns], axis=1) 678 | submission.loc[gt_4_index, '血清甘油三酯'] = replace_num.values 679 | date2 = time.strftime('%Y%m%d_%H%M%S') 680 | submission.to_csv('../submit/submit_{}.csv'.format(date2), index=None, encoding='utf8') 681 | -------------------------------------------------------------------------------- /round2_rank10/xgb_model/xgb_in_odps.sql: -------------------------------------------------------------------------------- 1 | drop table if exists jz_xgb_pred_val_1; 2 | drop table if exists jz_xgb_pred_val_2; 3 | drop table if exists jz_xgb_pred_val_3; 4 | drop table if exists jz_xgb_pred_val_4; 5 | drop table if exists jz_xgb_pred_val_5; 6 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_1; 7 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_2; 8 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_3; 9 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_4; 10 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_5; 11 | 12 | 13 | -- train-fold-1 14 | PAI 15 | -name xgboost 16 | -project algo_public 17 | -DinputTableName="tl_xgb_train_1" 18 | -DmodelName="jz_xgb_model_1" 19 | -Deta="0.01" 20 | -Dobjective="reg:linear" 21 | -DitemDelimiter="," 22 | -Dseed="1024" 23 | -Dnum_round="1800" 24 | -DlabelColName="log_tl" 25 | -DenableSparse="false" 26 | -Dmax_depth="5" 27 | -Dsubsample="0.7" 28 | -Dcolsample_bytree="0.7" 29 | -Dgamma="0" 30 | -Dlambda="50" 31 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related" 32 | -Dbase_score="0.2" 33 | -Dmin_child_weight="20" 34 | -DkvDelimiter=":"; 35 | 36 | -- predict-fold-1 37 | PAI 38 | -name prediction 39 | -project algo_public 40 | -DinputTableName="juz_test_6_7_xgb" 41 | -DappendColNames="vid,log_tl" 42 | -DmodelName="jz_xgb_model_1" 43 | -DitemDelimiter="," 44 | -DresultColName="result" 45 | -Dlifecycle="28" 46 | -DoutputTableName="jz_xgb_pred_val_1" 47 | -DkvDelimiter=":" 48 | -DenableSparse="false" 49 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"; 50 | 51 | -- train-fold-2 52 | PAI 53 | -name xgboost 54 | -project algo_public 55 | -DinputTableName="tl_xgb_train_2" 56 | -DmodelName="jz_xgb_model_2" 57 | -Deta="0.01" 58 | -Dobjective="reg:linear" 59 | -DitemDelimiter="," 60 | -Dseed="1024" 61 | -Dnum_round="1800" 62 | -DlabelColName="log_tl" 63 | -DenableSparse="false" 64 | -Dmax_depth="5" 65 | -Dsubsample="0.7" 66 | -Dcolsample_bytree="0.7" 67 | -Dgamma="0" 68 | -Dlambda="50" 69 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related" 70 | -Dbase_score="0.2" 71 | -Dmin_child_weight="20" 72 | -DkvDelimiter=":"; 73 | 74 | -- predict-fold-2 75 | PAI 76 | -name prediction 77 | -project algo_public 78 | -DinputTableName="juz_test_6_7_xgb" 79 | -DappendColNames="vid,log_tl" 80 | -DmodelName="jz_xgb_model_2" 81 | -DitemDelimiter="," 82 | -DresultColName="result" 83 | -Dlifecycle="28" 84 | -DoutputTableName="jz_xgb_pred_val_2" 85 | -DkvDelimiter=":" 86 | -DenableSparse="false" 87 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"; 88 | 89 | -- train-fold-3 90 | PAI 91 | -name xgboost 92 | -project algo_public 93 | -DinputTableName="tl_xgb_train_3" 94 | -DmodelName="jz_xgb_model_3" 95 | -Deta="0.01" 96 | -Dobjective="reg:linear" 97 | -DitemDelimiter="," 98 | -Dseed="1024" 99 | -Dnum_round="1800" 100 | -DlabelColName="log_tl" 101 | -DenableSparse="false" 102 | -Dmax_depth="5" 103 | -Dsubsample="0.7" 104 | -Dcolsample_bytree="0.7" 105 | -Dgamma="0" 106 | -Dlambda="50" 107 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related" 108 | -Dbase_score="0.2" 109 | -Dmin_child_weight="20" 110 | -DkvDelimiter=":"; 111 | 112 | -- predict-fold-3 113 | PAI 114 | -name prediction 115 | -project algo_public 116 | -DinputTableName="juz_test_6_7_xgb" 117 | -DappendColNames="vid,log_tl" 118 | -DmodelName="jz_xgb_model_3" 119 | -DitemDelimiter="," 120 | -DresultColName="result" 121 | -Dlifecycle="28" 122 | -DoutputTableName="jz_xgb_pred_val_3" 123 | -DkvDelimiter=":" 124 | -DenableSparse="false" 125 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"; 126 | 127 | -- train-fold-4 128 | PAI 129 | -name xgboost 130 | -project algo_public 131 | -DinputTableName="tl_xgb_train_4" 132 | -DmodelName="jz_xgb_model_4" 133 | -Deta="0.01" 134 | -Dobjective="reg:linear" 135 | -DitemDelimiter="," 136 | -Dseed="1024" 137 | -Dnum_round="1800" 138 | -DlabelColName="log_tl" 139 | -DenableSparse="false" 140 | -Dmax_depth="5" 141 | -Dsubsample="0.7" 142 | -Dcolsample_bytree="0.7" 143 | -Dgamma="0" 144 | -Dlambda="50" 145 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related" 146 | -Dbase_score="0.2" 147 | -Dmin_child_weight="20" 148 | -DkvDelimiter=":"; 149 | 150 | -- predict-fold-4 151 | PAI 152 | -name prediction 153 | -project algo_public 154 | -DinputTableName="juz_test_6_7_xgb" 155 | -DappendColNames="vid,log_tl" 156 | -DmodelName="jz_xgb_model_4" 157 | -DitemDelimiter="," 158 | -DresultColName="result" 159 | -Dlifecycle="28" 160 | -DoutputTableName="jz_xgb_pred_val_4" 161 | -DkvDelimiter=":" 162 | -DenableSparse="false" 163 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"; 164 | 165 | -- train-fold-5 166 | PAI 167 | -name xgboost 168 | -project algo_public 169 | -DinputTableName="tl_xgb_train_5" 170 | -DmodelName="jz_xgb_model_5" 171 | -Deta="0.01" 172 | -Dobjective="reg:linear" 173 | -DitemDelimiter="," 174 | -Dseed="1024" 175 | -Dnum_round="1800" 176 | -DlabelColName="log_tl" 177 | -DenableSparse="false" 178 | -Dmax_depth="5" 179 | -Dsubsample="0.7" 180 | -Dcolsample_bytree="0.7" 181 | -Dgamma="0" 182 | -Dlambda="50" 183 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related" 184 | -Dbase_score="0.2" 185 | -Dmin_child_weight="20" 186 | -DkvDelimiter=":"; 187 | 188 | -- predict-fold-5 189 | PAI 190 | -name prediction 191 | -project algo_public 192 | -DinputTableName="juz_test_6_7_xgb" 193 | -DappendColNames="vid,log_tl" 194 | -DmodelName="jz_xgb_model_5" 195 | -DitemDelimiter="," 196 | -DresultColName="result" 197 | -Dlifecycle="28" 198 | -DoutputTableName="jz_xgb_pred_val_5" 199 | -DkvDelimiter=":" 200 | -DenableSparse="false" 201 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"; 202 | --------------------------------------------------------------------------------