├── README.md
├── round1_rank2
    ├── features
    │   ├── word_label.txt
    │   └── num_label.txt
    ├── version.txt
    ├── README.md
    ├── team
    │   ├── data_process_by_Mongo.py
    │   └── team_feature_work.py
    └── code
    │   └── main.py
└── round2_rank10
    ├── xgb_model
        ├── split_5_fold_data_xgb.py
        ├── add_prefix_for_xgb_model.py
        ├── calc_xgb_test_loss_and_save.py
        └── xgb_in_odps.sql
    ├── submit_result.py
    ├── README.md
    ├── data_pre_process
        ├── origin_part1_part2_row2col.py
        ├── data_merge_split.py
        ├── get_num_features.py
        └── get_word_features.py
    ├── feature_selection
        ├── select_features_by_model.py
        ├── snp_drop_one_hot.py
        ├── classification_tl.py
        ├── predict_value_tl_gt_4.py
        ├── gbdt_log_model.py
        └── get_best_rounds.py
    └── every_predict_model
        ├── sys_gbdt_best_rounds.py
        ├── dia_gbdt_best_rounds.py
        ├── ldl_gbdt_best_rounds.py
        ├── hdl_gbdt_best_rounds.py
        └── tl_gbdt_best_rounds.py


/README.md:
--------------------------------------------------------------------------------
1 | ###  团队：Unreal	
2 | 
3 | ###  初赛Rank2， 复赛Rank10
4 | 
5 | ### 队友Github
6 | zhuifeng414: https://github.com/Zhuifeng414
7 | 
8 |    wzm	   : https://github.com/w-zm
9 | 


--------------------------------------------------------------------------------
/round1_rank2/features/word_label.txt:
--------------------------------------------------------------------------------
1 | vid, 121, 2302, 709, 3486, 3485, 30007, 3194, 101, 102, 113, 114, 116, 117, 118, 1001, 409, 413, 434, 439, 985, A201, A202, 4001, A301, 705, 3192, 3196, 3190, 3197, 3195, 3430, 100010, 3191, 3193, 426, 420, 421, 423, 430, 431, 976, 3399, 405, 429, 3400, 403, 3301, 422


--------------------------------------------------------------------------------
/round1_rank2/version.txt:
--------------------------------------------------------------------------------
 1 | 操作系统及主要package版本号：
 2 | 1、main1.py：
 3 | OS: Win10
 4 | Python: 3.6.1
 5 | pandas: 0.20.3
 6 | lightgbm: 2.1.0
 7 | numpy: 1.13.3
 8 | sklearn: 0.19.1
 9 | 
10 | 2、main2.ipynb：
11 | OS:       		win10
12 | numpy:    		1.14.2
13 | pandas:   		0.22.0
14 | xgboost:  		0.7
15 | scikit-learn:   0.19.1
16 | 
17 | 3、main3.ipynb：
18 | OS:       		win10
19 | numpy:    		1.14.2
20 | pandas:   		0.22.0
21 | xgboost:  		0.7
22 | scikit-learn:   0.19.1
23 | 


--------------------------------------------------------------------------------
/round2_rank10/xgb_model/split_5_fold_data_xgb.py:
--------------------------------------------------------------------------------
 1 | from odps import ODPS
 2 | import pandas as pd
 3 | from odps.df import DataFrame
 4 | from sklearn.model_selection import KFold
 5 | 
 6 | train_data = odps.get_table('juz_train_6_7_xgb').to_df().to_pandas()
 7 | kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 8 | for t, (train_index, test_index) in enumerate(kf.split(train_data), start=1):
 9 |     print('第{}次拆分...'.format(t))
10 |     x_train, x_test = train_data.iloc[train_index], train_data.iloc[test_index]
11 |     print(x_train.shape, x_test.shape)
12 |     train_odps = DataFrame(x_train)
13 |     test_odps = DataFrame(x_test)
14 |     train_odps.persist('tl_xgb_train_{}'.format(t))
15 |     test_odps.persist('tl_xgb_test_{}'.format(t))
16 |     
17 | 


--------------------------------------------------------------------------------
/round1_rank2/features/num_label.txt:
--------------------------------------------------------------------------------
1 | vid, 183, 190, 191, 192, 193, 314, 1115, 1117, 2403, 2404, 2405, 1814, 1815, 1840, 1845, 1850, 10002, 10003, 10004, 100005, 100006, 100007, 2174, 31, 32, 34, 37, 38, 39, 317, 315, 312, 1321, 320, 319, 2372, 33, 316, 313, 2406, 269024, 269005, 269021, 269012, 269019, 269009, 269013, 155, 269023, 269004, 269008, 269003, 269010, 1345, 269022, 300012, 2333, 1127, 269006, 300021, 809009, 809008, 979002, 979019, 1106, 979012, 979005, 269015, 269007, 300019, 669002, 979018, 2376, 269016, 269020, 269017, 269018, 269025, 2420, 269014, 669006, 669009, 979021, 979006, 979014, 979008, 979003, 979009, 979011, 300018, 300017, 2409, 300092, 669021, 979004, 809021, 979007, 979013, 979022, 979016, 669005, 979001, 669004, 1474, 300008, 809023, 139, 300009, 100012, 809010, 809025, 2386, 979015, 809001, 979017, 300011, 1112, 100014, 300013, 669001, 143, 1107, 809004, 10009, 300001, 809026, 979020, 300014, 809017, 100013, 979023, 424, 2177


--------------------------------------------------------------------------------
/round1_rank2/README.md:
--------------------------------------------------------------------------------
 1 | ###  团队：Unreal	
 2 | 
 3 | ###  Rank：2
 4 | 
 5 | ### 文件夹说明：
 6 | - data:数据文件夹
 7 | - features: 手动整理好的数值型和文字型特征，分别为num_label.txt和word_label.txt,数据清洗过程中需要使用这两个文件
 8 | - code: 主运行代码，我的数据融合了zhuifeng414的数据，可直接运行main.py
 9 | - team: 队友特征工程代码以及我的Mongodb操作代码
10 | - submit: 提交结果文件夹
11 | 
12 | 
13 | 
14 | ### PS：
15 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;我之前数据预处理部分并没有使用豆腐大佬分享的开源代码，是通过把数据存入Mongodb文件里，然后再转存为csv文件的。原始数据其中有个别同一个vid的table_id对应了多个结果，我这边对他们进行了拼接操作，具体逻辑可见压缩包中的team文件夹的data_process_by_Mongo.py。考虑到主办方工作任务比较繁重且代码整理的时间比较少，我这里还是和队友一起使用了开源的数据预处理代码，但两者结果可能存在一定的差异，最终也许会对提交的成绩有所影响，所以这里特此说明下。
16 | 
17 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**团队目前使用的是去掉缺失值为96%的数据**，A,B榜的最优成绩的舒张压，收缩压，血清高密度脂蛋白是基于去掉缺失值96%，血清低密度脂蛋白和血清甘油三酯是基于98%的，代码位置在team_feature_work.py的886行。
18 | 
19 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;这份代码是融合了我和zhuifeng414的特征，成绩A榜可以到0.02817，B榜可以到0.02792，B榜的最优成绩0.02764是用xgb进行融合的，那部分xgb代码是由wzm提供的。
20 | 
21 | ### 队友Github
22 | zhuifeng414: https://github.com/Zhuifeng414
23 | 
24 |    wzm	   : https://github.com/w-zm
25 | 


--------------------------------------------------------------------------------
/round2_rank10/xgb_model/add_prefix_for_xgb_model.py:
--------------------------------------------------------------------------------
 1 | from odps import ODPS
 2 | import pandas as pd
 3 | from odps.df import DataFrame
 4 | from sklearn.model_selection import KFold
 5 | import time
 6 | import numpy as np
 7 | 
 8 | import sys  
 9 | reload(sys)  
10 | sys.setdefaultencoding('utf8')
11 | 
12 | label = 'tl'
13 | train = odps.get_table('{}_juz_train_6_6_snp_onehot_22'.format(label)).to_df().to_pandas()
14 | test = odps.get_table('{}_juz_test_6_6_snp_onehot_22'.format(label)).to_df().to_pandas()
15 | print(train.shape, test.shape)
16 | 
17 | train['log_{}'.format(label)] = np.log(train[label])
18 | test['log_{}'.format(label)] = np.log(test[label])
19 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
20 | 
21 | for i in train.columns:
22 |     if i != 'vid' and not 'snp' in i and not 'log' in i and i not in predict_features:
23 |         train['jz_{}'.format(i)] = train[i]
24 |         test['jz_{}'.format(i)] = test[i]
25 |         predict_features.append(i)
26 | 
27 | train.drop(predict_features, axis=1, inplace=True)
28 | test.drop(predict_features, axis=1, inplace=True)
29 | print(train.shape, test.shape)
30 | 
31 | juz_train = DataFrame(train)
32 | juz_test = DataFrame(test)
33 | juz_train.persist('juz_train_6_7_xgb')
34 | juz_test.persist('juz_test_6_7_xgb')


--------------------------------------------------------------------------------
/round2_rank10/submit_result.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | sys = odps.get_table('sys_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:,['vid', 'sys']]
 5 | dia = odps.get_table('dia_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:,['vid', 'dia']]
 6 | tl = odps.get_table('tl_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:,['vid', 'tl']]
 7 | hdl = odps.get_table('hdl_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:, ['vid', 'hdl']]
 8 | ldl = odps.get_table('ldl_jz_5_fold_6_6_submit_22').to_df().to_pandas().loc[:, ['vid', 'ldl']]
 9 | 
10 | print(tl.sort_values(by=['tl'], ascending=False).head(15))
11 | 
12 | tl_xgb = odps.get_table('tl_xgb_result').to_df().to_pandas().loc[:,['vid', 'tl']]
13 | tl['tl'] = tl['tl']*0.7 + tl_xgb['tl']*0.35
14 | 
15 | sys_dia = pd.merge(sys, dia, on=['vid'], how='inner')
16 | sys_dia_tl = pd.merge(sys_dia, tl, on=['vid'], how='inner')
17 | sys_dia_tl_hdl = pd.merge(sys_dia_tl, hdl, on=['vid'], how='inner')
18 | submit = pd.merge(sys_dia_tl_hdl, ldl, on=['vid'], how='inner')
19 | 
20 | submit.loc[submit['vid'] == '7b437e2632c91be2a0789adabce4b953', 'tl'] = 6
21 | print(submit.describe())
22 | print(submit.head(5))
23 | print(submit.sort_values(by=['tl'], ascending=False).head(15))
24 | 
25 | juz_submit = DataFrame(submit)
26 | juz_submit.persist('meinian_round2_submit_b')


--------------------------------------------------------------------------------
/round2_rank10/README.md:
--------------------------------------------------------------------------------
 1 | ###  团队：Unreal	
 2 | 
 3 | ###  Rank：10
 4 | 
 5 | 
 6 | ### 代码说明
 7 | 
 8 | 
 9 | #### data_pre_process
10 | 
11 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1.origin_part1_part2_row2col：进行原始数据转换，包括行转列，去重等；
12 | 
13 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2. get_num_features，生成数值特征的表；
14 | 
15 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3. get_word_features，生成文字特征的表；
16 | 
17 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4. data_merge_split，合并数值、文字以及snp数据。
18 | 
19 | 
20 | #### feature_selection
21 | 
22 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1. 分别针对sys, dia, tl, hdl, ldl 运行snp_drop_one_hot, 得出五个对应特征的数据集，这一步骤主要是删去gbdt预训练中不重要的snp特征，然后进行one_hot编码；
23 | 
24 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2. 分别针对sys, dia, tl, hdl, ldl 运行get_best_rounds, 得出a步骤五个数据对应的五折最优迭代次数。
25 | 
26 | #### every_prediction_model
27 | 
28 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;运行所有文件，得出sys，dia，tl，hdl，ldl在测试集上的预测结果。
29 | 
30 | #### xgb_model
31 | 
32 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1. add_prefix_for_xgb_model，得出带有前缀的特征数据集；
33 | 
34 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2. split_5_fold_data_xgb，分割五折训练的数据；
35 | 
36 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3. baseline_xgboost_jz，训练xgb模型；
37 | 
38 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4. calc_xgb_test_loss_and_save, 将c步骤中的五个tl的预测结果融合并取均值。
39 |    
40 | 
41 | ####  submit_result
42 | 提交最终结果，最终结果是sys，dia，hdl，ldl为gbdt单模型，tl为gbdt和xgb的加权融合，比例为0.7和0.35。
43 | 
44 | A榜单模型GBDT最优得分为0.0318，B榜单模型GBDT最优0.0321，tl加权融合后最优成绩0.0319。
45 | 
46 | ### 队友Github
47 | zhuifeng414: https://github.com/Zhuifeng414
48 | 
49 |    wzm	   : https://github.com/w-zm
50 | 


--------------------------------------------------------------------------------
/round2_rank10/data_pre_process/origin_part1_part2_row2col.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import pandas as pd
 3 | from odps import ODPS
 4 | from odps.df import DataFrame
 5 | 
 6 | import sys  
 7 | reload(sys)  
 8 | sys.setdefaultencoding('utf8')
 9 | 
10 | # 读取数据
11 | part_1 = odps.get_table('meinian_round2_data_part1').to_df().to_pandas()
12 | part_2 = odps.get_table('meinian_round2_data_part2').to_df().to_pandas()
13 | part_1_2 = pd.concat([part_1,part_2])
14 | part_1_2 = pd.DataFrame(part_1_2).sort_values('vid').reset_index(drop=True)
15 | begin_time = time.time()
16 | 
17 | # 重复数据的拼接操作
18 | def merge_table(df):
19 |     df['results'] = df['results'].astype(str)
20 |     if df.shape[0] > 1:
21 |         merge_df = "$".join(list(df['results']))
22 |     else:
23 |         merge_df = df['results'].values[0]
24 |     return merge_df
25 | # 数据简单处理
26 | print(part_1_2.shape)
27 | is_happen = part_1_2.groupby(['vid','test_id']).size().reset_index()
28 | # 重塑index用来去重
29 | is_happen['new_index'] = is_happen['vid'] + '_' + is_happen['test_id']
30 | is_happen_new = is_happen[is_happen[0]>1]['new_index']
31 | 
32 | part_1_2['new_index'] = part_1_2['vid'] + '_' + part_1_2['test_id']
33 | 
34 | unique_part = part_1_2[part_1_2['new_index'].isin(list(is_happen_new))]
35 | unique_part = unique_part.sort_values(['vid','test_id'])
36 | no_unique_part = part_1_2[~part_1_2['new_index'].isin(list(is_happen_new))]
37 | print('begin')
38 | part_1_2_not_unique = unique_part.groupby(['vid','test_id']).apply(merge_table).reset_index()
39 | part_1_2_not_unique.rename(columns={0:'results'},inplace=True)
40 | tmp = pd.concat([part_1_2_not_unique,no_unique_part[['vid','test_id','results']]])
41 | # 行列转换
42 | print('finish')
43 | tmp = tmp.pivot(index='vid',values='results',columns='test_id')
44 | print(tmp.shape)
45 | combine_data = DataFrame(tmp,unknown_as_string=True)
46 | combine_data.persist('origin_data_combine_part1_part2')
47 | print('total time',time.time() - begin_time)


--------------------------------------------------------------------------------
/round2_rank10/xgb_model/calc_xgb_test_loss_and_save.py:
--------------------------------------------------------------------------------
 1 | from odps import ODPS
 2 | import pandas as pd
 3 | from odps.df import DataFrame
 4 | import numpy as np
 5 | 
 6 | 
 7 | def eval_metric(pred, labels):
 8 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
 9 | 
10 | 
11 | '''
12 | ('fold 1: ', 0.088066106616791956)
13 | ('fold 2: ', 0.087444759182314105)
14 | ('fold 3: ', 0.097941499769017726)
15 | ('fold 4: ', 0.078793753494365307)
16 | ('fold 5: ', 0.086734232908105002)
17 | ('total loss: ', 0.087796043720344913)
18 | '''
19 | 
20 | # 这边的val都是预测了test b榜的值，然后保存表
21 | # 如果要计算xgb在验证集上的loss，需要在xgb的模型修改代码，并使用底端的注释内容进行计算
22 | val_1 = odps.get_table('jz_xgb_pred_val_1').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']]
23 | val_2 = odps.get_table('jz_xgb_pred_val_2').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']]
24 | val_3 = odps.get_table('jz_xgb_pred_val_3').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']]
25 | val_4 = odps.get_table('jz_xgb_pred_val_4').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']]
26 | val_5 = odps.get_table('jz_xgb_pred_val_5').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']]
27 | 
28 | xgb_result = val_1.loc[:, ['vid']]
29 | 
30 | xgb_result['tl'] = np.exp((val_1['result'] + val_2['result'] + val_3['result'] + val_4['result'] + val_5['result'])/5)
31 | test_odps = DataFrame(xgb_result)
32 | test_odps.persist('tl_xgb_result')
33 | 
34 | '''
35 | val = pd.concat([val_1, val_2, val_3, val_4, val_5])
36 | print('fold 1: ', eval_metric(np.exp(val_1['result']), np.exp(val_1['log_tl'])))
37 | print('fold 2: ', eval_metric(np.exp(val_2['result']), np.exp(val_2['log_tl'])))
38 | print('fold 3: ', eval_metric(np.exp(val_3['result']), np.exp(val_3['log_tl'])))
39 | print('fold 4: ', eval_metric(np.exp(val_4['result']), np.exp(val_4['log_tl'])))
40 | print('fold 5: ', eval_metric(np.exp(val_5['result']), np.exp(val_5['log_tl'])))
41 | print('total loss: ', eval_metric(np.exp(val['result']), np.exp(val['log_tl'])))
42 | '''


--------------------------------------------------------------------------------
/round2_rank10/data_pre_process/data_merge_split.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import re
 3 | import pandas as pd
 4 | from odps import ODPS
 5 | from odps.df import DataFrame
 6 | import numpy as np
 7 | from collections import Iterable
 8 | from sklearn import preprocessing
 9 | from itertools import combinations
10 | 
11 | import sys  
12 | reload(sys)  
13 | sys.setdefaultencoding('utf8')
14 | 
15 | # my all features
16 | train = odps.get_table('meinian_round2_train').to_df().to_pandas()
17 | test = odps.get_table('meinian_round2_submit_b').to_df().to_pandas()
18 | num_data = odps.get_table('juz_num_data_5_31').to_df().to_pandas()
19 | word_data = odps.get_table('juz_word_data_5_30').to_df().to_pandas()
20 | 
21 | # add wl word features
22 | wl_word = odps.get_table('pre_txt_features_b').to_df().to_pandas()
23 | 
24 | gene_data = odps.get_table('meinian_round2_snp').to_df().to_pandas()
25 | 
26 | word_data = pd.merge(word_data, wl_word, on='vid', how='inner')
27 | # fix feature 314
28 | num_data.loc[num_data['314']<=1, '314'] = num_data.loc[num_data['314']<=1, '314'] * 100
29 | 
30 | lbl = preprocessing.LabelEncoder()
31 | 
32 | for c in gene_data.columns:
33 |     if c not in ['vid']:
34 |         gene_data[c] = lbl.fit_transform(gene_data[c])
35 | 
36 | print('final word data shape: ', word_data.shape)
37 | print('final num data shape: ', num_data.shape)
38 | print('final gene data shape: ', gene_data.shape)
39 |         
40 | merge_tmp = pd.merge(num_data, word_data, on='vid', how='inner')
41 | merge_tmp = pd.merge(merge_tmp, gene_data, on='vid', how='left')
42 | 
43 | print('final data shape: ', merge_tmp.shape)
44 | 
45 | train_merge = pd.merge(train, merge_tmp, on='vid', how='left')
46 | test_merge = pd.merge(test, merge_tmp, on='vid', how='left')
47 | 
48 | # fix some value of hdl and dia
49 | train_merge.loc[train_merge['vid'] == '605ebf5c6173cd3aab071060c9618b79', 'hdl'] = 1.28
50 | train_merge.loc[train_merge['vid'] == 'c6aec5461b1c5cca1c4ead3d4c2b83d9', 'dia'] = 90
51 | train_merge.fillna(-999, inplace=True)
52 | test_merge.fillna(-999, inplace=True)
53 | print('final train shape:{}, test shape:{} '.format(train_merge.shape, test_merge.shape))
54 | juz_train = DataFrame(train_merge)
55 | juz_test = DataFrame(test_merge)
56 | juz_train.persist('juz_train_6_6_final')
57 | juz_test.persist('juz_test_6_6_final')


--------------------------------------------------------------------------------
/round2_rank10/feature_selection/select_features_by_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from odps.df import DataFrame
 4 | from sklearn.feature_selection import SelectFromModel
 5 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
 6 | from sklearn.linear_model import RidgeCV
 7 | 
 8 | train = odps.get_table('jz_combine_tl_train_6_2').to_df().to_pandas()
 9 | test = odps.get_table('jz_combine_tl_test_6_2').to_df().to_pandas()
10 | 
11 | predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
12 | use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
13 | x_train = train.loc[:, use_features]
14 | label = train['tl']
15 | 
16 | gbdt = GradientBoostingRegressor(random_state=1)
17 | rf = RandomForestRegressor(random_state=1)
18 | l2 = RidgeCV()
19 | 
20 | sfm_gbdt = SelectFromModel(gbdt, threshold=0.001)
21 | sfm_gbdt.fit_transform(x_train, label)
22 | gbdt_features = set(x_train.columns[sfm_gbdt.get_support()])
23 | print('*************************************')
24 | print(gbdt_features)
25 | 
26 | 
27 | sfm_rf = SelectFromModel(rf, threshold=0.001)
28 | sfm_rf.fit_transform(x_train, label)
29 | rf_features = set(x_train.columns[sfm_rf.get_support()])
30 | print('*************************************')
31 | print(rf_features)
32 | 
33 | print(gbdt_features & rf_features)
34 | sfm_l2 = SelectFromModel(l2, threshold=0.5)
35 | sfm_l2.fit_transform(x_train, label)
36 | l2_features = set(x_train.columns[sfm_l2.get_support()])
37 | print('*************************************')
38 | print(l2_features)
39 | 
40 | final_features = list(gbdt_features | rf_features | l2_features)
41 | # choose top k features
42 | #final_features = list((gbdt_features & rf_features) | l2_features)
43 | print('gbdt model has {} features'.format(len(gbdt_features)))
44 | print('rf model has {} features'.format(len(rf_features)))
45 | print('l2 model has {} features'.format(len(l2_features)))
46 | print('final has {} features'.format(len(final_features)))
47 | print('*************************************')
48 | print(final_features)
49 | print('*************************************')          
50 | 
51 | final_features.extend(['vid', 'tl'])
52 | train_final = DataFrame(train.loc[:, final_features])
53 | train_final.persist('combine_tl_train_6_2')
54 | test_final = DataFrame(test.loc[:, final_features])
55 | test_final.persist('combine_tl_test_6_2')


--------------------------------------------------------------------------------
/round2_rank10/feature_selection/snp_drop_one_hot.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import re
 3 | import pandas as pd
 4 | from odps import ODPS
 5 | from odps.df import DataFrame
 6 | import numpy as np
 7 | from collections import Iterable
 8 | from sklearn.ensemble import GradientBoostingRegressor
 9 | from sklearn import preprocessing
10 | from itertools import combinations
11 | 
12 | import sys  
13 | reload(sys)  
14 | sys.setdefaultencoding('utf8')
15 | 
16 | # 选择超过设置阈值的snp数据进行one hot
17 | def get_one_hot_list(data_frame, pred_feac, threshold=10):
18 |     print('Now we extract snp features for  {} ...'.format(pred_feac))
19 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl','vid']
20 |     use_features = [i for i in data_frame.columns if i not in predict_features]
21 |     x_train = data_frame.loc[:, use_features]
22 |     label = data_frame[pred_feac]
23 | 
24 |     gbdt = GradientBoostingRegressor(random_state=1, n_estimators=100)
25 |     gbdt.fit(x_train, label)
26 |     feature_imp = gbdt.feature_importances_
27 |     df = pd.DataFrame()
28 |     df['feature'] = x_train.columns
29 |     df['imp'] = feature_imp
30 |     df.sort_values(by='imp', ascending=False, inplace=True)
31 |     snp_list = [s for s in df['feature'] if s.startswith('snp')][:threshold]
32 |     return snp_list
33 | 
34 | 	
35 | # sys snp one-hot threshold:14  use_data_set: juz_train_6_6_add_wzm_for145_final2  generate_data_set: sys_juz_train_6_6_snp_onehot_22
36 | # dia snp one-hot threshold:10  use_data_set: juz_train_6_6_add_wzm_for145_final2  generate_data_set: dia_juz_train_6_6_snp_onehot_22
37 | # tl  snp one-hot threshold:14  use_data_set: juz_train_6_6_add_wzm_onlytl_final   generate_data_set: tl_juz_train_6_6_snp_onehot_22
38 | # hdl snp one-hot threshold:10  use_data_set: juz_train_6_6_add_wzm_for145_final   generate_data_set: ldl_juz_train_6_6_snp_onehot_22
39 | # ldl snp one-hot threshold:1   use_data_set: juz_train_6_6_add_wzm_for145_final   generate_data_set: hdl_juz_train_6_6_snp_onehot_22
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     use_label = 'hdl'
44 |     train = odps.get_table('juz_train_6_6_add_wzm_for145_final').to_df().to_pandas()
45 |     test = odps.get_table('juz_test_6_6_add_wzm_for145_final').to_df().to_pandas()
46 |     print(train.shape, test.shape)
47 |     gene_list = get_one_hot_list(train, use_label, 14)
48 |     train.replace(-999, np.nan,inplace=True)
49 |     test.replace(-999, np.nan, inplace=True)
50 |     
51 |     drop_snp = [s for s in train.columns if 'snp' in s]
52 |     train.drop(drop_snp, axis=1, inplace=True)
53 |     test.drop(drop_snp, axis=1, inplace=True)
54 | 
55 |     gene_data = odps.get_table('meinian_round2_snp').to_df().to_pandas()
56 |     snp_data = pd.get_dummies(gene_data.loc[:, gene_list])
57 |     snp_data['vid'] = gene_data['vid'].values
58 |     for s in snp_data.columns:
59 |         if s != 'vid':
60 |             snp_data[s] = snp_data[s].astype(int)
61 | 
62 |     train_merge = pd.merge(train, snp_data, on='vid', how='left')
63 |     test_merge = pd.merge(test, snp_data, on='vid', how='left')
64 | 
65 |     train_merge.fillna(-999, inplace=True)
66 |     test_merge.fillna(-999, inplace=True)
67 |     print('final train shape:{}, test shape:{} '.format(train_merge.shape, test_merge.shape))
68 | 
69 |     juz_train = DataFrame(train_merge)
70 |     juz_test = DataFrame(test_merge)
71 |     juz_train.persist('{}_juz_train_6_6_snp_onehot_22'.format(use_label))
72 |     juz_test.persist('{}_juz_test_6_6_snp_onehot_22'.format(use_label))


--------------------------------------------------------------------------------
/round1_rank2/team/data_process_by_Mongo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2018/4/10 0010 上午 9:10
  4 | # @Author  : Juzphy
  5 | 
  6 | import pandas as pd
  7 | import time
  8 | from pymongo import MongoClient
  9 | from collections import defaultdict
 10 | 
 11 | 
 12 | '''
 13 |     save data to MongoDB and export the mongo data to csv file.
 14 | '''
 15 | 
 16 | 
 17 | def feature_data():
 18 |     df = pd.read_csv('../data/meinian_round1_data_part1_20180408.txt', delimiter='$')
 19 |     df2 = pd.read_csv('./data/meinian_round1_data_part2_20180408.txt', delimiter='$')
 20 |     df = df.append(df2, ignore_index=True)
 21 |     df = df.fillna('')
 22 |     return df
 23 | 
 24 | 
 25 | def data_load(path):
 26 |     data = pd.read_csv(path, encoding='gbk')
 27 |     data = data.fillna('')
 28 |     return data
 29 | 
 30 | 
 31 | def match(feature, data_frame, hostname, db_name, set_name, port=27017):
 32 |     count = 0
 33 |     time_start = time.time()
 34 |     df_group = (d for d in feature.groupby(by='vid'))
 35 |     vid_value = set(data_frame['vid'].values)
 36 |     mongo_conn = MongoClient(hostname, port)
 37 |     db_set = mongo_conn[db_name][set_name]
 38 |     for j in df_group:
 39 |         if j[0] in vid_value:
 40 |             count += 1
 41 |             temp_dict = data_frame[data_frame['vid'] == j[0]].to_dict()
 42 |             temp_dict = {'_id' if k == 'vid' else k: n for k, v in temp_dict.items() for n in v.values()}
 43 |             if len(j[1]['table_id']) > len(j[1]['table_id'].unique()):
 44 |                 j[1].index = range(j[1].shape[0])
 45 |                 table_dict = defaultdict(int)
 46 |                 for t in j[1]['table_id']:
 47 |                     table_dict[t] += 1
 48 |                 beyond_one = [k for k, v in table_dict.items() if v > 1]
 49 |                 other = [k for k, v in table_dict.items() if v == 1]
 50 |                 other_index = j[1][j[1]['table_id'].isin(other)].index
 51 |                 temp = dict(zip(j[1]['table_id'].iloc[other_index], j[1]['field_results'].iloc[other_index]))
 52 |                 beyond_dict = {k: '$'.join(j[1]['field_results'].iloc[j[1][j[1]['table_id'] == k].index].fillna('')) for
 53 |                                k in beyond_one}
 54 |                 temp.update(beyond_dict)
 55 |             else:
 56 |                 temp = dict(zip(j[1]['table_id'], j[1]['field_results']))
 57 |             temp_dict.update(temp)
 58 |             db_set.save(temp_dict)
 59 |             print('vid of {} has writen.'.format(j[0]))
 60 |     print("total writen {0} records and spend {1} s.".format(count, round(time.time() - time_start), 2))
 61 | 
 62 | 
 63 | def feature_count(hostname, db_name, set_name, port=27017, count_threshold=0.4, name='train_set'):
 64 |     mongo_conn = MongoClient(hostname, port)
 65 |     mongo_set = mongo_conn[db_name][set_name]
 66 |     cursor = mongo_set.find()
 67 |     feature_dict = defaultdict(list)
 68 |     size = 0
 69 |     for c in cursor:
 70 |         size += 1
 71 |         for k in c.keys():
 72 |             if k != '_id':
 73 |                 feature_dict[k].append(c['_id'])
 74 | 
 75 |     feature_lt_threshold = [code for code, b_list in feature_dict.items() if len(b_list)/size < count_threshold]
 76 |     feature_gt_threshold = set(feature_dict.keys()) - set(feature_lt_threshold)
 77 |     barcode_gt_threshold = list({f for fd in feature_gt_threshold for f in feature_dict[fd]})
 78 |     temp = {t: 0 for t in feature_lt_threshold}
 79 |     data = pd.DataFrame(list(mongo_set.find({"_id": {"$in": barcode_gt_threshold}}, temp)))
 80 |     data.to_csv('../data/{}.csv'.format(name), index=None, encoding='gbk')
 81 |     print('{}.csv has been successfully saved.'.format(name))
 82 | 
 83 | 
 84 | def mongo2csv(hostname, db_name, set_name, port=27017, name='train_set'):
 85 |     mongo_conn = MongoClient(hostname, port)
 86 |     mongo_set = mongo_conn[db_name][set_name]
 87 |     data = pd.DataFrame(list(mongo_set.find()))
 88 |     data.to_csv('./data/{}.csv'.format(name), index=None, encoding='gbk')
 89 |     print('{}.csv has been successfully saved.'.format(name))
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 |     all_features = feature_data()
 94 |     # train = data_load('./data/origin_train.csv')
 95 |     # a_test = data_load('./data/origin_test_a.csv')
 96 |     b_test = data_load('./data/origin_test_b.csv')
 97 |     # print(train.shape, a_test.shape)
 98 |     match(all_features, b_test, 'localhost', 'meinian', 'test_b')
 99 |     # match(feature_data, a_test, 'localhost', 'meinian', 'test_data')
100 |     # feature_count('10.10.0.7', 'meinian', 'train_data', name='new_meinian_train')
101 |     # feature_count('10.10.0.7', 'meinian', 'test_data', name='new_meinian_test')
102 |     mongo2csv('localhost', 'meinian', 'test_b', name='meinian_test_b')
103 | 


--------------------------------------------------------------------------------
/round2_rank10/feature_selection/classification_tl.py:
--------------------------------------------------------------------------------
 1 | from odps import ODPS
 2 | import pandas as pd
 3 | from sklearn.ensemble import GradientBoostingClassifier
 4 | from sklearn.metrics import classification_report
 5 | from odps.df import DataFrame
 6 | from sklearn.model_selection import KFold
 7 | import time
 8 | import numpy as np
 9 | 
10 | 
11 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model):
12 |     print(submission_data.head())
13 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
14 |     value4preds = df['pos_4']
15 |     train_data = df.loc[:, use_feature]
16 |     print(train_data.shape, true_test.shape)
17 |     pred_labels = np.zeros(df.shape[0])
18 |     submission_label = np.zeros((true_test.shape[0], 5))
19 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
20 |     five_fold_index = list(kf.split(train_data, value4preds))
21 |     
22 |     train_index_1, test_index_1 = five_fold_index[0]
23 |     print('第1次训练...')
24 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
25 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
26 |     gbdt_model.fit(x_train_1, y_train_1)
27 |     pred_labels[x_test_1.index] = np.where(gbdt_model.predict(x_test_1) > 0.5, 1, 0)
28 |     submission_label[:, 0] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0)
29 |     print('第1次训练结束')
30 |     print('*******************************************************************')
31 |     train_index_2, test_index_2 = five_fold_index[1]
32 |     print('第2次训练...')
33 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
34 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
35 |     gbdt_model.fit(x_train_2, y_train_2)
36 |     pred_labels[x_test_2.index] = np.where(gbdt_model.predict(x_test_2) > 0.5, 1, 0)
37 |     submission_label[:, 1] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0)
38 |     print('第2次训练结束')
39 |     print('*******************************************************************')
40 |     train_index_3, test_index_3 = five_fold_index[2]
41 |     print('第3次训练...')
42 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
43 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
44 |     gbdt_model.fit(x_train_3, y_train_3)
45 |     pred_labels[x_test_3.index] = np.where(gbdt_model.predict(x_test_3) > 0.5, 1, 0)
46 |     submission_label[:, 2] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0)
47 |     print('第3次训练结束')
48 |     print('*******************************************************************')
49 |     train_index_4, test_index_4 = five_fold_index[3]
50 |     print('第4次训练...')
51 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
52 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
53 |     gbdt_model.fit(x_train_4, y_train_4)
54 |     pred_labels[x_test_4.index] = np.where(gbdt_model.predict(x_test_4) > 0.5, 1, 0)
55 |     submission_label[:, 3] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0)
56 |     print('第4次训练结束')
57 |     print('*******************************************************************')
58 |     train_index_5, test_index_5 = five_fold_index[4]
59 |     print('第5次训练...')
60 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
61 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
62 |     gbdt_model.fit(x_train_5, y_train_5)
63 |     pred_labels[x_test_5.index] = np.where(gbdt_model.predict(x_test_5) > 0.5, 1, 0)
64 |     submission_label[:, 4] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0)
65 |     print('第5次训练结束')
66 |     print('*******************************************************************')
67 |     submission_data['pos_4'] = np.where(np.sum(submission_label, axis=1) >= 1, 1, 0)
68 |     print(classification_report(pred_labels, value4preds))
69 |     print(submission_data[submission_data['pos_4']==1])
70 |     sub_class = DataFrame(submission_data[submission_data['pos_4']==1], unknown_as_string=True)
71 |     sub_class.persist('tl_gt_4_vid_6_6')
72 | 
73 | # A榜使用了tl的高低值分类，B榜没有
74 | if __name__ == "__main__":
75 |     train = odps.get_table('juz_train_6_6_final').to_df().to_pandas()
76 |     train['pos_4'] = train['tl'].apply(lambda x: 1 if x > 4 else 0)
77 |     test = odps.get_table('juz_test_6_6_final').to_df().to_pandas()
78 |     class_result = test.loc[:, ['vid', 'tl']]
79 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
80 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
81 |     test_data = test.loc[:, use_features]
82 |     start = time.time()
83 |     model =  GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500, max_depth=5, subsample=0.7,
84 |                                                random_state=1, verbose=0, min_samples_leaf=50)
85 |     for i, j in enumerate(predict_features):
86 |         if j in ['tl']:
87 |             gbdt_model(train, j, use_features, test_data, class_result, model)


--------------------------------------------------------------------------------
/round2_rank10/feature_selection/predict_value_tl_gt_4.py:
--------------------------------------------------------------------------------
 1 | from odps import ODPS
 2 | import pandas as pd
 3 | from sklearn.ensemble import GradientBoostingRegressor
 4 | from sklearn.linear_model import BayesianRidge
 5 | from odps.df import DataFrame
 6 | from sklearn.model_selection import KFold
 7 | import time
 8 | import numpy as np
 9 | 
10 | 
11 | def eval_metric(pred, labels):
12 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
13 | 
14 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model):
15 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
16 |     value4preds = np.log(df[label])
17 |     train_data = df.loc[:, use_feature]
18 |     print(train_data.shape)
19 |     scores = np.zeros(len(value4preds))
20 |     submission_scores = np.zeros((len(submission_data), 5))
21 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
22 |     five_fold_index = list(kf.split(train_data, value4preds))
23 |     
24 |     train_index_1, test_index_1 = five_fold_index[0]
25 |     print('第1次训练...')
26 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
27 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
28 |     gbdt_model.fit(x_train_1, y_train_1)
29 |     scores[test_index_1] = np.exp(gbdt_model.predict(x_test_1))
30 |     submission_scores[:, 0] = gbdt_model.predict(true_test)
31 |     print('第1次训练结束')
32 |     print('*******************************************************************')
33 |     train_index_2, test_index_2 = five_fold_index[1]
34 |     print('第2次训练...')
35 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
36 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
37 |     gbdt_model.fit(x_train_2, y_train_2)
38 |     scores[test_index_2] = np.exp(gbdt_model.predict(x_test_2))
39 |     submission_scores[:, 1] = gbdt_model.predict(true_test)
40 |     print('第2次训练结束')
41 |     print('*******************************************************************')
42 |     train_index_3, test_index_3 = five_fold_index[2]
43 |     print('第3次训练...')
44 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
45 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
46 |     gbdt_model.fit(x_train_3, y_train_3)
47 |     scores[test_index_3] = np.exp(gbdt_model.predict(x_test_3))
48 |     submission_scores[:, 2] = gbdt_model.predict(true_test)
49 |     print('第3次训练结束')
50 |     print('*******************************************************************')
51 |     train_index_4, test_index_4 = five_fold_index[3]
52 |     print('第4次训练...')
53 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
54 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
55 |     gbdt_model.fit(x_train_4, y_train_4)
56 |     scores[test_index_4] = np.exp(gbdt_model.predict(x_test_4))
57 |     submission_scores[:, 3] = gbdt_model.predict(true_test)
58 |     print('第4次训练结束')
59 |     print('*******************************************************************')
60 |     train_index_5, test_index_5 = five_fold_index[4]
61 |     print('第5次训练...')
62 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
63 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
64 |     gbdt_model.fit(x_train_5, y_train_5)
65 |     scores[test_index_5] = np.exp(gbdt_model.predict(x_test_5))
66 |     submission_scores[:, 4] = gbdt_model.predict(true_test)
67 |     print('第5次训练结束')
68 |     print('*******************************************************************')
69 |     submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3)
70 | 
71 |     
72 | # A榜使用了tl的高低值分类，B榜没有
73 | if __name__ == "__main__":
74 |     train = odps.get_table('juz_train_6_6').to_df().to_pandas()
75 |     test = odps.get_table('juz_test_6_6').to_df().to_pandas()
76 |     submission = odps.get_table('tl_jz_5_fold_6_6_submit_22').to_df().to_pandas()
77 |     vid_gt_4 = odps.get_table('tl_gt_4_vid_6_6').to_df().to_pandas()['vid']
78 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
79 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features and t != 'pos_4' and not 'log' in t]
80 |     pos_eq_1 = test[test['vid'].isin(vid_gt_4)]
81 |     test_eq_1 = pos_eq_1.loc[:, use_features]
82 |     submission_gt_4 = pos_eq_1.loc[:, ['vid', 'tl']]
83 |     train_gt_4 = train[train['tl'] >= 4]
84 |     train_gt_4.index = list(range(train_gt_4.shape[0]))
85 |     model =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=800, max_depth=5, subsample=0.8,
86 |                                                random_state=1, verbose=1, min_samples_leaf=20)
87 |     gbdt_model(train_gt_4, 'tl', use_features, test_eq_1, submission_gt_4, model)
88 |     gt_4_index = submission[submission['vid'].isin(submission_gt_4['vid'])].index
89 |     submission_temp = submission.loc[gt_4_index, ['vid', 'tl']]
90 |     merge_fat = pd.merge(submission_temp, submission_gt_4, on='vid')
91 |     temp_columns = [tc for tc in merge_fat.columns if tc != 'vid']
92 |     replace_num = np.max(merge_fat.loc[:, temp_columns], axis=1)
93 |     submission.loc[gt_4_index, 'tl'] = replace_num.values
94 |     print(submission.sort_values(by=['tl'], ascending=False))
95 |     sub_final = DataFrame(submission)
96 |     sub_final.persist('tl_jz_5_fold_6_6_22_submit_modified_high_value')
97 | 


--------------------------------------------------------------------------------
/round2_rank10/feature_selection/gbdt_log_model.py:
--------------------------------------------------------------------------------
  1 | from odps import ODPS
  2 | import pandas as pd
  3 | from sklearn.ensemble import GradientBoostingRegressor
  4 | from odps.df import DataFrame
  5 | from sklearn.model_selection import KFold
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | def eval_metric(pred, labels):
 11 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
 12 | 
 13 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model):
 14 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
 15 |     value4preds = np.log(df[label])
 16 |     train_data = df.loc[:, use_feature]
 17 |     print(train_data.shape)
 18 |     scores = np.zeros(len(value4preds))
 19 |     submission_scores = np.zeros((len(submission_data), 5))
 20 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 21 |     five_fold_index = list(kf.split(train_data, value4preds))
 22 |     
 23 |     train_index_1, test_index_1 = five_fold_index[0]
 24 |     print('第1次训练...')
 25 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
 26 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
 27 |     gbdt_model.fit(x_train_1, y_train_1)
 28 |     scores[test_index_1] = np.exp(gbdt_model.predict(x_test_1))
 29 |     submission_scores[:, 0] = gbdt_model.predict(true_test)
 30 |     print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1)))
 31 |     print('第1次训练结束')
 32 |     print('*******************************************************************')
 33 |     train_index_2, test_index_2 = five_fold_index[1]
 34 |     print('第2次训练...')
 35 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
 36 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
 37 |     gbdt_model.fit(x_train_2, y_train_2)
 38 |     scores[test_index_2] = np.exp(gbdt_model.predict(x_test_2))
 39 |     submission_scores[:, 1] = gbdt_model.predict(true_test)
 40 |     print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2)))
 41 |     print('第2次训练结束')
 42 |     print('*******************************************************************')
 43 |     train_index_3, test_index_3 = five_fold_index[2]
 44 |     print('第3次训练...')
 45 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
 46 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
 47 |     gbdt_model.fit(x_train_3, y_train_3)
 48 |     scores[test_index_3] = np.exp(gbdt_model.predict(x_test_3))
 49 |     submission_scores[:, 2] = gbdt_model.predict(true_test)
 50 |     print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3)))
 51 |     print('第3次训练结束')
 52 |     print('*******************************************************************')
 53 |     train_index_4, test_index_4 = five_fold_index[3]
 54 |     print('第4次训练...')
 55 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
 56 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
 57 |     gbdt_model.fit(x_train_4, y_train_4)
 58 |     scores[test_index_4] = np.exp(gbdt_model.predict(x_test_4))
 59 |     submission_scores[:, 3] = gbdt_model.predict(true_test)
 60 |     print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4)))
 61 |     print('第4次训练结束')
 62 |     print('*******************************************************************')
 63 |     train_index_5, test_index_5 = five_fold_index[4]
 64 |     print('第5次训练...')
 65 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
 66 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
 67 |     gbdt_model.fit(x_train_5, y_train_5)
 68 |     scores[test_index_5] = np.exp(gbdt_model.predict(x_test_5))
 69 |     submission_scores[:, 4] = gbdt_model.predict(true_test)
 70 |     print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5)))
 71 |     print('第5次训练结束')
 72 |     print('*******************************************************************')
 73 |     submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3)
 74 |     return eval_metric(scores, np.exp(value4preds))
 75 | 
 76 | 
 77 | # snp one-hot:14  'sys': 0.013770852754101864  juz_train_6_6_add_wzm_for145_final2 sys_juz_train_6_6_snp_onehot_22
 78 | # snp one-hot:10  'dia': 0.01811632794174798   juz_train_6_6_add_wzm_for145_final2 dia_juz_train_6_6_snp_onehot_22
 79 | # snp one-hot:14  'tl':  0.088753086260020458  juz_train_6_6_add_wzm_onlytl_final  tl_juz_train_6_6_snp_onehot_22
 80 | # snp one-hot:10  'hdl': 0.011026393729362835  juz_train_6_6_add_wzm_for145_final  ldl_juz_train_6_6_snp_onehot_22
 81 | # snp one-hot:1   'ldl': 0.033357708093281487  juz_train_6_6_add_wzm_for145_final  hdl_juz_train_6_6_snp_onehot_22
 82 | if __name__ == "__main__":
 83 |     use_label = 'dia'
 84 |     train = odps.get_table('{}_juz_train_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas()
 85 |     test = odps.get_table('{}_juz_test_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas()
 86 | 
 87 |     print(train.shape)
 88 |     print(test.shape)
 89 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
 90 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
 91 |     test_data = test.loc[:, use_features]
 92 |     
 93 |     submission = test.loc[:, ['vid', use_label]]
 94 |     base_line_score = np.zeros(5)
 95 |     start = time.time()
 96 |     model =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=800, max_depth=5, subsample=0.7,
 97 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 98 |     for i, j in enumerate(predict_features):
 99 |         if j in [use_label]:
100 |             base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission, model)
101 |     print(dict(zip(predict_features, base_line_score)))
102 |     print('CV训练用时{}秒'.format(time.time() - start))
103 |     print('scores:', np.mean(base_line_score))


--------------------------------------------------------------------------------
/round2_rank10/feature_selection/get_best_rounds.py:
--------------------------------------------------------------------------------
  1 | from sklearn.ensemble import GradientBoostingRegressor
  2 | from sklearn.model_selection import KFold
  3 | import time
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | def eval_metric(pred, labels):
  8 |     return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
  9 | 
 10 | def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model):
 11 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
 12 |     value4preds = np.log(df[label])
 13 |     train_data = df.loc[:, use_feature]
 14 |     print(train_data.shape)
 15 |     scores = np.zeros(len(value4preds))
 16 |     submission_scores = np.zeros((len(submission_data), 5))
 17 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 18 |     five_fold_index = list(kf.split(train_data, value4preds))
 19 | 
 20 |     train_index_1, test_index_1 = five_fold_index[0]
 21 |     print('第1次训练...')
 22 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
 23 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
 24 |     gbdt_model.fit(x_train_1, y_train_1)
 25 |     errors_1 = [eval_metric(np.exp(y_test_1), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_1)]
 26 |     best_n_estimators_1 = np.argmin(errors_1)+1
 27 |     print("best number of estimators_1 is : ", best_n_estimators_1)
 28 |     scores[test_index_1] = np.exp(gbdt_model.predict(x_test_1))
 29 |     print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1)))
 30 |     print('第1次训练结束')
 31 |     print('*******************************************************************')
 32 |     
 33 |     train_index_2, test_index_2 = five_fold_index[1]
 34 |     print('第2次训练...')
 35 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
 36 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
 37 |     gbdt_model.fit(x_train_2, y_train_2)
 38 |     errors_2 = [eval_metric(np.exp(y_test_2), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_2)]
 39 |     best_n_estimators_2 = np.argmin(errors_2)+1
 40 |     print("best number of estimators_2 is : ", best_n_estimators_2)
 41 |     scores[test_index_2] = np.exp(gbdt_model.predict(x_test_2))
 42 |     print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2)))
 43 |     print('第2次训练结束')
 44 |     print('*******************************************************************')
 45 |     
 46 |     train_index_3, test_index_3 = five_fold_index[2]
 47 |     print('第3次训练...')
 48 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
 49 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
 50 |     gbdt_model.fit(x_train_3, y_train_3)
 51 |     errors_3 = [eval_metric(np.exp(y_test_3), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_3)]
 52 |     best_n_estimators_3 = np.argmin(errors_3)+1
 53 |     print("best number of estimators_3 is : ", best_n_estimators_3)
 54 |     scores[test_index_3] = np.exp(gbdt_model.predict(x_test_3))
 55 |     print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3)))
 56 |     print('第3次训练结束')
 57 |     print('*******************************************************************')
 58 |     
 59 |     train_index_4, test_index_4 = five_fold_index[3]
 60 |     print('第4次训练...')
 61 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
 62 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
 63 |     gbdt_model.fit(x_train_4, y_train_4)
 64 |     errors_4 = [eval_metric(np.exp(y_test_4), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_4)]
 65 |     best_n_estimators_4 = np.argmin(errors_4)+1
 66 |     print("best number of estimators_4 is : ", best_n_estimators_4)
 67 |     scores[test_index_4] = np.exp(gbdt_model.predict(x_test_4))
 68 |     print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4)))
 69 |     print('第4次训练结束')
 70 |     print('*******************************************************************')
 71 |     
 72 |     train_index_5, test_index_5 = five_fold_index[4]
 73 |     print('第5次训练...')
 74 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
 75 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
 76 |     gbdt_model.fit(x_train_5, y_train_5)
 77 |     errors_5 = [eval_metric(np.exp(y_test_5), np.exp(y_pred)) for y_pred in gbdt_model.staged_predict(x_test_5)]
 78 |     best_n_estimators_5 = np.argmin(errors_5)+1
 79 |     print("best number of estimators_5 is : ", best_n_estimators_5)
 80 |     scores[test_index_5] = np.exp(gbdt_model.predict(x_test_5))
 81 |     print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5)))
 82 |     print('第5次训练结束')
 83 |     print('*******************************************************************')
 84 |     return eval_metric(scores, np.exp(value4preds))
 85 | 
 86 | 
 87 | # get best rounds of every predict label in 2000 rounds
 88 | if __name__ == "__main__":
 89 |     use_label = 'sys'
 90 |     train = odps.get_table('{}_juz_train_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas()
 91 |     test = odps.get_table('{}_juz_test_6_6_snp_onehot_22'.format(use_label)).to_df().to_pandas()
 92 |     print(train.shape)
 93 |     print(test.shape)
 94 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
 95 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
 96 |     test_data = test.loc[:, use_features]
 97 |     submission = test.loc[:, ['vid', use_label]]
 98 |     base_line_score = np.zeros(5)
 99 |     start = time.time()
100 |     model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=2000, max_depth=5, subsample=0.7,
101 |                                       random_state=1, verbose=0, min_samples_leaf=50)
102 |     for i, j in enumerate(predict_features):
103 |         if j in [use_label]:
104 |             base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission, model)
105 |     print(dict(zip(predict_features, base_line_score)))
106 |     print('CV训练用时{}秒'.format(time.time() - start))
107 |     print('scores:', np.mean(base_line_score))


--------------------------------------------------------------------------------
/round2_rank10/every_predict_model/sys_gbdt_best_rounds.py:
--------------------------------------------------------------------------------
  1 | from odps import ODPS
  2 | import pandas as pd
  3 | from sklearn.ensemble import GradientBoostingRegressor
  4 | from odps.df import DataFrame
  5 | from sklearn.model_selection import KFold
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | def eval_metric(pred, labels):
 11 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
 12 | 
 13 | def gbdt_model(df, label, use_feature, true_test, submission_data):
 14 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
 15 |     value4preds = np.log(df[label])
 16 |     train_data = df.loc[:, use_feature]
 17 |     print(train_data.shape)
 18 |     scores = np.zeros(len(value4preds))
 19 |     submission_scores = np.zeros((len(submission_data), 5))
 20 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 21 |     five_fold_index = list(kf.split(train_data, value4preds))
 22 |     
 23 |     train_index_1, test_index_1 = five_fold_index[0]
 24 |     print('第1次训练...')
 25 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
 26 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
 27 |     gbdt_model_1 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=624, max_depth=5, subsample=0.7,
 28 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 29 |     gbdt_model_1.fit(x_train_1, y_train_1)
 30 |     scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1))
 31 |     submission_scores[:, 0] = gbdt_model_1.predict(true_test)
 32 |     print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1)))
 33 |     print('第1次训练结束')
 34 |     print('*******************************************************************')
 35 |     train_index_2, test_index_2 = five_fold_index[1]
 36 |     print('第2次训练...')
 37 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
 38 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
 39 |     gbdt_model_2 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=902, max_depth=5, subsample=0.7,
 40 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 41 |     gbdt_model_2.fit(x_train_2, y_train_2)
 42 |     scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2))
 43 |     submission_scores[:, 1] = gbdt_model_2.predict(true_test)
 44 |     print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2)))
 45 |     print('第2次训练结束')
 46 |     print('*******************************************************************')
 47 |     train_index_3, test_index_3 = five_fold_index[2]
 48 |     print('第3次训练...')
 49 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
 50 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
 51 |     gbdt_model_3 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=911, max_depth=5, subsample=0.7,
 52 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 53 |     gbdt_model_3.fit(x_train_3, y_train_3)
 54 |     scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3))
 55 |     submission_scores[:, 2] = gbdt_model_3.predict(true_test)
 56 |     print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3)))
 57 |     print('第3次训练结束')
 58 |     print('*******************************************************************')
 59 |     train_index_4, test_index_4 = five_fold_index[3]
 60 |     print('第4次训练...')
 61 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
 62 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
 63 |     gbdt_model_4 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=535, max_depth=5, subsample=0.7,
 64 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 65 |     gbdt_model_4.fit(x_train_4, y_train_4)
 66 |     scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4))
 67 |     submission_scores[:, 3] = gbdt_model_4.predict(true_test)
 68 |     print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4)))
 69 |     print('第4次训练结束')
 70 |     print('*******************************************************************')
 71 |     train_index_5, test_index_5 = five_fold_index[4]
 72 |     print('第5次训练...')
 73 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
 74 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
 75 |     gbdt_model_5 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=533, max_depth=5, subsample=0.7,
 76 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 77 |     gbdt_model_5.fit(x_train_5, y_train_5)
 78 |     scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5))
 79 |     submission_scores[:, 4] = gbdt_model_5.predict(true_test)
 80 |     print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5)))
 81 |     print('第5次训练结束')
 82 |     print('*******************************************************************')
 83 |     submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3)
 84 |     return eval_metric(scores, np.exp(value4preds))
 85 | 
 86 | 
 87 | # b-board
 88 | # 624 902 911 535 533
 89 | # 'sys': 0.013720952232896292
 90 | if __name__ == "__main__":
 91 |     train = odps.get_table('sys_juz_train_6_6_snp_onehot_22').to_df().to_pandas()
 92 |     test = odps.get_table('sys_juz_test_6_6_snp_onehot_22').to_df().to_pandas()
 93 |     print(train.shape)
 94 |     print(test.shape)
 95 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
 96 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
 97 |     test_data = test.loc[:, use_features]
 98 |     
 99 |     submission = test.loc[:, ['vid', 'sys']]
100 |     base_line_score = np.zeros(5)
101 |     start = time.time()
102 |     for i, j in enumerate(predict_features):
103 |         if j in ['sys']:
104 |             base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission)
105 |     print(dict(zip(predict_features, base_line_score)))
106 |     print('CV训练用时{}秒'.format(time.time() - start))
107 |     print('scores:', np.mean(base_line_score))
108 |     sub_final = DataFrame(submission)
109 |     sub_final.persist('sys_jz_5_fold_6_6_submit_22')


--------------------------------------------------------------------------------
/round2_rank10/every_predict_model/dia_gbdt_best_rounds.py:
--------------------------------------------------------------------------------
  1 | from odps import ODPS
  2 | import pandas as pd
  3 | from sklearn.ensemble import GradientBoostingRegressor
  4 | from odps.df import DataFrame
  5 | from sklearn.model_selection import KFold
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | def eval_metric(pred, labels):
 11 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
 12 | 
 13 | def gbdt_model(df, label, use_feature, true_test, submission_data):
 14 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
 15 |     value4preds = np.log(df[label])
 16 |     train_data = df.loc[:, use_feature]
 17 |     print(train_data.shape)
 18 |     scores = np.zeros(len(value4preds))
 19 |     submission_scores = np.zeros((len(submission_data), 5))
 20 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 21 |     five_fold_index = list(kf.split(train_data, value4preds))
 22 |     
 23 |     train_index_1, test_index_1 = five_fold_index[0]
 24 |     print('第1次训练...')
 25 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
 26 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
 27 |     gbdt_model_1 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=739, max_depth=5, subsample=0.7,
 28 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 29 |     gbdt_model_1.fit(x_train_1, y_train_1)
 30 |     scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1))
 31 |     submission_scores[:, 0] = gbdt_model_1.predict(true_test)
 32 |     print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1)))
 33 |     print('第1次训练结束')
 34 |     print('*******************************************************************')
 35 |     train_index_2, test_index_2 = five_fold_index[1]
 36 |     print('第2次训练...')
 37 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
 38 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
 39 |     gbdt_model_2 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=601, max_depth=5, subsample=0.7,
 40 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 41 |     gbdt_model_2.fit(x_train_2, y_train_2)
 42 |     scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2))
 43 |     submission_scores[:, 1] = gbdt_model_2.predict(true_test)
 44 |     print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2)))
 45 |     print('第2次训练结束')
 46 |     print('*******************************************************************')
 47 |     train_index_3, test_index_3 = five_fold_index[2]
 48 |     print('第3次训练...')
 49 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
 50 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
 51 |     gbdt_model_3 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=887, max_depth=5, subsample=0.7,
 52 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 53 |     gbdt_model_3.fit(x_train_3, y_train_3)
 54 |     scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3))
 55 |     submission_scores[:, 2] = gbdt_model_3.predict(true_test)
 56 |     print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3)))
 57 |     print('第3次训练结束')
 58 |     print('*******************************************************************')
 59 |     train_index_4, test_index_4 = five_fold_index[3]
 60 |     print('第4次训练...')
 61 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
 62 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
 63 |     gbdt_model_4 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=709, max_depth=5, subsample=0.7,
 64 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 65 |     gbdt_model_4.fit(x_train_4, y_train_4)
 66 |     scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4))
 67 |     submission_scores[:, 3] = gbdt_model_4.predict(true_test)
 68 |     print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4)))
 69 |     print('第4次训练结束')
 70 |     print('*******************************************************************')
 71 |     train_index_5, test_index_5 = five_fold_index[4]
 72 |     print('第5次训练...')
 73 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
 74 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
 75 |     gbdt_model_5 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1221, max_depth=5, subsample=0.7,
 76 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 77 |     gbdt_model_5.fit(x_train_5, y_train_5)
 78 |     scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5))
 79 |     submission_scores[:, 4] = gbdt_model_5.predict(true_test)
 80 |     print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5)))
 81 |     print('第5次训练结束')
 82 |     print('*******************************************************************')
 83 |     submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3)
 84 |     return eval_metric(scores, np.exp(value4preds))
 85 | 
 86 | 
 87 | # b-board
 88 | # 739 601 887 709 1221
 89 | # 'dia': 0.018069628693683809
 90 | if __name__ == "__main__":
 91 |     train = odps.get_table('dia_juz_train_6_6_snp_onehot_22').to_df().to_pandas()
 92 |     test = odps.get_table('dia_juz_test_6_6_snp_onehot_22').to_df().to_pandas()
 93 |     print(train.shape)
 94 |     print(test.shape)
 95 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
 96 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
 97 |     test_data = test.loc[:, use_features]
 98 |     
 99 |     submission = test.loc[:, ['vid', 'dia']]
100 |     base_line_score = np.zeros(5)
101 |     start = time.time()
102 |     for i, j in enumerate(predict_features):
103 |         if j in ['dia']:
104 |             base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission)
105 |     print(dict(zip(predict_features, base_line_score)))
106 |     print('CV训练用时{}秒'.format(time.time() - start))
107 |     print('scores:', np.mean(base_line_score))
108 |     sub_final = DataFrame(submission)
109 |     sub_final.persist('dia_jz_5_fold_6_6_submit_22')


--------------------------------------------------------------------------------
/round2_rank10/every_predict_model/ldl_gbdt_best_rounds.py:
--------------------------------------------------------------------------------
  1 | from odps import ODPS
  2 | import pandas as pd
  3 | from sklearn.ensemble import GradientBoostingRegressor
  4 | from odps.df import DataFrame
  5 | from sklearn.model_selection import KFold
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | def eval_metric(pred, labels):
 11 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
 12 | 
 13 | def gbdt_model(df, label, use_feature, true_test, submission_data):
 14 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
 15 |     value4preds = np.log(df[label])
 16 |     train_data = df.loc[:, use_feature]
 17 |     print(train_data.shape)
 18 |     scores = np.zeros(len(value4preds))
 19 |     submission_scores = np.zeros((len(submission_data), 5))
 20 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 21 |     five_fold_index = list(kf.split(train_data, value4preds))
 22 |     
 23 |     train_index_1, test_index_1 = five_fold_index[0]
 24 |     print('第1次训练...')
 25 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
 26 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
 27 |     gbdt_model_1 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=718, max_depth=5, subsample=0.7,
 28 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 29 |     gbdt_model_1.fit(x_train_1, y_train_1)
 30 |     scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1))
 31 |     submission_scores[:, 0] = gbdt_model_1.predict(true_test)
 32 |     print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1)))
 33 |     print('第1次训练结束')
 34 |     print('*******************************************************************')
 35 |     train_index_2, test_index_2 = five_fold_index[1]
 36 |     print('第2次训练...')
 37 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
 38 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
 39 |     gbdt_model_2 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=968, max_depth=5, subsample=0.7,
 40 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 41 |     gbdt_model_2.fit(x_train_2, y_train_2)
 42 |     scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2))
 43 |     submission_scores[:, 1] = gbdt_model_2.predict(true_test)
 44 |     print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2)))
 45 |     print('第2次训练结束')
 46 |     print('*******************************************************************')
 47 |     train_index_3, test_index_3 = five_fold_index[2]
 48 |     print('第3次训练...')
 49 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
 50 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
 51 |     gbdt_model_3 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=993, max_depth=5, subsample=0.7,
 52 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 53 |     gbdt_model_3.fit(x_train_3, y_train_3)
 54 |     scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3))
 55 |     submission_scores[:, 2] = gbdt_model_3.predict(true_test)
 56 |     print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3)))
 57 |     print('第3次训练结束')
 58 |     print('*******************************************************************')
 59 |     train_index_4, test_index_4 = five_fold_index[3]
 60 |     print('第4次训练...')
 61 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
 62 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
 63 |     gbdt_model_4 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1499, max_depth=5, subsample=0.7,
 64 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 65 |     gbdt_model_4.fit(x_train_4, y_train_4)
 66 |     scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4))
 67 |     submission_scores[:, 3] = gbdt_model_4.predict(true_test)
 68 |     print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4)))
 69 |     print('第4次训练结束')
 70 |     print('*******************************************************************')
 71 |     train_index_5, test_index_5 = five_fold_index[4]
 72 |     print('第5次训练...')
 73 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
 74 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
 75 |     gbdt_model_5 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=923, max_depth=5, subsample=0.7,
 76 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 77 |     gbdt_model_5.fit(x_train_5, y_train_5)
 78 |     scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5))
 79 |     submission_scores[:, 4] = gbdt_model_5.predict(true_test)
 80 |     print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5)))
 81 |     print('第5次训练结束')
 82 |     print('*******************************************************************')
 83 |     submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3)
 84 |     return eval_metric(scores, np.exp(value4preds))
 85 | 
 86 | 
 87 | # b-board
 88 | # 718 968 993 1499 923
 89 | # 'ldl': 0.033119396519559752
 90 | if __name__ == "__main__":
 91 |     train = odps.get_table('ldl_juz_train_6_6_snp_onehot_22').to_df().to_pandas()
 92 |     test = odps.get_table('ldl_juz_test_6_6_snp_onehot_22').to_df().to_pandas()
 93 |     print(train.shape)
 94 |     print(test.shape)
 95 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
 96 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
 97 |     test_data = test.loc[:, use_features]
 98 |     
 99 |     submission = test.loc[:, ['vid', 'ldl']]
100 |     base_line_score = np.zeros(5)
101 |     start = time.time()
102 |     for i, j in enumerate(predict_features):
103 |         if j in ['ldl']:
104 |             base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission)
105 |     print(dict(zip(predict_features, base_line_score)))
106 |     print('CV训练用时{}秒'.format(time.time() - start))
107 |     print('scores:', np.mean(base_line_score))
108 |     sub_final = DataFrame(submission)
109 |     sub_final.persist('ldl_jz_5_fold_6_6_submit_22')


--------------------------------------------------------------------------------
/round2_rank10/every_predict_model/hdl_gbdt_best_rounds.py:
--------------------------------------------------------------------------------
  1 | from odps import ODPS
  2 | import pandas as pd
  3 | from sklearn.ensemble import GradientBoostingRegressor
  4 | from odps.df import DataFrame
  5 | from sklearn.model_selection import KFold
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | def eval_metric(pred, labels):
 11 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
 12 | 
 13 | def gbdt_model(df, label, use_feature, true_test, submission_data):
 14 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
 15 |     value4preds = np.log(df[label])
 16 |     train_data = df.loc[:, use_feature]
 17 |     print(train_data.shape)
 18 |     scores = np.zeros(len(value4preds))
 19 |     submission_scores = np.zeros((len(submission_data), 5))
 20 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 21 |     five_fold_index = list(kf.split(train_data, value4preds))
 22 |     
 23 |     train_index_1, test_index_1 = five_fold_index[0]
 24 |     print('第1次训练...')
 25 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
 26 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
 27 |     gbdt_model_1 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1972, max_depth=5, subsample=0.7,
 28 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 29 |     gbdt_model_1.fit(x_train_1, y_train_1)
 30 |     scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1))
 31 |     submission_scores[:, 0] = gbdt_model_1.predict(true_test)
 32 |     print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1)))
 33 |     print('第1次训练结束')
 34 |     print('*******************************************************************')
 35 |     train_index_2, test_index_2 = five_fold_index[1]
 36 |     print('第2次训练...')
 37 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
 38 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
 39 |     gbdt_model_2 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=2000, max_depth=5, subsample=0.7,
 40 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 41 |     gbdt_model_2.fit(x_train_2, y_train_2)
 42 |     scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2))
 43 |     submission_scores[:, 1] = gbdt_model_2.predict(true_test)
 44 |     print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2)))
 45 |     print('第2次训练结束')
 46 |     print('*******************************************************************')
 47 |     train_index_3, test_index_3 = five_fold_index[2]
 48 |     print('第3次训练...')
 49 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
 50 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
 51 |     gbdt_model_3 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=2000, max_depth=5, subsample=0.7,
 52 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 53 |     gbdt_model_3.fit(x_train_3, y_train_3)
 54 |     scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3))
 55 |     submission_scores[:, 2] = gbdt_model_3.predict(true_test)
 56 |     print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3)))
 57 |     print('第3次训练结束')
 58 |     print('*******************************************************************')
 59 |     train_index_4, test_index_4 = five_fold_index[3]
 60 |     print('第4次训练...')
 61 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
 62 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
 63 |     gbdt_model_4 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1492, max_depth=5, subsample=0.7,
 64 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 65 |     gbdt_model_4.fit(x_train_4, y_train_4)
 66 |     scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4))
 67 |     submission_scores[:, 3] = gbdt_model_4.predict(true_test)
 68 |     print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4)))
 69 |     print('第4次训练结束')
 70 |     print('*******************************************************************')
 71 |     train_index_5, test_index_5 = five_fold_index[4]
 72 |     print('第5次训练...')
 73 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
 74 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
 75 |     gbdt_model_5 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1954, max_depth=5, subsample=0.7,
 76 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 77 |     gbdt_model_5.fit(x_train_5, y_train_5)
 78 |     scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5))
 79 |     submission_scores[:, 4] = gbdt_model_5.predict(true_test)
 80 |     print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5)))
 81 |     print('第5次训练结束')
 82 |     print('*******************************************************************')
 83 |     submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3)
 84 |     return eval_metric(scores, np.exp(value4preds))
 85 | 
 86 | 
 87 | # b-board
 88 | # 1972 2000 2000 1492 1954 
 89 | # 'hdl': 0.010681349752220548
 90 | if __name__ == "__main__":
 91 |     train = odps.get_table('hdl_juz_train_6_6_snp_onehot_22').to_df().to_pandas()
 92 |     test = odps.get_table('hdl_juz_test_6_6_snp_onehot_22').to_df().to_pandas()
 93 |     print(train.shape)
 94 |     print(test.shape)
 95 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
 96 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
 97 |     test_data = test.loc[:, use_features]
 98 |     
 99 |     submission = test.loc[:, ['vid', 'hdl']]
100 |     base_line_score = np.zeros(5)
101 |     start = time.time()
102 |     for i, j in enumerate(predict_features):
103 |         if j in ['hdl']:
104 |             base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission)
105 |     print(dict(zip(predict_features, base_line_score)))
106 |     print('CV训练用时{}秒'.format(time.time() - start))
107 |     print('scores:', np.mean(base_line_score))
108 |     sub_final = DataFrame(submission)
109 |     sub_final.persist('hdl_jz_5_fold_6_6_submit_22')


--------------------------------------------------------------------------------
/round2_rank10/every_predict_model/tl_gbdt_best_rounds.py:
--------------------------------------------------------------------------------
  1 | from odps import ODPS
  2 | import pandas as pd
  3 | from sklearn.ensemble import GradientBoostingRegressor
  4 | from odps.df import DataFrame
  5 | from sklearn.model_selection import KFold
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | def eval_metric(pred, labels):
 11 | 	return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
 12 | 
 13 | def gbdt_model(df, label, use_feature, true_test, submission_data):
 14 |     print("基于GBDT： 开始训练 label 为{}...".format(label))
 15 |     value4preds = np.log(df[label])
 16 |     train_data = df.loc[:, use_feature]
 17 |     print(train_data.shape)
 18 |     scores = np.zeros(len(value4preds))
 19 |     submission_scores = np.zeros((len(submission_data), 5))
 20 |     kf = KFold(n_splits=5, shuffle=True, random_state=1024)
 21 |     five_fold_index = list(kf.split(train_data, value4preds))
 22 |     
 23 |     train_index_1, test_index_1 = five_fold_index[0]
 24 |     print('第1次训练...')
 25 |     x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1]
 26 |     y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1]
 27 |     gbdt_model_1 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1777, max_depth=5, subsample=0.7,
 28 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 29 |     gbdt_model_1.fit(x_train_1, y_train_1)
 30 |     scores[test_index_1] = np.exp(gbdt_model_1.predict(x_test_1))
 31 |     submission_scores[:, 0] = gbdt_model_1.predict(true_test)
 32 |     print('the score is: ', eval_metric(scores[test_index_1], np.exp(y_test_1)))
 33 |     print('第1次训练结束')
 34 |     del train_index_1
 35 |     del test_index_1
 36 |     del gbdt_model_1
 37 |     print('*******************************************************************')
 38 |     train_index_2, test_index_2 = five_fold_index[1]
 39 |     print('第2次训练...')
 40 |     x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2]
 41 |     y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2]
 42 |     gbdt_model_2 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1795, max_depth=5, subsample=0.7,
 43 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 44 |     gbdt_model_2.fit(x_train_2, y_train_2)
 45 |     scores[test_index_2] = np.exp(gbdt_model_2.predict(x_test_2))
 46 |     submission_scores[:, 1] = gbdt_model_2.predict(true_test)
 47 |     print('the score is: ', eval_metric(scores[test_index_2], np.exp(y_test_2)))
 48 |     print('第2次训练结束')
 49 |     del train_index_2
 50 |     del test_index_2
 51 |     del gbdt_model_2
 52 |     print('*******************************************************************')
 53 |     train_index_3, test_index_3 = five_fold_index[2]
 54 |     print('第3次训练...')
 55 |     x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3]
 56 |     y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3]
 57 |     gbdt_model_3 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1800, max_depth=5, subsample=0.7,
 58 |                                               random_state=1, verbose=0, min_samples_leaf=50)
 59 |     gbdt_model_3.fit(x_train_3, y_train_3)
 60 |     scores[test_index_3] = np.exp(gbdt_model_3.predict(x_test_3))
 61 |     submission_scores[:, 2] = gbdt_model_3.predict(true_test)
 62 |     print('the score is: ', eval_metric(scores[test_index_3], np.exp(y_test_3)))
 63 |     print('第3次训练结束')
 64 |     del train_index_3
 65 |     del test_index_3
 66 |     del gbdt_model_3
 67 |     print('*******************************************************************')
 68 |     train_index_4, test_index_4 = five_fold_index[3]
 69 |     print('第4次训练...')
 70 |     x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4]
 71 |     y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4]
 72 |     gbdt_model_4 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1419, max_depth=5, subsample=0.7,
 73 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 74 |     gbdt_model_4.fit(x_train_4, y_train_4)
 75 |     scores[test_index_4] = np.exp(gbdt_model_4.predict(x_test_4))
 76 |     submission_scores[:, 3] = gbdt_model_4.predict(true_test)
 77 |     print('the score is: ', eval_metric(scores[test_index_4], np.exp(y_test_4)))
 78 |     print('第4次训练结束')
 79 |     del train_index_4
 80 |     del test_index_4
 81 |     del gbdt_model_4
 82 |     print('*******************************************************************')
 83 |     train_index_5, test_index_5 = five_fold_index[4]
 84 |     print('第5次训练...')
 85 |     x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5]
 86 |     y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5]
 87 |     gbdt_model_5 =  GradientBoostingRegressor(learning_rate=0.01, n_estimators=1800, max_depth=5, subsample=0.7,
 88 |                                                random_state=1, verbose=0, min_samples_leaf=50)
 89 |     gbdt_model_5.fit(x_train_5, y_train_5)
 90 |     scores[test_index_5] = np.exp(gbdt_model_5.predict(x_test_5))
 91 |     submission_scores[:, 4] = gbdt_model_5.predict(true_test)
 92 |     print('the score is: ', eval_metric(scores[test_index_5], np.exp(y_test_5)))
 93 |     print('第5次训练结束')
 94 |     del train_index_5
 95 |     del test_index_5
 96 |     del gbdt_model_5
 97 |     print('*******************************************************************')
 98 |     submission_data[label] = np.exp(np.mean(submission_scores, axis=1)).round(3)
 99 |     return eval_metric(scores, np.exp(value4preds))
100 | 
101 | 
102 | # b-board
103 | # 1777 1795 1800 1419 1800
104 | # 'tl': 0.086486107163495141
105 | if __name__ == "__main__":
106 |     train = odps.get_table('tl_juz_train_6_6_snp_onehot_22').to_df().to_pandas()
107 |     test = odps.get_table('tl_juz_test_6_6_snp_onehot_22').to_df().to_pandas()
108 |     print(train.shape)
109 |     print(test.shape)
110 |     predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
111 |     use_features = [t for t in train.columns if t != 'vid' and t not in predict_features]
112 |     test_data = test.loc[:, use_features]
113 |     
114 |     submission = test.loc[:, ['vid', 'tl']]
115 |     base_line_score = np.zeros(5)
116 |     start = time.time()
117 |     for i, j in enumerate(predict_features):
118 |         if j in ['tl']:
119 |             base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission)
120 |     print(dict(zip(predict_features, base_line_score)))
121 |     print('CV训练用时{}秒'.format(time.time() - start))
122 |     print('scores:', np.mean(base_line_score))
123 |     sub_final = DataFrame(submission)
124 |     sub_final.persist('tl_jz_5_fold_6_6_submit_22')


--------------------------------------------------------------------------------
/round2_rank10/data_pre_process/get_num_features.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import re
  3 | import pandas as pd
  4 | from odps import ODPS
  5 | from odps.df import DataFrame
  6 | import numpy as np
  7 | from collections import Iterable
  8 | 
  9 | import sys  
 10 | reload(sys)  
 11 | sys.setdefaultencoding('utf8')
 12 | 
 13 | def extract_num_norm(df):
 14 |     if isinstance(df, Iterable):
 15 |         temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
 16 |         if temp:
 17 |             return np.mean([float(i.replace('--', '')) for i in temp])
 18 |         else:
 19 |             return np.nan
 20 |     else:
 21 |         return np.nan
 22 | 
 23 | def transform_0424(df):
 24 |         if isinstance(df, Iterable):
 25 |             temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
 26 |             if temp:
 27 |                 return np.mean([float(i.replace('--', '')) for i in temp])
 28 |             else:
 29 |                 if '常' in df:
 30 |                     return 75
 31 |                 elif '过速' in df:
 32 |                     return 100
 33 |                 elif '过缓' in df:
 34 |                     return 50
 35 |                 else:
 36 |                     return np.nan
 37 |         else:
 38 |             return np.nan
 39 |                 
 40 | def transform_0425(df):
 41 |         if isinstance(df, Iterable):
 42 |             temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
 43 |             if temp:
 44 |                 return np.mean([float(i.replace('--', '')) for i in temp])
 45 |             else:
 46 |                 # median
 47 |                 if '常' in df:
 48 |                     return 17
 49 |                 # min
 50 |                 elif '粗糙' in df:
 51 |                     return 14
 52 |                 else:
 53 |                     return np.nan
 54 |         else:
 55 |             return np.nan
 56 |                 
 57 | def transform_1308(df):
 58 |     if isinstance(df, Iterable):
 59 |         if '裸眼' in df:
 60 |             temp1 = re.findall(r'\-*\d+(?:\.\d+)?', df)
 61 |             if temp1:
 62 |             	luo_yan = np.mean([float(i.replace('--', '')) for i in temp1])
 63 |                 if luo_yan >= 1:
 64 |                     return 4
 65 |                 else:
 66 |                     return 3
 67 |         elif '矫正' in df:
 68 |             temp2 = re.findall(r'\-*\d+(?:\.\d+)?', df)
 69 |             if temp2:
 70 |                 jiao_zheng = np.mean([float(i.replace('--', '')) for i in temp2])
 71 |                 if jiao_zheng >= 1:
 72 |                     return 2
 73 |                 else:
 74 |                     return 1
 75 |         else:
 76 |             return np.nan
 77 |     else:
 78 |         return np.nan
 79 |     
 80 | def transform_1321_1322(df):
 81 |         if isinstance(df, Iterable):
 82 |             temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
 83 |             if temp:
 84 |                 return np.mean([float(i.replace('--', '')) for i in temp])
 85 |             else:
 86 |                 if '失明' in df or '义眼' in df:
 87 |                     return 0
 88 |                 elif '指数' in df:
 89 |                     return 0.003
 90 |                 elif '手动' in df:
 91 |                     return 0.002
 92 |                 elif '光感' in df:
 93 |                     return 0.001
 94 |                 else:
 95 |                     return np.nan
 96 |         else:
 97 |             return np.nan
 98 |     
 99 | def calc_voice_area(df, desc):
100 |     if isinstance(df, Iterable) and desc in df:
101 |         temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
102 |         if temp:
103 |             if 'cm' in df:
104 |                 if len(temp) == 2:
105 |                 	return float(temp[0]) * float(temp[1]) * 100
106 |                 if len(temp) == 4:
107 |                     return (float(temp[0]) * float(temp[1]) + float(temp[2]) * float(temp[3]))*100.0/2
108 |             if 'mm' in df:
109 |                 if len(temp) == 2:
110 |                 	return float(temp[0]) * float(temp[1])
111 |                 if len(temp) == 4:
112 |                     return (float(temp[0]) * float(temp[1]) + float(temp[2]) * float(temp[3]))*1.0/2
113 |         else:
114 |             return 0
115 |     else:
116 |         return np.nan
117 |     
118 | # 眼压        
119 | def transform_1319_1320(df):
120 |         if isinstance(df, Iterable):
121 |             temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
122 |             if temp:
123 |                 return float(temp[0])
124 |             else:
125 |                 if '正常' in df:
126 |                     return 15
127 |                 elif '偏高' in df:
128 |                     return 22
129 |                 else:
130 |                     return np.nan
131 |         else:
132 |             return np.nan
133 | 
134 | def get_pure_num_features(data_frame, threshold):
135 |     pure_num_list = ['vid']
136 |     for c in data_frame.columns:
137 |         if c != 'vid':
138 |             data_frame[c] = pd.to_numeric(data_frame[c], errors='ignore')
139 |         if data_frame[c].dtypes != 'object' and (data_frame[c].isnull().sum() * 1.0 / data_frame.shape[0] <= threshold):
140 |             #if np.abs(ex_num[c].skew()) <= pian_tai:
141 |         	pure_num_list.append(c)
142 |     return data_frame.loc[:, pure_num_list]
143 | 
144 | def split_data(data_series, desc):
145 |     check_array = ['' for _ in range(data_series.shape[0])]
146 |     for pos, j in enumerate(data_series):
147 |         if isinstance(j, Iterable):
148 |             tmp = set(j.split('$'))
149 |             for t in tmp:
150 |                 if isinstance(t, Iterable) and desc in t:
151 |                     check_array[pos] = t
152 |     return check_array
153 | 
154 | def qian_lie_xian(df, pos):
155 |     if isinstance(df, Iterable):
156 |         temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
157 |         if temp:
158 |             if 'cm' in df:
159 |                 if len(temp) >= 3:
160 |                 	return float(temp[pos]) * 10
161 |             if 'mm' in df:
162 |                 if len(temp) >= 3:
163 |                 	return float(temp[pos])
164 |     else:
165 |         return np.nan
166 | 
167 | def dpm_check(df):
168 |     if isinstance(df, Iterable):
169 |         temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
170 |         if temp and 'dpm' in df:
171 |         	return float(temp[0])
172 |     else:
173 |         return np.nan
174 | 
175 | def ex_num_from_str(data_frame):
176 |     word_096_norm = ['004997', '0107', '100008', '100013', '100014','10002', '10003', '1106', '1107', '1110','1112', '1115',
177 |                      '1117', '1345', '139', '141', '143', '1474', '155', '1814', '1815', '183', '1845', '1850','1873', '191', 
178 |                      '192', '193', '20002', '2165', '2174', '2371', '2376', '2390', '2403', '2404', '2405', '2406','2420',
179 |                      '269011', '300001', '300021', '300035', '300051','300069', '300070', '300073', '300074', '300076', '300078',
180 |                      '300093', '300113', '300119', '300125', '300129', '314','3193', '321', '3804', '3807', '669003', '809021', 
181 |                      '979001', '979002', '979003', 'a701', 'a703']
182 |     shili = ['1308','1319','1320','1321','1322']
183 |     heart = ['0424','0425', 'vid']
184 |     for w in word_096_norm:
185 |         data_frame[w] = data_frame[w].apply(extract_num_norm)
186 |     data_frame['0424'] =  data_frame['0424'].apply(transform_0424)
187 |     data_frame['0425'] =  data_frame['0425'].apply(transform_0425)
188 |     data_frame['1308'] =  data_frame['1308'].apply(transform_1308)
189 |     data_frame['1319'] =  data_frame['1319'].apply(transform_1319_1320)
190 |     data_frame['1320'] =  data_frame['1320'].apply(transform_1319_1320)
191 |     data_frame['1321'] =  data_frame['1321'].apply(transform_1321_1322)
192 |     data_frame['1322'] =  data_frame['1322'].apply(transform_1321_1322)
193 |     data_frame['left_shen_no_voice'] =  data_frame['left_shen'].apply(calc_voice_area, args=('无回声',))
194 |     #data_frame['left_shen_strong_voice'] =  data_frame['left_shen'].apply(calc_voice_area, args=('强回声',))
195 |     data_frame['right_shen_no_voice'] =  data_frame['right_shen'].apply(calc_voice_area, args=('无回声',))
196 |     data_frame['right_shen_strong_voice'] =  data_frame['right_shen'].apply(calc_voice_area, args=('强回声',))
197 |     data_frame['jzx_no_voice_area'] =  data_frame['jia_zx'].apply(calc_voice_area, args=('无回声区',))
198 |     data_frame['jzx_no_voice_jiejie'] =  data_frame['jia_zx'].apply(calc_voice_area, args=('无回声结节',))
199 |     data_frame['jzx_low_voice_area'] =  data_frame['jia_zx'].apply(calc_voice_area, args=('低回声区',))
200 |     data_frame['jzx_low_voice_jiejie'] =  data_frame['jia_zx'].apply(calc_voice_area, args=('低回声结节',))
201 |     data_frame['liver_no_voice'] =  data_frame['0113'].apply(calc_voice_area, args=('无回声',))
202 |     data_frame['liver_strong_voice'] =  data_frame['0113'].apply(calc_voice_area, args=('强回声',))
203 |     data_frame['dan_strong_voice'] =  data_frame['0114'].apply(calc_voice_area, args=('强回声',))
204 |     data_frame['qian_lie_xian_1'] =  data_frame['0120'].apply(qian_lie_xian, args=(0,))
205 |     data_frame['qian_lie_xian_2'] =  data_frame['0120'].apply(qian_lie_xian, args=(1,))
206 |     data_frame['qian_lie_xian_3'] =  data_frame['0120'].apply(qian_lie_xian, args=(2,))
207 |     data_frame['dpm_from_3301'] =  data_frame['3301'].apply(dpm_check)
208 |     huishen = ['left_shen_no_voice','right_shen_no_voice','right_shen_strong_voice','jzx_no_voice_area','qian_lie_xian_2','qian_lie_xian_3','dpm_from_3301',
209 |                'jzx_no_voice_jiejie','jzx_low_voice_area','jzx_low_voice_jiejie','liver_no_voice','liver_strong_voice','dan_strong_voice','qian_lie_xian_1']
210 |     total = word_096_norm  + shili + heart + huishen
211 |     num_ex_str = data_frame.loc[:, total]
212 |     return num_ex_str
213 | 
214 | if __name__ == "__main__":
215 |     part_1_2 = odps.get_table('origin_data_combine_part1_part2').to_df().to_pandas()
216 |     part_1_2['jia_zx'] = split_data(part_1_2['0101'], '甲状腺')
217 |     part_1_2['left_shen'] = split_data(part_1_2['0117'], '左肾')
218 |     part_1_2['right_shen'] = split_data(part_1_2['0118'], '右肾')
219 |     part_1_2_copy = part_1_2.copy(deep=True)
220 |     ex_num_data = ex_num_from_str(part_1_2)
221 |     print('the shape of the num_data get from word: ', ex_num_data.shape)
222 |     pure_num_data = get_pure_num_features(part_1_2_copy, 0.96)
223 |     pure_columns = [p for p in pure_num_data.columns if p != 'vid']
224 |     ex_num_columns = [i for i in ex_num_data.columns if i not in ['vid', '314','1308','1319','1320','1321','1322','0424','0425']]
225 |     print('the shape of origin num data: ', pure_num_data.shape)
226 |     numeric_data = pd.merge(pure_num_data, ex_num_data, on='vid', how='inner')
227 |     exm_drop = []
228 |     for w in pure_columns + ex_num_columns:
229 | 		if np.abs(numeric_data[w].skew()) > 12:
230 | 			exm_drop.append(w)
231 |     print(exm_drop)
232 |     numeric_data.drop(exm_drop, axis=1, inplace=True)
233 |     print('total data shape: ', numeric_data.shape)
234 |     juz_num_data = DataFrame(numeric_data)
235 |     juz_num_data.persist('juz_num_data_5_31')


--------------------------------------------------------------------------------
/round2_rank10/data_pre_process/get_word_features.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import re
  3 | import pandas as pd
  4 | from odps import ODPS
  5 | from odps.df import DataFrame
  6 | import numpy as np
  7 | from collections import Iterable
  8 | 
  9 | import sys  
 10 | reload(sys)  
 11 | sys.setdefaultencoding('utf8')
 12 | 
 13 | def transform_2302(df):
 14 |     try:
 15 |         if '健康' in df:
 16 |             if '亚健康' in df:
 17 |                 return 1
 18 |             else:
 19 |                 return 0
 20 |         elif '疾病' in df:
 21 |             return 2
 22 |     except Exception:
 23 |         return df
 24 | 
 25 | 
 26 | def high_sugar(df):
 27 |     if df:
 28 |         if '血糖偏高' in df or '降糖' in df or '血糖' in df:
 29 |             return 1
 30 |         else:
 31 |             return 0
 32 |     else:
 33 |         return np.nan
 34 | 
 35 | 
 36 | def high_fat(df):
 37 |     if df:
 38 |         if '血脂偏高' in df or '低脂' in df or '血脂' in df:
 39 |             return 1
 40 |         else:
 41 |             return 0
 42 |     else:
 43 |         return np.nan
 44 | 
 45 | 
 46 | def high_pressure(df):
 47 |     if df:
 48 |         if '血压偏高' in df or '降压' in df or '血压' in df:
 49 |             return 1
 50 |         else:
 51 |             return 0
 52 |     else:
 53 |         return np.nan
 54 | 
 55 | 
 56 | def higher_pressure(df):
 57 |     if df:
 58 |         if '血压偏高' not in df:
 59 |             if '高血压' in df:
 60 |                 return 1
 61 |             else:
 62 |                 return 0
 63 |     else:
 64 |         return np.nan
 65 | 
 66 | 
 67 | def higher_fat(df):
 68 |     if df:
 69 |         if '血脂偏高' not in df:
 70 |             if '高血脂' in df:
 71 |                 return 1
 72 |             else:
 73 |                 return 0
 74 |     else:
 75 |         return np.nan
 76 | 
 77 | 
 78 | def higher_sugar(df):
 79 |     if df:
 80 |         if '血糖偏高' not in df:
 81 |             if '高血糖' in df or '糖尿病' in df:
 82 |                 return 1
 83 |             else:
 84 |                 return 0
 85 |     else:
 86 |         return np.nan
 87 | 
 88 | def coronary_heart_disease(df):
 89 |     if df:
 90 |         if '冠心病' in df or '冠状' in df:
 91 |             return 1
 92 |         else:
 93 |             return 0
 94 |     else:
 95 |         return np.nan
 96 | 
 97 | 
 98 | def kidney(df):
 99 |     if df:
100 |         if '肾' in df:
101 |             return 1
102 |         else:
103 |             return 0
104 |     else:
105 |         return np.nan
106 | 
107 | 
108 | def smoke(df):
109 |     if df:
110 |         if '烟' in df:
111 |             return 1
112 |         else:
113 |             return 0
114 |     else:
115 |         return np.nan
116 | 
117 | def blood_pipe_style(df):
118 |             try:
119 |                 if '良好' in df or '正常' in df:
120 |                     return 0
121 |                 elif '趋势' in df:
122 |                     return 1
123 |                 elif '轻度' in df:
124 |                     return 2
125 |                 elif '中度' in df:
126 |                     return 3
127 |                 elif '重度' in df:
128 |                     return 4
129 |                 elif '硬化' in df:
130 |                     return 5
131 |                 else:
132 |                     return np.nan
133 |             except Exception:
134 |                 return df
135 | 
136 | def ying_yang(df):
137 |     try:
138 |         if '+' in df and '-' in df:
139 |             return 1
140 |         elif '+' in df and '-' not in df:
141 |             return 2
142 |         elif ('-' in df or '阴' in df or '正常' in df or 'Normal' in df) and '+' not in df:
143 |             return 0
144 |         else:
145 |             return 0
146 |     except Exception:
147 |         return df
148 | 
149 | def HP_yy(df):
150 |     try:
151 |         if '阳' in df:
152 |             return 1
153 |         else:
154 |             return 0
155 |     except Exception:
156 |         return df
157 | 
158 | # 尿
159 | def urine(df):
160 |     try:
161 |         if '>=' in df:
162 |             return 1
163 |         else:
164 |             return 0
165 |     except Exception:
166 |         return df
167 | 
168 | def heart_rate(df):
169 |     try:
170 |         if df != '强弱不等':
171 |             if '弱' in df or '远' in df or '低' in df:
172 |                 return 1
173 |             elif '强' in df or '力' in df:
174 |                 return 3
175 |             else:
176 |                 return 0
177 |         else:
178 |             return 2
179 |     except Exception:
180 |         return df
181 | 
182 | def transform_421(df):
183 |     try:
184 |         if '齐' in df and '不' not in df:
185 |             return 0
186 |         else:
187 |             return 1
188 |     except Exception:
189 |         return df
190 | 
191 | def transform_403(df):
192 |     try:
193 |         if '大' in df and '无' not in df:
194 |             return 1
195 |         else:
196 |             return 0
197 |     except Exception:
198 |         return df
199 | 
200 | def transform_3399(df):
201 |     try:
202 |         if df == '黄色' or df == 'yellow':
203 |             return 2
204 |         elif df == '淡黄色' or df == '浅黄色':
205 |             return 1
206 |         elif df == '无色':
207 |             return 0
208 |         elif '红' in df:
209 |             return 3
210 |         elif df == '混浊':
211 |             return 4
212 |         else:
213 |             return 5
214 |     except Exception:
215 |         return df
216 | 
217 | def lung_voice(df):
218 |     try:
219 |         if '干啰' in df:
220 |             return 1
221 |         elif '湿啰' in df:
222 |             return 2
223 |         elif '哮鸣' in df:
224 |             return 3
225 |         elif '湿鸣' in df:
226 |             return 4
227 |         else:
228 |             return 0
229 |     except Exception:
230 |         return df
231 | 
232 | def get_num_from_102_front(df):
233 |     try:
234 |         temp_x = re.findall('(\d+)/(\d+)', df)
235 |         if temp_x:
236 |             return float(temp_x[0][0])
237 |     except Exception:
238 |         return np.nan
239 | 
240 | 
241 | def get_num_from_102_back(df):
242 |     try:
243 |         temp_x = re.findall('(\d+)/(\d+)', df)
244 |         if temp_x:
245 |             return float(temp_x[0][1])
246 |     except Exception:
247 |         return np.nan
248 |     
249 | def dannan_xirou(df):
250 |     if df:
251 |         if '胆囊息肉' in df:
252 |             return 1
253 |         else:
254 |             return 0
255 |     else:
256 |         return np.nan
257 | 
258 | 
259 | def dannan_jieshi(df):
260 |     if df:
261 |         if '胆囊结石' in df:
262 |             return 1
263 |         else:
264 |             return 0
265 |     else:
266 |         return np.nan
267 | 
268 | 
269 | def shen_jieshi(df):
270 |     if df:
271 |         if '肾结石' in df:
272 |             return 1
273 |         else:
274 |             return 0
275 |     else:
276 |         return np.nan
277 | 
278 | 
279 | def shen_nangzhong(df):
280 |     if df:
281 |         if '肾囊肿' in df:
282 |             return 1
283 |         else:
284 |             return 0
285 |     else:
286 |         return np.nan
287 | 
288 | 
289 | def gan_nangzhong(df):
290 |     if df:
291 |         if '肝囊肿' in df:
292 |             return 1
293 |         else:
294 |             return 0
295 |     else:
296 |         return np.nan
297 |     
298 | def map_deal_0113(temp):
299 |     try:
300 |         if isnan(float(temp)):
301 |             return -1
302 |         else:
303 |             return float(temp)
304 |     except Exception:
305 |         temp = str(temp)
306 |         value = 0
307 |         if "弥漫性" in temp:
308 |             value = 5
309 |         if "欠清晰" in temp:
310 |             value += 2
311 |         if "粗" in temp:
312 |             value += 0.5
313 |         if "多发" in temp:
314 |             value += 0.5
315 |         if "斑点状" in temp:
316 |             value += 1
317 |         if "回声区" in temp:
318 |             value += 1
319 |     return value
320 | 
321 | def gan_ying_hua(df):
322 |     if df:
323 |         if '肝脏' in df:
324 |             return 1
325 |         else:
326 |             return 0
327 |     else:
328 |         return np.nan
329 |     
330 | def strQ2B(ustring):  
331 |     """全角转半角""" 
332 |     ustring = str(ustring)
333 |     rstring = ""  
334 |     for uchar in ustring:  
335 |         inside_code=ord(uchar)  
336 |         if inside_code == 12288:                                            
337 |             inside_code = 32   
338 |         elif (inside_code >= 65281 and inside_code <= 65374):
339 |             inside_code -= 65248  
340 |   
341 |         rstring += chr(inside_code)
342 |     return rstring
343 | 
344 | def extract_num_norm(df):
345 |     if isinstance(df, Iterable):
346 |         temp = re.findall(r'\-*\d+(?:\.\d+)?', df)
347 |         if temp:
348 |             return np.mean([float(i.replace('--', '')) for i in temp])
349 |         else:
350 |             return np.nan
351 |     else:
352 |         return np.nan
353 | 
354 | 
355 | def is_sex(x):
356 |     x = str(x)
357 |     if ('阴道' in x)|('子宫' in x)|('妇' in x)|('乳' in x)|('孕' in x)|('卵巢' in x)|('女' in x)|('宫颈' in x)|('妊娠' in x)|('剖腹产' in x):
358 |         return 1
359 |     elif ('前列腺' in x)|('包皮' in x)|('包茎' in x)|('男' in x)|('阴茎' in x)|('睾丸' in x):
360 |         return 2
361 |     else:
362 |         return 0
363 | 
364 | def word2num(data_frame):
365 |     one_hot_list = ['0101', '0102', '0113', '0409', '0413', '0434', '0439', 'a201', 'a202', '4001', '0705', 'a301', '0709',
366 |                     '0985', 'a705']
367 |     data_frame.loc[:, one_hot_list] = data_frame.loc[:, one_hot_list].fillna('')
368 |     frame_409_434 = data_frame['0409'] + data_frame['0434'] + data_frame['0413'] + data_frame['4001'] + \
369 |                     data_frame['a201'] + data_frame['a301'] + data_frame['a202'] + data_frame['0705'] + \
370 |                     data_frame['0709'] + data_frame['0985'] + data_frame['0439']
371 |     data_frame['xue_ya_pian_gao'] = frame_409_434.apply(high_pressure)
372 |     data_frame['gan_by_ts'] = data_frame['0113'].apply(map_deal_0113)
373 |     data_frame['xue_zhi_pian_gao'] = frame_409_434.apply(high_fat)
374 |     data_frame['xue_tang_pian_gao'] = frame_409_434.apply(high_sugar)
375 |     data_frame['high_sugar'] = frame_409_434.apply(higher_sugar)
376 |     data_frame['guan_xin_bin'] = frame_409_434.apply(coronary_heart_disease)
377 |     data_frame['shen'] = frame_409_434.apply(kidney)
378 |     data_frame['smoke'] = frame_409_434.apply(smoke)
379 |     fat_liver_num = data_frame['0101'] + data_frame['0102'] + data_frame['0113'] + data_frame['a202']
380 |     data_frame['dannan_jieshi'] = fat_liver_num.apply(dannan_jieshi)
381 |     data_frame['dannan_xirou'] = fat_liver_num.apply(dannan_xirou)
382 |     data_frame['shen_jieshi'] = fat_liver_num.apply(shen_jieshi)
383 |     data_frame['shen_nanz'] = fat_liver_num.apply(shen_nangzhong)
384 |     data_frame['gan_nanz'] = fat_liver_num.apply(gan_nangzhong)
385 |     data_frame['gan_ying_hua'] = data_frame['a705'].apply(gan_ying_hua)
386 |     yy_list = ['3190', '3191', '3192', '3194', '3195', '3197', '3430', '100010']
387 |     for y in yy_list:
388 |         data_frame[y] = data_frame[y].apply(ying_yang)
389 |     data_frame['niao'] = data_frame['3193'].apply(urine)
390 |     data_frame['heart_rate'] = data_frame['0420'].apply(heart_rate)
391 |     data_frame['3399_w'] = data_frame['3399'].apply(transform_3399)
392 |     data_frame['3301_w'] = data_frame['3301'].apply(HP_yy)
393 |     data_frame['0403_w'] = data_frame['0403'].apply(transform_403)
394 |     data_frame['0421_w'] = data_frame['0421'].apply(transform_421)
395 |     data_frame['0405_w'] = data_frame['0405'].apply(lung_voice)
396 |     data_frame['blood_pipe_style'] = data_frame['4001'].apply(blood_pipe_style)
397 |     data_frame['health'] = data_frame['2302'].apply(transform_2302)
398 |     data_frame['pres_front'] = data_frame['0102'].apply(get_num_from_102_front)
399 |     data_frame['pres_back'] = data_frame['0102'].apply(get_num_from_102_back)
400 |     data_frame['heart_times'] = data_frame['1001'].apply(extract_num_norm)
401 | 
402 |     data_frame['all_result'] = '_'
403 |     for p in data_frame.columns:
404 |         if p != 'vid':
405 |         	data_frame['all_result'] = data_frame['all_result'] +  '_' + data_frame[p].astype('str')
406 |  
407 |     data_frame['gender'] = data_frame['all_result'].apply(is_sex)
408 |     del data_frame['all_result']
409 | 
410 |     new_add = ['xue_ya_pian_gao', 'xue_zhi_pian_gao', 'xue_tang_pian_gao', 'high_sugar', 'guan_xin_bin', 'shen', 'smoke','niao', 'heart_rate', '3399_w', 
411 |                '3301_w', '0403_w', '0421_w', '0405_w', 'gender','blood_pipe_style', 'health','pres_front', 'pres_back','heart_times', 'vid', 'dannan_jieshi',
412 |                'dannan_xirou', 'shen_jieshi', 'shen_nanz', 'gan_nanz','gan_ying_hua']
413 |     yy_list.extend(new_add)
414 |     return data_frame.loc[:, yy_list]
415 | 
416 | 
417 | if __name__ == "__main__":
418 |     part_1_2 = odps.get_table('origin_data_combine_part1_part2').to_df().to_pandas()
419 |     word_data = word2num(part_1_2)
420 |     print('the shape of word_data: ',word_data.shape)
421 |     juz_word_data = DataFrame(word_data)
422 |     juz_word_data.persist('juz_word_data_5_30')


--------------------------------------------------------------------------------
/round1_rank2/team/team_feature_work.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2018/5/8 0008 下午 16:40
  4 | # @Author  : Juzphy
  5 | 
  6 | import time
  7 | import pandas as pd
  8 | from math import isnan
  9 | start_time=time.time()
 10 | 
 11 | def filter_None(data):
 12 |     data=data[data['field_results']!='']
 13 |     data=data[data['field_results']!='未查']
 14 |     return data
 15 | 
 16 | # 重复数据的拼接操作
 17 | def merge_table(df):
 18 |     df['field_results'] = df['field_results'].astype(str)
 19 |     if df.shape[0] > 1:
 20 |         merge_df = " ".join(list(df['field_results']))
 21 |     else:
 22 |         merge_df = df['field_results'].values[0]
 23 |     return merge_df
 24 | 
 25 | # 删除掉一些出现次数低，缺失比例大的字段，保留超过阈值的特征
 26 | def remain_feat(df,thresh=0.9):
 27 |     exclude_feats = []
 28 |     print('----------移除数据缺失多的字段-----------')
 29 |     print('移除之前总的字段数量',len(df.columns))
 30 |     num_rows = df.shape[0]
 31 |     for c in df.columns:
 32 |         num_missing = df[c].isnull().sum()
 33 |         if num_missing == 0:
 34 |             continue
 35 |         missing_percent = num_missing / float(num_rows)
 36 |         if missing_percent > thresh:
 37 |             exclude_feats.append(c)
 38 |     print("移除缺失数据的字段数量: %s" % len(exclude_feats))
 39 |     # 保留超过阈值的特征
 40 |     feats = []
 41 |     for c in df.columns:
 42 |         if c not in exclude_feats:
 43 |             feats.append(c)
 44 |     print('剩余的字段数量',len(feats))
 45 |     return feats
 46 | 
 47 | def map_deal_3601(temp):
 48 |     try:
 49 |         if isnan(float(temp)):
 50 |             return -1
 51 |         else:
 52 |             return float(temp)
 53 |     except Exception:
 54 |         temp = str(temp)
 55 |         if "严重" in temp:
 56 |             return 4
 57 |         elif "中度" in temp:
 58 |             return 3
 59 |         elif "减少" in temp or "降低" in temp or "疏松":
 60 |             return 2
 61 |         else:
 62 |             return 1
 63 | 
 64 | def map_deal_0102(temp):
 65 |     try:
 66 |         if isnan(float(temp)):
 67 |             return -1
 68 |         else:
 69 |             return float(temp)
 70 |     except Exception:
 71 |         temp = str(temp)
 72 |         value = 0
 73 |         if "脂肪" in temp:
 74 |             if "重" in temp:
 75 |                 value = 4
 76 |             elif "中" in temp:
 77 |                 value = 3
 78 |             elif "轻" in temp:
 79 |                 value = 2
 80 |             else:
 81 |                 value = 1
 82 |         else:
 83 |             value = 0.0
 84 |         if "多发" in temp:
 85 |             value += 0.5
 86 |     return value
 87 | 
 88 | def map_deal_0113(temp):
 89 |     try:
 90 |         if isnan(float(temp)):
 91 |             return -1
 92 |         else:
 93 |             return float(temp)
 94 |     except Exception:
 95 |         temp = str(temp)
 96 |         value = 0
 97 |         if "弥漫性" in temp:
 98 |             value = 5
 99 |         if "欠清晰" in temp:
100 |             value += 2
101 |         if "粗" in temp:
102 |             value += 0.5
103 |         if "多发" in temp:
104 |             value += 0.5
105 |         if "斑点状" in temp:
106 |             value += 1
107 |         if "回声区" in temp:
108 |             value += 1
109 |     return value
110 | 
111 | def map_deal_0114(temp):
112 |     try:
113 |         if isnan(float(temp)):
114 |             return -1
115 |         else:
116 |             return float(temp)
117 |     except Exception:
118 |         temp = str(temp)
119 |         value = 0
120 |         if "毛糙" in temp:
121 |             value = 4
122 |         if "强回声" in temp:
123 |             value += 1
124 |     return value
125 | 
126 | def map_deal_0115(temp):
127 |     try:
128 |         if isnan(float(temp)):
129 |             return -1
130 |         else:
131 |             return float(temp)
132 |     except Exception:
133 |         temp = str(temp)
134 |         value = 0
135 |         if "不清晰" in temp:
136 |             value = 4
137 |         if "增强" in temp:
138 |             value += 1
139 |     return value
140 | 
141 | def map_deal_0115(temp):
142 |     try:
143 |         if isnan(float(temp)):
144 |             return -1
145 |         else:
146 |             return float(temp)
147 |     except Exception:
148 |         temp = str(temp)
149 |         value = 0
150 |         if "不清晰" in temp:
151 |             value = 4
152 |         if "增强" in temp:
153 |             value += 1
154 |     return value
155 | 
156 | def map_deal_0116(temp):
157 |     try:
158 |         if isnan(float(temp)):
159 |             return -1
160 |         else:
161 |             return float(temp)
162 |     except Exception:
163 |         temp = str(temp)
164 |         value = 0
165 |         if "不清晰" in temp:
166 |             value = 4
167 |         if "增强" in temp:
168 |             value += 1
169 |     return value
170 | 
171 | def map_deal_0117(temp):
172 |     try:
173 |         if isnan(float(temp)):
174 |             return -1
175 |         else:
176 |             return float(temp)
177 |     except Exception:
178 |         temp = str(temp)
179 |         value = 0
180 |         if "强回声" in temp:
181 |             value = 4
182 |         if "无回声" in temp:
183 |             value += 1
184 |         if "欠均匀" in temp:
185 |             value += 1
186 |     return value
187 | 
188 | def map_deal_0118(temp):
189 |     try:
190 |         if isnan(float(temp)):
191 |             return -1
192 |         else:
193 |             return float(temp)
194 |     except Exception:
195 |         temp = str(temp)
196 |         value = 0
197 |         if "强回声" in temp:
198 |             value = 4
199 |         if "无回声" in temp:
200 |             value += 1
201 |         if "欠均匀" in temp:
202 |             value += 1
203 |     return value
204 | 
205 | def map_deal_0118(temp):
206 |     try:
207 |         if isnan(float(temp)):
208 |             return -1
209 |         else:
210 |             return float(temp)
211 |     except Exception:
212 |         temp = str(temp)
213 |         value = 0
214 |         if "强回声" in temp:
215 |             value = 4
216 |         if "无回声" in temp:
217 |             value += 1
218 |         if "欠均匀" in temp:
219 |             value += 1
220 |     return value
221 | 
222 | def map_deal_0503(temp):
223 |     try:
224 |         if isnan(float(temp)):
225 |             return -1
226 |         else:
227 |             return float(temp)
228 |     except Exception:
229 |         temp = str(temp)
230 |         value = 0
231 |         if "分泌物多" in temp:
232 |             value = 8
233 |         if "分泌物中" in temp:
234 |             value = 5
235 |         if "分泌物少" in temp:
236 |             value = 3
237 |         if "浓性" in temp:
238 |             value += 1
239 |         if "充血" in temp:
240 |             value += 1
241 |         if "黄色" in temp:
242 |             value += 0.5
243 |     return value
244 | 
245 | def map_deal_0509(temp):
246 |     try:
247 |         if isnan(float(temp)):
248 |             return -1
249 |         else:
250 |             return float(temp)
251 |     except Exception:
252 |         temp = str(temp)
253 |         value = 0
254 |         if "充血" in temp:
255 |             value = 8
256 |         if "肥大" in temp:
257 |             value = 5
258 |         if "轻糜" in temp:
259 |             value += 1
260 |         if "中糜" in temp:
261 |             value += 1.5
262 |         if "囊" in temp:
263 |             value += 0.5
264 |     return value
265 | 
266 | def map_deal_0516(temp):
267 |     try:
268 |         if isnan(float(temp)):
269 |             return -1
270 |         else:
271 |             return float(temp)
272 |     except Exception:
273 |         temp = str(temp)
274 |         value = 0
275 |         if "前位" in temp:
276 |             value = 8
277 |         if "后位" in temp:
278 |             value = 5
279 |         if "平位" in temp:
280 |             value = 3
281 |         if "增大" in temp:
282 |             value += 1
283 |         if "硬" in temp:
284 |             value += 0.5
285 |     return value
286 | 
287 | def map_deal_0539(temp):
288 |     try:
289 |         if isnan(float(temp)):
290 |             return -1
291 |         else:
292 |             return float(temp)
293 |     except Exception:
294 |         temp = str(temp)
295 |         value = 0
296 |         if "分泌物" in temp:
297 |             value += 1
298 |         if "肥大" in temp:
299 |             value += 2
300 |         if "充血" in temp:
301 |             value += 3
302 |         if "炎" in temp:
303 |             value += 0.5
304 |     return value
305 | 
306 | def map_deal_2302(temp):
307 |     try:
308 |         if isnan(float(temp)):
309 |             return -1
310 |         else:
311 |             return float(temp)
312 |     except Exception:
313 |         temp = str(temp)
314 |         value = 0
315 |         if "亚健康" in temp:
316 |             value = 3
317 |         else:
318 |             value = 1
319 |     return value
320 | 
321 | def map_deal_1316(temp):
322 |     try:
323 |         if isnan(float(temp)):
324 |             return -1
325 |         else:
326 |             return float(temp)
327 |     except Exception:
328 |         temp = str(temp)
329 |         value = 0
330 |         if "正常" in temp or "未见" in temp:
331 |             pass
332 |         else:
333 |             value += 2
334 |     return value
335 | 
336 | def map_deal_0101(temp):
337 |     try:
338 |         if isnan(float(temp)):
339 |             return -1
340 |         else:
341 |             return float(temp)
342 |     except Exception:
343 |         temp = str(temp)
344 |         value = 0
345 |         if "低回声" in temp or "回声区" in temp:
346 |             value += 1
347 |     return value
348 | 
349 | def map_deal_0119(temp):
350 |     try:
351 |         if isnan(float(temp)):
352 |             return -1
353 |         else:
354 |             return float(temp)
355 |     except Exception:
356 |         temp = str(temp)
357 |         value = 0
358 |         if "欠佳" in temp:
359 |             value = 2
360 |     return value
361 | 
362 | def map_deal_0121(temp):
363 |     try:
364 |         if isnan(float(temp)):
365 |             return -1
366 |         else:
367 |             return float(temp)
368 |     except Exception:
369 |         temp = str(temp)
370 |         value = 0
371 |         if "低回声" in temp or "回声区" in temp:
372 |             value += 1
373 |     return value
374 | 
375 | def map_deal_0122(temp):
376 |     try:
377 |         if isnan(float(temp)):
378 |             return -1
379 |         else:
380 |             return float(temp)
381 |     except Exception:
382 |         temp = str(temp)
383 |         value = 0
384 |         if "回声团" in temp or "回声区" in temp:
385 |             value += 1
386 |     return value
387 | 
388 | def map_deal_0123(temp):
389 |     try:
390 |         if isnan(float(temp)):
391 |             return -1
392 |         else:
393 |             return float(temp)
394 |     except Exception:
395 |         temp = str(temp)
396 |         value = 0
397 |         if "回声团" in temp or "回声区" in temp:
398 |             value += 1
399 |     return value
400 | 
401 | def map_deal_A705(temp):
402 |     try:
403 |         if isnan(float(temp)):
404 |             return -1
405 |         else:
406 |             return float(temp)
407 |     except Exception:
408 |         temp = str(temp)
409 |         value = 0
410 |         if "衰减" in temp:
411 |             value += 5
412 |     return value
413 | 
414 | def map_deal_0911(temp):
415 |     try:
416 |         if isnan(float(temp)):
417 |             return -1
418 |         else:
419 |             return float(temp)
420 |     except Exception:
421 |         temp = str(temp)
422 |         value = 0
423 |         if "肿大" in temp:
424 |             value += 2
425 |     return value
426 | 
427 | def map_deal_0912(temp):
428 |     try:
429 |         if isnan(float(temp)):
430 |             return -1
431 |         else:
432 |             return float(temp)
433 |     except Exception:
434 |         temp = str(temp)
435 |         value = 0
436 |         if "无肿大" in temp or "未见" in temp:
437 |             pass
438 |         else:
439 |             value += 2
440 |     return value
441 | 
442 | def map_deal_0929(temp):
443 |     try:
444 |         if isnan(float(temp)):
445 |             return -1
446 |         else:
447 |             return float(temp)
448 |     except Exception:
449 |         temp = str(temp)
450 |         value = 0
451 |         if "不全" in temp:
452 |             value = 3
453 |         if "增生" in temp:
454 |             value = 6
455 |     return value
456 | 
457 | def map_deal_A202(temp):
458 |     try:
459 |         if isnan(float(temp)):
460 |             return -1
461 |         else:
462 |             return float(temp)
463 |     except Exception:
464 |         temp = str(temp)
465 |         value = 0
466 |         if "陈旧" in temp:
467 |             value = 5
468 |         if "灶" in temp:
469 |             value += 1
470 |     return value
471 | 
472 | def map_deal_1102(temp):
473 |     try:
474 |         if isnan(float(temp)):
475 |             return -1
476 |         else:
477 |             return float(temp)
478 |     except Exception:
479 |         temp = str(temp)
480 |         value = 0
481 |         if "增生" in temp:
482 |             value += 1
483 |     return value
484 | 
485 | def map_deal_0208(temp):
486 |     try:
487 |         if isnan(float(temp)):
488 |             return -1
489 |         else:
490 |             return float(temp)
491 |     except Exception:
492 |         temp = str(temp)
493 |         value = 0
494 |         if "正常" in temp or "未见" in temp:
495 |             pass
496 |         else:
497 |             value += 1
498 |     return value
499 | 
500 | def map_deal_0209(temp):
501 |     try:
502 |         if isnan(float(temp)):
503 |             return -1
504 |         else:
505 |             return float(temp)
506 |     except Exception:
507 |         temp = str(temp)
508 |         value = 0
509 |         if "正常" in temp or "未见" in temp:
510 |             pass
511 |         else:
512 |             value += 1
513 |     return value
514 | 
515 | def map_deal_0210(temp):
516 |     try:
517 |         if isnan(float(temp)):
518 |             return -1
519 |         else:
520 |             return float(temp)
521 |     except Exception:
522 |         temp = str(temp)
523 |         value = 0
524 |         if "正常" in temp or "未见" in temp:
525 |             pass
526 |         else:
527 |             value += 1
528 |     return value
529 | 
530 | def map_deal_0215(temp):
531 |     try:
532 |         if isnan(float(temp)):
533 |             return -1
534 |         else:
535 |             return float(temp)
536 |     except Exception:
537 |         temp = str(temp)
538 |         value = 0
539 |         if "充血" in temp:
540 |             value = 5
541 | 
542 |         if "正常" in temp or "未见" in temp:
543 |             pass
544 |         else:
545 |             value += 1
546 |     return value
547 | 
548 | def map_deal_0217(temp):
549 |     try:
550 |         if isnan(float(temp)):
551 |             return -1
552 |         else:
553 |             return float(temp)
554 |     except Exception:
555 |         temp = str(temp)
556 |         value = 0
557 |         if "肿" in temp:
558 |             value = 3
559 |     return value
560 | 
561 | def map_deal_4001(temp):
562 |     try:
563 |         if isnan(float(temp)):
564 |             return -1
565 |         else:
566 |             return float(temp)
567 |     except Exception:
568 |         temp = str(temp)
569 |         value = 0
570 |         if "轻度" in temp:
571 |             value = 3
572 |         if "中度" in temp:
573 |             value = 5
574 |         if "重度" in temp:
575 |             value = 8
576 |     return value
577 | 
578 | def map_deal_1001(temp):
579 |     try:
580 |         if isnan(float(temp)):
581 |             return -1
582 |         else:
583 |             return float(temp)
584 |     except Exception:
585 |         temp = str(temp)
586 |         value = 0
587 |         if "过缓" in temp or "不齐" in temp or "偏" in temp:
588 |             value += 3
589 |     return value
590 | 
591 | def map_deal_0409(temp):
592 |     try:
593 |         if isnan(float(temp)):
594 |             return -1
595 |         else:
596 |             return float(temp)
597 |     except Exception:
598 |         temp = str(temp)
599 |         value = 0
600 |         if "血压" in temp:
601 |             value += 9
602 |         if "糖尿" in temp:
603 |             value += 3
604 |         if "脂肪" in temp:
605 |             value += 5
606 |     return value
607 | 
608 | def map_deal_0421(temp):
609 |     try:
610 |         if isnan(float(temp)):
611 |             return -1
612 |         else:
613 |             return float(temp)
614 |     except Exception:
615 |         temp = str(temp)
616 |         value = 0
617 |         if "不齐" in temp:
618 |             value += 3
619 |     return value
620 | 
621 | def map_deal_0424(temp):
622 |     try:
623 |         if isnan(float(temp)):
624 |             return -1
625 |         else:
626 |             return float(temp)
627 |     except Exception:
628 |         temp = str(temp)
629 |         value = 0
630 |         if "次" in temp:
631 |             if "70" in temp:
632 |                 value = 70
633 |             else:
634 |                 value = 80
635 |     return value
636 | 
637 | def map_deal_0434(temp):
638 |     try:
639 |         if isnan(float(temp)):
640 |             return -1
641 |         else:
642 |             return float(temp)
643 |     except Exception:
644 |         temp = str(temp)
645 |         value = 0
646 |         if "血压" in temp:
647 |             value += 9
648 |         if "糖尿" in temp:
649 |             value += 3
650 |         if "脂肪" in temp:
651 |             value += 5
652 |         if "心" in temp:
653 |             value += 1
654 |     return value
655 | 
656 | def map_deal_1402(temp):
657 |     try:
658 |         if isnan(float(temp)):
659 |             return -1
660 |         else:
661 |             return float(temp)
662 |     except Exception:
663 |         temp = str(temp)
664 |         value = 0
665 |         if "硬" in temp:
666 |             value += 5
667 |         if "低" in temp:
668 |             value += 1
669 |         if "慢" in temp:
670 |             value += 1
671 |     return value
672 | 
673 | def map_deal_0120(temp):
674 |     try:
675 |         if isnan(float(temp)):
676 |             return -1
677 |         else:
678 |             return float(temp)
679 |     except Exception:
680 |         temp = str(temp)
681 |         value = 0
682 |         if "强回声" in temp:
683 |             value += 5
684 |         if "低" in temp:
685 |             value += 1
686 |     return value
687 | 
688 | def map_deal_0984(temp):
689 |     try:
690 |         if isnan(float(temp)):
691 |             return -1
692 |         else:
693 |             return float(temp)
694 |     except Exception:
695 |         temp = str(temp)
696 |         value = 0
697 |         if "增" in temp:
698 |             value += 5
699 |     return value
700 | 
701 | def map_deal_100010(temp):
702 |     try:
703 |         if isnan(float(temp)):
704 |             return -1
705 |         else:
706 |             return float(temp)
707 |     except Exception:
708 |         temp = str(temp)
709 |         value = 0
710 |         if "+" in temp:
711 |             value += 5
712 |     return value
713 | 
714 | def map_deal_3190(temp):
715 |     try:
716 |         if isnan(float(temp)):
717 |             return -1
718 |         else:
719 |             return float(temp)
720 |     except Exception:
721 |         temp = str(temp)
722 |         value = 0
723 |         if "+" in temp:
724 |             value += 5
725 |     return value
726 | 
727 | def map_deal_3191(temp):
728 |     try:
729 |         if isnan(float(temp)):
730 |             return -1
731 |         else:
732 |             return float(temp)
733 |     except Exception:
734 |         temp = str(temp)
735 |         value = 0
736 |         if "+" in temp:
737 |             value += 5
738 |     return value
739 | 
740 | def map_deal_3192(temp):
741 |     try:
742 |         if isnan(float(temp)):
743 |             return -1
744 |         else:
745 |             return float(temp)
746 |     except Exception:
747 |         temp = str(temp)
748 |         value = 0
749 |         if "+" in temp:
750 |             value += 5
751 |     return value
752 | 
753 | 
754 | def map_deal_3195(temp):
755 |     try:
756 |         if isnan(float(temp)):
757 |             return -1
758 |         else:
759 |             return float(temp)
760 |     except Exception:
761 |         temp = str(temp)
762 |         value = 0
763 |         if "+" in temp:
764 |             value += 5
765 |     return value
766 | 
767 | 
768 | def map_deal_3196(temp):
769 |     try:
770 |         if isnan(float(temp)):
771 |             return -1
772 |         else:
773 |             return float(temp)
774 |     except Exception:
775 |         temp = str(temp)
776 |         value = 0
777 |         if "+" in temp:
778 |             value += 5
779 |     return value
780 | 
781 | 
782 | def map_deal_3197(temp):
783 |     try:
784 |         if isnan(float(temp)):
785 |             return -1
786 |         else:
787 |             return float(temp)
788 |     except Exception:
789 |         temp = str(temp)
790 |         value = 0
791 |         if "+" in temp:
792 |             value += 5
793 |     return value
794 | 
795 | 
796 | def map_deal_3430(temp):
797 |     value = 0
798 |     try:
799 |         if isnan(float(temp)):
800 |             return -1
801 |         else:
802 |             return float(temp)
803 |     except Exception:
804 |         temp = str(temp)
805 |         if "+" in temp:
806 |             value += 5
807 |     return value
808 | 
809 | 
810 | def map_deal_3399(temp):
811 |     try:
812 |         if isnan(float(temp)):
813 |             return -1
814 |         else:
815 |             return float(temp)
816 |     except Exception:
817 |         temp = str(temp)
818 |         value = 0
819 |         if "淡" in temp:
820 |             value += 5
821 |     return value
822 | 
823 | 
824 | item_list = ['3601', '0102', '0113', '0114', '0115', '0116',
825 |              '0117', '0118', '0503', '0509', '0516', '0539',
826 |              '2302', '1316', '0101', '0119', '0121', '0122',
827 |              '0123', 'A705', '0911', '0912', '0929', 'A202',
828 |              '1102', '0208', '0209', '0210', '0215', '0217',
829 |              '4001', '1001', '0409', '0421', '0424', '0434',
830 |              '1402', '0120', '0984', '100010', '3190', '3191',
831 |              '3192', '3195', '3196', '3197', '3430', '3399']
832 | 
833 | map_list = [map_deal_3601, map_deal_0102, map_deal_0113, map_deal_0114, map_deal_0115, map_deal_0116,
834 |             map_deal_0117, map_deal_0118, map_deal_0503, map_deal_0509, map_deal_0516, map_deal_0539,
835 |             map_deal_2302, map_deal_1316, map_deal_0101, map_deal_0119, map_deal_0121, map_deal_0122,
836 |             map_deal_0123, map_deal_A705, map_deal_0911, map_deal_0912, map_deal_0929, map_deal_A202,
837 |             map_deal_1102, map_deal_0208, map_deal_0209, map_deal_0210, map_deal_0215, map_deal_0217,
838 |             map_deal_4001, map_deal_1001, map_deal_0409, map_deal_0421, map_deal_0424, map_deal_0434,
839 |             map_deal_1402, map_deal_0120, map_deal_0984, map_deal_100010, map_deal_3190, map_deal_3191,
840 |             map_deal_3192, map_deal_3195, map_deal_3196, map_deal_3197, map_deal_3430, map_deal_3399
841 |            ]
842 | 
843 | 
844 | def get_file():
845 |     train = pd.read_csv('../data/meinian_round1_train_20180408.csv', sep=',', encoding='gbk')
846 |     test = pd.read_csv('../data//meinian_round1_test_b_20180505.csv', sep=',', encoding='gbk')
847 |     data_part1 = pd.read_csv('../data/meinian_round1_data_part1_20180408.txt', sep='$', encoding='utf-8')
848 |     data_part2 = pd.read_csv('../data/meinian_round1_data_part2_20180408.txt', sep='$', encoding='utf-8')
849 | 
850 |     # data_part1和data_part2进行合并，并剔除掉与train、test不相关vid所在的行
851 |     part1_2 = pd.concat([data_part1, data_part2], axis=0)  # {0/'index', 1/'columns'}, default 0
852 |     part1_2 = pd.DataFrame(part1_2).sort_values('vid').reset_index(drop=True)
853 |     vid_set = pd.concat([train['vid'], test['vid']], axis=0)
854 |     vid_set = pd.DataFrame(vid_set).sort_values('vid').reset_index(drop=True)
855 |     part1_2 = part1_2[part1_2['vid'].isin(vid_set['vid'])]
856 |     # 根据常识判断无用的'检查项'table_id，过滤掉无用的table_id
857 |     part1_2 = filter_None(part1_2)
858 |     # 数据简单处理
859 |     print(part1_2.shape)
860 |     vid_tabid_group = part1_2.groupby(['vid', 'table_id']).size().reset_index()
861 |     print('------------------------------去重和组合-----------------------------')
862 |     vid_tabid_group['new_index'] = vid_tabid_group['vid'] + '_' + vid_tabid_group['table_id']
863 |     vid_tabid_group_dup = vid_tabid_group[vid_tabid_group[0] > 1]['new_index']
864 | 
865 |     # print(vid_tabid_group_dup.head()) #000330ad1f424114719b7525f400660b_0102
866 |     part1_2['new_index'] = part1_2['vid'] + '_' + part1_2['table_id']
867 | 
868 |     dup_part = part1_2[part1_2['new_index'].isin(list(vid_tabid_group_dup))]
869 |     dup_part = dup_part.sort_values(['vid', 'table_id'])
870 |     unique_part = part1_2[~part1_2['new_index'].isin(list(vid_tabid_group_dup))]
871 | 
872 |     part1_2_dup = dup_part.groupby(['vid', 'table_id']).apply(merge_table).reset_index()
873 |     part1_2_dup.rename(columns={0: 'field_results'}, inplace=True)
874 |     part1_2_res = pd.concat([part1_2_dup, unique_part[['vid', 'table_id', 'field_results']]])
875 | 
876 |     # 行列转换
877 |     print('--------------------------重新组织index和columns---------------------------')
878 |     merge_part1_2 = part1_2_res.pivot(index='vid', values='field_results', columns='table_id')
879 |     merge_part1_2.to_csv('../data/merge_part1_2.csv', encoding='utf-8')
880 |     del merge_part1_2
881 |     time.sleep(10)
882 |     print('------------------------重新读取数据merge_part1_2--------------------------')
883 |     merge_part1_2 = pd.read_csv('../data/merge_part1_2.csv', sep=',', encoding='utf-8')
884 |     print('--------------新的part1_2组合完毕----------')
885 |     print(merge_part1_2.shape)
886 |     feats = remain_feat(merge_part1_2, thresh=0.96)
887 |     merge_part1_2 = merge_part1_2[feats]
888 | 
889 |     for i in range(len(item_list)):
890 |         merge_part1_2[item_list[i]] = merge_part1_2[item_list[i]].apply(map_list[i])
891 | 
892 |     tran_kind_dict = {}
893 |     for x in merge_part1_2.columns:
894 |         if merge_part1_2[x].dtype == 'object':
895 |             a = len(merge_part1_2[x].unique())
896 |             tran_kind_dict[x] = a
897 | 
898 |     drop_list = []
899 |     onehot_list = []
900 |     for x in tran_kind_dict.keys():
901 | 
902 |         if tran_kind_dict[x] <= 200:
903 |             onehot_list.append(x)
904 |         else:
905 |             if x != 'vid':
906 |                 drop_list.append(x)
907 | 
908 |     from sklearn import preprocessing
909 |     lbl = preprocessing.LabelEncoder()
910 |     for x in onehot_list:
911 |         merge_part1_2[x] = lbl.fit_transform(merge_part1_2[x].map(lambda x: str(x)))
912 | 
913 |     merge_part1_2.drop(drop_list, axis=1, inplace=True)
914 |     merge_part1_2 = merge_part1_2.convert_objects(convert_numeric=True)
915 |     train_of_part = merge_part1_2[merge_part1_2['vid'].isin(train['vid'])]
916 |     test_of_part = merge_part1_2[merge_part1_2['vid'].isin(test['vid'])]
917 |     train = pd.merge(train, train_of_part, on='vid')
918 |     test = pd.merge(test, test_of_part, on='vid')
919 |     return train, test
920 | 
921 | 
922 | def do_map(merge_part1_2):
923 |     for i in range(len(item_list)):
924 |         merge_part1_2[item_list[i]] = merge_part1_2[item_list[i]].apply(map_list[i])
925 | 
926 |     merge_part1_2.info()
927 |     tran_kind_dict = {}
928 |     for x in merge_part1_2.columns:
929 |         if merge_part1_2[x].dtype == 'object':
930 |             a = len(merge_part1_2[x].unique())
931 |             tran_kind_dict[x] = a
932 | 
933 |     drop_list = []
934 |     onehot_list = []
935 |     for x in tran_kind_dict.keys():
936 | 
937 |         if tran_kind_dict[x] <= 200:
938 |             onehot_list.append(x)
939 |         else:
940 |             if x != 'vid':
941 |                 drop_list.append(x)
942 | 
943 |     from sklearn import preprocessing
944 |     lbl = preprocessing.LabelEncoder()
945 |     for x in onehot_list:
946 |         merge_part1_2[x] = lbl.fit_transform(merge_part1_2[x].map(lambda x: str(x)))
947 | 
948 |     merge_part1_2.drop(drop_list, axis=1, inplace=True)
949 |     merge_part1_2 = merge_part1_2.convert_objects(convert_numeric=True)
950 |     return merge_part1_2
951 | 


--------------------------------------------------------------------------------
/round1_rank2/code/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2018/5/7 0007 下午 13:02
  4 | # @Author  : Juzphy
  5 | import time
  6 | import pandas as pd
  7 | import lightgbm as lgb
  8 | import numpy as np
  9 | import re
 10 | from collections import Iterable
 11 | from sklearn.model_selection import KFold
 12 | from sklearn.metrics import classification_report
 13 | import warnings
 14 | from team.team_feature_work import get_file
 15 | 
 16 | warnings.filterwarnings('ignore')
 17 | 
 18 | 
 19 | class DataPreProcess(object):
 20 |     def __init__(self, threshold):
 21 |         self.thresh = threshold
 22 | 
 23 |     def pre_process(self):
 24 |         # 过滤掉无用的table_id
 25 |         def filter_none(data):
 26 |             data = data[data['field_results'] != '']
 27 |             data = data[data['field_results'] != '未查']
 28 |             return data
 29 | 
 30 |         # 重复数据的拼接操作
 31 |         def merge_table(df):
 32 |             df['field_results'] = df['field_results'].astype(str)
 33 |             if df.shape[0] > 1:
 34 |                 merge_df = " ".join(list(df['field_results']))
 35 |             else:
 36 |                 merge_df = df['field_results'].values[0]
 37 |             return merge_df
 38 | 
 39 |         # 删除掉一些出现次数低，缺失比例大的字段，保留超过阈值的特征
 40 |         def get_remain_feats(df):
 41 |             exclude_feats = set()
 42 |             print('----------目前移除缺失的阈值为{}-----------'.format(self.thresh))
 43 |             print('----------移除数据缺失多的字段-----------')
 44 |             print('移除之前总的字段数量', len(df.columns))
 45 |             num_rows = df.shape[0]
 46 |             for c in df.columns:
 47 |                 num_missing = df[c].isnull().sum()
 48 |                 if num_missing == 0:
 49 |                     continue
 50 |                 missing_percent = num_missing / float(num_rows)
 51 |                 if missing_percent > self.thresh:
 52 |                     exclude_feats.add(c)
 53 |             print("移除后数据的字段数量: %s" % len(exclude_feats))
 54 |             # 保留超过阈值的特征
 55 |             remain_feats = set(df.columns) - exclude_feats
 56 |             print('剩余的字段数量', len(remain_feats))
 57 |             return list(remain_feats)
 58 | 
 59 |         origin_train = pd.read_csv('../data/meinian_round1_train_20180408.csv', sep=',', encoding='gbk')
 60 |         origin_test = pd.read_csv('../data/meinian_round1_test_b_20180505.csv', sep=',', encoding='gbk')
 61 |         data_part1 = pd.read_csv('../data/meinian_round1_data_part1_20180408.txt', sep='$', encoding='utf-8')
 62 |         data_part2 = pd.read_csv('../data/meinian_round1_data_part2_20180408.txt', sep='$', encoding='utf-8')
 63 |         # data_part1和data_part2进行合并，并剔除掉与train、test不相关vid所在的行
 64 |         # {0/'index', 1/'columns'}, default 0
 65 |         part1_2 = pd.concat([data_part1, data_part2], axis=0)
 66 |         part1_2 = pd.DataFrame(part1_2).sort_values('vid').reset_index(drop=True)
 67 |         vid_set = pd.concat([origin_train['vid'], origin_test['vid']], axis=0)
 68 |         vid_set = pd.DataFrame(vid_set).sort_values('vid').reset_index(drop=True)
 69 |         part1_2 = part1_2[part1_2['vid'].isin(vid_set['vid'])]
 70 |         part1_2 = filter_none(part1_2)
 71 |         print(part1_2.shape)
 72 |         vid_tabid_group = part1_2.groupby(['vid', 'table_id']).size().reset_index()
 73 |         print('------------------------------去重和组合-----------------------------')
 74 |         vid_tabid_group['new_index'] = vid_tabid_group['vid'] + '_' + vid_tabid_group['table_id']
 75 |         vid_tabid_group_dup = vid_tabid_group[vid_tabid_group[0] > 1]['new_index']
 76 |         part1_2['new_index'] = part1_2['vid'] + '_' + part1_2['table_id']
 77 |         dup_part = part1_2[part1_2['new_index'].isin(list(vid_tabid_group_dup))]
 78 |         dup_part = dup_part.sort_values(['vid', 'table_id'])
 79 |         unique_part = part1_2[~part1_2['new_index'].isin(list(vid_tabid_group_dup))]
 80 |         part1_2_dup = dup_part.groupby(['vid', 'table_id']).apply(merge_table).reset_index()
 81 |         part1_2_dup.rename(columns={0: 'field_results'}, inplace=True)
 82 |         part1_2_res = pd.concat([part1_2_dup, unique_part[['vid', 'table_id', 'field_results']]])
 83 | 
 84 |         # 行列转换
 85 |         print('--------------------------重新组织index和columns---------------------------')
 86 |         merge_part1_2 = part1_2_res.pivot(index='vid', values='field_results', columns='table_id')
 87 |         merge_part1_2.to_csv('../data/merge_part1_2_{}.csv'.format(self.thresh), encoding='utf-8')
 88 |         del merge_part1_2
 89 |         time.sleep(10)
 90 |         print('------------------------重新读取数据merge_part1_2--------------------------')
 91 |         merge_part1_2 = pd.read_csv('../data/merge_part1_2_{}.csv'.format(self.thresh), sep=',', encoding='utf-8')
 92 |         print('--------------新的part1_2组合完毕----------')
 93 |         print(merge_part1_2.shape)
 94 |         feats = get_remain_feats(merge_part1_2)
 95 |         return merge_part1_2[feats]
 96 | 
 97 | 
 98 | class FeatureWork(object):
 99 |     def __init__(self, thresh_num):
100 |         self.thresh_num = thresh_num
101 | 
102 |     def get_features(self):
103 |         # 脂肪肝程度
104 |         def transform_101_102_113(df):
105 |             if df:
106 |                 if '脂肪肝趋势' in df:
107 |                     return 1
108 |                 elif '轻度' in df:
109 |                     if '中' not in df:
110 |                         return 2
111 |                     else:
112 |                         return 3
113 |                 elif '中度' in df:
114 |                     if '重' not in df:
115 |                         return 3
116 |                     else:
117 |                         return 4
118 |                 elif '重度' in df:
119 |                     return 4
120 |                 else:
121 |                     return 0
122 |             else:
123 |                 return np.nan
124 | 
125 |         def transform_2302(df):
126 |             try:
127 |                 if '健康' in df:
128 |                     if '亚健康' in df:
129 |                         return 1
130 |                     else:
131 |                         return 0
132 |                 elif '疾病' in df:
133 |                     return 2
134 |             except Exception:
135 |                 return df
136 | 
137 |         def high_sugar(df):
138 |             if df:
139 |                 if '血糖偏高' in df or '降糖' in df or '血糖' in df:
140 |                     return 1
141 |                 else:
142 |                     return 0
143 |             else:
144 |                 return np.nan
145 | 
146 |         def high_fat(df):
147 |             if df:
148 |                 if '血脂偏高' in df or '低脂' in df or '血脂' in df:
149 |                     return 1
150 |                 else:
151 |                     return 0
152 |             else:
153 |                 return np.nan
154 | 
155 |         def high_pressure(df):
156 |             if df:
157 |                 if '血压偏高' in df or '降压' in df or '血压' in df:
158 |                     return 1
159 |                 else:
160 |                     return 0
161 |             else:
162 |                 return np.nan
163 | 
164 |         def higher_pressure(df):
165 |             if df:
166 |                 if '血压偏高' not in df:
167 |                     if '高血压' in df:
168 |                         return 1
169 |                     else:
170 |                         return 0
171 |             else:
172 |                 return np.nan
173 | 
174 |         def higher_fat(df):
175 |             if df:
176 |                 if '血脂偏高' not in df:
177 |                     if '高血脂' in df:
178 |                         return 1
179 |                     else:
180 |                         return 0
181 |             else:
182 |                 return np.nan
183 | 
184 |         def higher_sugar(df):
185 |             if df:
186 |                 if '血糖偏高' not in df:
187 |                     if '高血糖' in df or '糖尿病' in df:
188 |                         return 1
189 |                     else:
190 |                         return 0
191 |             else:
192 |                 return np.nan
193 | 
194 |         def fatty_liver(df):
195 |             if df:
196 |                 if '脂肪肝' in df:
197 |                     return 1
198 |                 else:
199 |                     return 0
200 |             else:
201 |                 return np.nan
202 | 
203 |         def coronary_heart_disease(df):
204 |             if df:
205 |                 if '冠心病' in df or '冠状' in df:
206 |                     return 1
207 |                 else:
208 |                     return 0
209 |             else:
210 |                 return np.nan
211 | 
212 |         def kidney(df):
213 |             if df:
214 |                 if '肾' in df:
215 |                     return 1
216 |                 else:
217 |                     return 0
218 |             else:
219 |                 return np.nan
220 | 
221 |         def smoke(df):
222 |             if df:
223 |                 if '烟' in df:
224 |                     return 1
225 |                 else:
226 |                     return 0
227 |             else:
228 |                 return np.nan
229 | 
230 |         def strQ2B(df):
231 |             """全角转半角"""
232 |             if isinstance(df, Iterable):
233 |                 rstring = ""
234 |                 for uchar in df:
235 |                     inside_code = ord(uchar)
236 |                     # 全角空格直接转换
237 |                     if inside_code == 12288:
238 |                         inside_code = 32
239 |                     elif 65281 <= inside_code <= 65374:
240 |                         inside_code -= 65248
241 |                     rstring += chr(inside_code)
242 |                 return rstring
243 |             else:
244 |                 return df
245 | 
246 |         def extract_num(df):
247 |             try:
248 |                 df = float(df)
249 |                 if df <= 0:
250 |                     return np.nan
251 |                 return df
252 |             except Exception:
253 |                 if '.' in df:
254 |                     temp = re.findall('(\d+\.\d+)', df)
255 |                 else:
256 |                     temp = re.findall('(\d+)', df)
257 |                 if temp:
258 |                     return float(temp[0])
259 |                 else:
260 |                     return np.nan
261 | 
262 |         def blood_pipe_style(df):
263 |             try:
264 |                 if '良好' in df or '正常' in df:
265 |                     return 0
266 |                 elif '趋势' in df:
267 |                     return 1
268 |                 elif '轻度' in df:
269 |                     return 2
270 |                 elif '中度' in df:
271 |                     return 3
272 |                 elif '重度' in df:
273 |                     return 4
274 |                 elif '硬化' in df:
275 |                     return 5
276 |                 else:
277 |                     return np.nan
278 |             except Exception:
279 |                 return df
280 | 
281 |         def ying_yang(df):
282 |             try:
283 |                 if '+' in df and '-' in df:
284 |                     return 1
285 |                 elif '+' in df and '-' not in df:
286 |                     return 2
287 |                 elif ('-' in df or '阴' in df or '正常' in df or 'Normal' in df) and '+' not in df:
288 |                     return 0
289 |                 else:
290 |                     return 0
291 |             except Exception:
292 |                 return df
293 | 
294 |         def HP_yy(df):
295 |             try:
296 |                 if '阳' in df:
297 |                     return 1
298 |                 else:
299 |                     return 0
300 |             except Exception:
301 |                 return df
302 | 
303 |         # 尿
304 |         def urine(df):
305 |             try:
306 |                 if '>=' in df:
307 |                     return 1
308 |                 else:
309 |                     return 0
310 |             except Exception:
311 |                 return df
312 | 
313 |         def heart_rate(df):
314 |             try:
315 |                 if df != '强弱不等':
316 |                     if '弱' in df or '远' in df or '低' in df:
317 |                         return 1
318 |                     elif '强' in df or '力' in df:
319 |                         return 3
320 |                     else:
321 |                         return 0
322 |                 else:
323 |                     return 2
324 |             except Exception:
325 |                 return df
326 | 
327 |         def transform_421(df):
328 |             try:
329 |                 if '齐' in df and '不' not in df:
330 |                     return 0
331 |                 else:
332 |                     return 1
333 |             except Exception:
334 |                 return df
335 | 
336 |         def transform_430(df):
337 |             try:
338 |                 if df == '软':
339 |                     return 1
340 |                 elif df == '中':
341 |                     return 2
342 |                 elif df == '硬':
343 |                     return 3
344 |                 else:
345 |                     return 0
346 |             except Exception:
347 |                 return df
348 | 
349 |         def transform_403(df):
350 |             try:
351 |                 if '大' in df and '无' not in df:
352 |                     return 1
353 |                 else:
354 |                     return 0
355 |             except Exception:
356 |                 return df
357 | 
358 |         def transform_3399(df):
359 |             try:
360 |                 if df == '黄色' or df == 'yellow':
361 |                     return 2
362 |                 elif df == '淡黄色' or df == '浅黄色':
363 |                     return 1
364 |                 elif df == '无色':
365 |                     return 0
366 |                 elif '红' in df:
367 |                     return 3
368 |                 elif df == '混浊':
369 |                     return 4
370 |                 else:
371 |                     return 5
372 |             except Exception:
373 |                 return df
374 | 
375 |         def lung_voice(df):
376 |             try:
377 |                 if '干啰' in df:
378 |                     return 1
379 |                 elif '湿啰' in df:
380 |                     return 2
381 |                 elif '哮鸣' in df:
382 |                     return 3
383 |                 elif '湿鸣' in df:
384 |                     return 4
385 |                 else:
386 |                     return 0
387 |             except Exception:
388 |                 return df
389 | 
390 |         def one_hot(data_frame):
391 |             one_hot_list = ['101', '102', '113', '409', '413', '434', '439', 'A201', 'A202', '4001', '705', 'A301', '709', '985']
392 |             data_frame.loc[:, one_hot_list] = data_frame.loc[:, one_hot_list].fillna('')
393 |             data_frame['4001'] = data_frame['4001'].astype(str)
394 |             data_frame['705'] = data_frame['705'].astype(str)
395 |             data_frame['709'] = data_frame['709'].astype(str)
396 |             data_frame['A301'] = data_frame['A301'].astype(str)
397 |             data_frame['985'] = data_frame['985'].astype(str)
398 |             data_frame['439'] = data_frame['439'].astype(str)
399 |             frame_409_434 = data_frame['409'] + data_frame['434'] + data_frame['413'] + data_frame['4001'] + \
400 |                             data_frame['A201'] + data_frame['A301'] + data_frame['A202'] + data_frame['705'] + \
401 |                             data_frame['709'] + data_frame['985'] + data_frame['439']
402 |             data_frame['血压偏高'] = frame_409_434.apply(high_pressure)
403 |             data_frame['血脂偏高'] = frame_409_434.apply(high_fat)
404 |             data_frame['血糖偏高'] = frame_409_434.apply(high_sugar)
405 |             data_frame['高血糖'] = frame_409_434.apply(higher_sugar)
406 |             data_frame['高血脂'] = frame_409_434.apply(higher_fat)
407 |             data_frame['高血压'] = frame_409_434.apply(higher_pressure)
408 |             data_frame['脂肪肝'] = frame_409_434.apply(fatty_liver)
409 |             data_frame['冠心病'] = frame_409_434.apply(coronary_heart_disease)
410 |             data_frame['肾问题'] = frame_409_434.apply(kidney)
411 |             data_frame['吸烟'] = frame_409_434.apply(smoke)
412 |             fat_liver_num = data_frame['101'] + data_frame['102'] + data_frame['113']
413 |             data_frame['脂肪肝程度'] = fat_liver_num.apply(transform_101_102_113)
414 | 
415 |         def cm2mm(df):
416 |             try:
417 |                 if 'cm' in df:
418 |                     temp_cm = re.findall('\d+(?:\.\d+)?.*?x?\d+(?:\.\d+)?', df)
419 |                     if temp_cm:
420 |                         return float(temp_cm[0][0]) * float(temp_cm[0][1]) * 100
421 |                 elif 'mm' in df:
422 |                     temp_mm = re.findall('\d+(?:\.\d+)?.*?x?\d+(?:\.\d+)?', df)
423 |                     if temp_mm:
424 |                         return float(temp_mm[0][0]) * float(temp_mm[0][1])
425 |                 else:
426 |                     return np.nan
427 |             except Exception:
428 |                 return np.nan
429 | 
430 |         def get_num_from_102_front(df):
431 |             try:
432 |                 temp_x = re.findall('(\d+)/(\d+)', df)
433 |                 if temp_x:
434 |                     return float(temp_x[0][0])
435 |             except Exception:
436 |                 return np.nan
437 | 
438 |         def get_num_from_102_back(df):
439 |             try:
440 |                 temp_x = re.findall('(\d+)/(\d+)', df)
441 |                 if temp_x:
442 |                     return float(temp_x[0][1])
443 |             except Exception:
444 |                 return np.nan
445 | 
446 |         def word2num(data_frame):
447 |             drop_list = ['3193', '420', '431', '976', '429', '422', '423', '426', '3400', '3485', '3486', '30007']
448 |             drop_list2 = ['101', '102', '113', '409', '413', '434', 'A201', 'A202', '4001', '705', 'A301', '709']
449 |             drop_list3 = ['1001', '114', '116', '117', '118', '121', '985', '439']
450 |             drop_list.extend(drop_list2)
451 |             drop_list.extend(drop_list3)
452 |             yy_list = ['3190', '3191', '3192', '3194', '3195', '3196', '3197', '3430', '100010']
453 |             for y in yy_list:
454 |                 data_frame[y] = data_frame[y].apply(ying_yang)
455 |             data_frame['尿比重'] = data_frame['3193'].apply(urine)
456 |             data_frame['心音'] = data_frame['420'].apply(heart_rate)
457 |             data_frame['430'] = data_frame['430'].apply(transform_430)
458 |             data_frame['3399'] = data_frame['3399'].apply(transform_3399)
459 |             data_frame['3301'] = data_frame['3301'].apply(HP_yy)
460 |             data_frame['403'] = data_frame['3301'].apply(transform_403)
461 |             data_frame['421'] = data_frame['421'].apply(transform_421)
462 |             data_frame['405'] = data_frame['405'].apply(lung_voice)
463 |             data_frame['gender'] = data_frame['121'].apply(lambda n: 1 if isinstance(n, Iterable) else 0)
464 |             data_frame['血管弹性'] = data_frame['4001'].apply(blood_pipe_style)
465 |             data_frame['2302'] = data_frame['2302'].apply(transform_2302)
466 |             one_hot(data_frame)
467 |             for x, y in zip(['113', '114', '116', '117', '118'], ['肝脏回声', '胆囊回声', '脾脏回声', '左肾回声', '右肾回声']):
468 |                 data_frame[x] = data_frame[x].apply(strQ2B)
469 |                 data_frame[x] = data_frame[x].apply(lambda n: n.lower().replace('×', 'x').replace('*', 'x') if
470 |                 isinstance(n, Iterable) else n)
471 |                 data_frame[y] = data_frame[x].apply(cm2mm)
472 |             data_frame['血压_front'] = data_frame['102'].apply(get_num_from_102_front)
473 |             data_frame['血压_back'] = data_frame['102'].apply(get_num_from_102_back)
474 |             data_frame['心跳次数'] = data_frame['1001'].apply(extract_num)
475 |             data_frame.drop(drop_list, axis=1, inplace=True)
476 |             return data_frame
477 | 
478 |         def file_split(data_frame, path):
479 |             with open(path, encoding='utf8') as f:
480 |                 feature_list = [i for i in f.read().split(', ')]
481 |             features = data_frame[feature_list]
482 |             return features
483 | 
484 |         def save_all_num(data_frame):
485 |             for c in data_frame.columns:
486 |                 if c != 'vid':
487 |                     data_frame[c] = data_frame[c].apply(extract_num)
488 |                     q_num = data_frame[c].quantile(0.9) * 1.5
489 |                     data_frame[c] = data_frame[c].apply(lambda x: x if x < q_num else np.nan)
490 |             return data_frame
491 | 
492 |         # 添加队友的数据特征
493 |         def add_new_feature(df_mine, df_team, save_path):
494 |             columns = list(set(df_team.columns) - set(df_mine.columns) -
495 |                            set(['舒张压', '收缩压', '血清高密度脂蛋白', '血清低密度脂蛋白', '血清甘油三酯']))
496 |             columns.append('vid')
497 |             new_data = df_team[columns]
498 |             final_data = pd.merge(df_mine, new_data, on='vid')
499 |             final_data.to_csv(save_path, encoding='utf8', index=False)
500 | 
501 |         dpp = DataPreProcess(threshold=self.thresh_num)
502 |         all_data = dpp.pre_process()
503 |         all_data.columns = [a[1:] if a.startswith('0') else a for a in all_data.columns]
504 |         train_set = pd.read_csv('../data/meinian_round1_train_20180408.csv', sep=',', encoding='gbk')
505 |         for t in train_set.columns:
506 |             if t != 'vid':
507 |                 train_set[t] = train_set[t].apply(extract_num)
508 |         test_set = pd.read_csv('../data/meinian_round1_test_b_20180505.csv', sep=',', encoding='gbk')
509 |         num_data_temp = file_split(all_data, '../features/num_label.txt')
510 |         word_data_temp = file_split(all_data, '../features/word_label.txt')
511 |         num_data_temp.to_csv('../data/num_data.csv', encoding='utf8', index=False)
512 |         word_data_temp.to_csv('../data/word_data.csv', encoding='utf8', index=False)
513 |         num_data = pd.read_csv('../data/num_data.csv', encoding='utf8')
514 |         word_data = pd.read_csv('../data/word_data.csv', encoding='utf8')
515 |         num_data = save_all_num(num_data)
516 |         word_data = word2num(word_data)
517 |         transform_data = pd.merge(num_data, word_data, on='vid')
518 |         train_of_part = transform_data[transform_data['vid'].isin(train_set['vid'])]
519 |         test_of_part = transform_data[transform_data['vid'].isin(test_set['vid'])]
520 |         train_set = pd.merge(train_set, train_of_part, on='vid')
521 |         train_set.loc[train_set['vid'] == '7685d48685028a006c84070f68854ce1', '舒张压'] = 64
522 |         train_set.loc[train_set['vid'] == 'fa04c8db6d201b9f705a00c3086481b0', '舒张压'] = 74
523 |         train_set.loc[train_set['vid'] == 'de82a4130c4907cff4bfb96736674bbc', '血清低密度脂蛋白'] = 1.22
524 |         train_set.loc[train_set['vid'] == 'd9919661f0a45fbcacc4aa2c1119c3d2', '血清低密度脂蛋白'] = 0.12
525 |         train_set.loc[train_set['vid'] == '798d859a63044a8a5addf1f8c528629e', '血清低密度脂蛋白'] = 0.06
526 |         test_set = pd.merge(test_set, test_of_part, on='vid')
527 |         team_train, team_test = get_file()
528 |         add_new_feature(train_set, team_train, '../data/train_set_merge.csv')
529 |         add_new_feature(test_set, team_test, '../data/test_set_merge.csv')
530 |         print('*************************训练集和测试集数据已成功写入。*************************')
531 | 
532 | 
533 | class LGBRegression(object):
534 |     def __init__(self):
535 |         self.params = {
536 |             'learning_rate': 0.01,
537 |             'boosting_type': 'gbdt',
538 |             'objective': 'mse',
539 |             'num_leaves': 62,
540 |             'reg_sqrt': True,
541 |             'feature_fraction': 0.8,
542 |             'bagging_fraction': 0.8,
543 |             'bagging_freq': 2,
544 |             'num_threads': -1,
545 |             'min_data_in_leaf': 5,
546 |             'verbose': -1
547 |         }
548 | 
549 |     def eval_metric(self, pred, labels):
550 |         return np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
551 | 
552 |     def eval_error(self, pred, train_data):
553 |         labels = train_data.get_label()
554 |         score = np.mean(np.power(np.log(pred + 1) - np.log(labels + 1), 2))
555 |         return 'meinian', score, False
556 | 
557 |     def lgb_regression_model(self, df, label, use_feature, true_test, submission_data):
558 |         print("基于lightgbm： 开始训练 label 为{}...".format(label))
559 |         value4preds = df[label]
560 |         value4preds = value4preds[value4preds.isnull().values == False]
561 |         df = df.iloc[value4preds.index]
562 |         train_data = df.loc[:, use_feature]
563 |         print(train_data.shape)
564 |         scores = np.zeros(len(value4preds))
565 |         submission_scores = np.zeros((len(submission_data), 5))
566 |         num_round = 8000
567 |         kf = KFold(n_splits=5, shuffle=True, random_state=1024)
568 |         for t, (train_index, test_index) in enumerate(kf.split(train_data, value4preds), start=1):
569 |             print('第{}次训练...'.format(t))
570 |             x_train, x_test = train_data.iloc[train_index], train_data.iloc[test_index]
571 |             y_train, y_test = value4preds.iloc[train_index], value4preds.iloc[test_index]
572 |             lgb_train = lgb.Dataset(x_train, y_train)
573 |             lgb_test = lgb.Dataset(x_test, y_test)
574 |             gbm = lgb.train(self.params,
575 |                             lgb_train,
576 |                             num_boost_round=num_round,
577 |                             valid_sets=lgb_test,
578 |                             verbose_eval=100,
579 |                             feval=self.eval_error,
580 |                             early_stopping_rounds=100)
581 |             scores[test_index] = gbm.predict(x_test)
582 |             submission_scores[:, t - 1] = gbm.predict(true_test)
583 |         submission_data[label] = np.mean(submission_scores, axis=1).round(3)
584 |         return self.eval_metric(scores, value4preds)
585 | 
586 | 
587 | class LGBClassification(object):
588 |     def __init__(self):
589 |         self.params = {
590 |             'learning_rate': 0.01,
591 |             'boosting_type': 'gbdt',
592 |             'objective': 'binary',
593 |             'metric': 'auc',
594 |             'num_leaves': 62,
595 |             'feature_fraction': 0.8,
596 |             'bagging_fraction': 0.8,
597 |             'bagging_freq': 2,
598 |             'verbose': -1,
599 |             'min_data_in_leaf': 5,
600 |         }
601 | 
602 |     # pos: 分类的分界线
603 |     # df: 训练集
604 |     # label: 主要针对血清甘油三脂(分界线4)和血清低密度脂蛋白(分界线5)
605 |     # use_feature: 训练使用的特征
606 |     # save_path: 分类结果保存路径
607 |     def lgb_classification_model(self, pos, df, label, use_feature, test_class, save_path):
608 |         print("开始训练分界线为{}...".format(pos))
609 |         df['pos_{}'.format(pos)] = df[label].apply(lambda x: 1 if x > pos else 0)
610 |         test_preds = df['pos_{}'.format(pos)]
611 |         test4lgb = test_class.loc[:, use_feature]
612 |         train_preds = df[use_feature]
613 |         kf = KFold(n_splits=5, random_state=1024, shuffle=True)
614 |         pred_labels = np.zeros(df.shape[0])
615 |         submission_label = np.zeros((test4lgb.shape[0], 5))
616 |         for t, (train_index, test_index) in enumerate(kf.split(train_preds, test_preds), start=1):
617 |             print('第{}次训练...'.format(t))
618 |             X_train, X_test = train_preds.iloc[train_index], train_preds.iloc[test_index]
619 |             y_train, y_test = test_preds.iloc[train_index], test_preds.iloc[test_index]
620 |             pos_weight = y_train.sum() / y_train.size
621 |             print(pos_weight)
622 |             self.params.update({'scale_pos_weight': pos_weight})
623 |             lgb_train = lgb.Dataset(X_train, y_train)
624 |             lgb_test = lgb.Dataset(X_test, y_test)
625 |             gbm = lgb.train(self.params,
626 |                             lgb_train,
627 |                             num_boost_round=8000,
628 |                             valid_sets=lgb_test,
629 |                             verbose_eval=100,
630 |                             early_stopping_rounds=100)
631 |             pred_labels[X_test.index] = np.where(gbm.predict(X_test) > 0.5, 1, 0)
632 |             self.params.pop('scale_pos_weight')
633 |             submission_label[:, t - 1] = np.where(gbm.predict(test4lgb) > 0.5, 1, 0)
634 |         test_class['pos_{}'.format(pos)] = np.where(np.sum(submission_label, axis=1) >= 1, 1, 0)
635 |         print(classification_report(pred_labels, test_preds))
636 |         test_class.to_csv(save_path, index=False, encoding='utf8')
637 | 
638 | 
639 | if __name__ == "__main__":
640 |     fw = FeatureWork(thresh_num=0.9)
641 |     fw.get_features()
642 |     train = pd.read_csv('../data/train_set_merge.csv', encoding='utf8', low_memory=False)
643 |     test = pd.read_csv('../data/test_set_merge.csv', encoding='utf8', low_memory=False)
644 |     print(train.shape, test.shape)
645 |     predict_features = ['舒张压', '收缩压', '血清高密度脂蛋白', '血清低密度脂蛋白', '血清甘油三酯']
646 |     train[predict_features] = train[predict_features]
647 |     test[predict_features] = test[predict_features]
648 |     use_features = [t for t in test.columns if t != 'vid' and t not in predict_features]
649 |     test_data = test.loc[:, use_features]
650 |     submission = test.loc[:, ['vid', '收缩压', '舒张压', '血清甘油三酯', '血清高密度脂蛋白', '血清低密度脂蛋白']]
651 |     base_line_score = np.zeros(5)
652 |     start = time.time()
653 |     lgb_reg = LGBRegression()
654 |     for i, j in enumerate(predict_features):
655 |         base_line_score[i] = lgb_reg.lgb_regression_model(train, j, use_features, test_data, submission)
656 |     print(dict(zip(predict_features, base_line_score)))
657 |     print('CV训练用时{}秒'.format(time.time() - start))
658 |     print('线下得分为：', np.mean(base_line_score))
659 |     date1 = time.strftime('%Y%m%d_%H%M%S')
660 |     submission.to_csv('../submit/submit_{}.csv'.format(date1), index=None, header=None, encoding='utf8')
661 |     time.sleep(10)
662 |     lgr_class = LGBClassification()
663 |     lgr_class.lgb_classification_model(4, train, '血清甘油三酯', use_features, test, '../data/fat_class_pos4.csv')
664 |     time.sleep(10)
665 |     reg_test = pd.read_csv('../data/fat_class_pos4.csv', encoding='utf8', low_memory=False)
666 |     pos_eq_1 = reg_test[reg_test['pos_4'] == 1]
667 |     test_eq_1 = pos_eq_1.loc[:, use_features]
668 |     submission_gt_4 = pos_eq_1.loc[:, ['vid', '血清甘油三酯']]
669 |     train_gt_4 = train[train['血清甘油三酯'] >= 4]
670 |     train_gt_4.index = list(range(train_gt_4.shape[0]))
671 |     lgb_reg.lgb_regression_model(train_gt_4, '血清甘油三酯', use_features, test_eq_1, submission_gt_4)
672 |     submission_gt_4.to_csv('../data/submit_gt_4.csv', index=None, header=None, encoding='utf8')
673 |     gt_4_index = submission[submission['vid'].isin(submission_gt_4['vid'])].index
674 |     submission_temp = submission.loc[gt_4_index, ['vid', '血清甘油三酯']]
675 |     merge_fat = pd.merge(submission_temp, submission_gt_4, on='vid')
676 |     temp_columns = [tc for tc in merge_fat.columns if tc != 'vid']
677 |     replace_num = np.max(merge_fat.loc[:, temp_columns], axis=1)
678 |     submission.loc[gt_4_index, '血清甘油三酯'] = replace_num.values
679 |     date2 = time.strftime('%Y%m%d_%H%M%S')
680 |     submission.to_csv('../submit/submit_{}.csv'.format(date2), index=None, encoding='utf8')
681 | 


--------------------------------------------------------------------------------
/round2_rank10/xgb_model/xgb_in_odps.sql:
--------------------------------------------------------------------------------
  1 | drop table if exists jz_xgb_pred_val_1;
  2 | drop table if exists jz_xgb_pred_val_2;
  3 | drop table if exists jz_xgb_pred_val_3;
  4 | drop table if exists jz_xgb_pred_val_4;
  5 | drop table if exists jz_xgb_pred_val_5;
  6 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_1;
  7 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_2;
  8 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_3;
  9 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_4;
 10 | DROP OFFLINEMODEL IF EXISTS jz_xgb_model_5;
 11 | 
 12 | 
 13 | -- train-fold-1
 14 | PAI
 15 | -name xgboost
 16 | -project algo_public
 17 | -DinputTableName="tl_xgb_train_1"
 18 | -DmodelName="jz_xgb_model_1"
 19 | -Deta="0.01"
 20 | -Dobjective="reg:linear"
 21 | -DitemDelimiter=","
 22 | -Dseed="1024"
 23 | -Dnum_round="1800"
 24 | -DlabelColName="log_tl"
 25 | -DenableSparse="false"
 26 | -Dmax_depth="5"
 27 | -Dsubsample="0.7"
 28 | -Dcolsample_bytree="0.7"
 29 | -Dgamma="0"
 30 | -Dlambda="50" 
 31 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"
 32 | -Dbase_score="0.2"
 33 | -Dmin_child_weight="20"
 34 | -DkvDelimiter=":";
 35 | 
 36 | -- predict-fold-1
 37 | PAI
 38 | -name prediction
 39 | -project algo_public
 40 | -DinputTableName="juz_test_6_7_xgb"
 41 | -DappendColNames="vid,log_tl"
 42 | -DmodelName="jz_xgb_model_1"
 43 | -DitemDelimiter=","
 44 | -DresultColName="result"
 45 | -Dlifecycle="28"
 46 | -DoutputTableName="jz_xgb_pred_val_1"
 47 | -DkvDelimiter=":"
 48 | -DenableSparse="false"
 49 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related";
 50 | 
 51 | -- train-fold-2
 52 | PAI
 53 | -name xgboost
 54 | -project algo_public
 55 | -DinputTableName="tl_xgb_train_2"
 56 | -DmodelName="jz_xgb_model_2"
 57 | -Deta="0.01"
 58 | -Dobjective="reg:linear"
 59 | -DitemDelimiter=","
 60 | -Dseed="1024"
 61 | -Dnum_round="1800"
 62 | -DlabelColName="log_tl"
 63 | -DenableSparse="false"
 64 | -Dmax_depth="5"
 65 | -Dsubsample="0.7"
 66 | -Dcolsample_bytree="0.7"
 67 | -Dgamma="0"
 68 | -Dlambda="50" 
 69 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"
 70 | -Dbase_score="0.2"
 71 | -Dmin_child_weight="20"
 72 | -DkvDelimiter=":";
 73 | 
 74 | -- predict-fold-2
 75 | PAI
 76 | -name prediction
 77 | -project algo_public
 78 | -DinputTableName="juz_test_6_7_xgb"
 79 | -DappendColNames="vid,log_tl"
 80 | -DmodelName="jz_xgb_model_2"
 81 | -DitemDelimiter=","
 82 | -DresultColName="result"
 83 | -Dlifecycle="28"
 84 | -DoutputTableName="jz_xgb_pred_val_2"
 85 | -DkvDelimiter=":"
 86 | -DenableSparse="false"
 87 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related";
 88 | 
 89 | -- train-fold-3
 90 | PAI
 91 | -name xgboost
 92 | -project algo_public
 93 | -DinputTableName="tl_xgb_train_3"
 94 | -DmodelName="jz_xgb_model_3"
 95 | -Deta="0.01"
 96 | -Dobjective="reg:linear"
 97 | -DitemDelimiter=","
 98 | -Dseed="1024"
 99 | -Dnum_round="1800"
100 | -DlabelColName="log_tl"
101 | -DenableSparse="false"
102 | -Dmax_depth="5"
103 | -Dsubsample="0.7"
104 | -Dcolsample_bytree="0.7"
105 | -Dgamma="0"
106 | -Dlambda="50" 
107 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"
108 | -Dbase_score="0.2"
109 | -Dmin_child_weight="20"
110 | -DkvDelimiter=":";
111 | 
112 | -- predict-fold-3
113 | PAI
114 | -name prediction
115 | -project algo_public
116 | -DinputTableName="juz_test_6_7_xgb"
117 | -DappendColNames="vid,log_tl"
118 | -DmodelName="jz_xgb_model_3"
119 | -DitemDelimiter=","
120 | -DresultColName="result"
121 | -Dlifecycle="28"
122 | -DoutputTableName="jz_xgb_pred_val_3"
123 | -DkvDelimiter=":"
124 | -DenableSparse="false"
125 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related";
126 | 
127 | -- train-fold-4
128 | PAI
129 | -name xgboost
130 | -project algo_public
131 | -DinputTableName="tl_xgb_train_4"
132 | -DmodelName="jz_xgb_model_4"
133 | -Deta="0.01"
134 | -Dobjective="reg:linear"
135 | -DitemDelimiter=","
136 | -Dseed="1024"
137 | -Dnum_round="1800"
138 | -DlabelColName="log_tl"
139 | -DenableSparse="false"
140 | -Dmax_depth="5"
141 | -Dsubsample="0.7"
142 | -Dcolsample_bytree="0.7"
143 | -Dgamma="0"
144 | -Dlambda="50" 
145 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"
146 | -Dbase_score="0.2"
147 | -Dmin_child_weight="20"
148 | -DkvDelimiter=":";
149 | 
150 | -- predict-fold-4
151 | PAI
152 | -name prediction
153 | -project algo_public
154 | -DinputTableName="juz_test_6_7_xgb"
155 | -DappendColNames="vid,log_tl"
156 | -DmodelName="jz_xgb_model_4"
157 | -DitemDelimiter=","
158 | -DresultColName="result"
159 | -Dlifecycle="28"
160 | -DoutputTableName="jz_xgb_pred_val_4"
161 | -DkvDelimiter=":"
162 | -DenableSparse="false"
163 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related";
164 | 
165 | -- train-fold-5
166 | PAI
167 | -name xgboost
168 | -project algo_public
169 | -DinputTableName="tl_xgb_train_5"
170 | -DmodelName="jz_xgb_model_5"
171 | -Deta="0.01"
172 | -Dobjective="reg:linear"
173 | -DitemDelimiter=","
174 | -Dseed="1024"
175 | -Dnum_round="1800"
176 | -DlabelColName="log_tl"
177 | -DenableSparse="false"
178 | -Dmax_depth="5"
179 | -Dsubsample="0.7"
180 | -Dcolsample_bytree="0.7"
181 | -Dgamma="0"
182 | -Dlambda="50" 
183 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related"
184 | -Dbase_score="0.2"
185 | -Dmin_child_weight="20"
186 | -DkvDelimiter=":";
187 | 
188 | -- predict-fold-5
189 | PAI
190 | -name prediction
191 | -project algo_public
192 | -DinputTableName="juz_test_6_7_xgb"
193 | -DappendColNames="vid,log_tl"
194 | -DmodelName="jz_xgb_model_5"
195 | -DitemDelimiter=","
196 | -DresultColName="result"
197 | -Dlifecycle="28"
198 | -DoutputTableName="jz_xgb_pred_val_5"
199 | -DkvDelimiter=":"
200 | -DenableSparse="false"
201 | -DfeatureColNames="snp357_cc,snp357_cg,snp357_gg,snp191_cc,snp191_cg,snp191_gg,snp277_aa,snp277_ag,snp277_gg,snp39_cc,snp39_ct,snp39_tt,snp149_aa,snp149_ag,snp149_gg,snp298_cc,snp298_cg,snp298_gg,snp198_gg,snp198_gt,snp198_tt,snp234_aa,snp234_ag,snp234_gg,snp115_cc,snp115_ct,snp115_tt,snp291_aa,snp291_ag,snp291_gg,snp208_ac,snp208_at,snp208_cc,snp208_ct,snp208_tt,snp44_aa,snp44_ac,snp44_cc,snp223_gg,snp223_gt,snp223_tt,snp47_cc,snp47_ct,snp47_tt,jz_10009,jz_10012,jz_10013,jz_1124,jz_1125,jz_1844,jz_2168,jz_2387,jz_269026,jz_279006,jz_300006,jz_300007,jz_300009,jz_30006,jz_300066,jz_300092,jz_300117,jz_300150,jz_300151,jz_300152,jz_319100,jz_319298,jz_3801,jz_3805,jz_459154,jz_459155,jz_459156,jz_459158,jz_459159,jz_459206,jz_459207,jz_459210,jz_459211,jz_709001,jz_709003,jz_809020,jz_809022,jz_809024,jz_809027,jz_809037,jz_809038,jz_809039,jz_809040,jz_809041,jz_809042,jz_809043,jz_809044,jz_809045,jz_809046,jz_809047,jz_809048,jz_809049,jz_809050,jz_809051,jz_809052,jz_809053,jz_809054,jz_809055,jz_809056,jz_809057,jz_809058,jz_809059,jz_809060,jz_809061,jz_989001,jz_989002,jz_004997,jz_0107,jz_2165,jz_2376,jz_2390,jz_300051,jz_300069,jz_300070,jz_300073,jz_300074,jz_300076,jz_300078,jz_300113,jz_300119,jz_300125,jz_321,jz_3804,jz_3807,jz_1308,jz_1319,jz_1320,jz_1321,jz_1322,jz_0424,jz_0425_x,jz_left_shen_no_voice,jz_right_shen_no_voice,jz_right_shen_strong_voice,jz_jzx_no_voice_area,jz_qian_lie_xian_2,jz_qian_lie_xian_3,jz_dpm_from_3301,jz_jzx_no_voice_jiejie,jz_jzx_low_voice_area,jz_jzx_low_voice_jiejie,jz_liver_no_voice,jz_liver_strong_voice,jz_dan_strong_voice,jz_qian_lie_xian_1,jz_3190_x,jz_3194,jz_3195_x,jz_xue_ya_pian_gao,jz_xue_zhi_pian_gao,jz_xue_tang_pian_gao,jz_high_sugar,jz_guan_xin_bin,jz_shen,jz_smoke,jz_niao,jz_heart_rate,jz_3399_w,jz_3301_w,jz_0403_w,jz_0421_w,jz_0405_w,jz_gender,jz_blood_pipe_style,jz_health,jz_pres_front,jz_pres_back,jz_heart_times,jz_dannan_jieshi,jz_dannan_xirou,jz_shen_jieshi,jz_shen_nanz,jz_gan_nanz,jz_gan_ying_hua,jz_skin,jz_skin_dis,jz_liver,jz_liver_dis,jz_liver_fatty,jz_liver_hepa,jz_carotid_artery,jz_carotid_artery_dis,jz_carotid_artery_mild,jz_carotid_artery_hard,jz_carotid_artery_high_sugar,jz_carotid_artery_high_cho,jz_carotid_artery_heart,jz_thyroid,jz_thyroid_dis,jz_thyroid_up,jz_thyroid_down,jz_bile,jz_bile_dis,jz_bile_infla,jz_bile_cut,jz_kidney,jz_kidney_dis,jz_kidney_infla,jz_kidney_cut,jz_pancreatic,jz_pancreatic_dis,jz_lung,jz_lung_dis,jz_lung_infla,jz_lung_kerl,jz_lung_cut,jz_cervical,jz_cervical_dis,jz_cervical_infla,jz_cervical_cut,jz_cervical_end,jz_uterus,jz_uterus_dis,jz_uterus_infla,jz_uterus_cut,jz_uterus_end,jz_ovary,jz_ovary_dis,jz_ovary_infla,jz_ovary_cut,jz_vagina,jz_vagina_dis,jz_vagina_infla,jz_vagina_end,jz_bladder,jz_bladder_dis,jz_gland,jz_gland_dis,jz_gland_infla,jz_gland_cut,jz_anus,jz_anus_dis,jz_lympha,jz_lympha_dis,jz_lympha_infla,jz_mam,jz_mam_dis,jz_mam_infla,jz_mam_end,jz_mam_cut,jz_bone,jz_bone_less,jz_teeth,jz_teeth_dis,jz_teeth_smoke,jz_teeth_old,jz_tonsil,jz_tonsil_dis,jz_tonsil_infla,jz_tonsil_cut,jz_throat,jz_throat_dis,jz_nerve,jz_nerve_dis,jz_digestion_dis,jz_high_fat,jz_women,jz_men,jz_menopause,jz_old,jz_histroy_val,jz_31,jz_193,jz_2406,jz_269017,jz_269024,jz_0112,jz_269019,jz_317,jz_979012,jz_269013,jz_979019,jz_669009,jz_1840,jz_300129,jz_2372,jz_269025,jz_300035,jz_979021,jz_979022,jz_1873,jz_669021,jz_100005,jz_100006,jz_300093,jz_2386,jz_100010,jz_192,jz_269008,jz_100008,jz_979006,jz_1127,jz_269003,jz_269004,jz_191,jz_0105,jz_a703,jz_1814,jz_269005,jz_183,jz_809008,jz_979007,jz_300011,jz_669005,jz_269006,jz_269016,jz_143,jz_378,jz_a701,jz_1474,jz_2404,jz_979017,jz_979023,jz_3197,jz_316,jz_709038,jz_979013,jz_809017,jz_33,jz_100013,jz_269020,jz_979018,jz_979005,jz_3193,jz_979020,jz_139,jz_809009,jz_10003,jz_190,jz_979001,jz_100012,jz_0108,jz_0106,jz_141,jz_10002,jz_300008,jz_979004,jz_315,jz_1850,jz_269010,jz_979009,jz_38,jz_269018,jz_809004,jz_809025,jz_1107,jz_2174,jz_659025,jz_809013,jz_269021,jz_2405,jz_269012,jz_809023,jz_809010,jz_809026,jz_32,jz_2403,jz_10004,jz_269014,jz_3191,jz_100014,jz_0111,jz_269015,jz_320,jz_313,jz_300109,jz_2986,jz_2420,jz_669002,jz_269022,jz_39,jz_0109,jz_300012,jz_314,jz_300014,jz_1112,jz_669008,jz_669003,jz_37,jz_100007,jz_312,jz_979015,jz_669004,jz_269009,jz_1845,jz_3430,jz_669007,jz_709049,jz_979008,jz_979003,jz_809001,jz_300001,jz_1110,jz_1815,jz_1345,jz_979011,jz_300021,jz_0104,jz_979016,jz_809021,jz_1117,jz_34,jz_2371,jz_300013,jz_269007,jz_1115,jz_979014,jz_3192,jz_319,jz_269011,jz_269023,jz_20002,jz_979002,jz_1106,jz_0435,jz_1328,jz_0405,jz_3196,jz_0425_y,jz_0201,jz_1305,jz_3195_y,jz_1303,jz_3190_y,jz_0707,jz_1315,jz_0973,jz_3730,jz_0433,jz_1313,jz_0212,jz_0407,jz_0901,jz_0976,jz_3207,jz_3399,jz_0207,jz_0431,jz_0420,jz_1304,jz_0216,jz_0413,jz_0406,jz_0206,jz_0979,jz_0423,jz_2302,jz_0430,jz_3400,jz_0980,jz_0432,jz_0977,jz_sugar_high_related";
202 | 


--------------------------------------------------------------------------------