├── chizhu ├── single_model │ ├── config.py │ ├── .DS_Store │ ├── data │ │ └── .DS_Store │ ├── user_behavior.py │ ├── get_nn_feat.py │ ├── xgb.py │ └── xgb_nb.py ├── stacking │ ├── .DS_Store │ ├── all_feat │ │ └── .DS_Store │ └── nurbs_feat │ │ ├── .DS_Store │ │ ├── xgb_22.py │ │ └── xgb__nurbs_nb.py ├── readme.txt └── util │ ├── get_nn_res.py │ └── bagging.py ├── nb_cz_lwl_wcm ├── 运行说明.txt ├── 2_get_feature_brand.py ├── 6_get_feature_device_start_close.py ├── 7_get_feature_w2v.py ├── 13_last_get_all_feature.py ├── 1_get_age_reg.py └── 4_get_feature_device_start_close_tfidf_1_2.py ├── linwangli ├── 融合思路.pptx ├── result │ └── .DS_Store ├── readme.txt └── code │ ├── utils.py │ ├── lgb_allfeat_22.py │ └── lgb_allfeat_condProb.py ├── 2018易观A10大数据应用峰会-RNG_终极版.pptx ├── README.md └── THLUO ├── 代码运行.bat ├── 28.final.py ├── readme.md ├── 24.thluo_22_lgb.py ├── 3.w2c_all_emb.py ├── 1.w2c_model_start.py ├── 2.w2c_model_close.py ├── 3.w2c_model_all.py ├── 3.device_quchong_start_app_w2c.py ├── 11.hcc_device_brand_age_sex.py ├── 25.thluo_22_xgb.py ├── 14.device_start_GRU_pred_age.py ├── 21.tfidf_lr_sex_age_prob_oof.py ├── 26.thluo_nb_lgb.py ├── 13.device_start_GRU_pred.py ├── 15.device_all_GRU_pred.py ├── 16.device_start_capsule_pred.py ├── 17.device_start_textcnn_pred.py ├── 19.device_start_lstm_pred.py └── 18.device_start_text_dpcnn_pred.py /chizhu/single_model/config.py: -------------------------------------------------------------------------------- 1 | path = "/Users/chizhu/data/competition_data/易观/" 2 | -------------------------------------------------------------------------------- /nb_cz_lwl_wcm/运行说明.txt: -------------------------------------------------------------------------------- 1 | Demo文件夹下存放原始数据集 2 | 按照1、2、3... 顺序运行,最后在feature文件夹下面生成feature_nurbs.csv 3 | -------------------------------------------------------------------------------- /linwangli/融合思路.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/linwangli/融合思路.pptx -------------------------------------------------------------------------------- /chizhu/stacking/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/chizhu/stacking/.DS_Store -------------------------------------------------------------------------------- /linwangli/result/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/linwangli/result/.DS_Store -------------------------------------------------------------------------------- /2018易观A10大数据应用峰会-RNG_终极版.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/2018易观A10大数据应用峰会-RNG_终极版.pptx -------------------------------------------------------------------------------- /chizhu/single_model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/chizhu/single_model/.DS_Store -------------------------------------------------------------------------------- /chizhu/single_model/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/chizhu/single_model/data/.DS_Store -------------------------------------------------------------------------------- /chizhu/stacking/all_feat/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/chizhu/stacking/all_feat/.DS_Store -------------------------------------------------------------------------------- /chizhu/stacking/nurbs_feat/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chizhu/yiguan_sex_age_predict_1st_solution/HEAD/chizhu/stacking/nurbs_feat/.DS_Store -------------------------------------------------------------------------------- /linwangli/readme.txt: -------------------------------------------------------------------------------- 1 | |—— code 2 | |—— lgb_allfeat_22.py:基于【全部特征】训练得到lgb结果 3 | |—— lgb_allfeat_condProb.py:基于【全部特征+条件概率】训练得到lgb结果 4 | |—— utils.py:一些脚本函数,如加权融合/相关性评测等 5 | |—— dataset 6 | |—— deviceid_train.tsv: 赛方提供的文件 7 | |—— all_feat.csv: 团队提取的所有特征 8 | |—— result:存放各种提交文件 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # yiguan_sex_age_predict_1st_solution 2 | 易观性别年龄预测第一名解决方案 3 | 4 | ##### [比赛链接](https://www.tinymind.cn/competitions/43) 5 | -------- 6 | 7 | 团队是分别个人做然后再合并,所以团队中特征文件有所交叉,主要用到的方案是stacking不同模型,因为数据产出的维度较高,通过不同模型stacking可以达到不会损失过量信息下达到降维的目的。 8 | 9 | 以下是运行代码的顺序: 10 | 11 | * 1.产出特征文件 12 | 13 | > 按照nb_cz_lwl_wcm文件夹运行说明分别运行 nb_cz_lwl_wcm文件夹下的所有文件产出特征文件 feature_one.csv 14 | > 按照thluo 文件夹下运行说明分别运行 thluo 文件夹下的代码生成 thluo_train_best_feat.csv 15 | 16 | * 2.模型加权 17 | 注:模型所得到的结果在linwangli文件夹下 18 | 19 | > 运行完thluo文件夹下面的所有代码会生成thluo_prob 20 | > 用linwangli/code文件夹下面的模型以及上面所求得的特征文件可跑出对应概率文件,相关概率文件加权方案看 linwangli文件夹下面的融合思路ppt 21 | 22 |
23 |
24 | 25 | CONTRIBUTORS:[THLUO](https://github.com/THLUO) [WangliLin](https://github.com/WangliLin) [Puck Wang](https://github.com/PuckWong) [chizhu](https://github.com/chizhu) [NURBS](https://github.com/suncostanx) 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /chizhu/readme.txt: -------------------------------------------------------------------------------- 1 | |-single_model/ 2 | |-data/ 处理后的特征和数据存放位置 3 | |-model/ 模型文件 4 | |-submit 模型概率文件,可用作stacking材料 5 | |-config.py 配置原始文件路径 6 | |-user_behavior.py 得到user_behavior特征集 7 | |-get_nn_feat.py 获得nn 的统计特征输入 8 | |-lgb.py 9 | |-xgb.py 10 | |-xgb_nb.py 条件概率 11 | |-cnn.py 12 | |-deepnn.py 13 | |-yg_best_nn.py 14 | |-stacking/ 15 | |-all_feat/ 使用全部概率文件的xgb的条件概率 16 | |-nurbs_feat/ 使用rurbs概率文件的xgb的22分类以及条件概率 17 | |-xgb_nurbs_nb.py 条件概率 18 | |-xgb_22.py 22分类 19 | |-util/ 20 | |-bagging.py 加权融合脚本 21 | |-get_nn_res.py 获得nn概率文件和可提交的结果 22 | 23 | 24 | 使用说明: 25 | single_model:1)先配置config.py 里的文件路径 26 | 2)运行user_behavior.py 27 | 3)运行get_nn_feat.py 28 | 4)然后可以逐个运行nn或者tree模型,得到的概率文件在submit/ 29 | 30 | stacking:这里直接运行是不行的 因为需要概率文件,大小在2G左右,没有附上,之后可以找我们要 31 | util:加权用,这里需要的是stacking/nurbs_feat下的xgb_22.py和_xgbnb.py产生的结果取均值得到一份结果,xgb_22_nb.csv 32 | 33 | -------------------------------------------------------------------------------- /nb_cz_lwl_wcm/2_get_feature_brand.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn import preprocessing 6 | 7 | train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None) 8 | test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None) 9 | 10 | data_all = pd.concat([train, test], axis=0) 11 | data_all = data_all.rename({0:'id'}, axis=1) 12 | del data_all[1],data_all[2] 13 | deviced_brand = pd.read_csv('Demo/deviceid_brand.tsv', sep='\t', header=None) 14 | deviced_brand = deviced_brand.rename({0: 'id'}, axis=1) 15 | data_all = pd.merge(data_all, deviced_brand, on='id', how='left') 16 | print(data_all) 17 | # 直接做类别编码特征 18 | 19 | feature = pd.DataFrame() 20 | label_encoder = preprocessing.LabelEncoder() 21 | feature['phone_type'] = label_encoder.fit_transform(data_all[1]) 22 | feature['phone_type_detail'] = label_encoder.fit_transform(data_all[2]) 23 | feature.to_csv('feature/deviceid_brand_feature.csv', index=False) -------------------------------------------------------------------------------- /THLUO/代码运行.bat: -------------------------------------------------------------------------------- 1 | python 1.w2c_model_start.py 2 | python 2.w2c_model_close.py 3 | python 3.w2c_model_all.py 4 | python 3.device_quchong_start_app_w2c.py 5 | python 3.w2c_all_emb.py 6 | python 4.device_age_prob_oof.py 7 | python 5.device_sex_prob_oof.py 8 | python 6.start_close_age_prob_oof.py 9 | python 7.start_close_sex_prob_oof.py 10 | python 9.sex_age_bin_prob_oof.py 11 | python 10.age_bin_prob_oof.py 12 | python 11.hcc_device_brand_age_sex.py 13 | python 12.device_age_regression_prob_oof.py 14 | python 13.device_start_GRU_pred.py 15 | python 14.device_start_GRU_pred_age.py 16 | python 15.device_all_GRU_pred.py 17 | python 16.device_start_capsule_pred.py 18 | python 17.device_start_textcnn_pred.py 19 | python 18.device_start_text_dpcnn_pred.py 20 | python 19.device_start_lstm_pred.py 21 | python 20.lgb_sex_age_prob_oof.py 22 | python 21.tfidf_lr_sex_age_prob_oof.py 23 | python 22.base_feat.py 24 | python 23.ATT_v6.py 25 | python 24.thluo_22_lgb.py 26 | python 25.thluo_22_xgb.py 27 | python 26.thluo_nb_lgb.py 28 | python 27.thluo_nb_xgb.py 29 | python 28.final.py -------------------------------------------------------------------------------- /THLUO/28.final.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | # In[2]: 12 | 13 | 14 | th_22_results_lgb = pd.read_csv('th_22_results_lgb.csv') 15 | th_22_results_xgb = pd.read_csv('th_22_results_xgb.csv') 16 | th_lgb_nb = pd.read_csv('th_lgb_nb.csv') 17 | th_xgb_nb = pd.read_csv('th_xgb_nb.csv') 18 | 19 | 20 | # In[5]: 21 | 22 | 23 | #直接22分类 lgb与xgb进行55 45加权融合 24 | results_22 = pd.DataFrame(th_22_results_lgb.values[:,1:] * 0.55 + th_22_results_xgb.values[:,1:] * 0.45) 25 | results_22.columns = th_22_results_lgb.columns[1:] 26 | results_22['DeviceID'] = th_22_results_lgb['DeviceID'] 27 | 28 | 29 | # In[6]: 30 | 31 | 32 | #条件概率分类, xgb与lgb进行65 35加权融合 33 | results_nb = pd.DataFrame(th_xgb_nb.values[:,1:] * 0.65 + th_lgb_nb.values[:,1:] * 0.35) 34 | results_nb.columns = th_xgb_nb.columns[1:] 35 | results_nb['DeviceID'] = th_xgb_nb['DeviceID'] 36 | 37 | 38 | # In[ ]: 39 | 40 | 41 | #两份结果继续进行加权融合 42 | results_final = pd.DataFrame(results_22.values[:,1:] * 0.65 + results_nb.values[:,1:] * 0.35) 43 | results_final.columns = results_22.columns[1:] 44 | results_final['DeviceID'] = results_22['DeviceID'] 45 | 46 | 47 | # In[ ]: 48 | 49 | 50 | results_final.to_csv('result/thluo_final.csv', index=None) 51 | 52 | -------------------------------------------------------------------------------- /chizhu/util/get_nn_res.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | path = "/Users/chizhu/data/competition_data/易观/" 3 | res1 = pd.read_csv(path+"res1.csv") 4 | res2_1 = pd.read_csv(path+"res2_1.csv") 5 | res2_2 = pd.read_csv(path+"res2_2.csv") 6 | res1.index = range(len(res1)) 7 | res2_1.index = range(len(res2_1)) 8 | res2_2.index = range(len(res2_2)) 9 | final_1 = res2_1.copy() 10 | final_2 = res2_2.copy() 11 | for i in range(11): 12 | final_1[str(i)] = res1['sex1']*res2_1[str(i)] 13 | final_2[str(i)] = res1['sex2']*res2_2[str(i)] 14 | id_list = pred['DeviceID'] 15 | final = id_list 16 | final.index = range(len(final)) 17 | final.columns = ['DeviceID'] 18 | final_pred = pd.concat([final_1, final_2], 1) 19 | final = pd.concat([final, final_pred], 1) 20 | final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 21 | '1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 22 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10'] 23 | 24 | final.to_csv(path+'nn_feat_v12.csv', index=False) 25 | 26 | train = pd.read_csv(path+"deviceid_train.tsv", sep="\t", 27 | names=["id", "sex", "age"]) 28 | test = pd.read_csv(path+"deviceid_test.tsv", sep="\t", names=['DeviceID']) 29 | 30 | pred = pd.read_csv(path+"nn_feat_v6.csv") 31 | sub = pd.merge(test, pred, on="DeviceID", how="left") 32 | 33 | sub.to_csv(path+"nn_v6.csv", index=False) 34 | 35 | 36 | -------------------------------------------------------------------------------- /chizhu/util/bagging.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | path = "/Users/chizhu/data/competition_data/易观/" 4 | os.listdir(path) 5 | 6 | train = pd.read_csv(path+"deviceid_train.tsv", sep="\t", 7 | names=["id", "sex", "age"]) 8 | test = pd.read_csv(path+"deviceid_test.tsv", sep="\t", names=['DeviceID']) 9 | pred = pd.read_csv(path+"nn_feat_v6.csv") 10 | 11 | lgb1 = pd.read_csv(path+"th_results_ems_22_nb_5400.csv") # 576 12 | lgb1 = pd.merge(test, lgb1, on="DeviceID", how="left") 13 | submit = lgb1.copy() 14 | 15 | nn1 = pd.read_csv(path+"xgb_and_nurbs.csv") # 573 16 | nn1 = pd.merge(test, nn1, on="DeviceID", how="left") 17 | 18 | # nn2=pd.read_csv(path+"th_results_ems_2547.csv")##574 19 | # nn2=pd.merge(test,nn2,on="DeviceID",how="left") 20 | 21 | # lgb2=pd.read_csv(path+"th_results_ems_2.549.csv")##570 22 | # lgb2=pd.merge(test,lgb2,on="DeviceID",how="left") 23 | 24 | # lgb3=pd.read_csv(path+"th_results_ems_2547.csv")##547 25 | # lgb3=pd.merge(test,lgb3,on="DeviceID",how="left") 26 | 27 | 28 | for i in['1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 29 | '1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 30 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']: 31 | # submit[i]=(lgb1[i]+lgb2[i]+nn1[i]+nn2[i])/4.0 32 | submit[i] = 0.75*lgb1[i]+0.25*nn1[i] 33 | # submit[i]=0.1*lgb1[i]+0.1*nn1[i]+0.2*nn2[i]+0.2*lgb2[i]+0.4*lgb3[i] 34 | 35 | submit.to_csv(path+"th_nurbs_7525.csv", index=False) 36 | -------------------------------------------------------------------------------- /linwangli/code/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def weights_ensemble(results, weights): 5 | ''' 6 | 针对此次比赛的按权重进行模型融合的函数脚本 7 | results: list,存放所有需要融合的结果路径 8 | weights: list, 存放各个结果的权重 9 | return: 可以直接to_csv提交的结果 10 | ''' 11 | for i in range(len(results)): 12 | if i == 0: 13 | sub = pd.read_csv(results[0]) 14 | final_cols = list(sub.columns) 15 | cols = list(sub.columns) 16 | cols[1:] = [col + '_0' for col in cols[1:]] 17 | sub.columns = cols 18 | else: 19 | result = pd.read_csv(results[i]) 20 | cols = list(result.columns) 21 | cols[1:] = [col + '_' + str(i) for col in cols[1:]] 22 | result.columns = cols 23 | sub = pd.merge(left=sub, right=result, on='DeviceID') 24 | for i in range(len(weights)): 25 | for col in final_cols[1:]: 26 | if col not in sub.columns: 27 | sub[col] = weights[i] * sub[col + '_' + str(i)] 28 | else: 29 | sub[col] = sub[col] + weights[i] * sub[col + '_' + str(i)] 30 | sub = sub[final_cols] 31 | return sub 32 | 33 | def result_corr(path1, path2): 34 | ''' 35 | 根据此次比赛写的评测不同提交结果相关性文件 36 | path1: 结果1的路径 37 | path2: 结果2的路径 38 | return: 返回不同提交结果的相关性 39 | ''' 40 | result_1 = pd.read_csv(path1) 41 | result_2 = pd.read_csv(path2) 42 | result = pd.merge(left=result_1, right=result_2, on='DeviceID', suffixes=('_x', '_y')) 43 | cols = result_1.columns[1:] 44 | col_list = [] 45 | for col in cols: 46 | col_pair = [col + '_x', col + '_y'] 47 | col_list.append(result[col_pair].corr().loc[col + '_x', col + '_y']) 48 | 49 | return np.mean(col_list) -------------------------------------------------------------------------------- /nb_cz_lwl_wcm/6_get_feature_device_start_close.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn import preprocessing 6 | 7 | train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None) 8 | test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None) 9 | 10 | data_all = pd.concat([train, test], axis=0) 11 | data_all = data_all.rename({0:'id'}, axis=1) 12 | del data_all[1],data_all[2] 13 | 14 | start_close_time = pd.read_csv('Demo/deviceid_package_start_close.tsv', sep='\t', header=None) 15 | start_close_time = start_close_time.rename({0:'id', 1:'app_name', 2:'start_time', 3:'close_time'}, axis=1) 16 | 17 | 18 | start_close_time['diff_time'] = (start_close_time['close_time'] - start_close_time['start_time'])/1000 19 | 20 | print('开始转换时间') 21 | import time 22 | start_close_time['close_time'] = start_close_time['close_time'].apply(lambda row: int(time.localtime(row/1000).tm_hour)) 23 | start_close_time['start_time'] = start_close_time['start_time'].apply(lambda row: int(time.localtime(row/1000).tm_hour)) 24 | 25 | # 一个表里面的总次数 26 | print('一个表的总次数') 27 | feature = pd.DataFrame() 28 | feature['start_close_count'] = pd.merge(data_all, start_close_time.groupby('id').size().reset_index(), on='id', how='left')[0] 29 | 30 | # 0 - 5 点的使用次数 31 | temp = start_close_time[(start_close_time['close_time'] >=0)&(start_close_time['close_time'] <=5)] 32 | temp = temp.groupby('id').size().reset_index() 33 | feature['zero_five_count'] = pd.merge(data_all, temp, on='id', how='left').fillna(0)[0] 34 | 35 | # 玩的时间最长的app的名字编码 36 | def get_max_label(row): 37 | row_name = list(row['app_name']) 38 | row_diff_time = list(row['diff_time']) 39 | return row_name[np.argmax(row_diff_time)] 40 | 41 | start_close_max_name = start_close_time.groupby('id').apply(lambda row:get_max_label(row)).reset_index() 42 | label_encoder = preprocessing.LabelEncoder() 43 | feature['start_close_max_name'] = label_encoder.fit_transform(pd.merge(data_all, start_close_max_name, on='id', how='left').fillna(0)[0]) 44 | 45 | feature.to_csv('feature/feature_start_close.csv', index=False) -------------------------------------------------------------------------------- /nb_cz_lwl_wcm/7_get_feature_w2v.py: -------------------------------------------------------------------------------- 1 | from gensim.models import Word2Vec 2 | import pandas as pd 3 | path="Demo/" 4 | packages = pd.read_csv(path+"deviceid_packages.tsv", 5 | sep="\t", names=['id', 'app_list']) 6 | packages['app_count'] = packages['app_list'].apply( 7 | lambda x: len(x.split(",")), 1) 8 | documents = packages['app_list'].values.tolist() 9 | texts = [[word for word in str(document).split(',')] for document in documents] 10 | # frequency = defaultdict(int) 11 | # for text in texts: 12 | # for token in text: 13 | # frequency[token] += 1 14 | # texts = [[token for token in text if frequency[token] >= 5] for text in texts] 15 | w2v = Word2Vec(texts, size=128, window=10, iter=45, 16 | workers=12, seed=1017, min_count=5) 17 | w2v.wv.save_word2vec_format('./w2v_128.txt') 18 | 19 | import gensim 20 | import numpy as np 21 | 22 | 23 | def get_w2v_avg(text, w2v_out_path, word2vec_Path): 24 | texts = [] 25 | w2v_dim = 128 26 | data = text 27 | # data = pd.read_csv(text_path) 28 | data['app_list'] = data['app_list'].apply( 29 | lambda x: x.strip().split(","), 1) 30 | texts = data['app_list'].values.tolist() 31 | 32 | model = gensim.models.KeyedVectors.load_word2vec_format( 33 | word2vec_Path, binary=False) 34 | vacab = model.vocab.keys() 35 | 36 | w2v_feature = np.zeros((len(texts), w2v_dim)) 37 | w2v_feature_avg = np.zeros((len(texts), w2v_dim)) 38 | 39 | for i, line in enumerate(texts): 40 | num = 0 41 | if line == '': 42 | w2v_feature_avg[i, :] = np.zeros(w2v_dim) 43 | else: 44 | for word in line: 45 | num += 1 46 | vec = model[word] if word in vacab else np.zeros(w2v_dim) 47 | w2v_feature[i, :] += vec 48 | w2v_feature_avg[i, :] = w2v_feature[i, :] / num 49 | w2v_avg = pd.DataFrame(w2v_feature_avg) 50 | w2v_avg.columns = ['w2v_avg_' + str(i) for i in w2v_avg.columns] 51 | w2v_avg['id'] = data['id'] 52 | w2v_avg.to_csv(w2v_out_path, encoding='utf-8', index=None) 53 | return w2v_avg 54 | 55 | 56 | w2v_feat = get_w2v_avg(packages, "feature/w2v_avg.csv", "w2v_128.txt") 57 | 58 | 59 | -------------------------------------------------------------------------------- /nb_cz_lwl_wcm/13_last_get_all_feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import pandas as pd 4 | 5 | df_brand = pd.read_csv('feature/deviceid_brand_feature.csv') 6 | df_lr = pd.read_csv('feature/tfidf_lr_error_single_classfiy.csv') 7 | df_pac = pd.read_csv('feature/tfidf_pac_error_single_classfiy.csv') 8 | df_sgd = pd.read_csv('feature/tfidf_sgd_error_single_classfiy.csv') 9 | df_ridge = pd.read_csv('feature/tfidf_ridge_error_single_classfiy.csv') 10 | df_bnb = pd.read_csv('feature/tfidf_bnb_error_single_classfiy.csv') 11 | df_mnb = pd.read_csv('feature/tfidf_mnb_error_single_classfiy.csv') 12 | df_lsvc = pd.read_csv('feature/tfidf_lsvc_error_single_classfiy.csv') 13 | df_lr_2 = pd.read_csv('feature/tfidf_lr_1_3_error_single_classfiy.csv') 14 | df_pac_2 = pd.read_csv('feature/tfidf_pac_1_3_error_single_classfiy.csv') 15 | df_sgd_2 = pd.read_csv('feature/tfidf_sgd_1_3_error_single_classfiy.csv') 16 | df_ridge_2 = pd.read_csv('feature/tfidf_ridge_1_3_error_single_classfiy.csv') 17 | df_bnb_2 = pd.read_csv('feature/tfidf_bnb_1_3_error_single_classfiy.csv') 18 | df_mnb_2 = pd.read_csv('feature/tfidf_mnb_1_3_error_single_classfiy.csv') 19 | df_lsvc_2 = pd.read_csv('feature/tfidf_lsvc_2_error_single_classfiy.csv') 20 | df_kmeans_2 = pd.read_csv('feature/cluster_2_tfidf_feature.csv') 21 | df_start_close = pd.read_csv('feature/feature_start_close.csv') 22 | df_ling_reg = pd.read_csv('feature/tfidf_ling_reg.csv') 23 | df_par_reg = pd.read_csv('feature/tfidf_par_reg.csv') 24 | df_svr_reg = pd.read_csv('feature/tfidf_svr_reg.csv') 25 | df_w2v = pd.read_csv('feature/w2v_avg.csv') 26 | del df_w2v['DeviceID'] 27 | df_best_nn = pd.read_csv('feature/yg_best_nn.csv') 28 | del df_best_nn['DeviceID'] 29 | df_chizhu_lgb = pd.read_csv('feature/lgb_feat_chizhu.csv') 30 | del df_chizhu_lgb['DeviceID'] 31 | df_chizhu_nn = pd.read_csv('feature/nn_feat.csv') 32 | del df_chizhu_nn['DeviceID'] 33 | df_lwl_lgb = pd.read_csv('feature/feat_lwl.csv') 34 | del df_lwl_lgb['DeviceID'] 35 | df_feature = pd.concat([ 36 | df_brand, 37 | df_lr, df_pac, df_sgd, 38 | df_ridge, df_bnb, df_mnb, df_lsvc, 39 | df_start_close, df_ling_reg, df_par_reg,df_svr_reg, 40 | df_lr_2, df_pac_2, df_sgd_2, df_ridge_2, df_bnb_2, df_mnb_2, 41 | df_lsvc_2, df_kmeans_2, df_w2v, df_best_nn, df_chizhu_lgb, df_chizhu_nn 42 | df_lwl_lgb 43 | ], axis=1) 44 | 45 | df_feature.to_csv('feature/feature_one.csv', encoding='utf8', index=None) 46 | 47 | -------------------------------------------------------------------------------- /THLUO/readme.md: -------------------------------------------------------------------------------- 1 | 本代码运行在windows10, 48G内存, 1070ti显卡上, 由于运行的py文件比较多, 所以需要比较长的时间才能跑完 2 | 3 | 文件夹说明: 4 | > cache文件夹是存放输出模型的文件夹 5 | > embedding是存放w2c词嵌入的文件夹 6 | > input是存放本次比赛数据的文件夹 7 | > result是THLUO选手最终的结果 8 | 9 | 下面是每个py文件的功能介绍: 10 | * 1.w2c_model_start.py 根据device打开app的时间对app进行排序,形成app_list, 将app开作词,device_id看成文档,对app进行embedding 11 | * 2.w2c_model_close.py 根据device关闭app的时间对app进行排序,形成app_list, 将app开作词,device_id看成文档,对app进行embedding 12 | * 3.w2c_model_all.py 根据device打开关闭app的时间合在对app进行排序,形成app_list, 将app开作词,device_id看成文档,对app进行embedding 13 | * 4.device_quchong_start_app_w2c.py 根据device打开app的时间对app进行排序,形成app_list, 对app_list进行去重操作, 将app开作词,device_id看成文档,对app进行embedding 14 | * 5.device_age_prob_oof.py 单独对用户年龄进行预测 15 | * 6.device_sex_prob_oof.py 单独对用户性别进行预测 16 | * 7.start_close_age_prob_oof.py 对app所属的年龄概率进行预测 17 | * 8.start_close_sex_prob_oof.py 对app所属的性别概率进行预测 18 | * 9.sex_age_bin_prob_oof.py 用2分类的手法来预测用户属于性别-年龄的概率 19 | * 10.age_bin_prob_oof.py 用2分类的手法来预测用户属于年龄的概率 20 | * 11.hcc_device_brand_age_sex.py 手机品牌和手机类型属于High Cardinality Categorical, 参考论文A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems,对手机品牌和手机类型属于性别年龄的概率进行预测 21 | * 12.device_age_regression_prob_oof.py 用回归的手法对用户属于年龄的概率进行预测 22 | * 13.device_start_GRU_pred.py 根据device打开app的时间对app进行排序,形成app_list,将app开作词,device_id看成文档,跑了一个GRU文本模型对用户属于性别年龄的概率进行预测 23 | * 14.device_start_GRU_pred_age.py 根据device打开app的时间对app进行排序,形成app_list,将app开作词,device_id看成文档,跑了一个GRU文本模型对用户属于年龄的概率进行预测 24 | * 15.device_all_GRU_pred.py 根据device打开关闭app的时间合在对app进行排序,形成app_list, 将app开作词,device_id看成文档,跑了一个GRU文本模型对用户属于性别年龄的概率进行预测 25 | * 16.device_start_capsule_pred.py 用capsule模型对用户属于性别年龄的概率进行预测 26 | * 17.device_start_textcnn_pred.py 用textcnn模型对用户属于性别年龄的概率进行预测 27 | * 18.device_start_text_dpcnn_pred.py 用dpcnn模型对用户属于性别年龄的概率进行预测 28 | * 19.device_start_lstm_pred.py 用lstm模型对用户属于性别年龄的概率进行预测 29 | * 20.lgb_sex_age_prob_oof.py 一个基础的模型,对用户属于性别年龄的概率进行预测 30 | * 21.tfidf_lr_sex_age_prob_oof.py 对app进行tf-idf操作,用户LR训练一个模型来预测用户的性别年龄概率 31 | * 22.base_feat.py 生成基础人工特征+上面产出的概率模型特征 32 | * 23.ATT_v6.py 用attention模型对22.base_feat.py产出的特征进行训练,来计算用户属于性别年龄的概率 33 | * 24.thluo_22_lgb.py 用lgb训练一个22多分类模型,输出test概率文件 34 | * 25.thluo_22_xgb.py 用xgb训练一个22多分类模型,输出test概率文件 35 | * 26.thluo_nb_lgb.py 用lgb训练一个条件分类模型,输出test概率文件,条件概率模型指的是先预测p(sex) 再预测p(age|sex),最终p(sex, age) = p(sex) * p(age|sex) 36 | * 27.thluo_nb_xgb.py 用xgb训练一个条件分类模型,输出test概率文件,条件概率模型指的是先预测p(sex) 再预测p(age|sex),最终p(sex, age) = p(sex) * p(age|sex) 37 | * 28.final.py 对上面四个模型产出的结果,进行线性加权融合,形成THLUO选手个人的最终结果 38 | * TextModel.py包含本次比赛用到的文本模型 39 | * util.py里面包含一些共用的函数 40 | 41 | 42 | 43 | 44 | > note:因为本次比赛提交代码的时间比较仓促,之前一直都是用notebook来做比赛,所以如有问题,请联系团队 45 | -------------------------------------------------------------------------------- /THLUO/24.thluo_22_lgb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.preprocessing import LabelEncoder 20 | import gc 21 | 22 | 23 | 24 | # In[24]: 25 | 26 | 27 | df_train_w2v = pd.read_csv('thluo_train_best_feat.csv') 28 | df_att_nn_feat_v6 = pd.read_csv('att_nn_feat_v6.csv') 29 | df_att_nn_feat_v6.columns = ['device_id'] + ['att_nn_feat_' + str(i) for i in range(22)] 30 | df_train_w2v = df_train_w2v.merge(df_att_nn_feat_v6, on='device_id', how='left') 31 | 32 | 33 | # In[ ]: 34 | 35 | 36 | df_train_w2v.to_csv('thluo_train_best_feat.csv', index=None) 37 | 38 | 39 | # In[26]: 40 | 41 | 42 | train = df_train_w2v[df_train_w2v['sex'].notnull()] 43 | test = df_train_w2v[df_train_w2v['sex'].isnull()] 44 | 45 | X = train.drop(['sex','age','sex_age','device_id'],axis=1) 46 | Y = train['sex_age'] 47 | Y_CAT = pd.Categorical(Y) 48 | Y = pd.Series(Y_CAT.codes) 49 | 50 | 51 | # In[28]: 52 | 53 | 54 | from sklearn.model_selection import KFold, StratifiedKFold 55 | gc.collect() 56 | seed = 666 57 | num_folds = 5 58 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed) 59 | 60 | sub_list = [] 61 | 62 | cate_feat = ['device_type','device_brand'] 63 | 64 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)): 65 | train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx] 66 | valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx] 67 | 68 | lgb_train=lgb.Dataset(train_x,label=train_y) 69 | lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train) 70 | params = { 71 | 'boosting_type': 'gbdt', 72 | #'learning_rate' : 0.02, 73 | 'learning_rate' : 0.01, 74 | 'max_depth':5, 75 | 'num_leaves' : 2 ** 4, 76 | 'metric': {'multi_logloss'}, 77 | 'num_class' : 22, 78 | 'objective' : 'multiclass', 79 | 'random_state' : 2018, 80 | 'bagging_freq' : 5, 81 | 'feature_fraction' : 0.7, 82 | 'bagging_fraction' : 0.7, 83 | 'min_split_gain' : 0.0970905919552776, 84 | 'min_child_weight' : 9.42012323936088, 85 | } 86 | 87 | gbm = lgb.train(params, 88 | lgb_train, 89 | num_boost_round=1000, 90 | valid_sets=lgb_eval, 91 | early_stopping_rounds=200, verbose_eval=100) 92 | 93 | sub = pd.DataFrame(gbm.predict(test[X.columns.values],num_iteration=gbm.best_iteration)) 94 | sub_list.append(sub) 95 | 96 | 97 | # In[29]: 98 | 99 | 100 | sub = (sub_list[0] + sub_list[1] + sub_list[2] + sub_list[3] + sub_list[4]) / num_folds 101 | 102 | 103 | # In[31]: 104 | 105 | 106 | sub.columns=Y_CAT.categories 107 | sub['DeviceID']=test['device_id'].values 108 | sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']] 109 | 110 | 111 | # In[32]: 112 | 113 | 114 | sub.to_csv('th_22_results_lgb.csv',index=False) 115 | 116 | -------------------------------------------------------------------------------- /linwangli/code/lgb_allfeat_22.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from catboost import Pool, CatBoostClassifier, cv 5 | import pandas as pd 6 | import seaborn as sns 7 | import numpy as np 8 | from tqdm import tqdm 9 | from sklearn.decomposition import LatentDirichletAllocation 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.metrics import accuracy_score 12 | import lightgbm as lgb 13 | from datetime import datetime,timedelta 14 | import matplotlib.pyplot as plt 15 | import time 16 | 17 | import gc 18 | from sklearn import preprocessing 19 | from sklearn.feature_extraction.text import TfidfVectorizer 20 | 21 | from scipy.sparse import hstack, vstack 22 | from sklearn.model_selection import StratifiedKFold 23 | from sklearn.model_selection import cross_val_score 24 | from skopt.space import Integer, Categorical, Real, Log10 25 | from skopt.utils import use_named_args 26 | from skopt import gp_minimize 27 | import re 28 | 29 | 30 | train = pd.read_csv('../dataset/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age']) 31 | all_feat = pd.read_csv('../dataset/all_feat.csv') 32 | 33 | train['label'] = train['sex'].astype(str) + '-' + train['age'].astype(str) 34 | label_le = preprocessing.LabelEncoder() 35 | train['label'] = label_le.fit_transform(train['label']) 36 | data_all = pd.merge(left=all_feat, right=train, on='device_id', how='left') 37 | 38 | 39 | train = data_all[:50000] 40 | test = data_all[50000:] 41 | train = train.fillna(-1) 42 | test = test.fillna(-1) 43 | del data_all 44 | gc.collect() 45 | 46 | use_feats = all_feat.columns[1:] 47 | use_feats 48 | 49 | X_train = train[use_feats] 50 | X_test = test[use_feats] 51 | Y = train['label'] 52 | kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True) 53 | sub = np.zeros((X_test.shape[0], 22)) 54 | for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)): 55 | X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index] 56 | dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=[-1]) 57 | dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain) 58 | params = { 59 | 'boosting_type': 'gbdt', 60 | 'max_depth':6, 61 | 'metric': {'multi_logloss'}, 62 | 'num_class':22, 63 | 'objective':'multiclass', 64 | 'num_leaves':7, 65 | 'subsample': 0.9, 66 | 'colsample_bytree': 0.2, 67 | 'lambda_l1':0.0001, 68 | 'lambda_l2':0.00111, 69 | 'subsample_freq':12, 70 | 'learning_rate': 0.012, 71 | 'min_child_weight':12 72 | 73 | } 74 | 75 | model = lgb.train(params, 76 | dtrain, 77 | num_boost_round=6000, 78 | valid_sets=dvalid, 79 | early_stopping_rounds=100, 80 | verbose_eval=100) 81 | 82 | 83 | sub += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits 84 | 85 | 86 | sub = pd.DataFrame(sub) 87 | cols = [x for x in range(0, 22)] 88 | cols = label_le.inverse_transform(cols) 89 | sub.columns = cols 90 | sub['DeviceID'] = test['device_id'].values 91 | sub = sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 92 | '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 93 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']] 94 | sub.to_csv('lgb_22.csv', index=False) 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /chizhu/single_model/user_behavior.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import seaborn as sns 6 | import numpy as np 7 | from tqdm import tqdm 8 | from sklearn.decomposition import LatentDirichletAllocation 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score 11 | import lightgbm as lgb 12 | from datetime import datetime, timedelta 13 | import matplotlib.pyplot as plt 14 | import time 15 | from sklearn.feature_extraction.text import TfidfTransformer 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | # %matplotlib inline 18 | from config import path 19 | #add 20 | import gc 21 | 22 | packtime = pd.read_table(path+'deviceid_package_start_close.tsv', 23 | names=['device_id', 'app', 'start', 'close'], low_memory=True) 24 | # packtime.head() 25 | packtime['peroid'] = (packtime['close'] - packtime['start'])/1000 26 | packtime['start'] = pd.to_datetime(packtime['start'], unit='ms') 27 | #packtime['closetime'] = pd.to_datetime(packtime['close'], unit='ms') 28 | del packtime['close'] 29 | gc.collect() 30 | 31 | #packtime['day'] = packtime['start'].dt.day 32 | #packtime['month'] = packtime['start'].dt.month 33 | packtime['hour'] = packtime['start'].dt.hour 34 | packtime['date'] = packtime['start'].dt.date 35 | packtime['dayofweek'] = packtime['start'].dt.dayofweek 36 | #packtime['hour'] = pd.cut(packtime['hour'], bins=4).cat.codes 37 | 38 | #平均每天使用设备时间 39 | dtime = packtime.groupby(['device_id', 'date'])['peroid'].agg('sum') 40 | #不同时间段占比 41 | qtime = packtime.groupby(['device_id', 'hour'])['peroid'].agg('sum') 42 | wtime = packtime.groupby(['device_id', 'dayofweek'])['peroid'].agg('sum') 43 | atime = packtime.groupby(['device_id', 'app'])['peroid'].agg('sum') 44 | 45 | 46 | dapp = packtime[['device_id', 'date', 'app']].drop_duplicates().groupby( 47 | ['device_id', 'date'])['app'].agg(' '.join) 48 | dapp = dapp.reset_index() 49 | dapp['app_len'] = dapp['app'].apply(lambda x: x.split(' ')).apply(len) 50 | dapp_stat = dapp.groupby('device_id')['app_len'].agg( 51 | {'std': 'std', 'mean': 'mean', 'max': 'max'}) 52 | dapp_stat = dapp_stat.reset_index() 53 | dapp_stat.columns = ['device_id', 'app_len_std', 'app_len_mean', 'app_len_max'] 54 | # dapp_stat.head() 55 | 56 | dtime = dtime.reset_index() 57 | dtime_stat = dtime.groupby(['device_id'])['peroid'].agg( 58 | {'sum': 'sum', 'mean': 'mean', 'std': 'std', 'max': 'max'}).reset_index() 59 | dtime_stat.columns = ['device_id', 'date_sum', 60 | 'date_mean', 'date_std', 'date_max'] 61 | # dtime_stat.head() 62 | 63 | qtime = qtime.reset_index() 64 | ftime = qtime.pivot(index='device_id', columns='hour', 65 | values='peroid').fillna(0) 66 | ftime.columns = ['h%s' % i for i in range(24)] 67 | ftime.reset_index(inplace=True) 68 | # ftime.head() 69 | 70 | wtime = wtime.reset_index() 71 | weektime = wtime.pivot( 72 | index='device_id', columns='dayofweek', values='peroid').fillna(0) 73 | weektime.columns = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6'] 74 | weektime.reset_index(inplace=True) 75 | # weektime.head() 76 | 77 | atime = atime.reset_index() 78 | app = atime.groupby(['device_id'])['peroid'].idxmax() 79 | 80 | #dapp_stat.shape, dtime_stat.shape, ftime.shape, weektime.shape, app.shape 81 | 82 | user = pd.merge(dapp_stat, dtime_stat, on='device_id', how='left') 83 | user = pd.merge(user, ftime, on='device_id', how='left') 84 | user = pd.merge(user, weektime, on='device_id', how='left') 85 | user = pd.merge(user, atime.iloc[app], on='device_id', how='left') 86 | 87 | app_cat = pd.read_table(path+'package_label.tsv', 88 | names=['app', 'category', 'app_name']) 89 | 90 | cat_enc = pd.DataFrame(app_cat['category'].value_counts()) 91 | cat_enc['idx'] = range(45) 92 | 93 | app_cat['cat_enc'] = app_cat['category'].map(cat_enc['idx']) 94 | app_cat.set_index(['app'], inplace=True) 95 | 96 | atime['app_cat_enc'] = atime['app'].map(app_cat['cat_enc']).fillna(45) 97 | 98 | cat_num = atime.groupby(['device_id', 'app_cat_enc'])[ 99 | 'app'].agg('count').reset_index() 100 | cat_time = atime.groupby(['device_id', 'app_cat_enc'])[ 101 | 'peroid'].agg('sum').reset_index() 102 | 103 | app_cat_num = cat_num.pivot( 104 | index='device_id', columns='app_cat_enc', values='app').fillna(0) 105 | app_cat_num.columns = ['cat%s' % i for i in range(46)] 106 | app_cat_time = cat_time.pivot( 107 | index='device_id', columns='app_cat_enc', values='peroid').fillna(0) 108 | app_cat_time.columns = ['time%s' % i for i in range(46)] 109 | 110 | user = pd.merge(user, app_cat_num, on='device_id', how='left') 111 | user = pd.merge(user, app_cat_time, on='device_id', how='left') 112 | user.to_csv('data/user_behavior.csv', index=False) 113 | 114 | 115 | -------------------------------------------------------------------------------- /THLUO/3.w2c_all_emb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.preprocessing import LabelEncoder 20 | import gc 21 | 22 | 23 | 24 | # In[2]: 25 | 26 | 27 | path='input/' 28 | data=pd.DataFrame() 29 | #sex_age=pd.read_excel('./data/性别年龄对照表.xlsx') 30 | 31 | 32 | # In[3]: 33 | 34 | 35 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 36 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 37 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 38 | deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type']) 39 | deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time']) 40 | package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type']) 41 | 42 | 43 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0]) 44 | 45 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 46 | one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values 47 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x) 48 | 49 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 50 | one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values 51 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x) 52 | 53 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 54 | one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values 55 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x) 56 | 57 | 58 | #转换成对应的数字 59 | lbl = LabelEncoder() 60 | lbl.fit(list(deviceid_brand.device_brand.values)) 61 | deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values)) 62 | 63 | lbl = LabelEncoder() 64 | lbl.fit(list(deviceid_brand.device_type.values)) 65 | deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values)) 66 | 67 | #转换成对应的数字 68 | lbl = LabelEncoder() 69 | lbl.fit(list(package_label.app_parent_type.values)) 70 | package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values)) 71 | 72 | lbl = LabelEncoder() 73 | lbl.fit(list(package_label.app_child_type.values)) 74 | package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values)) 75 | 76 | 77 | # In[4]: 78 | 79 | 80 | deviceid_package_start = deviceid_package_start_close[['device_id', 'app_id', 'start_time']] 81 | deviceid_package_start.columns = ['device_id', 'app_id', 'all_time'] 82 | deviceid_package_close = deviceid_package_start_close[['device_id', 'app_id', 'close_time']] 83 | deviceid_package_close.columns = ['device_id', 'app_id', 'all_time'] 84 | deviceid_package_all = pd.concat([deviceid_package_start, deviceid_package_close]) 85 | 86 | 87 | # In[6]: 88 | 89 | 90 | df_sorted = deviceid_package_all.sort_values(by='all_time') 91 | 92 | 93 | # In[8]: 94 | 95 | 96 | df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'}) 97 | df_device_start_app_list 98 | 99 | 100 | # In[9]: 101 | 102 | 103 | app_list = list(df_device_start_app_list.app_list.values) 104 | 105 | 106 | # In[10]: 107 | 108 | 109 | from gensim.test.utils import common_texts, get_tmpfile 110 | from gensim.models import Word2Vec 111 | 112 | 113 | # In[11]: 114 | 115 | 116 | word_dim = 200 117 | model = Word2Vec(app_list, size=word_dim, window=20, min_count=2, workers=4) 118 | model.save("word2vec.model") 119 | 120 | 121 | # In[13]: 122 | 123 | 124 | vocab = list(model.wv.vocab.keys()) 125 | 126 | w2c_arr = [] 127 | 128 | for v in vocab : 129 | w2c_arr.append(list(model.wv[v])) 130 | 131 | 132 | # In[14]: 133 | 134 | 135 | df_w2c_start = pd.DataFrame() 136 | df_w2c_start['app_id'] = vocab 137 | df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1) 138 | df_w2c_start.columns = ['app_id'] + ['w2c_all_app_' + str(i) for i in range(word_dim)] 139 | 140 | 141 | # In[16]: 142 | 143 | 144 | df_w2c_start.to_csv('w2c_all_emb.csv', index=None) 145 | 146 | -------------------------------------------------------------------------------- /chizhu/stacking/nurbs_feat/xgb_22.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import matplotlib.pyplot as plt 17 | import time 18 | from sklearn.feature_extraction.text import TfidfTransformer 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | # get_ipython().run_line_magic('matplotlib', 'inline') 21 | 22 | #add 23 | import gc 24 | from sklearn import preprocessing 25 | from sklearn.feature_extraction.text import TfidfVectorizer 26 | 27 | from scipy.sparse import hstack, vstack 28 | from sklearn.model_selection import StratifiedKFold 29 | from sklearn.model_selection import cross_val_score 30 | # from skopt.space import Integer, Categorical, Real, Log10 31 | # from skopt.utils import use_named_args 32 | # from skopt import gp_minimize 33 | from gensim.models import Word2Vec, FastText 34 | import gensim 35 | import re 36 | import os 37 | path="./feature/"###nurbs概率文件路径 38 | o_path="/dev/shm/chizhu_data/data/"###原始文件路径 39 | os.listdir(path) 40 | 41 | 42 | # In[4]: 43 | 44 | 45 | 46 | all_feat=pd.read_csv(path+"feature_22_all.csv") 47 | train_id=pd.read_csv(o_path+"deviceid_train.tsv",sep="\t",names=['device_id','sex','age']) 48 | test_id=pd.read_csv(o_path+"deviceid_test.tsv",sep="\t",names=['device_id']) 49 | all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]]) 50 | all_id.index=range(len(all_id)) 51 | all_feat['device_id']=all_id 52 | # deepnn_feat=pd.read_csv(path+"deepnn_fix.csv") 53 | # deepnn_feat['device_id']=deepnn_feat['DeviceID'] 54 | # del deepnn_feat['DeviceID'] 55 | 56 | 57 | # In[9]: 58 | 59 | 60 | train=pd.merge(train_id,all_feat,on="device_id",how="left") 61 | # train=pd.merge(train,deepnn_feat,on="device_id",how="left") 62 | test=pd.merge(test_id,all_feat,on="device_id",how="left") 63 | # test=pd.merge(test,deepnn_feat,on="device_id",how="left") 64 | 65 | 66 | # In[10]: 67 | 68 | 69 | train['sex-age']=train.apply(lambda x:str(x['sex'])+"-"+str(x['age']),1) 70 | 71 | 72 | # In[11]: 73 | 74 | 75 | features = [x for x in train.columns if x not in ['device_id',"sex",'age','sex-age']] 76 | label="sex-age" 77 | 78 | 79 | # In[12]: 80 | 81 | 82 | Y_CAT=pd.Categorical(train[label]) 83 | 84 | 85 | # In[13]: 86 | 87 | 88 | import lightgbm as lgb 89 | import xgboost as xgb 90 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 91 | from sklearn.cross_validation import StratifiedKFold 92 | 93 | kf = StratifiedKFold(Y_CAT, n_folds=5, shuffle=True, random_state=1024) 94 | params={ 95 | 'booster':'gbtree', 96 | "tree_method":"gpu_hist", 97 | "gpu_id":"1", 98 | 'objective': 'multi:softprob', 99 | # 'is_unbalance':'True', 100 | # 'scale_pos_weight': 1500.0/13458.0, 101 | 'eval_metric': "mlogloss", 102 | 'num_class':22, 103 | 'gamma':0.1,#0.2 is ok 104 | 'max_depth':6, 105 | # 'lambda':20, 106 | # "alpha":5, 107 | 'subsample':0.7, 108 | 'colsample_bytree':0.4 , 109 | # 'min_child_weight':2.5, 110 | 'eta': 0.01, 111 | # 'learning_rate':0.01, 112 | "silent":1, 113 | 'seed':1024, 114 | 'nthread':12, 115 | 116 | } 117 | num_round = 3500 118 | early_stopping_rounds = 100 119 | 120 | 121 | # In[14]: 122 | 123 | 124 | aus = [] 125 | sub2 = np.zeros((len(test),22 )) 126 | pred_oob2=np.zeros((len(train),22)) 127 | models=[] 128 | iters=[] 129 | for i,(train_index,test_index) in enumerate(kf): 130 | 131 | tr_x = train[features].reindex(index=train_index, copy=False) 132 | tr_y = Y_CAT.codes[train_index] 133 | te_x = train[features].reindex(index=test_index, copy=False) 134 | te_y = Y_CAT.codes[test_index] 135 | 136 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 137 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 138 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 139 | d_te = xgb.DMatrix(te_x, label=te_y) 140 | watchlist = [(d_tr,'train'), 141 | (d_te,'val') 142 | ] 143 | model = xgb.train(params, d_tr, num_boost_round=5500, 144 | evals=watchlist,verbose_eval=200, 145 | early_stopping_rounds=100) 146 | models.append(model) 147 | iters.append(model.best_iteration) 148 | pred = model.predict(d_te,ntree_limit=model.best_iteration) 149 | pred_oob2[test_index] =pred 150 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 151 | a = log_loss(te_y, pred) 152 | 153 | sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=model.best_iteration)/5 154 | 155 | 156 | print ("idx: ", i) 157 | print (" loss: %.5f" % a) 158 | # print " gini: %.5f" % g 159 | aus.append(a) 160 | 161 | print ("mean") 162 | print ("loss: %s" % (sum(aus) / 5.0)) 163 | 164 | 165 | # In[15]: 166 | 167 | 168 | res=np.vstack((pred_oob2,sub2)) 169 | res = pd.DataFrame(res,columns=Y_CAT.categories) 170 | res['DeviceID']=all_id 171 | res=res[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']] 172 | 173 | res.to_csv("xgb_nurbs_22_feat.csv",index=False) 174 | 175 | 176 | # In[16]: 177 | 178 | 179 | test['DeviceID']=test['device_id'] 180 | sub=pd.merge(test[['DeviceID']],res,on="DeviceID",how="left") 181 | sub.to_csv("xgb_nurbs_22.csv",index=False) 182 | 183 | -------------------------------------------------------------------------------- /THLUO/1.w2c_model_start.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.preprocessing import LabelEncoder 20 | import gc 21 | from gensim.test.utils import common_texts, get_tmpfile 22 | from gensim.models import Word2Vec 23 | 24 | 25 | # In[2]: 26 | 27 | 28 | path='input/' 29 | data=pd.DataFrame() 30 | print ('1.w2c_model_start.py') 31 | 32 | # In[3]: 33 | 34 | 35 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 36 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 37 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 38 | deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type']) 39 | deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time']) 40 | package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type']) 41 | 42 | 43 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0]) 44 | 45 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 46 | one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values 47 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x) 48 | 49 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 50 | one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values 51 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x) 52 | 53 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 54 | one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values 55 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x) 56 | 57 | 58 | #转换成对应的数字 59 | lbl = LabelEncoder() 60 | lbl.fit(list(deviceid_brand.device_brand.values)) 61 | deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values)) 62 | 63 | lbl = LabelEncoder() 64 | lbl.fit(list(deviceid_brand.device_type.values)) 65 | deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values)) 66 | 67 | #转换成对应的数字 68 | lbl = LabelEncoder() 69 | lbl.fit(list(package_label.app_parent_type.values)) 70 | package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values)) 71 | 72 | lbl = LabelEncoder() 73 | lbl.fit(list(package_label.app_child_type.values)) 74 | package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values)) 75 | 76 | 77 | # In[4]: 78 | 79 | 80 | df_sorted = deviceid_package_start_close.sort_values(by='start_time') 81 | 82 | 83 | # In[20]: 84 | 85 | 86 | df_results = df_sorted.groupby('device_id')['app_id'].apply(lambda x:' '.join(x)).reset_index().rename(columns = {'app_id' : 'app_list'}) 87 | df_results.to_csv('01.device_click_app_sorted_by_start.csv', index=None) 88 | del df_results 89 | 90 | 91 | # In[5]: 92 | 93 | 94 | df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'}) 95 | 96 | 97 | # In[7]: 98 | 99 | 100 | app_list = list(df_device_start_app_list.app_list.values) 101 | 102 | 103 | # In[9]: 104 | 105 | 106 | model = Word2Vec(app_list, size=10, window=10, min_count=2, workers=4) 107 | model.save("word2vec.model") 108 | 109 | 110 | # In[10]: 111 | 112 | 113 | vocab = list(model.wv.vocab.keys()) 114 | 115 | w2c_arr = [] 116 | 117 | for v in vocab : 118 | w2c_arr.append(list(model.wv[v])) 119 | 120 | 121 | # In[11]: 122 | 123 | 124 | df_w2c_start = pd.DataFrame() 125 | df_w2c_start['app_id'] = vocab 126 | df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1) 127 | df_w2c_start.columns = ['app_id'] + ['w2c_start_app_' + str(i) for i in range(10)] 128 | 129 | 130 | # In[13]: 131 | 132 | 133 | w2c_nums = 10 134 | agg = {} 135 | for l in ['w2c_start_app_' + str(i) for i in range(w2c_nums)] : 136 | agg[l] = ['mean', 'std', 'max', 'min'] 137 | 138 | 139 | # In[14]: 140 | 141 | 142 | deviceid_package_start_close = deviceid_package_start_close.merge(df_w2c_start, on='app_id', how='left') 143 | 144 | 145 | # In[15]: 146 | 147 | 148 | df_agg = deviceid_package_start_close.groupby('device_id').agg(agg) 149 | df_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()]) 150 | df_agg = df_agg.reset_index() 151 | df_agg.to_csv('device_start_app_w2c.csv', index=None) 152 | 153 | 154 | # In[16]: 155 | 156 | 157 | df_results = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_time'].mean().reset_index() 158 | df_results = df_results.merge(df_w2c_start, on='app_id', how='left') 159 | 160 | 161 | # In[18]: 162 | 163 | 164 | df_agg = df_results.groupby('device_id').agg(agg) 165 | df_agg.columns = pd.Index(['device_app_unique_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()]) 166 | df_agg = df_agg.reset_index() 167 | 168 | 169 | # In[24]: 170 | 171 | 172 | df_agg.to_csv('device_app_unique_start_app_w2c.csv', index=None) 173 | print ('success.....') 174 | -------------------------------------------------------------------------------- /THLUO/2.w2c_model_close.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.preprocessing import LabelEncoder 20 | import gc 21 | 22 | 23 | 24 | 25 | # In[2]: 26 | print ('2.w2c_model_close.py') 27 | 28 | path='input/' 29 | data=pd.DataFrame() 30 | #sex_age=pd.read_excel('./data/性别年龄对照表.xlsx') 31 | 32 | 33 | # In[3]: 34 | 35 | 36 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 37 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 38 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 39 | deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type']) 40 | deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time']) 41 | package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type']) 42 | 43 | 44 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0]) 45 | 46 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 47 | one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values 48 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x) 49 | 50 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 51 | one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values 52 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x) 53 | 54 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 55 | one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values 56 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x) 57 | 58 | 59 | #转换成对应的数字 60 | lbl = LabelEncoder() 61 | lbl.fit(list(deviceid_brand.device_brand.values)) 62 | deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values)) 63 | 64 | lbl = LabelEncoder() 65 | lbl.fit(list(deviceid_brand.device_type.values)) 66 | deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values)) 67 | 68 | #转换成对应的数字 69 | lbl = LabelEncoder() 70 | lbl.fit(list(package_label.app_parent_type.values)) 71 | package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values)) 72 | 73 | lbl = LabelEncoder() 74 | lbl.fit(list(package_label.app_child_type.values)) 75 | package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values)) 76 | 77 | 78 | # In[4]: 79 | 80 | 81 | df_sorted = deviceid_package_start_close.sort_values(by='close_time') 82 | 83 | 84 | # In[6]: 85 | 86 | 87 | df_results = df_sorted.groupby('device_id')['app_id'].apply(lambda x:' '.join(x)).reset_index().rename(columns = {'app_id' : 'app_list'}) 88 | 89 | 90 | # In[7]: 91 | 92 | 93 | df_results.to_csv('02.device_click_app_sorted_by_close.csv', index=None) 94 | 95 | 96 | # In[6]: 97 | 98 | 99 | df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'}) 100 | 101 | 102 | # In[7]: 103 | 104 | 105 | app_list = list(df_device_start_app_list.app_list.values) 106 | 107 | 108 | # In[8]: 109 | 110 | 111 | from gensim.test.utils import common_texts, get_tmpfile 112 | from gensim.models import Word2Vec 113 | 114 | 115 | # In[9]: 116 | 117 | 118 | model = Word2Vec(app_list, size=10, window=10, min_count=2, workers=4) 119 | model.save("word2vec.model") 120 | 121 | 122 | # In[11]: 123 | 124 | 125 | vocab = list(model.wv.vocab.keys()) 126 | 127 | w2c_arr = [] 128 | 129 | for v in vocab : 130 | w2c_arr.append(list(model.wv[v])) 131 | 132 | 133 | # In[12]: 134 | 135 | 136 | df_w2c_start = pd.DataFrame() 137 | df_w2c_start['app_id'] = vocab 138 | df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1) 139 | df_w2c_start.columns = ['app_id'] + ['w2c_close_app_' + str(i) for i in range(10)] 140 | 141 | 142 | # In[ ]: 143 | 144 | 145 | w2c_nums = 10 146 | agg = {} 147 | for l in ['w2c_close_app_' + str(i) for i in range(w2c_nums)] : 148 | agg[l] = ['mean', 'std', 'max', 'min'] 149 | 150 | 151 | # In[14]: 152 | 153 | 154 | deviceid_package_start_close = deviceid_package_start_close.merge(df_w2c_start, on='app_id', how='left') 155 | 156 | 157 | # In[ ]: 158 | 159 | 160 | df_agg = deviceid_package_start_close.groupby('device_id').agg(agg) 161 | df_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()]) 162 | df_agg = df_agg.reset_index() 163 | df_agg.to_csv('device_close_app_w2c.csv', index=None) 164 | 165 | 166 | # In[14]: 167 | 168 | 169 | df_results = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_time'].mean().reset_index() 170 | df_results = df_results.merge(df_w2c_start, on='app_id', how='left') 171 | 172 | 173 | # In[17]: 174 | 175 | 176 | df_agg = df_results.groupby('device_id').agg(agg) 177 | df_agg.columns = pd.Index(['device_app_unique_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()]) 178 | df_agg = df_agg.reset_index() 179 | 180 | 181 | # In[18]: 182 | 183 | 184 | df_agg.to_csv('device_app_unique_close_app_w2c.csv', index=None) 185 | 186 | -------------------------------------------------------------------------------- /THLUO/3.w2c_model_all.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.preprocessing import LabelEncoder 20 | from gensim.test.utils import common_texts, get_tmpfile 21 | from gensim.models import Word2Vec 22 | import gc 23 | 24 | 25 | 26 | # In[2]: 27 | print ('3.w2c_model_all.py') 28 | 29 | path='input/' 30 | data=pd.DataFrame() 31 | #sex_age=pd.read_excel('./data/性别年龄对照表.xlsx') 32 | 33 | 34 | # In[3]: 35 | 36 | 37 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 38 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 39 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 40 | deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type']) 41 | deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time']) 42 | package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type']) 43 | 44 | 45 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0]) 46 | 47 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 48 | one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values 49 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x) 50 | 51 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 52 | one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values 53 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x) 54 | 55 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 56 | one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values 57 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x) 58 | 59 | 60 | #转换成对应的数字 61 | lbl = LabelEncoder() 62 | lbl.fit(list(deviceid_brand.device_brand.values)) 63 | deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values)) 64 | 65 | lbl = LabelEncoder() 66 | lbl.fit(list(deviceid_brand.device_type.values)) 67 | deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values)) 68 | 69 | #转换成对应的数字 70 | lbl = LabelEncoder() 71 | lbl.fit(list(package_label.app_parent_type.values)) 72 | package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values)) 73 | 74 | lbl = LabelEncoder() 75 | lbl.fit(list(package_label.app_child_type.values)) 76 | package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values)) 77 | 78 | 79 | # In[4]: 80 | 81 | 82 | deviceid_package_start = deviceid_package_start_close[['device_id', 'app_id', 'start_time']] 83 | deviceid_package_start.columns = ['device_id', 'app_id', 'all_time'] 84 | deviceid_package_close = deviceid_package_start_close[['device_id', 'app_id', 'close_time']] 85 | deviceid_package_close.columns = ['device_id', 'app_id', 'all_time'] 86 | deviceid_package_all = pd.concat([deviceid_package_start, deviceid_package_close]) 87 | 88 | 89 | # In[5]: 90 | 91 | 92 | df_sorted = deviceid_package_all.sort_values(by='all_time') 93 | 94 | 95 | # In[7]: 96 | 97 | 98 | df_results = df_sorted.groupby('device_id')['app_id'].apply(lambda x:' '.join(x)).reset_index().rename(columns = {'app_id' : 'app_list'}) 99 | df_results.to_csv('03.device_click_app_sorted_by_all.csv', index=None) 100 | del df_results 101 | 102 | 103 | # In[8]: 104 | 105 | 106 | df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'}) 107 | 108 | 109 | # In[9]: 110 | 111 | 112 | app_list = list(df_device_start_app_list.app_list.values) 113 | 114 | 115 | # In[11]: 116 | 117 | 118 | model = Word2Vec(app_list, size=10, window=50, min_count=2, workers=4) 119 | model.save("word2vec.model") 120 | 121 | 122 | # In[12]: 123 | 124 | 125 | vocab = list(model.wv.vocab.keys()) 126 | 127 | w2c_arr = [] 128 | 129 | for v in vocab : 130 | w2c_arr.append(list(model.wv[v])) 131 | 132 | 133 | # In[13]: 134 | 135 | 136 | df_w2c_start = pd.DataFrame() 137 | df_w2c_start['app_id'] = vocab 138 | df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1) 139 | df_w2c_start.columns = ['app_id'] + ['w2c_all_app_' + str(i) for i in range(10)] 140 | 141 | 142 | # In[14]: 143 | 144 | 145 | w2c_nums = 10 146 | agg = {} 147 | for l in ['w2c_all_app_' + str(i) for i in range(w2c_nums)] : 148 | agg[l] = ['mean', 'std', 'max', 'min'] 149 | 150 | 151 | # In[15]: 152 | 153 | 154 | deviceid_package_start_close = deviceid_package_start_close.merge(df_w2c_start, on='app_id', how='left') 155 | 156 | 157 | # In[16]: 158 | 159 | 160 | df_agg = deviceid_package_start_close.groupby('device_id').agg(agg) 161 | df_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()]) 162 | df_agg = df_agg.reset_index() 163 | df_agg.to_csv('device_all_app_w2c.csv', index=None) 164 | 165 | 166 | # In[18]: 167 | 168 | 169 | df_results = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_time'].mean().reset_index() 170 | df_results = df_results.merge(df_w2c_start, on='app_id', how='left') 171 | 172 | 173 | # In[22]: 174 | 175 | 176 | df_agg = df_results.groupby('device_id').agg(agg) 177 | df_agg.columns = pd.Index(['device_app_unique' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()]) 178 | df_agg = df_agg.reset_index() 179 | 180 | 181 | # In[20]: 182 | 183 | 184 | df_agg.to_csv('device_app_unique_all_app_w2c.csv', index=None) 185 | 186 | -------------------------------------------------------------------------------- /THLUO/3.device_quchong_start_app_w2c.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.preprocessing import LabelEncoder 20 | import gc 21 | from gensim.test.utils import common_texts, get_tmpfile 22 | from gensim.models import Word2Vec 23 | 24 | 25 | # In[2]: 26 | 27 | print ('8.device_quchong_start_app_w2c.py') 28 | path='input/' 29 | data=pd.DataFrame() 30 | #sex_age=pd.read_excel('./data/性别年龄对照表.xlsx') 31 | 32 | 33 | # In[3]: 34 | 35 | 36 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 37 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 38 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 39 | deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type']) 40 | deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time']) 41 | package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type']) 42 | 43 | 44 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0]) 45 | 46 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 47 | one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values 48 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x) 49 | 50 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 51 | one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values 52 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x) 53 | 54 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 55 | one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values 56 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x) 57 | 58 | 59 | #转换成对应的数字 60 | lbl = LabelEncoder() 61 | lbl.fit(list(deviceid_brand.device_brand.values)) 62 | deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values)) 63 | 64 | lbl = LabelEncoder() 65 | lbl.fit(list(deviceid_brand.device_type.values)) 66 | deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values)) 67 | 68 | #转换成对应的数字 69 | lbl = LabelEncoder() 70 | lbl.fit(list(package_label.app_parent_type.values)) 71 | package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values)) 72 | 73 | lbl = LabelEncoder() 74 | lbl.fit(list(package_label.app_child_type.values)) 75 | package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values)) 76 | 77 | 78 | # In[4]: 79 | 80 | 81 | import time 82 | 83 | # 输入毫秒级的时间,转出正常格式的时间 84 | def timeStamp(timeNum): 85 | timeStamp = float(timeNum/1000) 86 | timeArray = time.localtime(timeStamp) 87 | otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) 88 | return otherStyleTime 89 | 90 | #解析出具体的时间 91 | deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp)) 92 | deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp)) 93 | deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour 94 | deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour 95 | deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]') 96 | 97 | deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left') 98 | deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True) 99 | deviceid_package_start_close.app_child_type.fillna(-1, inplace=True) 100 | deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year 101 | deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year 102 | deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year'] 103 | 104 | 105 | # In[9]: 106 | 107 | 108 | df_temp = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_hour'].mean().reset_index() 109 | df_temp 110 | 111 | 112 | # In[10]: 113 | 114 | 115 | df_sorted = df_temp.sort_values(by='start_hour') 116 | 117 | 118 | # In[13]: 119 | 120 | 121 | df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'}) 122 | 123 | 124 | # In[17]: 125 | 126 | 127 | app_list = list(df_device_start_app_list.app_list.values) 128 | 129 | 130 | # In[35]: 131 | 132 | 133 | model = Word2Vec(app_list, size=10, window=4, min_count=2, workers=4) 134 | model.save("word2vec.model") 135 | 136 | 137 | # In[37]: 138 | 139 | 140 | vocab = list(model.wv.vocab.keys()) 141 | 142 | w2c_arr = [] 143 | 144 | for v in vocab : 145 | w2c_arr.append(list(model.wv[v])) 146 | 147 | 148 | # In[38]: 149 | 150 | 151 | df_w2c_start = pd.DataFrame() 152 | df_w2c_start['app_id'] = vocab 153 | df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1) 154 | df_w2c_start.columns = ['app_id'] + ['w2c_start_app_' + str(i) for i in range(10)] 155 | 156 | 157 | # In[47]: 158 | 159 | 160 | df_sorted = df_sorted.merge(df_w2c_start, on='app_id', how='left') 161 | df_sorted 162 | 163 | 164 | # In[48]: 165 | 166 | 167 | w2c_nums = 10 168 | agg = {} 169 | for l in ['w2c_start_app_' + str(i) for i in range(w2c_nums)] : 170 | agg[l] = ['mean', 'std', 'max', 'min'] 171 | 172 | 173 | # In[50]: 174 | 175 | 176 | df_agg = df_sorted.groupby('device_id').agg(agg) 177 | df_agg.columns = pd.Index(['device_quchong' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()]) 178 | df_agg = df_agg.reset_index() 179 | 180 | 181 | # In[52]: 182 | 183 | 184 | df_agg.to_csv('device_quchong_start_app_w2c.csv', index=None) 185 | 186 | -------------------------------------------------------------------------------- /chizhu/single_model/get_nn_feat.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import numpy as np 4 | from tqdm import tqdm 5 | from sklearn.decomposition import LatentDirichletAllocation 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import accuracy_score 8 | import lightgbm as lgb 9 | from datetime import datetime, timedelta 10 | import matplotlib.pyplot as plt 11 | import time 12 | from sklearn.feature_extraction.text import TfidfTransformer 13 | from sklearn.feature_extraction.text import CountVectorizer 14 | %matplotlib inline 15 | 16 | #add 17 | import gc 18 | from sklearn import preprocessing 19 | from sklearn.feature_extraction.text import TfidfVectorizer 20 | 21 | from scipy.sparse import hstack, vstack 22 | from sklearn.model_selection import StratifiedKFold 23 | from sklearn.model_selection import cross_val_score 24 | # from skopt.space import Integer, Categorical, Real, Log10 25 | # from skopt.utils import use_named_args 26 | # from skopt import gp_minimize 27 | from gensim.models import Word2Vec, FastText 28 | import gensim 29 | import re 30 | from config import path 31 | # path = "/dev/shm/chizhu_data/data/" 32 | ###这里是原始文件的地址,务必修改这里的路径 33 | 34 | test = pd.read_csv(path+'deviceid_test.tsv', sep='\t', names=['device_id']) 35 | train = pd.read_csv(path+'deviceid_train.tsv', sep='\t', 36 | names=['device_id', 'sex', 'age']) 37 | brand = pd.read_table(path+'deviceid_brand.tsv', 38 | names=['device_id', 'vendor', 'version']) 39 | packtime = pd.read_table(path+'deviceid_package_start_close.tsv', 40 | names=['device_id', 'app', 'start', 'close']) 41 | packages = pd.read_csv(path+'deviceid_packages.tsv', 42 | sep='\t', names=['device_id', 'apps']) 43 | 44 | packtime['period'] = (packtime['close'] - packtime['start'])/1000 45 | packtime['start'] = pd.to_datetime(packtime['start'], unit='ms') 46 | app_use_time = packtime.groupby(['app'])['period'].agg('sum').reset_index() 47 | app_use_top100 = app_use_time.sort_values( 48 | by='period', ascending=False)[:100]['app'] 49 | device_app_use_time = packtime.groupby(['device_id', 'app'])[ 50 | 'period'].agg('sum').reset_index() 51 | use_time_top100_statis = device_app_use_time.set_index( 52 | 'app').loc[list(app_use_top100)].reset_index() 53 | top100_statis = use_time_top100_statis.pivot( 54 | index='device_id', columns='app', values='period').reset_index() 55 | 56 | top100_statis = top100_statis.fillna(0) 57 | 58 | # 手机品牌预处理 59 | brand['vendor'] = brand['vendor'].astype( 60 | str).apply(lambda x: x.split(' ')[0].upper()) 61 | brand['ph_ver'] = brand['vendor'] + '_' + brand['version'] 62 | 63 | ph_ver = brand['ph_ver'].value_counts() 64 | ph_ver_cnt = pd.DataFrame(ph_ver).reset_index() 65 | ph_ver_cnt.columns = ['ph_ver', 'ph_ver_cnt'] 66 | 67 | brand = pd.merge(left=brand, right=ph_ver_cnt, on='ph_ver') 68 | 69 | # 针对长尾分布做的一点处理 70 | mask = (brand.ph_ver_cnt < 100) 71 | brand.loc[mask, 'ph_ver'] = 'other' 72 | 73 | train = pd.merge(brand[['device_id', 'ph_ver']], 74 | train, on='device_id', how='right') 75 | test = pd.merge(brand[['device_id', 'ph_ver']], 76 | test, on='device_id', how='right') 77 | train['ph_ver'] = train['ph_ver'].astype(str) 78 | test['ph_ver'] = test['ph_ver'].astype(str) 79 | 80 | # 将 ph_ver 进行 label encoder 81 | ph_ver_le = preprocessing.LabelEncoder() 82 | train['ph_ver'] = ph_ver_le.fit_transform(train['ph_ver']) 83 | test['ph_ver'] = ph_ver_le.transform(test['ph_ver']) 84 | train['label'] = train['sex'].astype(str) + '-' + train['age'].astype(str) 85 | label_le = preprocessing.LabelEncoder() 86 | train['label'] = label_le.fit_transform(train['label']) 87 | 88 | test['sex'] = -1 89 | test['age'] = -1 90 | test['label'] = -1 91 | data = pd.concat([train, test], ignore_index=True) 92 | # data.shape 93 | 94 | ph_ver_dummy = pd.get_dummies(data['ph_ver']) 95 | ph_ver_dummy.columns = ['ph_ver_' + str(i) 96 | for i in range(ph_ver_dummy.shape[1])] 97 | 98 | data = pd.concat([data, ph_ver_dummy], axis=1) 99 | 100 | del data['ph_ver'] 101 | 102 | train = data[data.sex != -1] 103 | test = data[data.sex == -1] 104 | # train.shape, test.shape 105 | 106 | # 每个app的总使用次数统计 107 | app_num = packtime['app'].value_counts().reset_index() 108 | app_num.columns = ['app', 'app_num'] 109 | packtime = pd.merge(left=packtime, right=app_num, on='app') 110 | # 同样的,针对长尾分布做些处理(尝试过不做处理,或换其他阈值,这个100的阈值最高) 111 | packtime.loc[packtime.app_num < 100, 'app'] = 'other' 112 | 113 | # 统计每台设备的app数量 114 | df_app = packtime[['device_id', 'app']] 115 | apps = df_app.drop_duplicates().groupby(['device_id'])[ 116 | 'app'].apply(' '.join).reset_index() 117 | apps['app_length'] = apps['app'].apply(lambda x: len(x.split(' '))) 118 | 119 | train = pd.merge(train, apps, on='device_id', how='left') 120 | test = pd.merge(test, apps, on='device_id', how='left') 121 | 122 | # packtime['period'] = (packtime['close'] - packtime['start'])/1000 123 | # packtime['start'] = pd.to_datetime(packtime['start'], unit='ms') 124 | packtime['dayofweek'] = packtime['start'].dt.dayofweek 125 | packtime['hour'] = packtime['start'].dt.hour 126 | # packtime = packtime[(packtime['start'] < '2017-03-31 23:59:59') & (packtime['start'] > '2017-03-01 00:00:00')] 127 | 128 | app_use_time = packtime.groupby(['device_id', 'dayofweek'])[ 129 | 'period'].agg('sum').reset_index() 130 | week_app_use = app_use_time.pivot_table( 131 | values='period', columns='dayofweek', index='device_id').reset_index() 132 | week_app_use = week_app_use.fillna(0) 133 | week_app_use.columns = ['device_id'] + \ 134 | ['week_day_' + str(i) for i in range(0, 7)] 135 | 136 | week_app_use['week_max'] = week_app_use.max(axis=1) 137 | week_app_use['week_min'] = week_app_use.min(axis=1) 138 | week_app_use['week_sum'] = week_app_use.sum(axis=1) 139 | week_app_use['week_std'] = week_app_use.std(axis=1) 140 | 141 | # ''' 142 | # for i in range(0, 7): 143 | # week_app_use['week_day_' + str(i)] = week_app_use['week_day_' + str(i)] / week_app_use['week_sum'] 144 | # ''' 145 | 146 | user_behavior = pd.read_csv('data/user_behavior.csv') 147 | user_behavior['app_len_max'] = user_behavior['app_len_max'].astype(np.float64) 148 | del user_behavior['app'] 149 | train = pd.merge(train, user_behavior, on='device_id', how='left') 150 | test = pd.merge(test, user_behavior, on='device_id', how='left') 151 | 152 | train = pd.merge(train, week_app_use, on='device_id', how='left') 153 | test = pd.merge(test, week_app_use, on='device_id', how='left') 154 | 155 | top100_statis.columns = ['device_id'] + \ 156 | ['top100_statis_' + str(i) for i in range(0, 100)] 157 | train = pd.merge(train, top100_statis, on='device_id', how='left') 158 | test = pd.merge(test, top100_statis, on='device_id', how='left') 159 | 160 | train.to_csv("data/train_statistic_feat.csv", index=False) 161 | test.to_csv("data/test_statistic_feat.csv", index=False) 162 | -------------------------------------------------------------------------------- /THLUO/11.hcc_device_brand_age_sex.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.preprocessing import LabelEncoder 20 | import gc 21 | from sklearn.model_selection import StratifiedKFold 22 | 23 | 24 | 25 | # In[2]: 26 | 27 | print ('11.hcc_device_brand_age_sex.py') 28 | path='input/' 29 | data=pd.DataFrame() 30 | #sex_age=pd.read_excel('./data/性别年龄对照表.xlsx') 31 | 32 | 33 | # In[3]: 34 | 35 | 36 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 37 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 38 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 39 | deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type']) 40 | deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time']) 41 | package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type']) 42 | 43 | 44 | #deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0]) 45 | 46 | 47 | #转换成对应的数字 48 | lbl = LabelEncoder() 49 | lbl.fit(list(deviceid_brand.device_brand.values)) 50 | deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values)) 51 | 52 | lbl = LabelEncoder() 53 | lbl.fit(list(deviceid_brand.device_type.values)) 54 | deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values)) 55 | 56 | #转换成对应的数字 57 | lbl = LabelEncoder() 58 | lbl.fit(list(package_label.app_parent_type.values)) 59 | package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values)) 60 | 61 | lbl = LabelEncoder() 62 | lbl.fit(list(package_label.app_child_type.values)) 63 | package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values)) 64 | 65 | 66 | # In[4]: 67 | 68 | 69 | df_train = deviceid_train.merge(deviceid_brand, how='left', on='device_id') 70 | df_train.fillna(-1, inplace=True) 71 | df_test = deviceid_test.merge(deviceid_brand, how='left', on='device_id') 72 | df_test.fillna(-1, inplace=True) 73 | 74 | 75 | # In[5]: 76 | 77 | 78 | df_train['sex'] = df_train.sex.apply(lambda x : 1 if x == 1 else 0) 79 | df_train = df_train.join(pd.get_dummies(df_train["age"], prefix="age").astype(int)) 80 | df_train['sex_age'] = df_train['sex'].map(str) + '_' + df_train['age'].map(str) 81 | Y = df_train['sex_age'] 82 | Y_CAT = pd.Categorical(Y) 83 | df_train['sex_age'] = pd.Series(Y_CAT.codes) 84 | df_train = df_train.join(pd.get_dummies(df_train["sex_age"], prefix="sex_age").astype(int)) 85 | 86 | 87 | # In[6]: 88 | 89 | 90 | sex_age_columns = ['sex_age_' + str(i) for i in range(22)] 91 | sex_age_prior_set = df_train[sex_age_columns].mean().values 92 | age_columns = ['age_' + str(i) for i in range(11)] 93 | age_prior_set = df_train[age_columns].mean().values 94 | sex_prior_prob= df_train.sex.mean() 95 | sex_prior_prob 96 | 97 | 98 | # In[7]: 99 | 100 | 101 | def hcc_encode(train_df, test_df, variable, target, prior_prob, k=5, f=1, g=1, update_df=None): 102 | """ 103 | See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in 104 | Classification and Prediction Problems" by Daniele Micci-Barreca 105 | """ 106 | hcc_name = "_".join(["hcc", variable, target]) 107 | 108 | grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"}) 109 | grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f)) 110 | grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob 111 | 112 | df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob) 113 | 114 | if update_df is None: update_df = test_df 115 | if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan 116 | update_df.update(df) 117 | return 118 | 119 | 120 | # In[8]: 121 | 122 | 123 | #拟合年龄 124 | #拟合测试集 125 | # High-Cardinality Categorical encoding 126 | skf = StratifiedKFold(5) 127 | nums = 11 128 | for variable in ['device_brand', 'device_type'] : 129 | for i in range(nums) : 130 | target = age_columns[i] 131 | age_prior_prob = age_prior_set[i] 132 | print (variable, target, age_prior_prob) 133 | hcc_encode(df_train, df_test, variable, target, age_prior_prob, k=5, f=1, g=1, update_df=None) 134 | #拟合验证集 135 | for train, test in skf.split(np.zeros(len(df_train)), df_train['age']): 136 | hcc_encode(df_train.iloc[train], df_train.iloc[test], variable, target, age_prior_prob, k=5, update_df=df_train) 137 | 138 | 139 | # In[9]: 140 | 141 | 142 | #拟合性别 143 | #拟合测试集 144 | # High-Cardinality Categorical encoding 145 | skf = StratifiedKFold(5) 146 | for variable in ['device_brand', 'device_type'] : 147 | target = 'sex' 148 | print (variable, target, sex_prior_prob) 149 | hcc_encode(df_train, df_test, variable, target, sex_prior_prob, k=5, f=1, g=1, update_df=None) 150 | #拟合验证集 151 | for train, test in skf.split(np.zeros(len(df_train)), df_train['age']): 152 | hcc_encode(df_train.iloc[train], df_train.iloc[test], variable, target, sex_prior_prob, k=5, f=1, g=1, update_df=df_train) 153 | 154 | 155 | # In[10]: 156 | 157 | 158 | #拟合性别年龄 159 | #拟合测试集 160 | # High-Cardinality Categorical encoding 161 | skf = StratifiedKFold(5) 162 | nums = 22 163 | for variable in ['device_brand', 'device_type'] : 164 | for i in range(nums) : 165 | target = sex_age_columns[i] 166 | sex_age_prior_prob = sex_age_prior_set[i] 167 | print (variable, target, sex_age_prior_prob) 168 | hcc_encode(df_train, df_test, variable, target, sex_age_prior_prob, k=5, f=1, g=1, update_df=None) 169 | #拟合验证集 170 | for train, test in skf.split(np.zeros(len(df_train)), df_train['sex_age']): 171 | hcc_encode(df_train.iloc[train], df_train.iloc[test], variable, target, sex_age_prior_prob, k=5, update_df=df_train) 172 | 173 | 174 | # In[14]: 175 | 176 | 177 | hcc_columns = ['device_id'] + ['hcc_device_brand_age_' + str(i) for i in range(11)] + ['hcc_device_brand_sex'] + ['hcc_device_type_age_' + str(i) for i in range(11)] + ['hcc_device_type_sex'] + ['hcc_device_type_sex_age_' + str(i) for i in range(22)] 178 | df_total = pd.concat([df_train[hcc_columns], df_test[hcc_columns]]) 179 | 180 | 181 | # In[15]: 182 | 183 | 184 | df_total.to_csv('hcc_device_brand_age_sex.csv', index=None) 185 | 186 | -------------------------------------------------------------------------------- /nb_cz_lwl_wcm/1_get_age_reg.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | 4 | ####### 尝试骚操作,单独针对这个表 5 | import pandas as pd 6 | from sklearn.cluster import KMeans 7 | from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier, Ridge, \ 8 | PassiveAggressiveRegressor 9 | from sklearn.metrics import mean_squared_error 10 | from sklearn.model_selection import KFold 11 | from sklearn.naive_bayes import BernoulliNB, MultinomialNB 12 | from sklearn.svm import LinearSVC, LinearSVR 13 | 14 | train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None) 15 | test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None) 16 | test_id = test[0] 17 | def get_label(row): 18 | return row[2] 19 | train['label'] = train.apply(lambda row:get_label(row), axis=1) 20 | data_all = pd.concat([train, test], axis=0) 21 | data_all = data_all.rename({0:'id'}, axis=1) 22 | del data_all[1],data_all[2] 23 | 24 | deviceid_packages = pd.read_csv('Demo/deviceid_packages.tsv', sep='\t', header=None) 25 | deviceid_packages = deviceid_packages.rename({0: 'id', 1: 'packages_names'}, axis=1) 26 | package_label = pd.read_csv('Demo/package_label.tsv', sep='\t', header=None) 27 | package_label = package_label.rename({0:'packages_name', 1:'packages_type'},axis=1) 28 | dict_label = dict(zip(list(package_label['packages_name']), list(package_label['packages_type']))) 29 | 30 | data_all = pd.merge(data_all, deviceid_packages, on='id', how='left') 31 | 32 | feature = pd.DataFrame() 33 | 34 | import numpy as np 35 | 36 | # app个数 37 | # 毒特征? 38 | # feature['app_count'] = data_all['packages_names'].apply(lambda row: len(str(row).split(','))) 39 | 40 | # 对此数据做countvector,和tfidfvector,并在一起跑几个学习模型 41 | # 引申出来的count和tfidf,跑基本机器学习分类模型 42 | data_all['package_str'] = data_all['packages_names'].apply(lambda row: str(row).replace(',', ' ')) 43 | def get_more_information(row): 44 | result = ' ' 45 | start = True 46 | row_list = row.split(',') 47 | for i in row_list: 48 | try: 49 | if start: 50 | result = dict_label[i] 51 | start = False 52 | else: 53 | result = result + ' ' + dict_label[i] 54 | except KeyError: 55 | pass 56 | return result 57 | data_all['package_str_more_information'] = data_all['packages_names'].apply(lambda row: get_more_information(str(row))) 58 | 59 | print(data_all) 60 | 61 | from sklearn.feature_extraction.text import CountVectorizer 62 | from sklearn.feature_extraction.text import TfidfVectorizer 63 | import scipy.sparse 64 | 65 | count_vec = CountVectorizer() 66 | count_csr_basic = count_vec.fit_transform(data_all['package_str']) 67 | tfidf_vec = TfidfVectorizer() 68 | tfidf_vec_basic = tfidf_vec.fit_transform(data_all['package_str']) 69 | 70 | count_vec = CountVectorizer() 71 | count_csr_more = count_vec.fit_transform(data_all['package_str_more_information']) 72 | 73 | tfidf_vec = TfidfVectorizer() 74 | tfidf_vec_more = tfidf_vec.fit_transform(data_all['package_str_more_information']) 75 | 76 | data_feature = scipy.sparse.csr_matrix(scipy.sparse.hstack([count_csr_basic, tfidf_vec_basic, 77 | count_csr_more, tfidf_vec_more])) 78 | 79 | train_feature = data_feature[:len(train)] 80 | score = train['label'] 81 | test_feature = data_feature[len(train):] 82 | number = len(np.unique(score)) 83 | 84 | X = train_feature 85 | test = test_feature 86 | y = score 87 | 88 | n_flods = 5 89 | kf = KFold(n_splits=n_flods,shuffle=True,random_state=1017) 90 | kf = kf.split(X) 91 | 92 | def xx_mse_s(y_true,y_pre): 93 | y_true = y_true 94 | y_pre = pd.DataFrame({'res': list(y_pre)}) 95 | return mean_squared_error(y_true,y_pre['res'].values) 96 | 97 | ######################## ridge reg #########################3 98 | cv_pred = [] 99 | xx_mse = [] 100 | stack = np.zeros((len(y),1)) 101 | stack_te = np.zeros((len(test_id),1)) 102 | model_1 = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=250, normalize=False, tol=0.01,random_state=1017) 103 | for i ,(train_fold,test_fold) in enumerate(kf): 104 | X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold] 105 | model_1.fit(X_train, label_train) 106 | val_ = model_1.predict(X=X_validate) 107 | stack[test_fold] = np.array(val_).reshape(len(val_),1) 108 | print(xx_mse_s(label_validate, val_)) 109 | cv_pred.append(model_1.predict(test)) 110 | xx_mse.append(xx_mse_s(label_validate, val_)) 111 | import numpy as np 112 | print('xx_result',np.mean(xx_mse)) 113 | s = 0 114 | for i in cv_pred: 115 | s = s+i 116 | s = s/n_flods 117 | print(stack) 118 | print(s) 119 | df_stack1 = pd.DataFrame(stack) 120 | df_stack2 = pd.DataFrame(s) 121 | df_stack = pd.concat([df_stack1,df_stack2 122 | ], axis=0) 123 | df_stack.to_csv('feature/tfidf_ling_reg.csv', encoding='utf8', index=None) 124 | 125 | ######################## par reg ######################### 126 | kf = KFold(n_splits=n_flods,shuffle=True,random_state=1017) 127 | kf = kf.split(X) 128 | cv_pred = [] 129 | xx_mse = [] 130 | stack = np.zeros((len(y),1)) 131 | model_1 = PassiveAggressiveRegressor(fit_intercept=True, max_iter=280, tol=0.01,random_state=1017) 132 | for i ,(train_fold,test_fold) in enumerate(kf): 133 | X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold] 134 | model_1.fit(X_train, label_train) 135 | val_ = model_1.predict(X=X_validate) 136 | stack[test_fold] = np.array(val_).reshape(len(val_),1) 137 | print(xx_mse_s(label_validate, val_)) 138 | cv_pred.append(model_1.predict(test)) 139 | xx_mse.append(xx_mse_s(label_validate, val_)) 140 | import numpy as np 141 | print('xx_result',np.mean(xx_mse)) 142 | s = 0 143 | for i in cv_pred: 144 | s = s+i 145 | s = s/n_flods 146 | print(stack) 147 | print(s) 148 | df_stack1 = pd.DataFrame(stack) 149 | df_stack2 = pd.DataFrame(s) 150 | df_stack = pd.concat([df_stack1,df_stack2 151 | ], axis=0) 152 | df_stack.to_csv('feature/tfidf_par_reg.csv', encoding='utf8', index=None) 153 | 154 | ######################## svr reg ######################### 155 | kf = KFold(n_splits=n_flods,shuffle=True,random_state=1017) 156 | kf = kf.split(X) 157 | cv_pred = [] 158 | xx_mse = [] 159 | stack = np.zeros((len(y),1)) 160 | model_1 = LinearSVR(random_state=1017) 161 | for i ,(train_fold,test_fold) in enumerate(kf): 162 | X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold] 163 | model_1.fit(X_train, label_train) 164 | val_ = model_1.predict(X=X_validate) 165 | stack[test_fold] = np.array(val_).reshape(len(val_),1) 166 | print(xx_mse_s(label_validate, val_)) 167 | cv_pred.append(model_1.predict(test)) 168 | xx_mse.append(xx_mse_s(label_validate, val_)) 169 | import numpy as np 170 | print('xx_result',np.mean(xx_mse)) 171 | s = 0 172 | for i in cv_pred: 173 | s = s+i 174 | s = s/n_flods 175 | print(stack) 176 | print(s) 177 | df_stack1 = pd.DataFrame(stack) 178 | df_stack2 = pd.DataFrame(s) 179 | df_stack = pd.concat([df_stack1,df_stack2 180 | ], axis=0) 181 | df_stack.to_csv('feature/tfidf_svr_reg.csv', encoding='utf8', index=None) 182 | 183 | -------------------------------------------------------------------------------- /THLUO/25.thluo_22_xgb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | import xgboost as xgb 16 | from datetime import datetime,timedelta 17 | import time 18 | from sklearn.feature_extraction.text import TfidfTransformer 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | from sklearn.preprocessing import LabelEncoder 21 | import gc 22 | from feat_util import * 23 | 24 | 25 | # In[2]: 26 | 27 | print ('25.thluo_22_xgb.py') 28 | path='input/' 29 | data=pd.DataFrame() 30 | #sex_age=pd.read_excel('./data/性别年龄对照表.xlsx') 31 | 32 | 33 | # In[3]: 34 | 35 | 36 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 37 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 38 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 39 | 40 | 41 | # In[4]: 42 | 43 | 44 | df_train = pd.concat([deviceid_train, deviceid_test]) 45 | 46 | 47 | # In[5]: 48 | 49 | 50 | df_train 51 | 52 | 53 | # In[6]: 54 | 55 | 56 | df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv') 57 | df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv') 58 | df_start_close_sex_prob_oof = pd.read_csv('start_close_sex_prob_oof.csv') 59 | #后面两个,线上线下不对应,线下过拟合了 60 | df_start_close_age_prob_oof = pd.read_csv('start_close_age_prob_oof.csv') 61 | df_tfidf_lr_sex_age_prob_oof = pd.read_csv('tfidf_lr_sex_age_prob_oof.csv') 62 | #之前的有用的 63 | df_sex_age_bin_prob_oof = pd.read_csv('sex_age_bin_prob_oof.csv') 64 | 65 | df_age_bin_prob_oof = pd.read_csv('age_bin_prob_oof.csv') 66 | df_hcc_device_brand_age_sex = pd.read_csv('hcc_device_brand_age_sex.csv') 67 | df_device_age_regression_prob_oof = pd.read_csv('device_age_regression_prob_oof.csv') 68 | df_device_start_GRU_pred = pd.read_csv('device_start_GRU_pred.csv') 69 | df_device_start_GRU_pred_age = pd.read_csv('device_start_GRU_pred_age.csv') 70 | df_device_all_GRU_pred = pd.read_csv('device_all_GRU_pred.csv') 71 | df_lgb_sex_age_prob_oof = pd.read_csv('lgb_sex_age_prob_oof.csv') 72 | df_device_start_capsule_pred = pd.read_csv('device_start_capsule_pred.csv') 73 | df_device_start_textcnn_pred = pd.read_csv('device_start_textcnn_pred.csv') 74 | df_device_start_text_dpcnn_pred = pd.read_csv('device_start_text_dpcnn_pred.csv') 75 | df_device_start_lstm_pred = pd.read_csv('device_start_lstm_pred.csv') 76 | df_att_nn_feat_v6 = pd.read_csv('att_nn_feat_v6.csv') 77 | df_att_nn_feat_v6.columns = ['device_id'] + ['att_nn_feat_' + str(i) for i in range(22)] 78 | 79 | #过拟合特征 80 | del df_start_close_age_prob_oof['device_app_groupedstart_close_age_prob_oof_4_MEAN'] 81 | del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MIN'] 82 | del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MAX'] 83 | 84 | 85 | # In[7]: 86 | 87 | 88 | df_train_w2v = df_train.merge(df_sex_prob_oof, on='device_id', how='left') 89 | df_train_w2v = df_train_w2v.merge(df_age_prob_oof, on='device_id', how='left') 90 | df_train_w2v = df_train_w2v.merge(df_start_close_sex_prob_oof, on='device_id', how='left') 91 | df_train_w2v = df_train_w2v.merge(df_start_close_age_prob_oof, on='device_id', how='left') 92 | df_train_w2v = df_train_w2v.merge(df_sex_age_bin_prob_oof, on='device_id', how='left') 93 | df_train_w2v = df_train_w2v.merge(df_age_bin_prob_oof, on='device_id', how='left') 94 | df_train_w2v = df_train_w2v.merge(df_hcc_device_brand_age_sex, on='device_id', how='left') 95 | df_train_w2v = df_train_w2v.merge(df_device_age_regression_prob_oof, on='device_id', how='left') 96 | df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred, on='device_id', how='left') 97 | df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred_age, on='device_id', how='left') 98 | df_train_w2v = df_train_w2v.merge(df_device_all_GRU_pred, on='device_id', how='left') 99 | df_train_w2v = df_train_w2v.merge(df_lgb_sex_age_prob_oof, on='device_id', how='left') 100 | df_train_w2v = df_train_w2v.merge(df_device_start_capsule_pred, on='device_id', how='left') 101 | df_train_w2v = df_train_w2v.merge(df_device_start_textcnn_pred, on='device_id', how='left') 102 | df_train_w2v = df_train_w2v.merge(df_device_start_text_dpcnn_pred, on='device_id', how='left') 103 | df_train_w2v = df_train_w2v.merge(df_device_start_lstm_pred, on='device_id', how='left') 104 | df_train_w2v = df_train_w2v.merge(df_att_nn_feat_v6, on='device_id', how='left') 105 | 106 | 107 | # In[9]: 108 | 109 | 110 | df_train_w2v['sex'] = df_train_w2v['sex'].apply(lambda x:str(x)) 111 | df_train_w2v['age'] = df_train_w2v['age'].apply(lambda x:str(x)) 112 | def tool(x): 113 | if x=='nan': 114 | return x 115 | else: 116 | return str(int(float(x))) 117 | df_train_w2v['sex']=df_train_w2v['sex'].apply(tool) 118 | df_train_w2v['age']=df_train_w2v['age'].apply(tool) 119 | df_train_w2v['sex_age']=df_train_w2v['sex']+'-'+df_train_w2v['age'] 120 | df_train_w2v = df_train_w2v.replace({'nan':np.NaN,'nan-nan':np.NaN}) 121 | 122 | 123 | # In[11]: 124 | 125 | 126 | train = df_train_w2v[df_train_w2v['sex'].notnull()] 127 | test = df_train_w2v[df_train_w2v['sex'].isnull()] 128 | 129 | X = train.drop(['sex','age','sex_age','device_id'],axis=1) 130 | Y = train['sex_age'] 131 | Y_CAT = pd.Categorical(Y) 132 | Y = pd.Series(Y_CAT.codes) 133 | 134 | 135 | # In[14]: 136 | 137 | 138 | from sklearn.model_selection import KFold, StratifiedKFold 139 | gc.collect() 140 | #seed = 2048 141 | seed = 666 142 | num_folds = 5 143 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed) 144 | 145 | sub_list = [] 146 | 147 | cate_feat = ['device_type','device_brand'] 148 | 149 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)): 150 | train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx] 151 | valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx] 152 | 153 | xg_train = xgb.DMatrix(train_x, label=train_y) 154 | xg_val = xgb.DMatrix(valid_x, label=valid_y) 155 | 156 | param = { 157 | 'objective' : 'multi:softprob', 158 | 'eta' : 0.03, 159 | 'max_depth' : 3, 160 | 'num_class' : 22, 161 | 'eval_metric' : 'mlogloss', 162 | 'min_child_weight' : 3, 163 | 'subsample' : 0.7, 164 | 'colsample_bytree' : 0.7, 165 | 'seed' : 2006, 166 | 'nthread' : 5 167 | } 168 | 169 | num_rounds = 1000 170 | 171 | watchlist = [ (xg_train,'train'), (xg_val, 'val') ] 172 | model = xgb.train(param, xg_train, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=50) 173 | 174 | test_matrix = xgb.DMatrix(test[X.columns.values]) 175 | sub = pd.DataFrame(model.predict(test_matrix)) 176 | sub_list.append(sub) 177 | 178 | 179 | # In[15]: 180 | 181 | 182 | sub = (sub_list[0] + sub_list[1] + sub_list[2] + sub_list[3] + sub_list[4]) / num_folds 183 | sub 184 | 185 | 186 | # In[16]: 187 | 188 | 189 | sub.columns=Y_CAT.categories 190 | sub['DeviceID']=test['device_id'].values 191 | sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']] 192 | sub.to_csv('th_22_results_xgb.csv',index=False) 193 | 194 | -------------------------------------------------------------------------------- /linwangli/code/lgb_allfeat_condProb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | from catboost import Pool, CatBoostClassifier, cv 4 | import pandas as pd 5 | import seaborn as sns 6 | import numpy as np 7 | from tqdm import tqdm 8 | from sklearn.decomposition import LatentDirichletAllocation 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score 11 | import lightgbm as lgb 12 | from datetime import datetime,timedelta 13 | import matplotlib.pyplot as plt 14 | import time 15 | from sklearn.feature_extraction.text import TfidfTransformer 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | import gc 18 | from sklearn import preprocessing 19 | from sklearn.feature_extraction.text import TfidfVectorizer 20 | 21 | from scipy.sparse import hstack, vstack 22 | from sklearn.model_selection import StratifiedKFold 23 | from sklearn.model_selection import cross_val_score 24 | from skopt.space import Integer, Categorical, Real, Log10 25 | from skopt.utils import use_named_args 26 | from skopt import gp_minimize 27 | import re 28 | 29 | 30 | # 读入数据 31 | train = pd.read_csv('../dataset/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age']) 32 | all_feat = pd.read_csv('../dataset/all_feat.csv') 33 | 34 | 35 | data_all = pd.merge(left=all_feat, right=train, on='device_id', how='left') 36 | train = data_all[:50000] 37 | test = data_all[50000:] 38 | train = train.fillna(-1) 39 | test = test.fillna(-1) 40 | del data_all 41 | gc.collect() 42 | use_feats = all_feat.columns[1:] 43 | use_feats 44 | 45 | 46 | # P(age) 47 | 48 | Y = train['sex'] - 1 49 | X_train = train[use_feats] 50 | X_test = test[use_feats] 51 | kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True) 52 | oof_preds1 = np.zeros((X_train.shape[0], )) 53 | sub1 = np.zeros((X_test.shape[0], )) 54 | for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)): 55 | X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index] 56 | dtrain = lgb.Dataset(X_tr, label=y_tr) 57 | dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain) 58 | params = { 59 | 'boosting_type': 'gbdt', 60 | 'max_depth':6, 61 | 'objective':'binary', 62 | 'num_leaves':31, 63 | 'subsample': 0.85, 64 | 'colsample_bytree': 0.2, 65 | 'lambda_l1':0.00007995302080034896, 66 | 'lambda_l2':0.0003648648811380991, 67 | 'subsample_freq':12, 68 | 'learning_rate': 0.012, 69 | 'min_child_weight':5.5 70 | } 71 | 72 | model = lgb.train(params, 73 | dtrain, 74 | num_boost_round=4000, 75 | valid_sets=dvalid, 76 | early_stopping_rounds=100, 77 | verbose_eval=100) 78 | 79 | oof_preds1[test_index] = model.predict(X_vl, num_iteration=model.best_iteration) 80 | sub1 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits 81 | 82 | 83 | # P(age|sex = 2) 84 | 85 | train['sex_pred'] = train['sex'] 86 | test['sex_pred'] = 1 87 | 88 | use_feats = list(train.columns[1:-3]) 89 | use_feats = use_feats + ['sex_pred'] 90 | 91 | X_train = train[use_feats] 92 | X_test = test[use_feats] 93 | 94 | Y = train['age'] 95 | kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True) 96 | oof_preds2_1 = np.zeros((X_train.shape[0], 11)) 97 | sub2_1 = np.zeros((X_test.shape[0], 11)) 98 | for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)): 99 | X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index] 100 | 101 | 102 | dtrain = lgb.Dataset(X_tr, label=y_tr) 103 | dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain) 104 | params = { 105 | 'boosting_type': 'gbdt', 106 | 'max_depth':6, 107 | 'metric': {'multi_logloss'}, 108 | 'num_class':11, 109 | 'objective':'multiclass', 110 | 'num_leaves':31, 111 | 'subsample': 0.9, 112 | 'colsample_bytree': 0.2, 113 | 'lambda_l1':0.0001, 114 | 'lambda_l2':0.00111, 115 | 'subsample_freq':10, 116 | 'learning_rate': 0.012, 117 | 'min_child_weight':10 118 | } 119 | 120 | model = lgb.train(params, 121 | dtrain, 122 | num_boost_round=4000, 123 | valid_sets=dvalid, 124 | early_stopping_rounds=100, 125 | verbose_eval=100) 126 | 127 | oof_preds2_1[test_index] = model.predict(X_vl, num_iteration=model.best_iteration) 128 | sub2_1 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits 129 | 130 | 131 | # P(age|sex = 2) 132 | 133 | train['sex_pred'] = train['sex'] 134 | test['sex_pred'] = 2 135 | 136 | use_feats = list(train.columns[1:-3]) 137 | use_feats = use_feats + ['sex_pred'] 138 | 139 | X_train = train[use_feats] 140 | X_test = test[use_feats] 141 | 142 | 143 | Y = train['age'] 144 | kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True) 145 | oof_preds2_2 = np.zeros((X_train.shape[0], 11)) 146 | sub2_2 = np.zeros((X_test.shape[0], 11)) 147 | for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)): 148 | X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index] 149 | 150 | 151 | dtrain = lgb.Dataset(X_tr, label=y_tr) 152 | dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain) 153 | params = { 154 | 'boosting_type': 'gbdt', 155 | 'max_depth':6, 156 | 'metric': {'multi_logloss'}, 157 | 'num_class':11, 158 | 'objective':'multiclass', 159 | 'num_leaves':31, 160 | 'subsample': 0.9, 161 | 'colsample_bytree': 0.2, 162 | 'lambda_l1':0.0001, 163 | 'lambda_l2':0.00111, 164 | 'subsample_freq':10, 165 | 'learning_rate': 0.012, 166 | 'min_child_weight':10 167 | } 168 | 169 | model = lgb.train(params, 170 | dtrain, 171 | num_boost_round=4000, 172 | valid_sets=dvalid, 173 | early_stopping_rounds=100, 174 | verbose_eval=100) 175 | 176 | oof_preds2_2[test_index] = model.predict(X_vl, num_iteration=model.best_iteration) 177 | sub2_2 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits 178 | 179 | 180 | # 保存测试集的预测结果 181 | sub1 = pd.DataFrame(sub1, columns=['sex2']) 182 | 183 | sub1['sex1'] = 1-sub1['sex2'] 184 | sub2 = pd.DataFrame(sub2_1, columns=['age%s'%i for i in range(11)]) 185 | sub = pd.DataFrame(test['device_id'].values, columns=['DeviceID']) 186 | 187 | for i in ['sex1', 'sex2']: 188 | for j in ['age%s'%i for i in range(11)]: 189 | sub[i+'_'+j] = sub1[i] * sub2[j] 190 | sub.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 191 | '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 192 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10'] 193 | 194 | sub.to_csv('test_pred.csv', index=False) 195 | 196 | 197 | # 保存训练集五折的预测结果 198 | oof_preds1 = pd.DataFrame(oof_preds1, columns=['sex2']) 199 | oof_preds1['sex1'] = 1-oof_preds1['sex2'] 200 | 201 | oof_preds2_1 = pd.DataFrame(oof_preds2_1, columns=['age%s'%i for i in range(11)]) 202 | oof_preds2_2 = pd.DataFrame(oof_preds2_2, columns=['age%s'%i for i in range(11)]) 203 | 204 | oof_preds = train[['device_id']] 205 | oof_preds.columns = ['DeviceID'] 206 | 207 | for i in ['age%s'%i for i in range(11)]: 208 | oof_preds['sex1_'+i] = oof_preds1['sex1'] * oof_preds2_1[i] 209 | for i in ['age%s'%i for i in range(11)]: 210 | oof_preds['sex2_'+i] = oof_preds1['sex2'] * oof_preds2_2[i] 211 | 212 | oof_preds.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 213 | '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 214 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10'] 215 | 216 | oof_preds.to_csv('train_pred.csv', index=False) 217 | 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /THLUO/14.device_start_GRU_pred_age.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # coding: utf-8 8 | import feather 9 | import os 10 | import re 11 | import sys 12 | import gc 13 | import random 14 | import pandas as pd 15 | import numpy as np 16 | import gensim 17 | from gensim.models import Word2Vec 18 | from gensim.models.word2vec import LineSentence 19 | from scipy import stats 20 | import tensorflow as tf 21 | import keras 22 | from keras.layers import * 23 | from keras.models import * 24 | from keras.optimizers import * 25 | from keras.callbacks import * 26 | from keras.preprocessing import text, sequence 27 | from keras.utils import to_categorical 28 | from keras.engine.topology import Layer 29 | from sklearn.preprocessing import LabelEncoder 30 | from keras.utils import np_utils 31 | from keras.utils.training_utils import multi_gpu_model 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.metrics import f1_score 34 | from sklearn.model_selection import KFold 35 | from sklearn.metrics import accuracy_score 36 | from sklearn.preprocessing import LabelEncoder 37 | from sklearn.metrics import f1_score 38 | from TextModel import * 39 | import warnings 40 | warnings.filterwarnings('ignore') 41 | config = tf.ConfigProto() 42 | config.gpu_options.allow_growth = True 43 | session = tf.Session(config=config) 44 | 45 | 46 | # In[2]: 47 | print('14.device_start_GRU_pred_age.py') 48 | 49 | df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv') 50 | deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id']) 51 | deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 52 | df_total = pd.concat([deviceid_train, deviceid_test]) 53 | df_doc = df_doc.merge(df_total, on='device_id', how='left') 54 | 55 | 56 | df_wv2_all = pd.read_csv('w2c_all_emb.csv') 57 | 58 | dic_w2c_all = {} 59 | for row in df_wv2_all.values : 60 | app_id = row[0] 61 | vector = row[1:] 62 | dic_w2c_all[app_id] = vector 63 | 64 | 65 | # In[3]: 66 | 67 | 68 | train = df_doc[df_doc['age'].notnull()] 69 | test = df_doc[df_doc['age'].isnull()] 70 | train.reset_index(drop=True, inplace=True) 71 | test.reset_index(drop=True, inplace=True) 72 | 73 | lb = LabelEncoder() 74 | train_label = lb.fit_transform(train['age'].values) 75 | train['class'] = train_label 76 | 77 | 78 | # In[5]: 79 | 80 | 81 | column_name="app_list" 82 | word_seq_len = 900 83 | victor_size = 200 84 | num_words = 35000 85 | batch_size = 64 86 | classification = 11 87 | kfold=10 88 | 89 | 90 | # In[6]: 91 | 92 | 93 | from sklearn.metrics import log_loss 94 | 95 | def get_mut_label(y_label) : 96 | results = [] 97 | for ele in y_label : 98 | results.append(ele.argmax()) 99 | return results 100 | 101 | class RocAucEvaluation(Callback): 102 | def __init__(self, validation_data=(), interval=1): 103 | super(Callback, self).__init__() 104 | 105 | self.interval = interval 106 | self.X_val, self.y_val = validation_data 107 | 108 | def on_epoch_end(self, epoch, logs={}): 109 | if epoch % self.interval == 0: 110 | y_pred = self.model.predict(self.X_val, verbose=0) 111 | val_y = get_mut_label(self.y_val) 112 | score = log_loss(val_y, y_pred) 113 | print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score)) 114 | 115 | 116 | # In[7]: 117 | 118 | 119 | #词向量 120 | def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words): 121 | 122 | tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="") 123 | tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values)) 124 | 125 | train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_) 126 | test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_) 127 | 128 | word_index = tokenizer.word_index 129 | 130 | count = 0 131 | nb_words = len(word_index) 132 | print(nb_words) 133 | all_data=pd.concat([df_train[col],df_test[col]]) 134 | file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model' 135 | if not os.path.exists(file_name): 136 | model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], 137 | size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2) 138 | model.save(file_name) 139 | else: 140 | model = Word2Vec.load(file_name) 141 | print("add word2vec finished....") 142 | 143 | 144 | 145 | embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) 146 | for word, i in word_index.items(): 147 | embedding_vector = model[word] if word in model else None 148 | if embedding_vector is not None: 149 | count += 1 150 | embedding_word2vec_matrix[i] = embedding_vector 151 | else: 152 | unk_vec = np.random.random(victor_size) * 0.5 153 | unk_vec = unk_vec - unk_vec.mean() 154 | embedding_word2vec_matrix[i] = unk_vec 155 | 156 | embedding_w2c_all = np.zeros((nb_words + 1, victor_size)) 157 | for word, i in word_index.items(): 158 | embedding_vector = dic_w2c_all[word] 159 | embedding_w2c_all[i] = embedding_vector 160 | 161 | 162 | #embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1) 163 | embedding_matrix = embedding_word2vec_matrix 164 | 165 | return train_, test_, word_index, embedding_matrix 166 | 167 | 168 | # In[8]: 169 | 170 | 171 | train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words) 172 | 173 | 174 | # In[11]: 175 | 176 | 177 | my_opt="bi_gru_model" 178 | #参数 179 | Y = train['class'].values 180 | 181 | if not os.path.exists("cache/"+my_opt): 182 | os.mkdir("cache/"+my_opt) 183 | 184 | 185 | # In[17]: 186 | 187 | 188 | from sklearn.model_selection import KFold, StratifiedKFold 189 | gc.collect() 190 | seed = 2006 191 | num_folds = 10 192 | kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y) 193 | 194 | epochs = 4 195 | my_opt=eval(my_opt) 196 | train_model_pred = np.zeros((train_.shape[0], classification)) 197 | test_model_pred = np.zeros((test_.shape[0], classification)) 198 | for i, (train_fold, val_fold) in enumerate(kf): 199 | X_train, X_valid, = train_[train_fold, :], train_[val_fold, :] 200 | y_train, y_valid = Y[train_fold], Y[val_fold] 201 | 202 | y_tra = to_categorical(y_train) 203 | y_val = to_categorical(y_valid) 204 | 205 | #模型 206 | name = str(my_opt.__name__) 207 | 208 | model = my_opt(word_seq_len, word_embedding, classification) 209 | 210 | 211 | RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1) 212 | 213 | hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val), 214 | callbacks=[RocAuc]) 215 | 216 | 217 | train_model_pred[val_fold, :] = model.predict(X_valid) 218 | 219 | 220 | # In[21]: 221 | 222 | 223 | #模型 224 | #用全部的数据预测 225 | train_label = to_categorical(Y) 226 | name = str(my_opt.__name__) 227 | 228 | model = my_opt(word_seq_len, word_embedding, classification) 229 | 230 | 231 | RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1) 232 | 233 | hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label), 234 | callbacks=[RocAuc]) 235 | 236 | 237 | test_model_pred = model.predict(test_) 238 | 239 | 240 | # In[22]: 241 | 242 | 243 | df_train_pred = pd.DataFrame(train_model_pred) 244 | df_test_pred = pd.DataFrame(test_model_pred) 245 | df_train_pred.columns = ['device_start_GRU_pred_age_' + str(i) for i in range(11)] 246 | df_test_pred.columns = ['device_start_GRU_pred_age_' + str(i) for i in range(11)] 247 | 248 | 249 | # In[23]: 250 | 251 | 252 | df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1) 253 | df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1) 254 | 255 | 256 | # In[24]: 257 | 258 | 259 | df_results = pd.concat([df_train_pred, df_test_pred]) 260 | df_results.to_csv('device_start_GRU_pred_age.csv', index=None) 261 | 262 | -------------------------------------------------------------------------------- /chizhu/single_model/xgb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import matplotlib.pyplot as plt 17 | import time 18 | from sklearn.feature_extraction.text import TfidfTransformer 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | # get_ipython().run_line_magic('matplotlib', 'inline') 21 | 22 | #add 23 | import gc 24 | from sklearn import preprocessing 25 | from sklearn.feature_extraction.text import TfidfVectorizer 26 | 27 | from scipy.sparse import hstack, vstack 28 | from sklearn.model_selection import StratifiedKFold 29 | from sklearn.model_selection import cross_val_score 30 | # from skopt.space import Integer, Categorical, Real, Log10 31 | # from skopt.utils import use_named_args 32 | # from skopt import gp_minimize 33 | from gensim.models import Word2Vec, FastText 34 | import gensim 35 | import re 36 | # path="/dev/shm/chizhu_data/data/" 37 | 38 | 39 | # In[2]: 40 | 41 | 42 | tfidf_feat=pd.read_csv("data/tfidf_classfiy.csv") 43 | tf2=pd.read_csv("data/tfidf_classfiy_package.csv") 44 | train_data=pd.read_csv("data/train_data.csv") 45 | test_data=pd.read_csv("data/test_data.csv") 46 | 47 | 48 | # In[3]: 49 | 50 | 51 | train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left") 52 | train = pd.merge(train_data,tf2,on="device_id",how="left") 53 | test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left") 54 | test = pd.merge(test_data,tf2,on="device_id",how="left") 55 | 56 | 57 | # In[4]: 58 | 59 | 60 | features = [x for x in train.columns if x not in ['device_id', 'sex',"age","label","app"]] 61 | Y = train['sex'] - 1 62 | 63 | 64 | # In[19]: 65 | 66 | 67 | import lightgbm as lgb 68 | import xgboost as xgb 69 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 70 | from sklearn.cross_validation import StratifiedKFold 71 | 72 | kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024) 73 | params={ 74 | 'booster':'gbtree', 75 | 76 | 'objective': 'binary:logistic', 77 | # 'is_unbalance':'True', 78 | # 'scale_pos_weight': 1500.0/13458.0, 79 | 'eval_metric': "logloss", 80 | 81 | 'gamma':0.2,#0.2 is ok 82 | 'max_depth':6, 83 | # 'lambda':20, 84 | # "alpha":5, 85 | 'subsample':0.7, 86 | 'colsample_bytree':0.4 , 87 | # 'min_child_weight':2.5, 88 | 'eta': 0.01, 89 | # 'learning_rate':0.01, 90 | "silent":1, 91 | 'seed':1024, 92 | 'nthread':12, 93 | 94 | } 95 | num_round = 3500 96 | early_stopping_rounds = 100 97 | 98 | 99 | # In[20]: 100 | 101 | 102 | aus = [] 103 | sub1 = np.zeros((len(test), )) 104 | pred_oob1=np.zeros((len(train),)) 105 | for i,(train_index,test_index) in enumerate(kf): 106 | 107 | tr_x = train[features].reindex(index=train_index, copy=False) 108 | tr_y = Y[train_index] 109 | te_x = train[features].reindex(index=test_index, copy=False) 110 | te_y = Y[test_index] 111 | 112 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 113 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 114 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 115 | d_te = xgb.DMatrix(te_x, label=te_y) 116 | watchlist = [(d_tr,'train'), 117 | (d_te,'val') 118 | ] 119 | model = xgb.train(params, d_tr, num_boost_round=5500, 120 | evals=watchlist,verbose_eval=200, 121 | early_stopping_rounds=100) 122 | pred = model.predict(d_te) 123 | pred_oob1[test_index] =pred 124 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 125 | a = log_loss(te_y, pred) 126 | 127 | sub1 += model.predict(xgb.DMatrix(test[features]))/5 128 | 129 | 130 | print ("idx: ", i) 131 | print (" loss: %.5f" % a) 132 | # print " gini: %.5f" % g 133 | aus.append(a) 134 | 135 | print ("mean") 136 | print ("auc: %s" % (sum(aus) / 5.0)) 137 | 138 | 139 | # In[21]: 140 | 141 | 142 | pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2']) 143 | sub1 = pd.DataFrame(sub1, columns=['sex2']) 144 | res1=pd.concat([pred_oob1,sub1]) 145 | res1['sex1'] = 1-res1['sex2'] 146 | 147 | 148 | # In[22]: 149 | 150 | 151 | import gc 152 | gc.collect() 153 | 154 | 155 | # In[23]: 156 | 157 | 158 | tfidf_feat=pd.read_csv("data/tfidf_age.csv") 159 | tf2=pd.read_csv("data/pack_tfidf_age.csv") 160 | train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left") 161 | train = pd.merge(train_data,tf2,on="device_id",how="left") 162 | test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left") 163 | test = pd.merge(test_data,tf2,on="device_id",how="left") 164 | features = [x for x in train.columns if x not in ['device_id',"age","sex","label","app"]] 165 | Y = train['age'] 166 | 167 | 168 | # In[34]: 169 | 170 | 171 | import lightgbm as lgb 172 | import xgboost as xgb 173 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 174 | from sklearn.cross_validation import StratifiedKFold 175 | 176 | kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024) 177 | params={ 178 | 'booster':'gbtree', 179 | 'objective': 'multi:softprob', 180 | # 'is_unbalance':'True', 181 | # 'scale_pos_weight': 1500.0/13458.0, 182 | 'eval_metric': "mlogloss", 183 | 'num_class':11, 184 | 'gamma':0.1,#0.2 is ok 185 | 'max_depth':6, 186 | # 'lambda':20, 187 | # "alpha":5, 188 | 'subsample':0.7, 189 | 'colsample_bytree':0.4 , 190 | # 'min_child_weight':2.5, 191 | 'eta': 0.01, 192 | # 'learning_rate':0.01, 193 | "silent":1, 194 | 'seed':1024, 195 | 'nthread':12, 196 | 197 | } 198 | num_round = 3500 199 | early_stopping_rounds = 100 200 | 201 | 202 | # In[ ]: 203 | 204 | 205 | aus = [] 206 | sub2 = np.zeros((len(test),11 )) 207 | pred_oob2=np.zeros((len(train),11)) 208 | for i,(train_index,test_index) in enumerate(kf): 209 | 210 | tr_x = train[features].reindex(index=train_index, copy=False) 211 | tr_y = Y[train_index] 212 | te_x = train[features].reindex(index=test_index, copy=False) 213 | te_y = Y[test_index] 214 | 215 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 216 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 217 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 218 | d_te = xgb.DMatrix(te_x, label=te_y) 219 | watchlist = [(d_tr,'train'), 220 | (d_te,'val') 221 | ] 222 | model = xgb.train(params, d_tr, num_boost_round=5500, 223 | evals=watchlist,verbose_eval=200, 224 | early_stopping_rounds=100) 225 | pred = model.predict(d_te) 226 | pred_oob2[test_index] =pred 227 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 228 | a = log_loss(te_y, pred) 229 | 230 | sub2 += model.predict(xgb.DMatrix(test[features]))/5 231 | 232 | 233 | print ("idx: ", i) 234 | print (" loss: %.5f" % a) 235 | # print " gini: %.5f" % g 236 | aus.append(a) 237 | 238 | print ("mean") 239 | print ("auc: %s" % (sum(aus) / 5.0)) 240 | 241 | 242 | # In[ ]: 243 | 244 | 245 | res2_1=np.vstack((pred_oob2,sub2)) 246 | res2_1 = pd.DataFrame(res2_1) 247 | 248 | 249 | # In[ ]: 250 | 251 | 252 | res1.index=range(len(res1)) 253 | res2_1.index=range(len(res2_1)) 254 | final_1=res2_1.copy() 255 | final_2=res2_1.copy() 256 | for i in range(11): 257 | final_1[i]=res1['sex1']*res2_1[i] 258 | final_2[i]=res1['sex2']*res2_1[i] 259 | id_list=pd.concat([train[['device_id']],test[['device_id']]]) 260 | final=id_list 261 | final.index=range(len(final)) 262 | final.columns= ['DeviceID'] 263 | final_pred = pd.concat([final_1,final_2],1) 264 | final=pd.concat([final,final_pred],1) 265 | final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 266 | '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 267 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10'] 268 | 269 | final.to_csv('submit/xgb_feat_chizhu.csv', index=False) 270 | 271 | 272 | # In[ ]: 273 | 274 | 275 | test['DeviceID']=test['device_id'] 276 | sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left") 277 | sub.to_csv("submit/xgb_chizhu.csv",index=False) 278 | 279 | -------------------------------------------------------------------------------- /THLUO/21.tfidf_lr_sex_age_prob_oof.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import time 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.linear_model import LogisticRegression 20 | from sklearn.preprocessing import LabelEncoder 21 | import gc 22 | from sklearn.linear_model import LogisticRegression 23 | from sklearn.metrics import log_loss 24 | 25 | 26 | 27 | # In[2]: 28 | 29 | print('21.tfidf_lr.py') 30 | path='input/' 31 | data=pd.DataFrame() 32 | #sex_age=pd.read_excel('./data/性别年龄对照表.xlsx') 33 | 34 | 35 | # In[3]: 36 | 37 | 38 | deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps']) 39 | deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id']) 40 | deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 41 | deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type']) 42 | deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time']) 43 | package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type']) 44 | 45 | 46 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0]) 47 | 48 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 49 | one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values 50 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x) 51 | 52 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 53 | one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values 54 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x) 55 | 56 | df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'}) 57 | one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values 58 | deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x) 59 | 60 | 61 | #转换成对应的数字 62 | lbl = LabelEncoder() 63 | lbl.fit(list(deviceid_brand.device_brand.values)) 64 | deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values)) 65 | 66 | lbl = LabelEncoder() 67 | lbl.fit(list(deviceid_brand.device_type.values)) 68 | deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values)) 69 | 70 | #转换成对应的数字 71 | lbl = LabelEncoder() 72 | lbl.fit(list(package_label.app_parent_type.values)) 73 | package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values)) 74 | 75 | lbl = LabelEncoder() 76 | lbl.fit(list(package_label.app_child_type.values)) 77 | package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values)) 78 | 79 | deviceid_train = pd.concat([deviceid_train, deviceid_test]) 80 | 81 | 82 | # In[4]: 83 | 84 | 85 | deviceid_package_start = deviceid_package_start_close[['device_id', 'app_id', 'start_time']] 86 | deviceid_package_start.columns = ['device_id', 'app_id', 'all_time'] 87 | deviceid_package_close = deviceid_package_start_close[['device_id', 'app_id', 'close_time']] 88 | deviceid_package_close.columns = ['device_id', 'app_id', 'all_time'] 89 | deviceid_package_all = pd.concat([deviceid_package_start, deviceid_package_close]) 90 | deviceid_package_all = deviceid_package_all.sort_values(by='all_time') 91 | #deviceid_package_all = deviceid_package_all.merge(deviceid_train, on='device_id', how='left') 92 | 93 | 94 | # In[5]: 95 | 96 | 97 | df = deviceid_package_all.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'}) 98 | 99 | 100 | # In[6]: 101 | 102 | 103 | df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv') 104 | df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv') 105 | df_start_close_sex_prob_oof = pd.read_csv('start_close_sex_prob_oof.csv') 106 | df_start_close_age_prob_oof = pd.read_csv('start_close_age_prob_oof.csv') 107 | df_start_close_sex_age_prob_oof = pd.read_csv('start_close_sex_age_prob_oof.csv') 108 | 109 | 110 | gc.collect() 111 | df = df.merge(df_sex_prob_oof, on='device_id', how='left') 112 | df = df.merge(df_age_prob_oof, on='device_id', how='left') 113 | df = df.merge(df_start_close_sex_prob_oof, on='device_id', how='left') 114 | df = df.merge(df_start_close_age_prob_oof, on='device_id', how='left') 115 | df = df.merge(df_start_close_sex_age_prob_oof, on='device_id', how='left') 116 | df.fillna(0, inplace=True) 117 | apps = df['app_list'].apply(lambda x:' '.join(x)).tolist() 118 | del df['app_list'] 119 | 120 | 121 | df = df.merge(deviceid_train, on='device_id', how='left') 122 | 123 | 124 | # In[8]: 125 | 126 | 127 | vectorizer=CountVectorizer() 128 | transformer=TfidfTransformer() 129 | cntTf = vectorizer.fit_transform(apps) 130 | tfidf=transformer.fit_transform(cntTf) 131 | word=vectorizer.get_feature_names() 132 | weight=tfidf.toarray() 133 | df_weight=pd.DataFrame(weight) 134 | feature=df_weight.columns 135 | 136 | 137 | # In[9]: 138 | 139 | 140 | for i in df.columns.values: 141 | df_weight[i] = df[i] 142 | df_weight[i] = df[i] 143 | 144 | 145 | # In[11]: 146 | 147 | 148 | df_weight['sex'] = df_weight['sex'].apply(lambda x:str(x)) 149 | df_weight['age'] = df_weight['age'].apply(lambda x:str(x)) 150 | def tool(x): 151 | if x == 'nan': 152 | return x 153 | else: 154 | return str(int(float(x))) 155 | df_weight['sex'] = df_weight['sex'].apply(tool) 156 | df_weight['age'] = df_weight['age'].apply(tool) 157 | df_weight['sex_age'] = df_weight['sex']+'-'+df_weight['age'] 158 | df_weight['sex_age'] = df_weight.sex_age.replace({'nan':np.NaN,'nan-nan':np.NaN}) 159 | 160 | 161 | # In[12]: 162 | 163 | 164 | train = df_weight[df_weight.sex_age.notnull()] 165 | train.reset_index(drop=True, inplace=True) 166 | test = df_weight[df_weight.sex_age.isnull()] 167 | test.reset_index(drop=True, inplace=True) 168 | gc.collect() 169 | 170 | 171 | # In[16]: 172 | 173 | 174 | X = train.drop(['sex','age','sex_age','device_id'],axis=1) 175 | Y = train['sex_age'] 176 | Y_CAT = pd.Categorical(Y) 177 | Y = pd.Series(Y_CAT.codes) 178 | 179 | 180 | # In[18]: 181 | 182 | 183 | from sklearn.model_selection import KFold, StratifiedKFold 184 | gc.collect() 185 | seed = 666 186 | num_folds = 5 187 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed) 188 | 189 | oof_preds = np.zeros([train.shape[0], 22]) 190 | 191 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)): 192 | train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx] 193 | valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx] 194 | 195 | 196 | clf = LogisticRegression(C=4) 197 | clf.fit(train_x, train_y) 198 | valid_preds=clf.predict_proba(valid_x) 199 | train_preds=clf.predict_proba(train_x) 200 | 201 | oof_preds[valid_idx] = valid_preds 202 | 203 | print (log_loss(train_y.values, train_preds), log_loss(valid_y.values, valid_preds)) 204 | 205 | 206 | oof_train = pd.DataFrame(oof_preds) 207 | oof_train.columns = ['tfidf_lr_sex_age_prob_oof_' + str(i) for i in range(22)] 208 | train_temp = pd.concat([train[['device_id']], oof_train], axis=1) 209 | 210 | 211 | # In[20]: 212 | 213 | 214 | #用全部的数据预测 215 | clf = LogisticRegression(C=4) 216 | clf.fit(X, Y) 217 | train_preds=clf.predict_proba(X) 218 | test_preds=clf.predict_proba(test[X.columns]) 219 | print (log_loss(Y.values, train_preds)) 220 | 221 | oof_test = pd.DataFrame(test_preds) 222 | oof_test.columns = ['tfidf_lr_sex_age_prob_oof_' + str(i) for i in range(22)] 223 | 224 | 225 | # In[24]: 226 | 227 | 228 | oof_test 229 | 230 | 231 | # In[25]: 232 | 233 | 234 | test_temp = pd.concat([test[['device_id']], oof_test], axis=1) 235 | test_temp 236 | 237 | 238 | # In[26]: 239 | 240 | 241 | sex_age_oof = pd.concat([train_temp, test_temp]) 242 | sex_age_oof 243 | 244 | 245 | # In[29]: 246 | 247 | 248 | sex_age_oof.to_csv('tfidf_lr_sex_age_prob_oof.csv', index=None) 249 | 250 | -------------------------------------------------------------------------------- /THLUO/26.thluo_nb_lgb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # coding: utf-8 8 | 9 | # In[1]: 10 | 11 | from sklearn.metrics import log_loss 12 | import pandas as pd 13 | import seaborn as sns 14 | import numpy as np 15 | from tqdm import tqdm 16 | from sklearn.decomposition import LatentDirichletAllocation 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.metrics import accuracy_score 19 | import lightgbm as lgb 20 | from datetime import datetime,timedelta 21 | import time 22 | from sklearn.feature_extraction.text import TfidfTransformer 23 | from sklearn.feature_extraction.text import CountVectorizer 24 | get_ipython().run_line_magic('matplotlib', 'inline') 25 | 26 | #add 27 | import gc 28 | from sklearn import preprocessing 29 | from sklearn.feature_extraction.text import TfidfVectorizer 30 | 31 | from scipy.sparse import hstack, vstack 32 | from sklearn.model_selection import StratifiedKFold 33 | from sklearn.model_selection import cross_val_score 34 | # from skopt.space import Integer, Categorical, Real, Log10 35 | # from skopt.utils import use_named_args 36 | # from skopt import gp_minimize 37 | from gensim.models import Word2Vec, FastText 38 | import gensim 39 | import re 40 | import os 41 | path="./" 42 | os.listdir(path) 43 | 44 | 45 | # In[2]: 46 | print ('26.thluo_nb_lgb.py') 47 | 48 | train_id=pd.read_csv("input/deviceid_train.tsv",sep="\t",names=['device_id','sex','age']) 49 | test_id=pd.read_csv("input/deviceid_test.tsv",sep="\t",names=['device_id']) 50 | all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]]) 51 | #nurbs=pd.read_csv("nurbs_feature_all.csv") 52 | #nurbs.columns=["nurbs_"+str(i) for i in nurbs.columns] 53 | thluo = pd.read_csv("thluo_train_best_feat.csv") 54 | del thluo['age'] 55 | del thluo['sex'] 56 | del thluo['sex_age'] 57 | 58 | 59 | # In[7]: 60 | 61 | 62 | feat = thluo.copy() 63 | 64 | 65 | # In[8]: 66 | 67 | 68 | train=pd.merge(train_id,feat,on="device_id",how="left") 69 | test=pd.merge(test_id,feat,on="device_id",how="left") 70 | 71 | 72 | # In[11]: 73 | 74 | 75 | features = [x for x in train.columns if x not in ['device_id', 'sex',"age",]] 76 | Y = train['sex'] - 1 77 | 78 | 79 | # In[12]: 80 | 81 | 82 | from sklearn.model_selection import KFold, StratifiedKFold 83 | gc.collect() 84 | seed = 1024 85 | num_folds = 5 86 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed) 87 | 88 | 89 | # In[13]: 90 | 91 | 92 | params = { 93 | 'boosting_type': 'gbdt', 94 | 'learning_rate' : 0.02, 95 | #'max_depth':5, 96 | 'num_leaves' : 2 ** 5, 97 | 'metric': {'binary_logloss'}, 98 | #'num_class' : 22, 99 | 'objective' : 'binary', 100 | 'random_state' : 6666, 101 | 'bagging_freq' : 5, 102 | 'feature_fraction' : 0.7, 103 | 'bagging_fraction' : 0.7, 104 | 'min_split_gain' : 0.0970905919552776, 105 | 'min_child_weight' : 9.42012323936088, 106 | } 107 | 108 | 109 | # In[14]: 110 | 111 | 112 | #预测性别 113 | aus = [] 114 | sub1 = np.zeros((len(test), )) 115 | pred_oob1=np.zeros((len(train),)) 116 | for i,(train_index,test_index) in enumerate(folds.split(train[features], Y)): 117 | 118 | tr_x = train[features].reindex(index=train_index, copy=False) 119 | tr_y = Y[train_index] 120 | te_x = train[features].reindex(index=test_index, copy=False) 121 | te_y = Y[test_index] 122 | 123 | lgb_train=lgb.Dataset(tr_x,label=tr_y) 124 | lgb_eval = lgb.Dataset(te_x, te_y, reference=lgb_train) 125 | 126 | gbm = lgb.train(params, lgb_train, num_boost_round=300, 127 | valid_sets=[lgb_train, lgb_eval], verbose_eval=100) 128 | 129 | pred = gbm.predict(te_x[tr_x.columns.values]) 130 | pred_oob1[test_index] =pred 131 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 132 | a = log_loss(te_y, pred) 133 | 134 | 135 | print ("idx: ", i) 136 | print (" loss: %.5f" % a) 137 | # print " gini: %.5f" % g 138 | aus.append(a) 139 | 140 | print ("mean") 141 | print ("auc: %s" % (sum(aus) / 5.0)) 142 | 143 | 144 | # In[15]: 145 | 146 | 147 | #用全部数据训练一个lgb 148 | #用全部的train来预测test 149 | lgb_train = lgb.Dataset(train[features],label=Y) 150 | 151 | gbm = lgb.train(params, lgb_train, num_boost_round=300, valid_sets=lgb_train, verbose_eval=100) 152 | 153 | sub1 = gbm.predict(test[features]) 154 | 155 | 156 | # In[16]: 157 | 158 | 159 | pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2']) 160 | sub1 = pd.DataFrame(sub1, columns=['sex2']) 161 | res1=pd.concat([pred_oob1,sub1]) 162 | res1['sex1'] = 1-res1['sex2'] 163 | 164 | 165 | # In[18]: 166 | 167 | 168 | 169 | # In[50]: 170 | 171 | 172 | features = [x for x in train.columns if x not in ['device_id',"age"]] 173 | Y = train['age'] 174 | 175 | 176 | # In[51]: 177 | 178 | 179 | import lightgbm as lgb 180 | import xgboost as xgb 181 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 182 | 183 | 184 | # In[19]: 185 | 186 | 187 | from sklearn.model_selection import KFold, StratifiedKFold 188 | gc.collect() 189 | seed = 1024 190 | num_folds = 5 191 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed) 192 | 193 | 194 | # In[20]: 195 | 196 | 197 | params = { 198 | 'boosting_type': 'gbdt', 199 | 'learning_rate' : 0.02, 200 | #'max_depth':5, 201 | 'num_leaves' : 2 ** 5, 202 | 'metric': {'multi_logloss'}, 203 | 'num_class' : 11, 204 | 'objective' : 'multiclass', 205 | 'random_state' : 6666, 206 | 'bagging_freq' : 5, 207 | 'feature_fraction' : 0.7, 208 | 'bagging_fraction' : 0.7, 209 | 'min_split_gain' : 0.0970905919552776, 210 | 'min_child_weight' : 9.42012323936088, 211 | } 212 | 213 | 214 | # In[22]: 215 | 216 | 217 | #预测性别 218 | aus = [] 219 | sub2 = np.zeros((len(test),11 )) 220 | pred_oob2=np.zeros((len(train),11)) 221 | models=[] 222 | iters=[] 223 | for i,(train_index,test_index) in enumerate(folds.split(train[features], Y)): 224 | 225 | tr_x = train[features].reindex(index=train_index, copy=False) 226 | tr_y = Y[train_index] 227 | te_x = train[features].reindex(index=test_index, copy=False) 228 | te_y = Y[test_index] 229 | 230 | lgb_train=lgb.Dataset(tr_x,label=tr_y) 231 | lgb_eval = lgb.Dataset(te_x, te_y, reference=lgb_train) 232 | 233 | gbm = lgb.train(params, lgb_train, num_boost_round=430, 234 | valid_sets=[lgb_train, lgb_eval], verbose_eval=100) 235 | 236 | pred = gbm.predict(te_x[tr_x.columns.values]) 237 | pred_oob2[test_index] = pred 238 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 239 | a = log_loss(te_y, pred) 240 | 241 | #sub2 += gbm.predict(test[features], num_iteration=gbm.best_iteration) / 5 242 | 243 | models.append(gbm) 244 | iters.append(gbm.best_iteration) 245 | 246 | print ("idx: ", i) 247 | print (" loss: %.5f" % a) 248 | # print " gini: %.5f" % g 249 | aus.append(a) 250 | 251 | print ("mean") 252 | print ("auc: %s" % (sum(aus) / 5.0)) 253 | 254 | 255 | # In[23]: 256 | 257 | 258 | #预测条件概率 259 | ####sex1 260 | test['sex']=1 261 | #用全部数据训练一个lgb 262 | #用全部的train来预测test 263 | lgb_train = lgb.Dataset(train[features],label=Y) 264 | 265 | gbm = lgb.train(params, lgb_train, num_boost_round=430, valid_sets=lgb_train, verbose_eval=100) 266 | sub2 = gbm.predict(test[features]) 267 | 268 | res2_1=np.vstack((pred_oob2,sub2)) 269 | res2_1 = pd.DataFrame(res2_1) 270 | 271 | 272 | # In[24]: 273 | 274 | 275 | ###sex2 276 | #预测条件概率 277 | test['sex']=2 278 | 279 | sub2 = np.zeros((len(test),11)) 280 | sub2 = gbm.predict(test[features], num_iteration = gbm.best_iteration) 281 | res2_2=np.vstack((pred_oob2,sub2)) 282 | res2_2 = pd.DataFrame(res2_2) 283 | 284 | 285 | # In[27]: 286 | 287 | 288 | res1.index=range(len(res1)) 289 | res2_1.index=range(len(res2_1)) 290 | res2_2.index=range(len(res2_2)) 291 | final_1=res2_1.copy() 292 | final_2=res2_2.copy() 293 | 294 | 295 | # In[28]: 296 | 297 | 298 | for i in range(11): 299 | final_1[i]=res1['sex1'] * res2_1[i] 300 | final_2[i]=res1['sex2'] * res2_2[i] 301 | id_list = pd.concat([train[['device_id']],test[['device_id']]]) 302 | final = id_list 303 | final.index = range(len(final)) 304 | final.columns = ['DeviceID'] 305 | final_pred = pd.concat([final_1,final_2], 1) 306 | final = pd.concat([final,final_pred],1) 307 | final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 308 | '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 309 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10'] 310 | 311 | 312 | # In[30]: 313 | 314 | 315 | test['DeviceID']=test['device_id'] 316 | sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left") 317 | sub.to_csv("th_lgb_nb.csv",index=False) 318 | 319 | -------------------------------------------------------------------------------- /THLUO/13.device_start_GRU_pred.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # coding: utf-8 8 | import feather 9 | import os 10 | import re 11 | import sys 12 | import gc 13 | import random 14 | import pandas as pd 15 | import numpy as np 16 | import gensim 17 | from gensim.models import Word2Vec 18 | from gensim.models.word2vec import LineSentence 19 | from scipy import stats 20 | import tensorflow as tf 21 | import keras 22 | from keras.layers import * 23 | from keras.models import * 24 | from keras.optimizers import * 25 | from keras.callbacks import * 26 | from keras.preprocessing import text, sequence 27 | from keras.utils import to_categorical 28 | from keras.engine.topology import Layer 29 | from sklearn.preprocessing import LabelEncoder 30 | from keras.utils import np_utils 31 | from keras.utils.training_utils import multi_gpu_model 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.metrics import f1_score 34 | from sklearn.model_selection import KFold 35 | from sklearn.metrics import accuracy_score 36 | from sklearn.preprocessing import LabelEncoder 37 | from sklearn.metrics import f1_score 38 | import warnings 39 | from TextModel import * 40 | warnings.filterwarnings('ignore') 41 | config = tf.ConfigProto() 42 | config.gpu_options.allow_growth = True 43 | session = tf.Session(config=config) 44 | 45 | 46 | # In[2]: 47 | print ('13.device_start_GRU_pred.py') 48 | 49 | df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv') 50 | deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id']) 51 | deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 52 | df_total = pd.concat([deviceid_train, deviceid_test]) 53 | df_doc = df_doc.merge(df_total, on='device_id', how='left') 54 | 55 | 56 | df_wv2_all = pd.read_csv('w2c_all_emb.csv') 57 | 58 | dic_w2c_all = {} 59 | for row in df_wv2_all.values : 60 | app_id = row[0] 61 | vector = row[1:] 62 | dic_w2c_all[app_id] = vector 63 | 64 | 65 | # In[3]: 66 | 67 | 68 | df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x)) 69 | df_doc['age'] = df_doc['age'].apply(lambda x:str(x)) 70 | def tool(x): 71 | if x=='nan': 72 | return x 73 | else: 74 | return str(int(float(x))) 75 | df_doc['sex']=df_doc['sex'].apply(tool) 76 | df_doc['age']=df_doc['age'].apply(tool) 77 | df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age'] 78 | df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN}) 79 | train = df_doc[df_doc['sex_age'].notnull()] 80 | test = df_doc[df_doc['sex_age'].isnull()] 81 | train.reset_index(drop=True, inplace=True) 82 | test.reset_index(drop=True, inplace=True) 83 | 84 | lb = LabelEncoder() 85 | train_label = lb.fit_transform(train['sex_age'].values) 86 | train['class'] = train_label 87 | 88 | 89 | # In[5]: 90 | 91 | 92 | column_name="app_list" 93 | word_seq_len = 900 94 | victor_size = 200 95 | num_words = 35000 96 | batch_size = 64 97 | classification = 22 98 | kfold=10 99 | 100 | 101 | # In[6]: 102 | 103 | 104 | from sklearn.metrics import log_loss 105 | 106 | def get_mut_label(y_label) : 107 | results = [] 108 | for ele in y_label : 109 | results.append(ele.argmax()) 110 | return results 111 | 112 | class RocAucEvaluation(Callback): 113 | def __init__(self, validation_data=(), interval=1): 114 | super(Callback, self).__init__() 115 | 116 | self.interval = interval 117 | self.X_val, self.y_val = validation_data 118 | 119 | def on_epoch_end(self, epoch, logs={}): 120 | if epoch % self.interval == 0: 121 | y_pred = self.model.predict(self.X_val, verbose=0) 122 | val_y = get_mut_label(self.y_val) 123 | score = log_loss(val_y, y_pred) 124 | print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score)) 125 | 126 | 127 | # In[7]: 128 | 129 | 130 | #词向量 131 | def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words): 132 | 133 | tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="") 134 | tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values)) 135 | 136 | train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_) 137 | test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_) 138 | 139 | word_index = tokenizer.word_index 140 | 141 | count = 0 142 | nb_words = len(word_index) 143 | print(nb_words) 144 | all_data=pd.concat([df_train[col],df_test[col]]) 145 | file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model' 146 | if not os.path.exists(file_name): 147 | model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], 148 | size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2) 149 | model.save(file_name) 150 | else: 151 | model = Word2Vec.load(file_name) 152 | print("add word2vec finished....") 153 | 154 | 155 | 156 | embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) 157 | for word, i in word_index.items(): 158 | embedding_vector = model[word] if word in model else None 159 | if embedding_vector is not None: 160 | count += 1 161 | embedding_word2vec_matrix[i] = embedding_vector 162 | else: 163 | unk_vec = np.random.random(victor_size) * 0.5 164 | unk_vec = unk_vec - unk_vec.mean() 165 | embedding_word2vec_matrix[i] = unk_vec 166 | 167 | embedding_w2c_all = np.zeros((nb_words + 1, victor_size)) 168 | for word, i in word_index.items(): 169 | embedding_vector = dic_w2c_all[word] 170 | embedding_w2c_all[i] = embedding_vector 171 | 172 | 173 | #embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1) 174 | embedding_matrix = embedding_word2vec_matrix 175 | 176 | return train_, test_, word_index, embedding_matrix 177 | 178 | 179 | # In[8]: 180 | 181 | 182 | train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words) 183 | 184 | 185 | # In[11]: 186 | 187 | 188 | my_opt="bi_gru_model" 189 | #参数 190 | Y = train['class'].values 191 | 192 | if not os.path.exists("cache/"+my_opt): 193 | os.mkdir("cache/"+my_opt) 194 | 195 | 196 | # In[12]: 197 | 198 | 199 | from sklearn.model_selection import KFold, StratifiedKFold 200 | gc.collect() 201 | seed = 2006 202 | num_folds = 10 203 | kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y) 204 | 205 | 206 | # In[13]: 207 | 208 | 209 | epochs = 4 210 | my_opt=eval(my_opt) 211 | train_model_pred = np.zeros((train_.shape[0], classification)) 212 | test_model_pred = np.zeros((test_.shape[0], classification)) 213 | for i, (train_fold, val_fold) in enumerate(kf): 214 | X_train, X_valid, = train_[train_fold, :], train_[val_fold, :] 215 | y_train, y_valid = Y[train_fold], Y[val_fold] 216 | 217 | y_tra = to_categorical(y_train) 218 | y_val = to_categorical(y_valid) 219 | 220 | #模型 221 | name = str(my_opt.__name__) 222 | 223 | model = my_opt(word_seq_len, word_embedding, classification) 224 | 225 | 226 | RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1) 227 | 228 | hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val), 229 | callbacks=[RocAuc]) 230 | 231 | 232 | train_model_pred[val_fold, :] = model.predict(X_valid) 233 | 234 | 235 | # In[26]: 236 | 237 | 238 | #模型 239 | #用全部的数据预测 240 | train_label = to_categorical(Y) 241 | name = str(my_opt.__name__) 242 | 243 | model = my_opt(word_seq_len, word_embedding, classification) 244 | 245 | 246 | RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1) 247 | 248 | hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label), 249 | callbacks=[RocAuc]) 250 | 251 | 252 | test_model_pred = model.predict(test_) 253 | 254 | 255 | # In[27]: 256 | 257 | 258 | df_train_pred = pd.DataFrame(train_model_pred) 259 | df_test_pred = pd.DataFrame(test_model_pred) 260 | df_train_pred.columns = ['device_start_GRU_pred_' + str(i) for i in range(22)] 261 | df_test_pred.columns = ['device_start_GRU_pred_' + str(i) for i in range(22)] 262 | 263 | 264 | # In[35]: 265 | 266 | 267 | df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1) 268 | df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1) 269 | 270 | 271 | # In[37]: 272 | 273 | 274 | df_results = pd.concat([df_train_pred, df_test_pred]) 275 | df_results.to_csv('device_start_GRU_pred.csv', index=None) 276 | 277 | -------------------------------------------------------------------------------- /THLUO/15.device_all_GRU_pred.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # coding: utf-8 8 | import feather 9 | import os 10 | import re 11 | import sys 12 | import gc 13 | import random 14 | import pandas as pd 15 | import numpy as np 16 | import gensim 17 | from gensim.models import Word2Vec 18 | from gensim.models.word2vec import LineSentence 19 | from scipy import stats 20 | import tensorflow as tf 21 | import keras 22 | from keras.layers import * 23 | from keras.models import * 24 | from keras.optimizers import * 25 | from keras.callbacks import * 26 | from keras.preprocessing import text, sequence 27 | from keras.utils import to_categorical 28 | from keras.engine.topology import Layer 29 | from sklearn.preprocessing import LabelEncoder 30 | from keras.utils import np_utils 31 | from keras.utils.training_utils import multi_gpu_model 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.metrics import f1_score 34 | from sklearn.model_selection import KFold 35 | from sklearn.metrics import accuracy_score 36 | from sklearn.preprocessing import LabelEncoder 37 | from TextModel import * 38 | from sklearn.metrics import f1_score 39 | import warnings 40 | warnings.filterwarnings('ignore') 41 | config = tf.ConfigProto() 42 | config.gpu_options.allow_growth = True 43 | session = tf.Session(config=config) 44 | 45 | 46 | # In[2]: 47 | print('15.device_all_GRU_pred.py') 48 | 49 | df_doc = pd.read_csv('03.device_click_app_sorted_by_all.csv') 50 | deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id']) 51 | deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 52 | df_total = pd.concat([deviceid_train, deviceid_test]) 53 | df_doc = df_doc.merge(df_total, on='device_id', how='left') 54 | 55 | 56 | df_wv2_all = pd.read_csv('w2c_all_emb.csv') 57 | 58 | dic_w2c_all = {} 59 | for row in df_wv2_all.values : 60 | app_id = row[0] 61 | vector = row[1:] 62 | dic_w2c_all[app_id] = vector 63 | 64 | # In[3]: 65 | 66 | 67 | df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x)) 68 | df_doc['age'] = df_doc['age'].apply(lambda x:str(x)) 69 | def tool(x): 70 | if x=='nan': 71 | return x 72 | else: 73 | return str(int(float(x))) 74 | df_doc['sex']=df_doc['sex'].apply(tool) 75 | df_doc['age']=df_doc['age'].apply(tool) 76 | df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age'] 77 | df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN}) 78 | train = df_doc[df_doc['sex_age'].notnull()] 79 | test = df_doc[df_doc['sex_age'].isnull()] 80 | train.reset_index(drop=True, inplace=True) 81 | test.reset_index(drop=True, inplace=True) 82 | 83 | lb = LabelEncoder() 84 | train_label = lb.fit_transform(train['sex_age'].values) 85 | train['class'] = train_label 86 | 87 | 88 | # In[6]: 89 | 90 | 91 | column_name="app_list" 92 | word_seq_len = 1800 93 | victor_size = 200 94 | num_words = 35000 95 | batch_size = 64 96 | classification = 22 97 | kfold=10 98 | 99 | 100 | # In[7]: 101 | 102 | 103 | from sklearn.metrics import log_loss 104 | 105 | def get_mut_label(y_label) : 106 | results = [] 107 | for ele in y_label : 108 | results.append(ele.argmax()) 109 | return results 110 | 111 | class RocAucEvaluation(Callback): 112 | def __init__(self, validation_data=(), interval=1): 113 | super(Callback, self).__init__() 114 | 115 | self.interval = interval 116 | self.X_val, self.y_val = validation_data 117 | 118 | def on_epoch_end(self, epoch, logs={}): 119 | if epoch % self.interval == 0: 120 | y_pred = self.model.predict(self.X_val, verbose=0) 121 | val_y = get_mut_label(self.y_val) 122 | score = log_loss(val_y, y_pred) 123 | print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score)) 124 | 125 | 126 | # In[14]: 127 | 128 | 129 | #词向量 130 | def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words): 131 | 132 | tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="") 133 | tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values)) 134 | 135 | train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_) 136 | test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_) 137 | 138 | word_index = tokenizer.word_index 139 | 140 | count = 0 141 | nb_words = len(word_index) 142 | print(nb_words) 143 | all_data=pd.concat([df_train[col],df_test[col]]) 144 | file_name = 'embedding/' + 'Word2Vec_all' + col +"_"+ str(victor_size) + '.model' 145 | if not os.path.exists(file_name): 146 | model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], 147 | size=victor_size, window=30, iter=10, workers=11, seed=2018, min_count=2) 148 | model.save(file_name) 149 | else: 150 | model = Word2Vec.load(file_name) 151 | print("add word2vec finished....") 152 | 153 | 154 | 155 | embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) 156 | for word, i in word_index.items(): 157 | embedding_vector = model[word] if word in model else None 158 | if embedding_vector is not None: 159 | count += 1 160 | embedding_word2vec_matrix[i] = embedding_vector 161 | else: 162 | unk_vec = np.random.random(victor_size) * 0.5 163 | unk_vec = unk_vec - unk_vec.mean() 164 | embedding_word2vec_matrix[i] = unk_vec 165 | 166 | embedding_w2c_all = np.zeros((nb_words + 1, victor_size)) 167 | for word, i in word_index.items(): 168 | embedding_vector = dic_w2c_all[word] 169 | embedding_w2c_all[i] = embedding_vector 170 | 171 | 172 | #embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1) 173 | embedding_matrix = embedding_word2vec_matrix 174 | 175 | return train_, test_, word_index, embedding_matrix 176 | 177 | 178 | # In[15]: 179 | 180 | 181 | train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words) 182 | 183 | 184 | # In[21]: 185 | 186 | 187 | my_opt="bi_gru_model" 188 | #参数 189 | Y = train['class'].values 190 | 191 | if not os.path.exists("cache/"+my_opt): 192 | os.mkdir("cache/"+my_opt) 193 | 194 | 195 | # In[22]: 196 | 197 | 198 | from sklearn.model_selection import KFold, StratifiedKFold 199 | gc.collect() 200 | seed = 2006 201 | num_folds = 10 202 | kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y) 203 | 204 | 205 | # In[23]: 206 | 207 | 208 | epochs = 4 209 | my_opt=eval(my_opt) 210 | train_model_pred = np.zeros((train_.shape[0], classification)) 211 | test_model_pred = np.zeros((test_.shape[0], classification)) 212 | for i, (train_fold, val_fold) in enumerate(kf): 213 | X_train, X_valid, = train_[train_fold, :], train_[val_fold, :] 214 | y_train, y_valid = Y[train_fold], Y[val_fold] 215 | 216 | y_tra = to_categorical(y_train) 217 | y_val = to_categorical(y_valid) 218 | 219 | #模型 220 | name = str(my_opt.__name__) 221 | 222 | model = my_opt(word_seq_len, word_embedding, classification) 223 | 224 | 225 | RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1) 226 | 227 | hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val), 228 | callbacks=[RocAuc]) 229 | 230 | 231 | train_model_pred[val_fold, :] = model.predict(X_valid) 232 | 233 | del model 234 | del hist 235 | gc.collect() 236 | 237 | 238 | # In[27]: 239 | 240 | 241 | #模型 242 | #用全部的数据预测 243 | train_label = to_categorical(Y) 244 | name = str(my_opt.__name__) 245 | 246 | model = my_opt(word_seq_len, word_embedding, classification) 247 | 248 | 249 | RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1) 250 | 251 | hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label), 252 | callbacks=[RocAuc]) 253 | 254 | 255 | test_model_pred = model.predict(test_) 256 | 257 | 258 | # In[28]: 259 | 260 | 261 | df_train_pred = pd.DataFrame(train_model_pred) 262 | df_test_pred = pd.DataFrame(test_model_pred) 263 | df_train_pred.columns = ['device_all_GRU_pred_' + str(i) for i in range(22)] 264 | df_test_pred.columns = ['device_all_GRU_pred_' + str(i) for i in range(22)] 265 | 266 | 267 | # In[29]: 268 | 269 | 270 | df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1) 271 | df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1) 272 | 273 | 274 | # In[30]: 275 | 276 | 277 | df_results = pd.concat([df_train_pred, df_test_pred]) 278 | df_results.to_csv('device_all_GRU_pred.csv', index=None) 279 | 280 | -------------------------------------------------------------------------------- /THLUO/16.device_start_capsule_pred.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # coding: utf-8 8 | import feather 9 | import os 10 | import re 11 | import sys 12 | import gc 13 | import random 14 | import pandas as pd 15 | import numpy as np 16 | import gensim 17 | from gensim.models import Word2Vec 18 | from gensim.models.word2vec import LineSentence 19 | from scipy import stats 20 | import tensorflow as tf 21 | import keras 22 | from keras.layers import * 23 | from keras.models import * 24 | from keras.optimizers import * 25 | from keras.callbacks import * 26 | from keras.preprocessing import text, sequence 27 | from keras.utils import to_categorical 28 | from keras.engine.topology import Layer 29 | from sklearn.preprocessing import LabelEncoder 30 | from keras.utils import np_utils 31 | from keras.utils.training_utils import multi_gpu_model 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.metrics import f1_score 34 | from sklearn.model_selection import KFold 35 | from sklearn.metrics import accuracy_score 36 | from sklearn.preprocessing import LabelEncoder 37 | from sklearn.metrics import f1_score 38 | import warnings 39 | warnings.filterwarnings('ignore') 40 | config = tf.ConfigProto() 41 | config.gpu_options.allow_growth = True 42 | session = tf.Session(config=config) 43 | 44 | 45 | # In[2]: 46 | print ('16.device_start_capsule_pred.py') 47 | 48 | df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv') 49 | deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id']) 50 | deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 51 | df_total = pd.concat([deviceid_train, deviceid_test]) 52 | df_doc = df_doc.merge(df_total, on='device_id', how='left') 53 | 54 | 55 | df_wv2_all = pd.read_csv('w2c_all_emb.csv') 56 | 57 | dic_w2c_all = {} 58 | for row in df_wv2_all.values : 59 | app_id = row[0] 60 | vector = row[1:] 61 | dic_w2c_all[app_id] = vector 62 | 63 | 64 | # In[3]: 65 | 66 | 67 | df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x)) 68 | df_doc['age'] = df_doc['age'].apply(lambda x:str(x)) 69 | def tool(x): 70 | if x=='nan': 71 | return x 72 | else: 73 | return str(int(float(x))) 74 | df_doc['sex']=df_doc['sex'].apply(tool) 75 | df_doc['age']=df_doc['age'].apply(tool) 76 | df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age'] 77 | df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN}) 78 | train = df_doc[df_doc['sex_age'].notnull()] 79 | test = df_doc[df_doc['sex_age'].isnull()] 80 | train.reset_index(drop=True, inplace=True) 81 | test.reset_index(drop=True, inplace=True) 82 | 83 | lb = LabelEncoder() 84 | train_label = lb.fit_transform(train['sex_age'].values) 85 | train['class'] = train_label 86 | 87 | 88 | # In[5]: 89 | 90 | 91 | column_name="app_list" 92 | word_seq_len = 900 93 | victor_size = 200 94 | num_words = 35000 95 | batch_size = 64 96 | classification = 22 97 | kfold=10 98 | 99 | 100 | # In[6]: 101 | 102 | 103 | from sklearn.metrics import log_loss 104 | 105 | def get_mut_label(y_label) : 106 | results = [] 107 | for ele in y_label : 108 | results.append(ele.argmax()) 109 | return results 110 | 111 | class RocAucEvaluation(Callback): 112 | def __init__(self, validation_data=(), interval=1): 113 | super(Callback, self).__init__() 114 | 115 | self.interval = interval 116 | self.X_val, self.y_val = validation_data 117 | 118 | def on_epoch_end(self, epoch, logs={}): 119 | if epoch % self.interval == 0: 120 | y_pred = self.model.predict(self.X_val, verbose=0) 121 | val_y = get_mut_label(self.y_val) 122 | score = log_loss(val_y, y_pred) 123 | print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score)) 124 | 125 | 126 | # In[7]: 127 | 128 | 129 | #词向量 130 | def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words): 131 | 132 | tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="") 133 | tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values)) 134 | 135 | train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_) 136 | test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_) 137 | 138 | word_index = tokenizer.word_index 139 | 140 | count = 0 141 | nb_words = len(word_index) 142 | print(nb_words) 143 | all_data=pd.concat([df_train[col],df_test[col]]) 144 | file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model' 145 | if not os.path.exists(file_name): 146 | model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], 147 | size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2) 148 | model.save(file_name) 149 | else: 150 | model = Word2Vec.load(file_name) 151 | print("add word2vec finished....") 152 | 153 | 154 | 155 | embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) 156 | for word, i in word_index.items(): 157 | embedding_vector = model[word] if word in model else None 158 | if embedding_vector is not None: 159 | count += 1 160 | embedding_word2vec_matrix[i] = embedding_vector 161 | else: 162 | unk_vec = np.random.random(victor_size) * 0.5 163 | unk_vec = unk_vec - unk_vec.mean() 164 | embedding_word2vec_matrix[i] = unk_vec 165 | 166 | embedding_w2c_all = np.zeros((nb_words + 1, victor_size)) 167 | for word, i in word_index.items(): 168 | embedding_vector = dic_w2c_all[word] 169 | embedding_w2c_all[i] = embedding_vector 170 | 171 | 172 | #embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1) 173 | embedding_matrix = embedding_word2vec_matrix 174 | 175 | return train_, test_, word_index, embedding_matrix 176 | 177 | 178 | # In[8]: 179 | 180 | 181 | train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words) 182 | 183 | 184 | # In[10]: 185 | 186 | 187 | from TextModel import * 188 | 189 | 190 | # In[18]: 191 | 192 | 193 | my_opt="get_text_capsule" 194 | #参数 195 | Y = train['class'].values 196 | 197 | if not os.path.exists("cache/"+my_opt): 198 | os.mkdir("cache/"+my_opt) 199 | 200 | 201 | 202 | # In[19]: 203 | 204 | 205 | from sklearn.model_selection import KFold, StratifiedKFold 206 | gc.collect() 207 | seed = 2006 208 | num_folds = 5 209 | kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y) 210 | 211 | 212 | # In[20]: 213 | 214 | 215 | epochs = 10 216 | my_opt=eval(my_opt) 217 | train_model_pred = np.zeros((train_.shape[0], classification)) 218 | test_model_pred = np.zeros((test_.shape[0], classification)) 219 | for i, (train_fold, val_fold) in enumerate(kf): 220 | X_train, X_valid, = train_[train_fold, :], train_[val_fold, :] 221 | y_train, y_valid = Y[train_fold], Y[val_fold] 222 | 223 | y_tra = to_categorical(y_train) 224 | y_val = to_categorical(y_valid) 225 | 226 | #模型 227 | name = str(my_opt.__name__) 228 | 229 | model = my_opt(word_seq_len, word_embedding, classification) 230 | 231 | 232 | RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1) 233 | 234 | hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val), 235 | callbacks=[RocAuc]) 236 | 237 | 238 | train_model_pred[val_fold, :] = model.predict(X_valid) 239 | 240 | 241 | # In[24]: 242 | 243 | 244 | #模型 245 | #用全部的数据预测 246 | train_label = to_categorical(Y) 247 | name = str(my_opt.__name__) 248 | 249 | model = my_opt(word_seq_len, word_embedding, classification) 250 | 251 | 252 | RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1) 253 | 254 | hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label), 255 | callbacks=[RocAuc]) 256 | 257 | 258 | test_model_pred = model.predict(test_) 259 | 260 | 261 | # In[25]: 262 | 263 | 264 | df_train_pred = pd.DataFrame(train_model_pred) 265 | df_test_pred = pd.DataFrame(test_model_pred) 266 | df_train_pred.columns = ['device_start_capsule_pred_' + str(i) for i in range(22)] 267 | df_test_pred.columns = ['device_start_capsule_pred_' + str(i) for i in range(22)] 268 | 269 | 270 | # In[26]: 271 | 272 | 273 | df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1) 274 | df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1) 275 | 276 | 277 | # In[27]: 278 | 279 | 280 | df_results = pd.concat([df_train_pred, df_test_pred]) 281 | df_results.to_csv('device_start_capsule_pred.csv', index=None) 282 | 283 | -------------------------------------------------------------------------------- /THLUO/17.device_start_textcnn_pred.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # coding: utf-8 8 | import feather 9 | import os 10 | import re 11 | import sys 12 | import gc 13 | import random 14 | import pandas as pd 15 | import numpy as np 16 | import gensim 17 | from gensim.models import Word2Vec 18 | from gensim.models.word2vec import LineSentence 19 | from scipy import stats 20 | import tensorflow as tf 21 | import keras 22 | from keras.layers import * 23 | from keras.models import * 24 | from keras.optimizers import * 25 | from keras.callbacks import * 26 | from keras.preprocessing import text, sequence 27 | from keras.utils import to_categorical 28 | from keras.engine.topology import Layer 29 | from sklearn.preprocessing import LabelEncoder 30 | from keras.utils import np_utils 31 | from keras.utils.training_utils import multi_gpu_model 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.metrics import f1_score 34 | from sklearn.model_selection import KFold 35 | from sklearn.metrics import accuracy_score 36 | from sklearn.preprocessing import LabelEncoder 37 | from sklearn.metrics import f1_score 38 | import warnings 39 | warnings.filterwarnings('ignore') 40 | config = tf.ConfigProto() 41 | config.gpu_options.allow_growth = True 42 | session = tf.Session(config=config) 43 | 44 | 45 | # In[2]: 46 | print ('17.device_start_textcnn_pred.py') 47 | 48 | df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv') 49 | deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id']) 50 | deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 51 | df_total = pd.concat([deviceid_train, deviceid_test]) 52 | df_doc = df_doc.merge(df_total, on='device_id', how='left') 53 | 54 | 55 | df_wv2_all = pd.read_csv('w2c_all_emb.csv') 56 | 57 | dic_w2c_all = {} 58 | for row in df_wv2_all.values : 59 | app_id = row[0] 60 | vector = row[1:] 61 | dic_w2c_all[app_id] = vector 62 | 63 | 64 | # In[3]: 65 | 66 | 67 | df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x)) 68 | df_doc['age'] = df_doc['age'].apply(lambda x:str(x)) 69 | def tool(x): 70 | if x=='nan': 71 | return x 72 | else: 73 | return str(int(float(x))) 74 | df_doc['sex']=df_doc['sex'].apply(tool) 75 | df_doc['age']=df_doc['age'].apply(tool) 76 | df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age'] 77 | df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN}) 78 | train = df_doc[df_doc['sex_age'].notnull()] 79 | test = df_doc[df_doc['sex_age'].isnull()] 80 | train.reset_index(drop=True, inplace=True) 81 | test.reset_index(drop=True, inplace=True) 82 | 83 | lb = LabelEncoder() 84 | train_label = lb.fit_transform(train['sex_age'].values) 85 | train['class'] = train_label 86 | 87 | 88 | # In[5]: 89 | 90 | 91 | column_name="app_list" 92 | word_seq_len = 900 93 | victor_size = 200 94 | num_words = 35000 95 | batch_size = 64 96 | classification = 22 97 | kfold=10 98 | 99 | 100 | # In[6]: 101 | 102 | 103 | from sklearn.metrics import log_loss 104 | 105 | def get_mut_label(y_label) : 106 | results = [] 107 | for ele in y_label : 108 | results.append(ele.argmax()) 109 | return results 110 | 111 | class RocAucEvaluation(Callback): 112 | def __init__(self, validation_data=(), interval=1): 113 | super(Callback, self).__init__() 114 | 115 | self.interval = interval 116 | self.X_val, self.y_val = validation_data 117 | 118 | def on_epoch_end(self, epoch, logs={}): 119 | if epoch % self.interval == 0: 120 | y_pred = self.model.predict(self.X_val, verbose=0) 121 | val_y = get_mut_label(self.y_val) 122 | score = log_loss(val_y, y_pred) 123 | print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score)) 124 | 125 | 126 | # In[7]: 127 | 128 | 129 | #词向量 130 | def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words): 131 | 132 | tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="") 133 | tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values)) 134 | 135 | train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_) 136 | test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_) 137 | 138 | word_index = tokenizer.word_index 139 | 140 | count = 0 141 | nb_words = len(word_index) 142 | print(nb_words) 143 | all_data=pd.concat([df_train[col],df_test[col]]) 144 | file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model' 145 | if not os.path.exists(file_name): 146 | model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], 147 | size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2) 148 | model.save(file_name) 149 | else: 150 | model = Word2Vec.load(file_name) 151 | print("add word2vec finished....") 152 | 153 | 154 | 155 | embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) 156 | for word, i in word_index.items(): 157 | embedding_vector = model[word] if word in model else None 158 | if embedding_vector is not None: 159 | count += 1 160 | embedding_word2vec_matrix[i] = embedding_vector 161 | else: 162 | unk_vec = np.random.random(victor_size) * 0.5 163 | unk_vec = unk_vec - unk_vec.mean() 164 | embedding_word2vec_matrix[i] = unk_vec 165 | 166 | embedding_w2c_all = np.zeros((nb_words + 1, victor_size)) 167 | for word, i in word_index.items(): 168 | embedding_vector = dic_w2c_all[word] 169 | embedding_w2c_all[i] = embedding_vector 170 | 171 | 172 | #embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1) 173 | embedding_matrix = embedding_word2vec_matrix 174 | 175 | return train_, test_, word_index, embedding_matrix 176 | 177 | 178 | # In[8]: 179 | 180 | 181 | train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words) 182 | 183 | 184 | # In[10]: 185 | 186 | 187 | from TextModel import * 188 | 189 | 190 | # In[19]: 191 | 192 | 193 | my_opt="get_text_cnn2" 194 | #参数 195 | Y = train['class'].values 196 | 197 | if not os.path.exists("cache/"+my_opt): 198 | os.mkdir("cache/"+my_opt) 199 | 200 | 201 | 202 | # In[20]: 203 | 204 | 205 | from sklearn.model_selection import KFold, StratifiedKFold 206 | gc.collect() 207 | seed = 2006 208 | num_folds = 5 209 | kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y) 210 | 211 | 212 | # In[21]: 213 | 214 | 215 | epochs = 6 216 | my_opt=eval(my_opt) 217 | train_model_pred = np.zeros((train_.shape[0], classification)) 218 | test_model_pred = np.zeros((test_.shape[0], classification)) 219 | for i, (train_fold, val_fold) in enumerate(kf): 220 | X_train, X_valid, = train_[train_fold, :], train_[val_fold, :] 221 | y_train, y_valid = Y[train_fold], Y[val_fold] 222 | 223 | y_tra = to_categorical(y_train) 224 | y_val = to_categorical(y_valid) 225 | 226 | #模型 227 | name = str(my_opt.__name__) 228 | 229 | model = my_opt(word_seq_len, word_embedding, classification) 230 | 231 | 232 | RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1) 233 | 234 | hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val), 235 | callbacks=[RocAuc]) 236 | 237 | 238 | train_model_pred[val_fold, :] = model.predict(X_valid) 239 | 240 | 241 | # In[25]: 242 | 243 | 244 | #模型 245 | #用全部的数据预测 246 | train_label = to_categorical(Y) 247 | name = str(my_opt.__name__) 248 | 249 | model = my_opt(word_seq_len, word_embedding, classification) 250 | 251 | 252 | RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1) 253 | 254 | hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label), 255 | callbacks=[RocAuc]) 256 | 257 | 258 | test_model_pred = model.predict(test_) 259 | 260 | 261 | # In[26]: 262 | 263 | 264 | df_train_pred = pd.DataFrame(train_model_pred) 265 | df_test_pred = pd.DataFrame(test_model_pred) 266 | df_train_pred.columns = ['device_start_textcnn_pred_' + str(i) for i in range(22)] 267 | df_test_pred.columns = ['device_start_textcnn_pred_' + str(i) for i in range(22)] 268 | 269 | 270 | # In[27]: 271 | 272 | 273 | df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1) 274 | df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1) 275 | 276 | 277 | # In[28]: 278 | 279 | 280 | df_results = pd.concat([df_train_pred, df_test_pred]) 281 | df_results.to_csv('device_start_textcnn_pred.csv', index=None) 282 | 283 | -------------------------------------------------------------------------------- /THLUO/19.device_start_lstm_pred.py: -------------------------------------------------------------------------------- 1 | import feather 2 | import os 3 | import re 4 | import sys 5 | import gc 6 | import random 7 | import pandas as pd 8 | import numpy as np 9 | import gensim 10 | from gensim.models import Word2Vec 11 | from gensim.models.word2vec import LineSentence 12 | from scipy import stats 13 | import tensorflow as tf 14 | import keras 15 | from keras.layers import * 16 | from keras.models import * 17 | from keras.optimizers import * 18 | from keras.callbacks import * 19 | from keras.preprocessing import text, sequence 20 | from keras.utils import to_categorical 21 | from keras.engine.topology import Layer 22 | from sklearn.preprocessing import LabelEncoder 23 | from keras.utils import np_utils 24 | from keras.utils.training_utils import multi_gpu_model 25 | from sklearn.model_selection import train_test_split 26 | from sklearn.metrics import f1_score 27 | from sklearn.model_selection import KFold 28 | from sklearn.metrics import accuracy_score 29 | from sklearn.preprocessing import LabelEncoder 30 | from sklearn.metrics import f1_score 31 | import warnings 32 | warnings.filterwarnings('ignore') 33 | config = tf.ConfigProto() 34 | config.gpu_options.allow_growth = True 35 | session = tf.Session(config=config) 36 | 37 | print ('19.lstm...........py') 38 | # In[2]: 39 | 40 | 41 | df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv') 42 | deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id']) 43 | deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 44 | df_total = pd.concat([deviceid_train, deviceid_test]) 45 | df_doc = df_doc.merge(df_total, on='device_id', how='left') 46 | 47 | df_wv2_all = pd.read_csv('w2c_all_emb.csv') 48 | 49 | dic_w2c_all = {} 50 | for row in df_wv2_all.values : 51 | app_id = row[0] 52 | vector = row[1:] 53 | dic_w2c_all[app_id] = vector 54 | 55 | 56 | # In[3]: 57 | 58 | 59 | df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x)) 60 | df_doc['age'] = df_doc['age'].apply(lambda x:str(x)) 61 | def tool(x): 62 | if x=='nan': 63 | return x 64 | else: 65 | return str(int(float(x))) 66 | df_doc['sex']=df_doc['sex'].apply(tool) 67 | df_doc['age']=df_doc['age'].apply(tool) 68 | df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age'] 69 | df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN}) 70 | train = df_doc[df_doc['sex_age'].notnull()] 71 | test = df_doc[df_doc['sex_age'].isnull()] 72 | train.reset_index(drop=True, inplace=True) 73 | test.reset_index(drop=True, inplace=True) 74 | 75 | lb = LabelEncoder() 76 | train_label = lb.fit_transform(train['sex_age'].values) 77 | train['class'] = train_label 78 | 79 | 80 | # In[5]: 81 | 82 | 83 | column_name="app_list" 84 | word_seq_len = 900 85 | victor_size = 200 86 | num_words = 35000 87 | batch_size = 64 88 | classification = 22 89 | kfold=10 90 | 91 | 92 | # In[6]: 93 | 94 | 95 | from sklearn.metrics import log_loss 96 | 97 | def get_mut_label(y_label) : 98 | results = [] 99 | for ele in y_label : 100 | results.append(ele.argmax()) 101 | return results 102 | 103 | class RocAucEvaluation(Callback): 104 | def __init__(self, validation_data=(), interval=1): 105 | super(Callback, self).__init__() 106 | 107 | self.interval = interval 108 | self.X_val, self.y_val = validation_data 109 | 110 | def on_epoch_end(self, epoch, logs={}): 111 | if epoch % self.interval == 0: 112 | y_pred = self.model.predict(self.X_val, verbose=0) 113 | val_y = get_mut_label(self.y_val) 114 | score = log_loss(val_y, y_pred) 115 | print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score)) 116 | 117 | 118 | # In[7]: 119 | 120 | 121 | #词向量 122 | def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words): 123 | 124 | tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="") 125 | tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values)) 126 | 127 | train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_) 128 | test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_) 129 | 130 | word_index = tokenizer.word_index 131 | 132 | count = 0 133 | nb_words = len(word_index) 134 | print(nb_words) 135 | all_data=pd.concat([df_train[col],df_test[col]]) 136 | file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model' 137 | if not os.path.exists(file_name): 138 | model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], 139 | size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2) 140 | model.save(file_name) 141 | else: 142 | model = Word2Vec.load(file_name) 143 | print("add word2vec finished....") 144 | 145 | 146 | 147 | embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) 148 | for word, i in word_index.items(): 149 | embedding_vector = model[word] if word in model else None 150 | if embedding_vector is not None: 151 | count += 1 152 | embedding_word2vec_matrix[i] = embedding_vector 153 | else: 154 | unk_vec = np.random.random(victor_size) * 0.5 155 | unk_vec = unk_vec - unk_vec.mean() 156 | embedding_word2vec_matrix[i] = unk_vec 157 | 158 | embedding_w2c_all = np.zeros((nb_words + 1, victor_size)) 159 | for word, i in word_index.items(): 160 | embedding_vector = dic_w2c_all[word] 161 | embedding_w2c_all[i] = embedding_vector 162 | 163 | 164 | #embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1) 165 | embedding_matrix = embedding_word2vec_matrix 166 | 167 | return train_, test_, word_index, embedding_matrix 168 | 169 | 170 | # In[8]: 171 | 172 | 173 | train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words) 174 | 175 | 176 | # In[10]: 177 | 178 | 179 | from TextModel import * 180 | 181 | 182 | # In[13]: 183 | 184 | 185 | my_opt="get_text_lstm1" 186 | #参数 187 | Y = train['class'].values 188 | 189 | if not os.path.exists("cache/"+my_opt): 190 | os.mkdir("cache/"+my_opt) 191 | 192 | 193 | 194 | # In[14]: 195 | 196 | 197 | from sklearn.model_selection import KFold, StratifiedKFold 198 | gc.collect() 199 | seed = 2006 200 | num_folds = 5 201 | kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y) 202 | 203 | 204 | # In[15]: 205 | 206 | 207 | from keras import backend as K 208 | 209 | epochs = 6 210 | my_opt=eval(my_opt) 211 | train_model_pred = np.zeros((train_.shape[0], classification)) 212 | test_model_pred = np.zeros((test_.shape[0], classification)) 213 | for i, (train_fold, val_fold) in enumerate(kf): 214 | X_train, X_valid, = train_[train_fold, :], train_[val_fold, :] 215 | y_train, y_valid = Y[train_fold], Y[val_fold] 216 | 217 | y_tra = to_categorical(y_train) 218 | y_val = to_categorical(y_valid) 219 | 220 | #模型 221 | name = str(my_opt.__name__) 222 | 223 | model = my_opt(word_seq_len, word_embedding, classification) 224 | 225 | 226 | RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1) 227 | 228 | hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val), 229 | callbacks=[RocAuc]) 230 | 231 | 232 | train_model_pred[val_fold, :] = model.predict(X_valid) 233 | 234 | 235 | del model 236 | del hist 237 | gc.collect() 238 | K.clear_session() 239 | tf.reset_default_graph() 240 | 241 | 242 | 243 | # In[19]: 244 | 245 | 246 | #模型 247 | #用全部的数据预测 248 | train_label = to_categorical(Y) 249 | name = str(my_opt.__name__) 250 | 251 | model = my_opt(word_seq_len, word_embedding, classification) 252 | 253 | 254 | RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1) 255 | 256 | hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label), 257 | callbacks=[RocAuc]) 258 | 259 | 260 | test_model_pred = model.predict(test_) 261 | 262 | 263 | # In[20]: 264 | 265 | 266 | df_train_pred = pd.DataFrame(train_model_pred) 267 | df_test_pred = pd.DataFrame(test_model_pred) 268 | df_train_pred.columns = ['device_start_lstm_pred_' + str(i) for i in range(22)] 269 | df_test_pred.columns = ['device_start_lstm_pred_' + str(i) for i in range(22)] 270 | 271 | 272 | # In[21]: 273 | 274 | 275 | df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1) 276 | df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1) 277 | 278 | 279 | # In[22]: 280 | 281 | 282 | df_results = pd.concat([df_train_pred, df_test_pred]) 283 | df_results.to_csv('device_start_lstm_pred.csv', index=None) 284 | 285 | -------------------------------------------------------------------------------- /THLUO/18.device_start_text_dpcnn_pred.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # coding: utf-8 8 | import feather 9 | import os 10 | import re 11 | import sys 12 | import gc 13 | import random 14 | import pandas as pd 15 | import numpy as np 16 | import gensim 17 | from gensim.models import Word2Vec 18 | from gensim.models.word2vec import LineSentence 19 | from scipy import stats 20 | import tensorflow as tf 21 | import keras 22 | from keras.layers import * 23 | from keras.models import * 24 | from keras.optimizers import * 25 | from keras.callbacks import * 26 | from keras.preprocessing import text, sequence 27 | from keras.utils import to_categorical 28 | from keras.engine.topology import Layer 29 | from sklearn.preprocessing import LabelEncoder 30 | from keras.utils import np_utils 31 | from keras.utils.training_utils import multi_gpu_model 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.metrics import f1_score 34 | from sklearn.model_selection import KFold 35 | from sklearn.metrics import accuracy_score 36 | from sklearn.preprocessing import LabelEncoder 37 | from sklearn.metrics import f1_score 38 | import warnings 39 | warnings.filterwarnings('ignore') 40 | config = tf.ConfigProto() 41 | config.gpu_options.allow_growth = True 42 | session = tf.Session(config=config) 43 | 44 | 45 | # In[2]: 46 | print ('18.device_start_text_dpcnn_pred.py') 47 | 48 | df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv') 49 | deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id']) 50 | deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age']) 51 | df_total = pd.concat([deviceid_train, deviceid_test]) 52 | df_doc = df_doc.merge(df_total, on='device_id', how='left') 53 | 54 | 55 | df_wv2_all = pd.read_csv('w2c_all_emb.csv') 56 | 57 | dic_w2c_all = {} 58 | for row in df_wv2_all.values : 59 | app_id = row[0] 60 | vector = row[1:] 61 | dic_w2c_all[app_id] = vector 62 | 63 | 64 | # In[3]: 65 | 66 | 67 | df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x)) 68 | df_doc['age'] = df_doc['age'].apply(lambda x:str(x)) 69 | def tool(x): 70 | if x=='nan': 71 | return x 72 | else: 73 | return str(int(float(x))) 74 | df_doc['sex']=df_doc['sex'].apply(tool) 75 | df_doc['age']=df_doc['age'].apply(tool) 76 | df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age'] 77 | df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN}) 78 | train = df_doc[df_doc['sex_age'].notnull()] 79 | test = df_doc[df_doc['sex_age'].isnull()] 80 | train.reset_index(drop=True, inplace=True) 81 | test.reset_index(drop=True, inplace=True) 82 | 83 | lb = LabelEncoder() 84 | train_label = lb.fit_transform(train['sex_age'].values) 85 | train['class'] = train_label 86 | 87 | 88 | # In[5]: 89 | 90 | 91 | column_name="app_list" 92 | word_seq_len = 900 93 | victor_size = 200 94 | num_words = 35000 95 | batch_size = 64 96 | classification = 22 97 | kfold=10 98 | 99 | 100 | # In[6]: 101 | 102 | 103 | from sklearn.metrics import log_loss 104 | 105 | def get_mut_label(y_label) : 106 | results = [] 107 | for ele in y_label : 108 | results.append(ele.argmax()) 109 | return results 110 | 111 | class RocAucEvaluation(Callback): 112 | def __init__(self, validation_data=(), interval=1): 113 | super(Callback, self).__init__() 114 | 115 | self.interval = interval 116 | self.X_val, self.y_val = validation_data 117 | 118 | def on_epoch_end(self, epoch, logs={}): 119 | if epoch % self.interval == 0: 120 | y_pred = self.model.predict(self.X_val, verbose=0) 121 | val_y = get_mut_label(self.y_val) 122 | score = log_loss(val_y, y_pred) 123 | print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score)) 124 | 125 | 126 | # In[7]: 127 | 128 | 129 | #词向量 130 | def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words): 131 | 132 | tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="") 133 | tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values)) 134 | 135 | train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_) 136 | test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_) 137 | 138 | word_index = tokenizer.word_index 139 | 140 | count = 0 141 | nb_words = len(word_index) 142 | print(nb_words) 143 | all_data=pd.concat([df_train[col],df_test[col]]) 144 | file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model' 145 | if not os.path.exists(file_name): 146 | model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], 147 | size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2) 148 | model.save(file_name) 149 | else: 150 | model = Word2Vec.load(file_name) 151 | print("add word2vec finished....") 152 | 153 | 154 | 155 | embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) 156 | for word, i in word_index.items(): 157 | embedding_vector = model[word] if word in model else None 158 | if embedding_vector is not None: 159 | count += 1 160 | embedding_word2vec_matrix[i] = embedding_vector 161 | else: 162 | unk_vec = np.random.random(victor_size) * 0.5 163 | unk_vec = unk_vec - unk_vec.mean() 164 | embedding_word2vec_matrix[i] = unk_vec 165 | 166 | embedding_w2c_all = np.zeros((nb_words + 1, victor_size)) 167 | for word, i in word_index.items(): 168 | embedding_vector = dic_w2c_all[word] 169 | embedding_w2c_all[i] = embedding_vector 170 | 171 | 172 | #embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1) 173 | embedding_matrix = embedding_word2vec_matrix 174 | 175 | return train_, test_, word_index, embedding_matrix 176 | 177 | 178 | # In[8]: 179 | 180 | 181 | train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words) 182 | 183 | 184 | # In[10]: 185 | 186 | 187 | from TextModel import * 188 | 189 | 190 | # In[12]: 191 | 192 | 193 | my_opt="get_text_dpcnn" 194 | #参数 195 | Y = train['class'].values 196 | 197 | if not os.path.exists("cache/"+my_opt): 198 | os.mkdir("cache/"+my_opt) 199 | 200 | 201 | 202 | # In[13]: 203 | 204 | 205 | from sklearn.model_selection import KFold, StratifiedKFold 206 | gc.collect() 207 | seed = 2006 208 | num_folds = 5 209 | kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y) 210 | 211 | 212 | # In[14]: 213 | 214 | 215 | from keras import backend as K 216 | 217 | epochs = 6 218 | my_opt=eval(my_opt) 219 | train_model_pred = np.zeros((train_.shape[0], classification)) 220 | test_model_pred = np.zeros((test_.shape[0], classification)) 221 | for i, (train_fold, val_fold) in enumerate(kf): 222 | X_train, X_valid, = train_[train_fold, :], train_[val_fold, :] 223 | y_train, y_valid = Y[train_fold], Y[val_fold] 224 | 225 | y_tra = to_categorical(y_train) 226 | y_val = to_categorical(y_valid) 227 | 228 | #模型 229 | name = str(my_opt.__name__) 230 | 231 | model = my_opt(word_seq_len, word_embedding, classification) 232 | 233 | 234 | RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1) 235 | 236 | hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val), 237 | callbacks=[RocAuc]) 238 | 239 | 240 | train_model_pred[val_fold, :] = model.predict(X_valid) 241 | 242 | 243 | del model 244 | del hist 245 | gc.collect() 246 | K.clear_session() 247 | tf.reset_default_graph() 248 | 249 | 250 | 251 | # In[15]: 252 | 253 | 254 | #模型 255 | #用全部的数据预测 256 | train_label = to_categorical(Y) 257 | name = str(my_opt.__name__) 258 | 259 | model = my_opt(word_seq_len, word_embedding, classification) 260 | 261 | 262 | RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1) 263 | 264 | hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label), 265 | callbacks=[RocAuc]) 266 | 267 | 268 | test_model_pred = model.predict(test_) 269 | 270 | 271 | # In[16]: 272 | 273 | 274 | df_train_pred = pd.DataFrame(train_model_pred) 275 | df_test_pred = pd.DataFrame(test_model_pred) 276 | df_train_pred.columns = ['device_start_text_dpcnn_pred_' + str(i) for i in range(22)] 277 | df_test_pred.columns = ['device_start_text_dpcnn_pred_' + str(i) for i in range(22)] 278 | 279 | 280 | # In[17]: 281 | 282 | 283 | df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1) 284 | df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1) 285 | 286 | 287 | # In[18]: 288 | 289 | 290 | df_results = pd.concat([df_train_pred, df_test_pred]) 291 | df_results.to_csv('device_start_text_dpcnn_pred.csv', index=None) 292 | 293 | -------------------------------------------------------------------------------- /chizhu/stacking/nurbs_feat/xgb__nurbs_nb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import matplotlib.pyplot as plt 17 | import time 18 | from sklearn.feature_extraction.text import TfidfTransformer 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | get_ipython().run_line_magic('matplotlib', 'inline') 21 | 22 | #add 23 | import gc 24 | from sklearn import preprocessing 25 | from sklearn.feature_extraction.text import TfidfVectorizer 26 | 27 | from scipy.sparse import hstack, vstack 28 | from sklearn.model_selection import StratifiedKFold 29 | from sklearn.model_selection import cross_val_score 30 | # from skopt.space import Integer, Categorical, Real, Log10 31 | # from skopt.utils import use_named_args 32 | # from skopt import gp_minimize 33 | from gensim.models import Word2Vec, FastText 34 | import gensim 35 | import re 36 | import os 37 | path="./feature/"##nurbs概率文件路径 38 | o_path="/dev/shm/chizhu_data/data/"###原始文件路径 39 | os.listdir(path) 40 | 41 | 42 | # In[2]: 43 | 44 | 45 | sex_feat=pd.read_csv(path+"feature_sex_all.csv") 46 | age_feat=pd.read_csv(path+"feature_age_all.csv") 47 | # all_feat=pd.read_csv(path+"feature_22_all.csv") 48 | train_id=pd.read_csv(o_path+"deviceid_train.tsv",sep="\t",names=['device_id','sex','age']) 49 | test_id=pd.read_csv(o_path+"deviceid_test.tsv",sep="\t",names=['device_id']) 50 | all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]]) 51 | all_id.index=range(len(all_id)) 52 | sex_feat['device_id']=all_id 53 | age_feat['device_id']=all_id 54 | # deepnn_feat=pd.read_csv(path+"deepnn_fix.csv") 55 | # deepnn_feat['device_id']=deepnn_feat['DeviceID'] 56 | # del deepnn_feat['DeviceID'] 57 | 58 | 59 | # In[3]: 60 | 61 | 62 | train=pd.merge(train_id,sex_feat,on="device_id",how="left") 63 | # train=pd.merge(train,deepnn_feat,on="device_id",how="left") 64 | test=pd.merge(test_id,sex_feat,on="device_id",how="left") 65 | # test=pd.merge(test,deepnn_feat,on="device_id",how="left") 66 | 67 | 68 | # In[4]: 69 | 70 | 71 | features = [x for x in train.columns if x not in ['device_id', 'sex',"age",]] 72 | Y = train['sex'] - 1 73 | 74 | 75 | # In[5]: 76 | 77 | 78 | 79 | import xgboost as xgb 80 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 81 | from sklearn.cross_validation import StratifiedKFold 82 | 83 | kf = StratifiedKFold(Y, n_folds=10, shuffle=True, random_state=1024) 84 | params={ 85 | 'booster':'gbtree', 86 | "tree_method":"gpu_hist", 87 | "gpu_id":"2", 88 | 'objective': 'binary:logistic', 89 | # 'is_unbalance':'True', 90 | # 'scale_pos_weight': 1500.0/13458.0, 91 | 'eval_metric': "logloss", 92 | 93 | 'gamma':0.2,#0.2 is ok 94 | 'max_depth':6, 95 | # 'lambda':20, 96 | # "alpha":5, 97 | 'subsample':0.7, 98 | 'colsample_bytree':0.4 , 99 | # 'min_child_weight':2.5, 100 | 'eta': 0.01, 101 | # 'learning_rate':0.01, 102 | "silent":1, 103 | 'seed':1024, 104 | 'nthread':12, 105 | 106 | } 107 | num_round = 3500 108 | early_stopping_rounds = 100 109 | 110 | 111 | # In[6]: 112 | 113 | 114 | aus = [] 115 | sub1 = np.zeros((len(test), )) 116 | pred_oob1=np.zeros((len(train),)) 117 | for i,(train_index,test_index) in enumerate(kf): 118 | 119 | tr_x = train[features].reindex(index=train_index, copy=False) 120 | tr_y = Y[train_index] 121 | te_x = train[features].reindex(index=test_index, copy=False) 122 | te_y = Y[test_index] 123 | 124 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 125 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 126 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 127 | d_te = xgb.DMatrix(te_x, label=te_y) 128 | watchlist = [(d_tr,'train'), 129 | (d_te,'val') 130 | ] 131 | model = xgb.train(params, d_tr, num_boost_round=5500, 132 | evals=watchlist,verbose_eval=200, 133 | early_stopping_rounds=100) 134 | pred = model.predict(d_te,ntree_limit=model.best_iteration) 135 | pred_oob1[test_index] =pred 136 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 137 | a = log_loss(te_y, pred) 138 | 139 | sub1 += model.predict(xgb.DMatrix(test[features]),ntree_limit=model.best_iteration)/10 140 | 141 | 142 | print ("idx: ", i) 143 | print (" loss: %.5f" % a) 144 | # print " gini: %.5f" % g 145 | aus.append(a) 146 | 147 | print ("mean") 148 | print ("auc: %s" % (sum(aus) / 10.0)) 149 | 150 | 151 | # In[7]: 152 | 153 | 154 | pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2']) 155 | sub1 = pd.DataFrame(sub1, columns=['sex2']) 156 | res1=pd.concat([pred_oob1,sub1]) 157 | res1['sex1'] = 1-res1['sex2'] 158 | 159 | 160 | # In[8]: 161 | 162 | 163 | import gc 164 | gc.collect() 165 | 166 | 167 | # In[9]: 168 | 169 | 170 | train=pd.merge(train_id,age_feat,on="device_id",how="left") 171 | # train=pd.merge(train,deepnn_feat,on="device_id",how="left") 172 | test=pd.merge(test_id,age_feat,on="device_id",how="left") 173 | # test=pd.merge(test,deepnn_feat,on="device_id",how="left") 174 | 175 | 176 | # In[10]: 177 | 178 | 179 | ####sex1 180 | test['sex']=1 181 | 182 | 183 | # In[11]: 184 | 185 | 186 | features = [x for x in train.columns if x not in ['device_id',"age"]] 187 | Y = train['age'] 188 | 189 | 190 | # In[12]: 191 | 192 | 193 | import lightgbm as lgb 194 | import xgboost as xgb 195 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 196 | from sklearn.cross_validation import StratifiedKFold 197 | 198 | kf = StratifiedKFold(Y, n_folds=10, shuffle=True, random_state=1024) 199 | params={ 200 | 'booster':'gbtree', 201 | "tree_method":"gpu_hist", 202 | "gpu_id":"2", 203 | 'objective': 'multi:softprob', 204 | # 'is_unbalance':'True', 205 | # 'scale_pos_weight': 1500.0/13458.0, 206 | 'eval_metric': "mlogloss", 207 | 'num_class':11, 208 | 'gamma':0.1,#0.2 is ok 209 | 'max_depth':6, 210 | # 'lambda':20, 211 | # "alpha":5, 212 | 'subsample':0.7, 213 | 'colsample_bytree':0.4 , 214 | # 'min_child_weight':2.5, 215 | 'eta': 0.01, 216 | # 'learning_rate':0.01, 217 | "silent":1, 218 | 'seed':1024, 219 | 'nthread':12, 220 | 221 | } 222 | num_round = 3500 223 | early_stopping_rounds = 100 224 | 225 | 226 | # In[13]: 227 | 228 | 229 | aus = [] 230 | sub2 = np.zeros((len(test),11 )) 231 | pred_oob2=np.zeros((len(train),11)) 232 | models=[] 233 | iters=[] 234 | for i,(train_index,test_index) in enumerate(kf): 235 | 236 | tr_x = train[features].reindex(index=train_index, copy=False) 237 | tr_y = Y[train_index] 238 | te_x = train[features].reindex(index=test_index, copy=False) 239 | te_y = Y[test_index] 240 | 241 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 242 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 243 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 244 | d_te = xgb.DMatrix(te_x, label=te_y) 245 | watchlist = [(d_tr,'train'), 246 | (d_te,'val') 247 | ] 248 | model = xgb.train(params, d_tr, num_boost_round=5500, 249 | evals=watchlist,verbose_eval=200, 250 | early_stopping_rounds=100) 251 | models.append(model) 252 | iters.append(model.best_iteration) 253 | pred = model.predict(d_te,ntree_limit=model.best_iteration) 254 | pred_oob2[test_index] =pred 255 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 256 | a = log_loss(te_y, pred) 257 | 258 | sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=model.best_iteration)/10 259 | 260 | 261 | print ("idx: ", i) 262 | print (" loss: %.5f" % a) 263 | # print " gini: %.5f" % g 264 | aus.append(a) 265 | 266 | print ("mean") 267 | print ("auc: %s" % (sum(aus) / 10.0)) 268 | 269 | 270 | # In[14]: 271 | 272 | 273 | res2_1=np.vstack((pred_oob2,sub2)) 274 | res2_1 = pd.DataFrame(res2_1) 275 | 276 | 277 | # In[15]: 278 | 279 | 280 | ###sex2 281 | test['sex']=2 282 | features = [x for x in train.columns if x not in ['device_id',"age"]] 283 | Y = train['age'] 284 | 285 | 286 | # In[16]: 287 | 288 | 289 | aus = [] 290 | sub2 = np.zeros((len(test),11 )) 291 | for model,it in zip(models,iters): 292 | sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=it)/10 293 | res2_2=np.vstack((pred_oob2,sub2)) 294 | res2_2 = pd.DataFrame(res2_2) 295 | 296 | 297 | # In[17]: 298 | 299 | 300 | res1.index=range(len(res1)) 301 | res2_1.index=range(len(res2_1)) 302 | res2_2.index=range(len(res2_2)) 303 | final_1=res2_1.copy() 304 | final_2=res2_2.copy() 305 | for i in range(11): 306 | final_1[i]=res1['sex1']*res2_1[i] 307 | final_2[i]=res1['sex2']*res2_2[i] 308 | id_list=pd.concat([train[['device_id']],test[['device_id']]]) 309 | final=id_list 310 | final.index=range(len(final)) 311 | final.columns= ['DeviceID'] 312 | final_pred = pd.concat([final_1,final_2],1) 313 | final=pd.concat([final,final_pred],1) 314 | final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 315 | '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 316 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10'] 317 | 318 | final.to_csv('xgb_feat_nurbs_nb_10fold.csv', index=False) 319 | 320 | 321 | # In[18]: 322 | 323 | 324 | test['DeviceID']=test['device_id'] 325 | sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left") 326 | sub.to_csv("xgb_nurbs_nb_10fold.csv",index=False) 327 | 328 | -------------------------------------------------------------------------------- /nb_cz_lwl_wcm/4_get_feature_device_start_close_tfidf_1_2.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import pandas as pd 4 | import scipy.sparse 5 | import numpy as np 6 | from sklearn import preprocessing 7 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 8 | 9 | train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None) 10 | test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None) 11 | 12 | data_all = pd.concat([train, test], axis=0) 13 | data_all = data_all.rename({0:'id'}, axis=1) 14 | del data_all[1],data_all[2] 15 | 16 | start_close_time = pd.read_csv('Demo/deviceid_package_start_close.tsv', sep='\t', header=None) 17 | start_close_time = start_close_time.rename({0:'id', 1:'app_name', 2:'start_time', 3:'close_time'}, axis=1) 18 | 19 | start_close_time = start_close_time.sort_values(by='start_time') 20 | 21 | start_close_time['start_time'] = map(int,start_close_time['start_time']/1000) 22 | start_close_time['close_time'] = map(int,start_close_time['close_time']/1000) 23 | 24 | unique_app_name = np.unique(start_close_time['app_name']) 25 | dict_label = dict(zip(list(unique_app_name), list(np.arange(0, len(unique_app_name), 1)))) 26 | import time 27 | start_close_time['app_name'] = start_close_time['app_name'].apply(lambda row: str(dict_label[row])) 28 | 29 | del start_close_time['start_time'], start_close_time['close_time'] 30 | 31 | from tqdm import tqdm, tqdm_pandas 32 | tqdm_pandas(tqdm()) 33 | def dealed_row(row): 34 | app_name_list = list(row['app_name']) 35 | return ' '.join(app_name_list) 36 | 37 | data_feature = start_close_time.groupby('id').progress_apply(lambda row:dealed_row(row)).reset_index() 38 | data_feature = pd.merge(data_all, data_feature, on='id', how='left') 39 | del data_feature['id'] 40 | 41 | count_vec = CountVectorizer(ngram_range=(1,3)) 42 | count_csr_basic = count_vec.fit_transform(data_feature[0]) 43 | tfidf_vec = TfidfVectorizer(ngram_range=(1,3)) 44 | tfidf_vec_basic = tfidf_vec.fit_transform(data_feature[0]) 45 | 46 | data_feature = scipy.sparse.csr_matrix(scipy.sparse.hstack([count_csr_basic, tfidf_vec_basic])) 47 | 48 | 49 | from sklearn.cluster import KMeans 50 | from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier 51 | from sklearn.metrics import mean_squared_error 52 | from sklearn.naive_bayes import BernoulliNB, MultinomialNB 53 | from sklearn.svm import LinearSVC 54 | from sklearn.cross_validation import StratifiedKFold 55 | 56 | train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None) 57 | test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None) 58 | def get_label(row): 59 | if row[1] == 1: 60 | return row[2] 61 | else: 62 | return row[2] + 11 63 | train['label'] = train.apply(lambda row:get_label(row), axis=1) 64 | data_all = pd.concat([train, test], axis=0) 65 | data_all = data_all.rename({0:'id'}, axis=1) 66 | del data_all[1],data_all[2] 67 | 68 | train_feature = data_feature[:len(train)] 69 | score = train['label'] 70 | test_feature = data_feature[len(train):] 71 | number = len(np.unique(score)) 72 | 73 | # 五则交叉验证 74 | n_folds = 5 75 | print('处理完毕') 76 | 77 | ########################### lr(LogisticRegression) ################################ 78 | print('lr stacking') 79 | stack_train = np.zeros((len(train), number)) 80 | stack_test = np.zeros((len(test), number)) 81 | score_va = 0 82 | 83 | for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)): 84 | print('stack:%d/%d' % ((i + 1), n_folds)) 85 | clf = LogisticRegression(random_state=1017, C=8) 86 | clf.fit(train_feature[tr], score[tr]) 87 | score_va = clf.predict_proba(train_feature[va]) 88 | score_te = clf.predict_proba(test_feature) 89 | print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va])))) 90 | stack_train[va] += score_va 91 | stack_test += score_te 92 | stack_test /= n_folds 93 | stack = np.vstack([stack_train, stack_test]) 94 | df_stack = pd.DataFrame() 95 | for i in range(stack.shape[1]): 96 | df_stack['tfidf_lr_2_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) 97 | df_stack.to_csv('feature/tfidf_lr_1_3_error_single_classfiy.csv', index=None, encoding='utf8') 98 | print('lr特征已保存\n') 99 | 100 | ########################### SGD(随机梯度下降) ################################ 101 | print('sgd stacking') 102 | stack_train = np.zeros((len(train), number)) 103 | stack_test = np.zeros((len(test), number)) 104 | score_va = 0 105 | 106 | for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)): 107 | print('stack:%d/%d' % ((i + 1), n_folds)) 108 | sgd = SGDClassifier(random_state=1017, loss='log') 109 | sgd.fit(train_feature[tr], score[tr]) 110 | score_va = sgd.predict_proba(train_feature[va]) 111 | score_te = sgd.predict_proba(test_feature) 112 | print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va])))) 113 | stack_train[va] += score_va 114 | stack_test += score_te 115 | stack_test /= n_folds 116 | stack = np.vstack([stack_train, stack_test]) 117 | df_stack = pd.DataFrame() 118 | for i in range(stack.shape[1]): 119 | df_stack['tfidf_2_sgd_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) 120 | df_stack.to_csv('feature/tfidf_sgd_1_3_error_single_classfiy.csv', index=None, encoding='utf8') 121 | print('sgd特征已保存\n') 122 | 123 | ########################### pac(PassiveAggressiveClassifier) ################################ 124 | print('PAC stacking') 125 | stack_train = np.zeros((len(train), number)) 126 | stack_test = np.zeros((len(test), number)) 127 | score_va = 0 128 | 129 | for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)): 130 | print('stack:%d/%d' % ((i + 1), n_folds)) 131 | pac = PassiveAggressiveClassifier(random_state=1017) 132 | pac.fit(train_feature[tr], score[tr]) 133 | score_va = pac._predict_proba_lr(train_feature[va]) 134 | score_te = pac._predict_proba_lr(test_feature) 135 | print(score_va) 136 | print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va])))) 137 | stack_train[va] += score_va 138 | stack_test += score_te 139 | stack_test /= n_folds 140 | stack = np.vstack([stack_train, stack_test]) 141 | df_stack = pd.DataFrame() 142 | for i in range(stack.shape[1]): 143 | df_stack['tfidf_pac_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) 144 | df_stack.to_csv('feature/tfidf_pac_1_3_error_single_classfiy.csv', index=None, encoding='utf8') 145 | print('pac特征已保存\n') 146 | 147 | 148 | ########################### ridge(RidgeClassfiy) ################################ 149 | print('RidgeClassfiy stacking') 150 | stack_train = np.zeros((len(train), number)) 151 | stack_test = np.zeros((len(test), number)) 152 | score_va = 0 153 | 154 | for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)): 155 | print('stack:%d/%d' % ((i + 1), n_folds)) 156 | ridge = RidgeClassifier(random_state=1017) 157 | ridge.fit(train_feature[tr], score[tr]) 158 | score_va = ridge._predict_proba_lr(train_feature[va]) 159 | score_te = ridge._predict_proba_lr(test_feature) 160 | print(score_va) 161 | print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va])))) 162 | stack_train[va] += score_va 163 | stack_test += score_te 164 | stack_test /= n_folds 165 | stack = np.vstack([stack_train, stack_test]) 166 | df_stack = pd.DataFrame() 167 | for i in range(stack.shape[1]): 168 | df_stack['tfidf_ridge_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) 169 | df_stack.to_csv('feature/tfidf_ridge_1_3_error_single_classfiy.csv', index=None, encoding='utf8') 170 | print('ridge特征已保存\n') 171 | 172 | 173 | ########################### bnb(BernoulliNB) ################################ 174 | print('BernoulliNB stacking') 175 | stack_train = np.zeros((len(train), number)) 176 | stack_test = np.zeros((len(test), number)) 177 | score_va = 0 178 | 179 | for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)): 180 | print('stack:%d/%d' % ((i + 1), n_folds)) 181 | bnb = BernoulliNB() 182 | bnb.fit(train_feature[tr], score[tr]) 183 | score_va = bnb.predict_proba(train_feature[va]) 184 | score_te = bnb.predict_proba(test_feature) 185 | print(score_va) 186 | print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va])))) 187 | stack_train[va] += score_va 188 | stack_test += score_te 189 | stack_test /= n_folds 190 | stack = np.vstack([stack_train, stack_test]) 191 | df_stack = pd.DataFrame() 192 | for i in range(stack.shape[1]): 193 | df_stack['tfidf_bnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) 194 | df_stack.to_csv('feature/tfidf_bnb_1_3_error_single_classfiy.csv', index=None, encoding='utf8') 195 | print('BernoulliNB特征已保存\n') 196 | 197 | ########################### mnb(MultinomialNB) ################################ 198 | print('MultinomialNB stacking') 199 | stack_train = np.zeros((len(train), number)) 200 | stack_test = np.zeros((len(test), number)) 201 | score_va = 0 202 | 203 | for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)): 204 | print('stack:%d/%d' % ((i + 1), n_folds)) 205 | mnb = MultinomialNB() 206 | mnb.fit(train_feature[tr], score[tr]) 207 | score_va = mnb.predict_proba(train_feature[va]) 208 | score_te = mnb.predict_proba(test_feature) 209 | print(score_va) 210 | print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va])))) 211 | stack_train[va] += score_va 212 | stack_test += score_te 213 | stack_test /= n_folds 214 | stack = np.vstack([stack_train, stack_test]) 215 | df_stack = pd.DataFrame() 216 | for i in range(stack.shape[1]): 217 | df_stack['tfidf_mnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) 218 | df_stack.to_csv('feature/tfidf_mnb_1_3_error_single_classfiy.csv', index=None, encoding='utf8') 219 | print('MultinomialNB特征已保存\n') 220 | -------------------------------------------------------------------------------- /chizhu/single_model/xgb_nb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import numpy as np 10 | from tqdm import tqdm 11 | from sklearn.decomposition import LatentDirichletAllocation 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | import lightgbm as lgb 15 | from datetime import datetime,timedelta 16 | import matplotlib.pyplot as plt 17 | import time 18 | from sklearn.feature_extraction.text import TfidfTransformer 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | # get_ipython().run_line_magic('matplotlib', 'inline') 21 | 22 | #add 23 | import gc 24 | from sklearn import preprocessing 25 | from sklearn.feature_extraction.text import TfidfVectorizer 26 | 27 | from scipy.sparse import hstack, vstack 28 | from sklearn.model_selection import StratifiedKFold 29 | from sklearn.model_selection import cross_val_score 30 | # from skopt.space import Integer, Categorical, Real, Log10 31 | # from skopt.utils import use_named_args 32 | # from skopt import gp_minimize 33 | from gensim.models import Word2Vec, FastText 34 | import gensim 35 | import re 36 | # path="/dev/shm/chizhu_data/data/" 37 | 38 | 39 | # In[2]: 40 | 41 | 42 | tfidf_feat=pd.read_csv("data/tfidf_classfiy.csv") 43 | tf2=pd.read_csv("data/tfidf_classfiy_package.csv") 44 | train_data=pd.read_csv("data/train_data.csv") 45 | test_data=pd.read_csv("data/test_data.csv") 46 | 47 | 48 | # In[4]: 49 | 50 | 51 | train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left") 52 | train = pd.merge(train_data,tf2,on="device_id",how="left") 53 | test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left") 54 | test = pd.merge(test_data,tf2,on="device_id",how="left") 55 | 56 | 57 | # In[5]: 58 | 59 | 60 | features = [x for x in train.columns if x not in ['device_id', 'sex',"age","label","app"]] 61 | Y = train['sex'] - 1 62 | 63 | 64 | # In[5]: 65 | 66 | 67 | import lightgbm as lgb 68 | import xgboost as xgb 69 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 70 | from sklearn.cross_validation import StratifiedKFold 71 | 72 | kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024) 73 | params={ 74 | 'booster':'gbtree', 75 | 'objective': 'binary:logistic', 76 | # 'is_unbalance':'True', 77 | # 'scale_pos_weight': 1500.0/13458.0, 78 | 'eval_metric': "logloss", 79 | 80 | 'gamma':0.2,#0.2 is ok 81 | 'max_depth':6, 82 | # 'lambda':20, 83 | # "alpha":5, 84 | 'subsample':0.7, 85 | 'colsample_bytree':0.4 , 86 | # 'min_child_weight':2.5, 87 | 'eta': 0.01, 88 | # 'learning_rate':0.01, 89 | "silent":1, 90 | 'seed':1024, 91 | 'nthread':12, 92 | 93 | } 94 | num_round = 3500 95 | early_stopping_rounds = 100 96 | 97 | 98 | # In[6]: 99 | 100 | 101 | aus = [] 102 | sub1 = np.zeros((len(test), )) 103 | pred_oob1=np.zeros((len(train),)) 104 | for i,(train_index,test_index) in enumerate(kf): 105 | 106 | tr_x = train[features].reindex(index=train_index, copy=False) 107 | tr_y = Y[train_index] 108 | te_x = train[features].reindex(index=test_index, copy=False) 109 | te_y = Y[test_index] 110 | 111 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 112 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 113 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 114 | d_te = xgb.DMatrix(te_x, label=te_y) 115 | watchlist = [(d_tr,'train'), 116 | (d_te,'val') 117 | ] 118 | model = xgb.train(params, d_tr, num_boost_round=5500, 119 | evals=watchlist,verbose_eval=200, 120 | early_stopping_rounds=100) 121 | pred = model.predict(d_te) 122 | pred_oob1[test_index] =pred 123 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 124 | a = log_loss(te_y, pred) 125 | 126 | sub1 += model.predict(xgb.DMatrix(test[features]))/5 127 | 128 | 129 | print ("idx: ", i) 130 | print (" loss: %.5f" % a) 131 | # print " gini: %.5f" % g 132 | aus.append(a) 133 | 134 | print ("mean") 135 | print ("auc: %s" % (sum(aus) / 5.0)) 136 | 137 | 138 | # In[7]: 139 | 140 | 141 | pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2']) 142 | sub1 = pd.DataFrame(sub1, columns=['sex2']) 143 | res1=pd.concat([pred_oob1,sub1]) 144 | res1['sex1'] = 1-res1['sex2'] 145 | 146 | 147 | # In[8]: 148 | 149 | 150 | import gc 151 | gc.collect() 152 | 153 | 154 | # In[9]: 155 | 156 | 157 | tfidf_feat=pd.read_csv("data/tfidf_age.csv") 158 | tf2=pd.read_csv("data/pack_tfidf_age.csv") 159 | train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left") 160 | train = pd.merge(train_data,tf2,on="device_id",how="left") 161 | test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left") 162 | test = pd.merge(test_data,tf2,on="device_id",how="left") 163 | 164 | 165 | # In[10]: 166 | 167 | 168 | ####sex1 169 | test['sex']=1 170 | 171 | 172 | # In[11]: 173 | 174 | 175 | features = [x for x in train.columns if x not in ['device_id',"age","label","app"]] 176 | Y = train['age'] 177 | 178 | 179 | # In[12]: 180 | 181 | 182 | import lightgbm as lgb 183 | import xgboost as xgb 184 | from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score 185 | from sklearn.cross_validation import StratifiedKFold 186 | 187 | kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024) 188 | params={ 189 | 'booster':'gbtree', 190 | 'objective': 'multi:softprob', 191 | # 'is_unbalance':'True', 192 | # 'scale_pos_weight': 1500.0/13458.0, 193 | 'eval_metric': "mlogloss", 194 | 'num_class':11, 195 | 'gamma':0.1,#0.2 is ok 196 | 'max_depth':6, 197 | # 'lambda':20, 198 | # "alpha":5, 199 | 'subsample':0.7, 200 | 'colsample_bytree':0.4 , 201 | # 'min_child_weight':2.5, 202 | 'eta': 0.01, 203 | # 'learning_rate':0.01, 204 | "silent":1, 205 | 'seed':1024, 206 | 'nthread':12, 207 | 208 | } 209 | num_round = 3500 210 | early_stopping_rounds = 100 211 | 212 | 213 | # In[13]: 214 | 215 | 216 | aus = [] 217 | sub2 = np.zeros((len(test),11 )) 218 | pred_oob2=np.zeros((len(train),11)) 219 | for i,(train_index,test_index) in enumerate(kf): 220 | 221 | tr_x = train[features].reindex(index=train_index, copy=False) 222 | tr_y = Y[train_index] 223 | te_x = train[features].reindex(index=test_index, copy=False) 224 | te_y = Y[test_index] 225 | 226 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 227 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 228 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 229 | d_te = xgb.DMatrix(te_x, label=te_y) 230 | watchlist = [(d_tr,'train'), 231 | (d_te,'val') 232 | ] 233 | model = xgb.train(params, d_tr, num_boost_round=5500, 234 | evals=watchlist,verbose_eval=200, 235 | early_stopping_rounds=100) 236 | pred = model.predict(d_te) 237 | pred_oob2[test_index] =pred 238 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 239 | a = log_loss(te_y, pred) 240 | 241 | sub2 += model.predict(xgb.DMatrix(test[features]))/5 242 | 243 | 244 | print ("idx: ", i) 245 | print (" loss: %.5f" % a) 246 | # print " gini: %.5f" % g 247 | aus.append(a) 248 | 249 | print ("mean") 250 | print ("auc: %s" % (sum(aus) / 5.0)) 251 | 252 | 253 | # In[14]: 254 | 255 | 256 | res2_1=np.vstack((pred_oob2,sub2)) 257 | res2_1 = pd.DataFrame(res2_1) 258 | 259 | 260 | # In[ ]: 261 | 262 | 263 | ###sex2 264 | test['sex']=2 265 | features = [x for x in train.columns if x not in ['device_id',"age","label","app"]] 266 | Y = train['age'] 267 | 268 | 269 | # In[ ]: 270 | 271 | 272 | aus = [] 273 | sub2 = np.zeros((len(test),11 )) 274 | pred_oob2=np.zeros((len(train),11)) 275 | for i,(train_index,test_index) in enumerate(kf): 276 | 277 | tr_x = train[features].reindex(index=train_index, copy=False) 278 | tr_y = Y[train_index] 279 | te_x = train[features].reindex(index=test_index, copy=False) 280 | te_y = Y[test_index] 281 | 282 | # tr_y=tr_y.apply(lambda x:1 if x>0 else 0) 283 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 284 | d_tr = xgb.DMatrix(tr_x, label=tr_y) 285 | d_te = xgb.DMatrix(te_x, label=te_y) 286 | watchlist = [(d_tr,'train'), 287 | (d_te,'val') 288 | ] 289 | model = xgb.train(params, d_tr, num_boost_round=5500, 290 | evals=watchlist,verbose_eval=200, 291 | early_stopping_rounds=100) 292 | pred = model.predict(d_te) 293 | pred_oob2[test_index] =pred 294 | # te_y=te_y.apply(lambda x:1 if x>0 else 0) 295 | a = log_loss(te_y, pred) 296 | 297 | sub2 += model.predict(xgb.DMatrix(test[features]))/5 298 | 299 | 300 | print ("idx: ", i) 301 | print (" loss: %.5f" % a) 302 | # print " gini: %.5f" % g 303 | aus.append(a) 304 | 305 | print ("mean") 306 | print ("auc: %s" % (sum(aus) / 5.0)) 307 | 308 | 309 | # In[ ]: 310 | 311 | 312 | res2_2=np.vstack((pred_oob2,sub2)) 313 | res2_2 = pd.DataFrame(res2_2) 314 | 315 | 316 | # In[ ]: 317 | 318 | 319 | res1.index=range(len(res1)) 320 | res2_1.index=range(len(res2_1)) 321 | res2_2.index=range(len(res2_2)) 322 | final_1=res2_1.copy() 323 | final_2=res2_2.copy() 324 | for i in range(11): 325 | final_1[i]=res1['sex1']*res2_1[i] 326 | final_2[i]=res1['sex2']*res2_2[i] 327 | id_list=pd.concat([train[['device_id']],test[['device_id']]]) 328 | final=id_list 329 | final.index=range(len(final)) 330 | final.columns= ['DeviceID'] 331 | final_pred = pd.concat([final_1,final_2],1) 332 | final=pd.concat([final,final_pred],1) 333 | final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', 334 | '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', 335 | '2-5', '2-6', '2-7', '2-8', '2-9', '2-10'] 336 | 337 | final.to_csv('submit/xgb_feat_chizhu_nb.csv', index=False) 338 | 339 | 340 | # In[ ]: 341 | 342 | 343 | test['DeviceID']=test['device_id'] 344 | sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left") 345 | sub.to_csv("submit/xgb_chizhu_nb.csv",index=False) 346 | 347 | --------------------------------------------------------------------------------