├── README.md ├── run.sh └── src ├── blending └── f1_blend_and_submit.py ├── keras ├── f10_AGE_m11_transformer_lstm_5inputs_train_fold.py ├── f11_GENDER_m1_transformer_3inputs_train_fold.py ├── f12_GENDER_m2_transformer_lstm_3inputs_train_fold.py ├── f13_merge_fold_results.py ├── f1_save_data.py ├── f2_save_sequence.py ├── f3_save_embeddings.py ├── f4_AGE_m3_lstm_4inputs_train_fold.py ├── f5_AGE_m4_transformer_4inputs_train_fold.py ├── f6_AGE_m5_transformer_3inputs_train_fold.py ├── f7_AGE_m6_transformer_lstm_2inputs_train_fold.py ├── f8_AGE_m8_transformer_lstm_3inputs_2r_train_fold.py └── f9_AGE_m10_transformer_lstm_5inputs_train_fold.py ├── lgb ├── f1_save_tfidf_countvec.py ├── f2_save_target_encoding.py ├── f3_save_AGE_tf_idf_stacking_feats.py ├── f4_save_GENDER_tf_idf_stacking_feats.py └── f5_run_fold_training.py ├── stacking ├── f1_merge_stacking_feats.py ├── f2_save_embeddings.py ├── f3_stacking_DNN_120probs_train_fold.py ├── f4_stacking_transformer_2inputs_90probs_train_fold.py └── f5_merge_fold_results.py └── torch ├── f1_save_data.py ├── f2_save_embedding_w2v.py ├── f3_AGE_m7_lstm_3inputs_train_5fold.py ├── f4_AGE_m1_lstm_6inputs_train_5fold.py ├── f5_AGE_m9_transformer_3inputs_train_5fold.py ├── f6_AGE_m2_transformer_6inputs_train_5fold.py ├── f7_save_data.py ├── f8_AGE_GENDER_m13_transformer_4inputs_train_5fold.py ├── lookahead.py ├── m13_transformer_4inputs.py ├── m1_lstm_6inputs_age.py ├── m2_transformer_6inputs_age.py ├── m7_lstm_3inputs_age.py ├── m9_transformer_3inputs_age.py └── pytorchtools.py /README.md: -------------------------------------------------------------------------------- 1 | 队伍: 最后一次打比赛 2 | 队员:[jackhuntcn](https://github.com/jackhuntcn)、[PandasCute](https://github.com/PandasCute) 、[LogicJake](https://github.com/LogicJake) 3 | ## 运行环境 4 | ### 硬件 5 | - P40 显存 24G 6 | - 内存 114G 或以上 7 | - 磁盘 300G 或以上 8 | 9 | ### 软件 10 | run.sh 有安装指令 11 | 12 | - gensim 13 | - torch 14 | - transformers 15 | - keras == 2.3.1 16 | - keras_self_attention 17 | - keras_multi_head 18 | - keras_position_wise_feed_forward 19 | - keras_layer_normalization 20 | - lightgbm 21 | 22 | ## 代码目录 23 | 24 | ``` 25 | . 26 | ├── data 27 | │   ├── keras 28 | │   ├── lgb 29 | │   └── torch 30 | ├── models 31 | ├── probs 32 | ├── raw_data 33 | │   ├── test 34 | │   ├── train_preliminary 35 | │   └── train_semi_final 36 | ├── run.sh 37 | ├── src 38 | │   ├── blending 39 | │   │   └── f1_blend_and_submit.py 40 | │   ├── keras 41 | │   │   ├── f10_AGE_m11_transformer_lstm_5inputs_train_fold.py 42 | │   │   ├── f11_GENDER_m1_transformer_3inputs_train_fold.py 43 | │   │   ├── f12_GENDER_m2_transformer_lstm_3inputs_train_fold.py 44 | │   │   ├── f13_merge_fold_results.py 45 | │   │   ├── f1_save_data.py 46 | │   │   ├── f2_save_sequence.py 47 | │   │   ├── f3_save_embeddings.py 48 | │   │   ├── f4_AGE_m3_lstm_4inputs_train_fold.py 49 | │   │   ├── f5_AGE_m4_transformer_4inputs_train_fold.py 50 | │   │   ├── f6_AGE_m5_transformer_3inputs_train_fold.py 51 | │   │   ├── f7_AGE_m6_transformer_lstm_2inputs_train_fold.py 52 | │   │   ├── f8_AGE_m8_transformer_lstm_3inputs_2r_train_fold.py 53 | │   │   └── f9_AGE_m10_transformer_lstm_5inputs_train_fold.py 54 | │   ├── lgb 55 | │   │   ├── f1_save_tfidf_countvec.py 56 | │   │   ├── f2_save_target_encoding.py 57 | │   │   ├── f3_save_AGE_tf_idf_stacking_feats.py 58 | │   │   ├── f4_save_GENDER_tf_idf_stacking_feats.py 59 | │   │   └── f5_run_fold_training.py 60 | │   ├── stacking 61 | │   │   ├── f1_merge_stacking_feats.py 62 | │   │   ├── f2_save_embeddings.py 63 | │   │   ├── f3_stacking_DNN_120probs_train_fold.py 64 | │   │   ├── f4_stacking_transformer_2inputs_90probs_train_fold.py 65 | │   │   └── f5_merge_fold_results.py 66 | │   └── torch 67 | │   ├── f1_save_data.py 68 | │   ├── f2_save_embedding_w2v.py 69 | │   ├── f3_AGE_m7_lstm_3inputs_train_5fold.py 70 | │   ├── f4_AGE_m1_lstm_6inputs_train_5fold.py 71 | │   ├── f5_AGE_m9_transformer_3inputs_train_5fold.py 72 | │   ├── f6_AGE_m2_transformer_6inputs_train_5fold.py 73 | │   ├── f7_save_data.py 74 | │   ├── f8_AGE_GENDER_m13_transformer_4inputs_train_5fold.py 75 | │   ├── lookahead.py 76 | │   ├── m13_transformer_4inputs.py 77 | │   ├── m1_lstm_6inputs_age.py 78 | │   ├── m2_transformer_6inputs_age.py 79 | │   ├── m7_lstm_3inputs_age.py 80 | │   ├── m9_transformer_3inputs_age.py 81 | │   └── pytorchtools.py 82 | └── w2v_models 83 | 84 | 17 directories, 40 files 85 | ``` 86 | 87 | - src 运行代码目录, 分为 torch/keras/lgb 三种框架 88 | - data 预处理完成数据目录 89 | - models 模型生成目录 90 | - probs 模型生成概率存放目录 91 | - raw_data 比赛的原始数据, 包含初赛和复赛数据 92 | - run.sh 一键执行脚本 93 | - w2v_models 为存放 w2v embedding 模型的目录 94 | 95 | ## 模型说明 96 | 97 | ## keras 98 | 99 | 对 age 和 gender 两个目标分别建模 100 | 101 | 每种模型分为不同的输入 id 个数,具体模型如下: (分数均指 A 榜分数,下同) 102 | 103 | #### AGE 104 | 105 | - LSTM + Attention 四输入五折, 线上大概 0.512 106 | - transformer 四输入五折, 线上 0.516 107 | - transformer 三输入五折, 线上 0.515 108 | - transformer + LSTM 二输入五折,线上大概 0.515 109 | - transformer + LSTM 三输入五折,线上大概 0.515 110 | - transformer + LSTM 四输入五折,线上 0.517 111 | - transformer + LSTM 五输入五折,线上大概 0.517 112 | 113 | #### GENDER 114 | - transformer 三输入五折, 线上 0.9500 115 | - transformer + LSTM 三输入五折, 线上 0.9501 116 | 117 | 118 | ## torch 119 | 120 | 原生 transformers 对 age 和 gender 两个目标分别建模 121 | 122 | huggingface transformers 同时对两个目录建模,两路输出 123 | 124 | #### AGE 125 | 126 | - LSTM + Attention 六输入五折, 线上 0.513 127 | - transformer + LSTM 六输入五折, 线上 0.516 128 | - transformer + LSTM 三输入五折, 线上 0.514 129 | 130 | #### AGE & GENDER 131 | 132 | - AGE: transformer + LSTM 四输入五折, 线上 0.519 133 | - GENDER: transformer + LSTM 四输入五折, 线下 0.9468, 线上未测 134 | 135 | ## LGB 136 | 137 | LGB 使用的特征为 TF-IDF 和 COUNTVEC 以及目标编码特征 138 | 139 | 后期加入了线性模型产生的概率特征, AGE 线下分数大概 0.48 140 | 141 | 本次比赛中表现不如 NN 强势, 只采用了 AGE 概率用于 stacking 142 | 143 | ## stacking 144 | 145 | 本次比赛使用了两种 stacking 方式: 146 | 147 | - 纯概率特征 stacking: 将上面模型跑出来的概率分层进入 DNN (如 keras 产生的概率与 keras 产生的概率合并,torch 概率与 torch 概率合并, keras 概率先进入, torch 在 keras 概率经过了几层全连接之后再 concat, 实验证明这种做法可以避免相关性较高的概率带来的融合不利影响), 五折线上分数大概为 0.525 148 | - 混合特征 stacking: 一方面采用了不同的两 id 序列输入的 transformers 模型,在最后经过全连接层之前跟上面的九个相关性较低的模型进行 concat, 起到一种类似于残差的作用,避免过拟合, 五折线上分数为 0.523 149 | 150 | ## blending 151 | 152 | ### AGE 153 | 154 | 比赛结束前一周我们使用了 huggingface transformers 重新实现了 transformer + LSTM 模型, 五折分数为 0.519, 而且相关度与之前实现的 keras 和 torch 都较低, 只有 0.93,0.94 左右 (相比 keras 之间的相似度高达 0.98, torch 之间相似度 0.96), 所有我们单独将这个模型与上面所生成的两个 stacking 模型进行融合,取得 0.52780 线上分数: 155 | 156 | 157 | ``` 158 | 0.50 * DNN_stacking + 0.15 * transformer_stacking + 0.35 * age_m13 159 | ``` 160 | 161 | ### GENDER 162 | 163 | Gender 主要是三个 transformer 模型进行基本均等的融合:线上分数 0.95048 164 | 165 | ``` 166 | 0.35 * gender_m1 + 0.35 * gender_m2 + 0.30 * gender_m3 167 | ``` 168 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 安装必要的 packages 4 | 5 | pip install gensim 6 | pip install keras==2.3.1 7 | pip install keras_self_attention keras_multi_head keras_position_wise_feed_forward keras_layer_normalization 8 | pip install pytorch 9 | pip install transformers 10 | pip install lightgbm 11 | 12 | # keras 13 | 14 | cd src/keras 15 | 16 | python f1_save_data.py 17 | python f2_save_sequence.py 18 | python f3_save_embeddings.py 19 | for i in `seq 0 4`; do python f4_AGE_m3_lstm_4inputs_train_fold.py "fold${i}"; done 20 | for i in `seq 0 4`; do python f5_AGE_m4_transformer_4inputs_train_fold.py "fold${i}"; done 21 | for i in `seq 0 4`; do python f6_AGE_m5_transformer_3inputs_train_fold.py "fold${i}"; done 22 | for i in `seq 0 4`; do python f7_AGE_m6_transformer_lstm_2inputs_train_fold.py "fold${i}"; done 23 | for i in `seq 0 4`; do python f8_AGE_m8_transformer_lstm_3inputs_2r_train_fold.py "fold${i}"; done 24 | for i in `seq 0 4`; do python f9_AGE_m10_transformer_lstm_5inputs_train_fold.py "fold${i}"; done 25 | for i in `seq 0 4`; do python f10_AGE_m11_transformer_lstm_5inputs_train_fold.py "fold${i}"; done 26 | for i in `seq 0 4`; do python f11_GENDER_m1_transformer_3inputs_train_fold.py "fold${i}"; done 27 | for i in `seq 0 4`; do python f12_GENDER_m2_transformer_lstm_3inputs_train_fold.py "fold${i}"; done 28 | python f13_merge_fold_results.py 'age_m3_keras' 29 | python f13_merge_fold_results.py 'age_m4_keras' 30 | python f13_merge_fold_results.py 'age_m5_keras' 31 | python f13_merge_fold_results.py 'age_m6_keras' 32 | python f13_merge_fold_results.py 'age_m8_keras' 33 | python f13_merge_fold_results.py 'age_m10_keras' 34 | python f13_merge_fold_results.py 'age_m11_keras' 35 | python f13_merge_fold_results.py 'gender_m1_keras' 36 | python f13_merge_fold_results.py 'gender_m2_keras' 37 | 38 | cd ../.. 39 | 40 | # torch 41 | 42 | cd src/torch 43 | 44 | python f1_save_data.py 45 | python f2_save_embedding_w2v.py 46 | python f3_AGE_m7_lstm_3inputs_train_5fold.py 47 | python f4_AGE_m1_lstm_6inputs_train_5fold.py 48 | python f5_AGE_m9_transformer_3inputs_train_5fold.py 49 | python f6_AGE_m2_transformer_6inputs_train_5fold.py 50 | python f7_save_data.py 51 | python f8_AGE_GENDER_m13_transformer_4inputs_train_5fold.py 52 | 53 | cd ../.. 54 | 55 | # lgb 56 | 57 | cd src/lgb 58 | 59 | python f1_save_tfidf_countvec.py 60 | python f2_save_target_encoding.py 61 | python f3_save_AGE_tf_idf_stacking_feats.py 62 | python f4_save_GENDER_tf_idf_stacking_feats.py 63 | python f5_run_fold_training.py 64 | 65 | cd ../.. 66 | 67 | # stacking 68 | 69 | cd src/stacking 70 | 71 | python f1_merge_stacking_feats.py 72 | python f2_save_embeddings.py 73 | for i in `seq 0 4`; do python f3_stacking_DNN_120probs_train_fold.py "fold${i}"; done 74 | for i in `seq 0 4`; do python f4_stacking_transformer_2inputs_90probs_train_fold.py "fold${i}"; done 75 | python f5_merge_fold_results.py "dnn_stacking" 76 | python f5_merge_fold_results.py "transformer_stacking" 77 | 78 | cd ../.. 79 | 80 | # blend and submit 81 | 82 | cd src/blending 83 | 84 | python f1_blend_and_submit.py 85 | -------------------------------------------------------------------------------- /src/blending/f1_blend_and_submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ########################################################################################## 5 | # AGE: 0.50 * DNN_stacking + 0.15 * transformer_stacking + 0.35 * age_m13 score: 0.52780 6 | # GENDER: 0.35 * gender_m1 + 0.35 * gender_m2 + 0.30 * gender_m3 score: 0.95048 7 | ########################################################################################## 8 | 9 | import warnings 10 | warnings.simplefilter('ignore') 11 | 12 | import pandas as pd 13 | import numpy as np 14 | 15 | from scipy.special import softmax 16 | 17 | 18 | prob1_age_0 = np.load('../../probs/sub_age_dnn_stacking_0.npy') 19 | prob1_age_1 = np.load('../../probs/sub_age_dnn_stacking_1.npy') 20 | prob1_age_2 = np.load('../../probs/sub_age_dnn_stacking_2.npy') 21 | prob1_age_3 = np.load('../../probs/sub_age_dnn_stacking_3.npy') 22 | prob1_age_4 = np.load('../../probs/sub_age_dnn_stacking_4.npy') 23 | prob1_age = (prob1_age_0 + prob1_age_1 + prob1_age_2 + prob1_age_3 + prob1_age_4) / 5 24 | 25 | prob2_age_0 = np.load('../../probs/sub_age_transformer_stacking_0.npy') 26 | prob2_age_1 = np.load('../../probs/sub_age_transformer_stacking_1.npy') 27 | prob2_age_2 = np.load('../../probs/sub_age_transformer_stacking_2.npy') 28 | prob2_age_3 = np.load('../../probs/sub_age_transformer_stacking_3.npy') 29 | prob2_age_4 = np.load('../../probs/sub_age_transformer_stacking_4.npy') 30 | prob2_age = (prob2_age_0 + prob2_age_1 + prob2_age_2 + prob2_age_3 + prob2_age_4) / 5 31 | 32 | prob3_age = softmax(np.load('../../probs/sub_age_m13_torch.npy'), axis=1) 33 | 34 | prob_age = 0.5*prob1_age + 0.15*prob2_age + 0.35*prob3_age 35 | 36 | 37 | prob1_gender_0 = np.load('../../probs/sub_gender_m1_keras_0.npy')[:,0] 38 | prob1_gender_1 = np.load('../../probs/sub_gender_m1_keras_1.npy')[:,0] 39 | prob1_gender_2 = np.load('../../probs/sub_gender_m1_keras_2.npy')[:,0] 40 | prob1_gender_3 = np.load('../../probs/sub_gender_m1_keras_3.npy')[:,0] 41 | prob1_gender_4 = np.load('../../probs/sub_gender_m1_keras_4.npy')[:,0] 42 | prob1_gender = (prob1_gender_0 + prob1_gender_1 + prob1_gender_2 + prob1_gender_3 + prob1_gender_4) / 5 43 | 44 | prob2_gender_0 = np.load('../../probs/sub_gender_m2_keras_0.npy')[:,0] 45 | prob2_gender_1 = np.load('../../probs/sub_gender_m2_keras_1.npy')[:,0] 46 | prob2_gender_2 = np.load('../../probs/sub_gender_m2_keras_2.npy')[:,0] 47 | prob2_gender_3 = np.load('../../probs/sub_gender_m2_keras_3.npy')[:,0] 48 | prob2_gender_4 = np.load('../../probs/sub_gender_m2_keras_4.npy')[:,0] 49 | prob2_gender = (prob2_gender_0 + prob2_gender_1 + prob2_gender_2 + prob2_gender_3 + prob2_gender_4) / 5 50 | 51 | prob3_gender = softmax(np.load('../../probs/sub_gender_m3_torch.npy'), axis=1)[:,1] 52 | 53 | prob_gender = 0.35*prob1_gender + 0.35*prob2_gender + 0.3*prob3_gender 54 | 55 | 56 | sub = pd.DataFrame({'user_id': range(3000001,4000001), 'predicted_age': [-1]*1000000, 'predicted_gender': [-1]*1000000}) 57 | sub['predicted_age'] = np.argmax(prob_age, axis=1) + 1 58 | 59 | 60 | sub['prob_gender'] = prob_gender 61 | sub['predicted_gender'] = sub['prob'].apply(lambda x: 2 if x>0.5 else 1) 62 | sub.drop(['prob_gender'], axis=1, inplace=True) 63 | 64 | 65 | 66 | sub.to_csv('../../submissions.csv') 67 | 68 | -------------------------------------------------------------------------------- /src/keras/f13_merge_fold_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ############################################################################# 5 | # 将五折跑完的结果合并成 oof 和 sub 概率 6 | ############################################################################# 7 | 8 | import sys 9 | 10 | import warnings 11 | warnings.simplefilter('ignore') 12 | 13 | import numpy as np 14 | import pandas as pd 15 | 16 | from sklearn.metrics import accuracy_score 17 | 18 | # 参数类似 age_m3_keras or gender_m1_keras 19 | name = sys.argv[1] 20 | 21 | probs_pth = '../../probs' 22 | 23 | train_f0 = np.load(f'{probs_pth}/oof_{name}_4.npy') 24 | train_f1 = np.load(f'{probs_pth}/oof_{name}_3.npy') 25 | train_f2 = np.load(f'{probs_pth}/oof_{name}_2.npy') 26 | train_f3 = np.load(f'{probs_pth}/oof_{name}_1.npy') 27 | train_f4 = np.load(f'{probs_pth}/oof_{name}_0.npy') 28 | 29 | test_f0 = np.load(f'{probs_pth}/sub_{name}_4.npy') 30 | test_f1 = np.load(f'{probs_pth}/sub_{name}_3.npy') 31 | test_f2 = np.load(f'{probs_pth}/sub_{name}_2.npy') 32 | test_f3 = np.load(f'{probs_pth}/sub_{name}_1.npy') 33 | test_f4 = np.load(f'{probs_pth}/sub_{name}_0.npy') 34 | 35 | 36 | oof_probs = np.concatenate((train_f0, train_f1, train_f2, train_f3, train_f4), axis=0) 37 | test_probs = (test_f0 + test_f1 + test_f2 + test_f3 + test_f4) / 5 38 | 39 | 40 | np.save(f'{probs_pth}/oof_{name}', oof_probs) 41 | np.save(f'{probs_pth}/sub_{name}', test_probs) 42 | 43 | -------------------------------------------------------------------------------- /src/keras/f1_save_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | 6 | import warnings 7 | warnings.simplefilter('ignore') 8 | 9 | import numpy as np 10 | import pandas as pd 11 | pd.set_option('max_columns', None) 12 | pd.set_option('max_rows', 1000) 13 | pd.set_option('float_format', lambda x: '%.6f' % x) 14 | 15 | import pickle 16 | import gc 17 | import logging 18 | 19 | from tqdm.autonotebook import * 20 | 21 | import gensim 22 | from gensim.models import FastText, Word2Vec 23 | 24 | 25 | user_train_1 = pd.read_csv('../../raw_data/train_preliminary/user.csv') 26 | user_train_2 = pd.read_csv('../../raw_data/train_semi_final/user.csv') 27 | user_train = pd.concat([user_train_1, user_train_2]) 28 | 29 | click_train_1 = pd.read_csv('../../raw_data/train_preliminary/click_log.csv') 30 | click_train_2 = pd.read_csv('../../raw_data/train_semi_final/click_log.csv') 31 | click_train = pd.concat([click_train_1, click_train_2]) 32 | click_test = pd.read_csv('../../raw_data/test/click_log.csv') 33 | 34 | ad_train_1 = pd.read_csv('../../raw_data/train_preliminary/ad.csv') 35 | ad_train_2 = pd.read_csv('../../raw_data/train_semi_final/ad.csv') 36 | ad_train = pd.concat([ad_train_1, ad_train_2]) 37 | ad_test = pd.read_csv('../../raw_data/test/ad.csv') 38 | 39 | del user_train_1, user_train_2 40 | del click_train_1, click_train_2 41 | del ad_train_1, ad_train_2 42 | gc.collect() 43 | 44 | 45 | 46 | df_train = pd.merge(click_train, ad_train, on='creative_id') 47 | df_test = pd.merge(click_test, ad_test, on='creative_id') 48 | 49 | df_train = df_train.drop_duplicates() 50 | df_test = df_test.drop_duplicates() 51 | 52 | del click_train, click_test, ad_train, ad_test 53 | gc.collect() 54 | 55 | 56 | df_train = pd.merge(df_train, user_train, on='user_id') 57 | 58 | df = pd.concat([df_train, df_test]) 59 | 60 | del df_train, df_test 61 | gc.collect() 62 | 63 | 64 | df = df.sort_values(by=['user_id', 'time', 'click_times'], ascending=[True, True, True]).reset_index(drop=True) 65 | 66 | 67 | df.to_pickle('../../data/keras/df_user_click_ad.pickle') 68 | 69 | -------------------------------------------------------------------------------- /src/keras/f2_save_sequence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ############################################################################# 5 | # 对各不同的 id 生成序列数据 6 | ############################################################################# 7 | 8 | import warnings 9 | warnings.simplefilter('ignore') 10 | 11 | import numpy as np 12 | import pandas as pd 13 | pd.set_option('max_columns', 1000) 14 | pd.set_option('max_rows', None) 15 | pd.set_option('display.float_format',lambda x : '%.6f' % x) 16 | 17 | from pandarallel import pandarallel 18 | pandarallel.initialize() 19 | 20 | from gensim.models import Word2Vec 21 | 22 | from tqdm import tqdm 23 | tqdm.pandas() 24 | 25 | import gc 26 | 27 | 28 | df = pd.read_pickle('../../data/keras/df_user_click_ad.pickle') 29 | 30 | 31 | # click_times 32 | df_clicktimes = df.groupby('user_id')['click_times'].agg(clks=list) 33 | df_clicktimes = df_clicktimes.reset_index(drop=True) 34 | 35 | df_clicktimes.to_pickle('../../data/keras/df_clicktimes_sequence.pickle') 36 | 37 | 38 | # creative_id 39 | df_creative = df.groupby('user_id')['creative_id'].agg(cids=list) 40 | df_creative = df_creative.reset_index(drop=True) 41 | 42 | df_creative.to_pickle('../../data/keras/df_creative_sequence.pickle') 43 | 44 | 45 | # advertiser_id 46 | df_advertiser = df.groupby('user_id')['advertiser_id'].agg(advids=list) 47 | df_advertiser = df_advertiser.reset_index(drop=True) 48 | 49 | df_advertiser.to_pickle('../../data/keras/df_advertiser_sequence.pickle') 50 | 51 | 52 | # ad_id 53 | df_ad = df.groupby('user_id')['ad_id'].agg(aids=list) 54 | df_ad = df_ad.reset_index(drop=True) 55 | 56 | df_ad.to_pickle('../../data/keras/df_ad_sequence.pickle') 57 | 58 | 59 | # industry 60 | df_industry = df.groupby('user_id')['industry'].agg(industry=list) 61 | df_industry = df_industry.reset_index(drop=True) 62 | 63 | df_industry.to_pickle('../../data/keras/df_industry_sequence.pickle') 64 | 65 | 66 | # product 67 | df_product = df.groupby('user_id')['product_id'].agg(pids=list) 68 | df_product = df_product.reset_index(drop=True) 69 | 70 | df_product.to_pickle('../../data/keras/df_product_sequence.pickle') 71 | 72 | -------------------------------------------------------------------------------- /src/keras/f3_save_embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | import warnings 6 | warnings.simplefilter('ignore') 7 | 8 | import numpy as np 9 | import pandas as pd 10 | pd.set_option('max_columns', None) 11 | pd.set_option('max_rows', 1000) 12 | 13 | import pickle 14 | import gc 15 | import logging 16 | 17 | from tqdm.autonotebook import * 18 | 19 | import gensim 20 | from gensim.models import FastText, Word2Vec 21 | 22 | from keras.preprocessing import text, sequence 23 | from keras.preprocessing.text import Tokenizer, text_to_word_sequence 24 | from keras.preprocessing.sequence import pad_sequences 25 | 26 | 27 | window = 8 28 | max_len = 100 29 | min_count = 3 30 | iter_ = 10 31 | emb_dim_cid = 128 32 | emb_dim_aid = 128 33 | emb_dim_advid = 128 34 | emb_dim_pid = 128 35 | emb_dim_ind = 128 36 | emb_dim_clk = 128 37 | 38 | 39 | def set_tokenizer(docs, split_char=' '): 40 | tokenizer = Tokenizer(lower=False, char_level=False, split=split_char) 41 | tokenizer.fit_on_texts(docs) 42 | X = tokenizer.texts_to_sequences(docs) 43 | maxlen = max_len 44 | X = pad_sequences(X, maxlen=maxlen, value=0) 45 | word_index = tokenizer.word_index 46 | return X, word_index 47 | 48 | 49 | def trian_save_word2vec(sentences, emb_dim, save_name='w2v.txt', split_char=' '): 50 | input_docs = [] 51 | for i in sentences: 52 | input_docs.append([ii for ii in i]) 53 | logging.basicConfig( 54 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO 55 | ) 56 | w2v = Word2Vec(input_docs, 57 | size=emb_dim, 58 | sg=1, 59 | window=window, 60 | seed=2020, 61 | workers=18, 62 | min_count=min_count, 63 | iter=iter_) 64 | w2v.wv.save_word2vec_format(save_name) 65 | return w2v 66 | 67 | 68 | def get_embedding_matrix(word_index, embed_size=128, Emed_path="w2v_300.txt"): 69 | embeddings_index = gensim.models.KeyedVectors.load_word2vec_format( 70 | Emed_path, binary=False) 71 | nb_words = len(word_index)+1 72 | embedding_matrix = np.zeros((nb_words, embed_size)) 73 | count = 0 74 | for word, i in tqdm(word_index.items()): 75 | if i >= nb_words: 76 | continue 77 | try: 78 | embedding_vector = embeddings_index[word] 79 | except: 80 | embedding_vector = np.zeros(embed_size) 81 | count += 1 82 | if embedding_vector is not None: 83 | embedding_matrix[i] = embedding_vector 84 | print("null cnt", count) 85 | return embedding_matrix 86 | 87 | 88 | 89 | # click_times 90 | df = pd.read_pickle('../../data/keras/df_clicktimes_sequence.pickle') 91 | clk_list = list(df['clks']) 92 | 93 | for i in tqdm(range(0, len(clk_list))): 94 | clk_list[i] =[str(ii) for ii in clk_list[i]] 95 | 96 | x_clk, index_clk = set_tokenizer(clk_list, split_char=',') 97 | trian_save_word2vec(clk_list, 98 | emb_dim_clk, 99 | save_name=f'../../w2v_models/clk_w2v_{emb_dim_cid}_win{window}_iter{iter_}_mincount{min_count}.txt', 100 | split_char=',') 101 | 102 | del df 103 | gc.collect() 104 | 105 | # creative_od 106 | df = pd.read_pickle('data/df_creative_sequence.pickle') 107 | cid_list = list(df['cids']) 108 | 109 | for i in tqdm(range(0, len(cid_list))): 110 | cid_list[i] =[str(ii) for ii in cid_list[i]] 111 | 112 | x_cid, index_cid = set_tokenizer(cid_list, split_char=',') 113 | trian_save_word2vec(cid_list, 114 | emb_dim_cid, 115 | save_name=f'../../w2v_models/cid_w2v_{emb_dim_cid}_win{window}_iter{iter_}_mincount{min_count}.txt', 116 | split_char=',') 117 | 118 | del df 119 | gc.collect() 120 | 121 | 122 | # advertiser_id 123 | df = pd.read_pickle('../../data/keras/df_advertiser_sequence.pickle') 124 | advid_list = list(df['advids']) 125 | 126 | for i in tqdm(range(0, len(advid_list))): 127 | advid_list[i] =[str(ii) for ii in advid_list[i]] 128 | 129 | x_advid, index_advid = set_tokenizer(advid_list, split_char=',') 130 | trian_save_word2vec(advid_list, 131 | emb_dim_advid, 132 | save_name=f'../../w2v_models/advid_w2v_{emb_dim_advid}_win{window}_iter{iter_}_mincount{min_count}.txt', 133 | split_char=',') 134 | 135 | del df 136 | gc.collect() 137 | 138 | 139 | # product_id 140 | df = pd.read_pickle('data/df_product_sequence.pickle') 141 | pid_list = list(df['pids']) 142 | 143 | for i in tqdm(range(0, len(pid_list))): 144 | pid_list[i] =[str(ii) for ii in pid_list[i]] 145 | 146 | x_pid, index_pid = set_tokenizer(pid_list, split_char=',') 147 | trian_save_word2vec(pid_list, 148 | emb_dim_pid, 149 | save_name=f'../../w2v_models/pid_w2v_{emb_dim_pid}_win{window}_iter{iter_}_mincount{min_count}.txt', 150 | split_char=',') 151 | 152 | del df 153 | gc.collect() 154 | 155 | 156 | # ad_id 157 | df = pd.read_pickle('data/df_ad_sequence.pickle') 158 | ad_list = list(df['aids']) 159 | 160 | for i in tqdm(range(0, len(ad_list))): 161 | ad_list[i] =[str(ii) for ii in ad_list[i]] 162 | 163 | x_adid, index_adid = set_tokenizer(ad_list, split_char=',') 164 | trian_save_word2vec(ad_list, 165 | emb_dim_aid, 166 | save_name=f'../../w2v_models/adid_w2v_{emb_dim_aid}_win{window}_iter{iter_}_mincount{min_count}.txt', 167 | split_char=',') 168 | 169 | del df 170 | gc.collect() 171 | 172 | 173 | # industry 174 | df = pd.read_pickle('data/df_industry_sequence.pickle') 175 | indid_list = list(df['industry']) 176 | 177 | for i in tqdm(range(0, len(indid_list))): 178 | indid_list[i] =[str(ii) for ii in indid_list[i]] 179 | 180 | x_indid, index_indid = set_tokenizer(indid_list, split_char=',') 181 | trian_save_word2vec(indid_list, 182 | emb_dim_ind, 183 | save_name=f'../../w2v_models/indid_w2v_{emb_dim_ind}_win{window}_iter{iter_}_mincount{min_count}.txt', 184 | split_char=',') 185 | -------------------------------------------------------------------------------- /src/keras/f7_AGE_m6_transformer_lstm_2inputs_train_fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ################################################################################# 5 | # AGE model 6: Keras transformer+LSTM 2 inputs 6 | # score: 7 | # 五折: 0.50463 (线下) 8 | # 训练时长: ~ 3 days 9 | ################################################################################# 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | import sys 15 | import time 16 | import pickle 17 | import gc 18 | import logging 19 | 20 | from tqdm import tqdm 21 | 22 | import gensim 23 | from gensim.models import FastText, Word2Vec 24 | 25 | import keras 26 | from keras import layers 27 | from keras import callbacks 28 | 29 | from keras.preprocessing import text, sequence 30 | from keras.preprocessing.text import Tokenizer, text_to_word_sequence 31 | from keras.preprocessing.sequence import pad_sequences 32 | 33 | from keras_multi_head import MultiHead, MultiHeadAttention 34 | from keras_self_attention import SeqSelfAttention 35 | from keras_position_wise_feed_forward import FeedForward 36 | from keras_layer_normalization import LayerNormalization 37 | 38 | 39 | fold = sys.argv[1] 40 | 41 | max_len = 120 42 | emb_dim_cid = 128 43 | emb_dim_advid = 128 44 | 45 | batch_size = 1024 46 | 47 | 48 | def set_tokenizer(docs, split_char=' '): 49 | tokenizer = Tokenizer(lower=False, char_level=False, split=split_char) 50 | tokenizer.fit_on_texts(docs) 51 | X = tokenizer.texts_to_sequences(docs) 52 | maxlen = max_len 53 | X = pad_sequences(X, maxlen=maxlen, value=0) 54 | word_index = tokenizer.word_index 55 | return X, word_index 56 | 57 | 58 | def get_embedding_matrix(word_index, embed_size=128, Emed_path="w2v_300.txt"): 59 | embeddings_index = gensim.models.KeyedVectors.load_word2vec_format( 60 | Emed_path, binary=False) 61 | nb_words = len(word_index)+1 62 | embedding_matrix = np.zeros((nb_words, embed_size)) 63 | count = 0 64 | for word, i in word_index.items(): 65 | if i >= nb_words: 66 | continue 67 | try: 68 | embedding_vector = embeddings_index[word] 69 | except: 70 | embedding_vector = np.zeros(embed_size) 71 | count += 1 72 | if embedding_vector is not None: 73 | embedding_matrix[i] = embedding_vector 74 | return embedding_matrix 75 | 76 | 77 | 78 | print("loading sequence data and embedding") 79 | start_time = time.time() 80 | 81 | 82 | print("loading creative id") 83 | df = pd.read_pickle('../../data/keras/df_creative_sequence.pickle') 84 | cid_list = list(df['cids']) 85 | for i in range(0, len(cid_list)): 86 | cid_list[i] =[str(ii) for ii in cid_list[i]] 87 | 88 | x_cid, index_cid = set_tokenizer(cid_list, split_char=',') 89 | emb_cid = get_embedding_matrix(index_cid, 90 | embed_size=emb_dim_cid, 91 | Emed_path='../../w2v_models/cid_w2v_128_win8_iter10_mincount3.txt') 92 | del df, cid_list, index_cid 93 | gc.collect() 94 | 95 | 96 | print("loading advertiser id") 97 | df = pd.read_pickle('data/df_advertiser_sequence.pickle') 98 | advid_list = list(df['advids']) 99 | for i in range(0, len(advid_list)): 100 | advid_list[i] =[str(ii) for ii in advid_list[i]] 101 | 102 | x_advid, index_advid = set_tokenizer(advid_list, split_char=',') 103 | emb_advid = get_embedding_matrix(index_advid, 104 | embed_size=emb_dim_advid, 105 | Emed_path='../../w2v_models/advid_w2v_128_win8_iter10_mincount3.txt') 106 | del df, advid_list, index_advid 107 | gc.collect() 108 | 109 | 110 | used_minutes = (time.time() - start_time) / 60 111 | print(f"done, used {used_minutes} minutes") 112 | 113 | 114 | print("loading labels") 115 | start_time = time.time() 116 | 117 | labels_1 = pd.read_csv('../../raw_data/train_preliminary/user.csv') 118 | labels_2 = pd.read_csv('../../raw_data/train_semi_final/user.csv') 119 | labels = pd.concat([labels_1, labels_2]) 120 | labels['age'] = labels['age'] - 1 121 | labels['gender'] = labels['gender'] - 1 122 | 123 | used_minutes = (time.time() - start_time) / 60 124 | print(f"done, used {used_minutes} minutes") 125 | 126 | 127 | print("split train, valid and test data") 128 | start_time = time.time() 129 | 130 | y = keras.utils.to_categorical(labels['age']) 131 | 132 | if fold == "fold0": 133 | train_cid = x_cid[:2400000] 134 | valid_cid = x_cid[2400000:3000000] 135 | train_advid = x_advid[:2400000] 136 | valid_advid = x_advid[2400000:3000000] 137 | y_train = y[:2400000] 138 | y_valid = y[2400000:] 139 | elif fold == "fold1": 140 | train_cid = np.concatenate((x_cid[:1800000], x_cid[2400000:3000000]), axis=0) 141 | valid_cid = x_cid[1800000:2400000] 142 | train_advid = np.concatenate((x_advid[:1800000], x_advid[2400000:3000000]), axis=0) 143 | valid_advid = x_advid[1800000:2400000] 144 | y_train = np.concatenate((y[:1800000], y[2400000:3000000])) 145 | y_valid = y[1800000:2400000] 146 | elif fold == "fold2": 147 | train_cid = np.concatenate((x_cid[:1200000], x_cid[1800000:3000000]), axis=0) 148 | valid_cid = x_cid[1200000:1800000] 149 | train_advid = np.concatenate((x_advid[:1200000], x_advid[1800000:3000000]), axis=0) 150 | valid_advid = x_advid[1200000:1800000] 151 | y_train = np.concatenate((y[:1200000], y[1800000:3000000])) 152 | y_valid = y[1200000:1800000] 153 | elif fold == "fold3": 154 | train_cid = np.concatenate((x_cid[:600000], x_cid[1200000:3000000]), axis=0) 155 | valid_cid = x_cid[600000:1200000] 156 | train_advid = np.concatenate((x_advid[:600000], x_advid[1200000:3000000]), axis=0) 157 | valid_advid = x_advid[600000:1200000] 158 | y_train = np.concatenate((y[:600000], y[1200000:3000000])) 159 | y_valid = y[600000:1200000] 160 | elif fold == "fold4": 161 | train_cid = x_cid[600000:3000000] 162 | valid_cid = x_cid[:600000] 163 | train_advid = x_advid[600000:3000000] 164 | valid_advid = x_advid[:600000] 165 | y_train = y[600000:3000000] 166 | y_valid = y[:600000] 167 | else: 168 | pass 169 | 170 | test_cid = x_cid[3000000:] 171 | test_advid = x_advid[3000000:] 172 | 173 | del x_cid, x_advid 174 | del y 175 | gc.collect() 176 | 177 | print(train_cid.shape, valid_cid.shape, test_cid.shape) 178 | print(train_advid.shape, valid_advid.shape, test_advid.shape) 179 | print(y_train.shape, y_valid.shape) 180 | 181 | used_minutes = (time.time() - start_time) / 60 182 | print(f"done, used {used_minutes} minutes") 183 | 184 | 185 | print("building model") 186 | 187 | start_time = time.time() 188 | 189 | def build_model(emb_cid, emb_advid): 190 | 191 | inp1 = layers.Input(shape=(max_len,)) 192 | inp2 = layers.Input(shape=(max_len,)) 193 | 194 | emb1 = layers.Embedding( 195 | input_dim=emb_cid.shape[0], 196 | output_dim=emb_cid.shape[1], 197 | input_length=max_len, 198 | weights=[emb_cid], 199 | trainable=False 200 | )(inp1) 201 | emb2 = layers.Embedding( 202 | input_dim=emb_advid.shape[0], 203 | output_dim=emb_advid.shape[1], 204 | input_length=max_len, 205 | weights=[emb_advid], 206 | trainable=False 207 | )(inp2) 208 | 209 | sdrop = layers.SpatialDropout1D(rate=0.2) 210 | 211 | emb1 = sdrop(emb1) 212 | emb2 = sdrop(emb2) 213 | 214 | content = layers.Concatenate()([emb1, emb2]) 215 | 216 | mha = MultiHeadAttention(head_num=16)(content) 217 | mha = layers.Dropout(0.01)(mha) 218 | mha = layers.Add()([content, mha]) 219 | mha = LayerNormalization()(mha) 220 | mha = layers.Dropout(0.01)(mha) 221 | mha_ff = FeedForward(256)(mha) 222 | mha_out = layers.Add()([mha, mha_ff]) 223 | mha_out = LayerNormalization()(mha_out) 224 | 225 | lstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(mha_out) 226 | 227 | avg_pool = layers.GlobalAveragePooling1D()(lstm) 228 | max_pool = layers.GlobalMaxPool1D()(lstm) 229 | 230 | x = layers.Concatenate()([avg_pool, max_pool]) 231 | 232 | x = layers.Dense(128, activation='relu')(x) 233 | x = layers.BatchNormalization()(x) 234 | 235 | x = layers.Dense(64, activation='relu')(x) 236 | x = layers.BatchNormalization()(x) 237 | 238 | x = layers.Dropout(0.1)(x) 239 | 240 | out = layers.Dense(10, activation='softmax')(x) 241 | model = keras.Model(inputs=[inp1, inp2], outputs=out) 242 | model.compile(loss='categorical_crossentropy', 243 | optimizer=keras.optimizers.Adam(1e-3), 244 | metrics=['accuracy']) 245 | 246 | return model 247 | 248 | 249 | model = build_model(emb_cid, emb_advid) 250 | 251 | used_minutes = (time.time() - start_time) / 60 252 | print(f"done, used {used_minutes} minutes") 253 | 254 | 255 | 256 | checkpoint = callbacks.ModelCheckpoint(f'../../models/age_m6_{fold}.h5', 257 | monitor='val_accuracy', 258 | verbose=1, 259 | save_best_only=True, 260 | mode='max', 261 | save_weights_only=True) 262 | 263 | reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_accuracy', 264 | factor=0.1, 265 | patience=2, 266 | verbose=1, 267 | mode='max', 268 | epsilon=1e-6) 269 | 270 | early_stop = callbacks.EarlyStopping(monitor='val_accuracy', 271 | mode='max', 272 | patience=5) 273 | 274 | 275 | hist = model.fit([train_cid, train_advid], 276 | y_train, 277 | batch_size=batch_size, 278 | epochs=100, 279 | validation_data=([valid_cid, valid_advid], y_valid), 280 | callbacks=[checkpoint, reduce_lr, early_stop], 281 | verbose=1, 282 | shuffle=True) 283 | 284 | 285 | acc = max(hist.history['val_accuracy']) 286 | print(acc) 287 | 288 | 289 | print("predict start") 290 | start_time = time.time() 291 | 292 | model.load_weights(f'../../models/age_m6_{fold}.h5') 293 | preds = model.predict([test_cid, test_advid], 294 | batch_size=batch_size, 295 | verbose=1) 296 | 297 | np.save(f'../../probs/sub_age_m6_keras_{fold}', preds) 298 | 299 | used_minutes = (time.time() - start_time) / 60 300 | print(f"done, used {used_minutes} minutes") 301 | 302 | 303 | 304 | print("save oof start") 305 | start_time = time.time() 306 | 307 | valid_preds = model.predict([valid_cid, valid_advid], 308 | batch_size=batch_size, 309 | verbose=1) 310 | 311 | np.save(f'../../probs/oof_age_m6_keras_{fold}', valid_preds) 312 | 313 | used_minutes = (time.time() - start_time) / 60 314 | print(f"done, used {used_minutes} minutes") 315 | 316 | 317 | -------------------------------------------------------------------------------- /src/lgb/f1_save_tfidf_countvec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ######################################################################## 5 | # 将用户的点击日志按 user_id, time, click_times 聚合排列生成点击序列 6 | # 生成 TF-IDF 特征和 COUNTVEC 特征 7 | ######################################################################## 8 | 9 | import warnings 10 | warnings.filterwarnings("ignore") 11 | 12 | import pandas as pd 13 | import numpy as np 14 | 15 | import os 16 | import json 17 | import gc 18 | from tqdm import tqdm 19 | 20 | from sklearn.model_selection import train_test_split 21 | from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold 22 | from sklearn.preprocessing import LabelEncoder 23 | from sklearn.preprocessing import StandardScaler 24 | from sklearn.metrics import mean_absolute_error 25 | from sklearn.linear_model import LinearRegression 26 | from sklearn.preprocessing import StandardScaler as std 27 | from sklearn.kernel_ridge import KernelRidge 28 | from sklearn.feature_extraction.text import TfidfVectorizer 29 | from sklearn.feature_extraction.text import CountVectorizer 30 | from sklearn.decomposition import TruncatedSVD 31 | from sklearn.metrics import f1_score 32 | from sklearn.metrics import precision_score, recall_score 33 | from sklearn.metrics import roc_auc_score 34 | 35 | import time 36 | import datetime 37 | from datetime import datetime, timedelta 38 | 39 | from scipy.signal import hilbert 40 | from scipy.signal import hann 41 | from scipy.signal import convolve 42 | from scipy import stats 43 | from scipy import sparse 44 | import scipy.spatial.distance as dist 45 | 46 | from collections import Counter 47 | from statistics import mode 48 | 49 | import math 50 | from itertools import product 51 | import ast 52 | 53 | 54 | # 降低数据内存使用的函数 55 | def reduce_mem_usage(df, verbose=True): 56 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 57 | start_mem = df.memory_usage().sum() / 1024**2 58 | for col in df.columns: 59 | col_type = df[col].dtypes 60 | if col_type in numerics: 61 | c_min = df[col].min() 62 | c_max = df[col].max() 63 | if str(col_type)[:3] == 'int': 64 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 65 | df[col] = df[col].astype(np.int8) 66 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 67 | df[col] = df[col].astype(np.int16) 68 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 69 | df[col] = df[col].astype(np.int32) 70 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 71 | df[col] = df[col].astype(np.int64) 72 | else: 73 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 74 | df[col] = df[col].astype(np.float16) 75 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 76 | df[col] = df[col].astype(np.float32) 77 | else: 78 | df[col] = df[col].astype(np.float64) 79 | end_mem = df.memory_usage().sum() / 1024**2 80 | if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem)) 81 | return df 82 | 83 | path = '../../raw_data/' 84 | 85 | train_log1 = pd.read_csv(path+'train_preliminary/click_log.csv') 86 | train_log2 = pd.read_csv(path+'train_semi_final/click_log.csv') 87 | train_log = pd.concat([train_log1, train_log2]) 88 | train_ad1 = pd.read_csv(path+'train_preliminary/ad.csv') 89 | train_ad2 = pd.read_csv(path+'train_semi_final/ad.csv') 90 | train_ad = pd.concat([train_ad1, train_ad2]) 91 | train_log = pd.merge(train_log, train_ad, on='creative_id') 92 | train_log = train_log.drop_duplicates() 93 | 94 | test_log = pd.read_csv(path+'test/click_log.csv') 95 | test_ad = pd.read_csv(path+'test/ad.csv') 96 | test_log = pd.merge(test_log, test_ad, on='creative_id') 97 | 98 | df_log = pd.concat([train_log, test_log]) 99 | df_log = df_log.drop_duplicates() 100 | data = df_log.sort_values(by=['user_id', 'time','click_times'], ascending=[True,True,True]).reset_index(drop=True) 101 | 102 | 103 | data['user_items']='tim'+data['time'].astype(str)+','+'crea'+data['creative_id'].astype(str)+','+'ad'+data['ad_id'].astype(str)+','+'prodd'+data['product_id'].astype(str)+','+'proc'+data['product_category'].astype(str)+','+'adv'+data['advertiser_id'].astype(str)+','+'ind'+data['industry'].astype(str)+',' 104 | 105 | 106 | trian_user1 = pd.read_csv(path+'train_preliminary/user.csv') 107 | trian_user2 = pd.read_csv(path+'train_semi_final/user.csv') 108 | trian_user = trian_user1.append(trian_user2).reset_index(drop=True) 109 | trian_user = trian_user.drop_duplicates(['user_id']) 110 | trian_user = trian_user.sort_values(by=['user_id']) 111 | 112 | 113 | df_tmp = data.groupby('user_id')['user_items'].agg(list) 114 | df_tmp = pd.DataFrame(df_tmp) 115 | df_tmp['user_id'] = df_tmp.index 116 | df_tmp = df_tmp.reset_index(drop=True) 117 | 118 | 119 | train_uid = trian_user[['user_id','age','gender']] 120 | test_uid = pd.DataFrame(list(set(test_log['user_id']))) 121 | test_uid.columns=['user_id'] 122 | 123 | # #### 构建Tfidf 特征 124 | 125 | os.system('mkdir -pv ../../data/lgb/tf_idf_feats') 126 | 127 | df_tmp['text'] = df_tmp['user_items'].apply(lambda x: " ".join([str(i) for i in x])) 128 | txt1 = data.groupby('user_id')['user_items'].apply(lambda x: " ".join(x)).reset_index()['user_items'] 129 | X = list(txt1.values) 130 | tfv = TfidfVectorizer(min_df=35) 131 | tfv.fit(X) 132 | 133 | train_uid = train_uid.merge(df_tmp, on='user_id', how='left') 134 | test_uid = test_uid.merge(df_tmp, on='user_id', how='left') 135 | 136 | traintext_tfidf = tfv.transform(train_uid['text'].values) 137 | sparse.save_npz('./tf_idf_feats/traintext_tfidf3.npz',traintext_tfidf) 138 | testtext_tfidf = tfv.transform(test_uid['text'].values) 139 | sparse.save_npz('./tf_idf_feats/testtext_tfidf3.npz', testtext_tfidf) 140 | 141 | # #### 构建COUNTVEC特征 142 | 143 | os.system('mkdir -pv ../../data/lgb/countvec_feats') 144 | 145 | cv = CountVectorizer(min_df=30) 146 | cv.fit(df_tmp['text']) 147 | 148 | train_ta = cv.transform(train_uid['text']) 149 | sparse.save_npz('../../data/lgb/countvec_feats/traintext_countvec2.npz',train_ta) 150 | test_ta = cv.transform(test_uid['text']) 151 | sparse.save_npz('../../data/lgb/countvec_feats/testtext_countvec2.npz',test_ta) 152 | 153 | -------------------------------------------------------------------------------- /src/lgb/f2_save_target_encoding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ##################################################################### 5 | # 生成目标编码 Target Encoding 特征 6 | ##################################################################### 7 | 8 | import warnings 9 | warnings.simplefilter('ignore') 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | pd.set_option('max_columns', 500) 15 | pd.set_option('max_rows', 500) 16 | pd.set_option('display.float_format',lambda x : '%.4f' % x) 17 | 18 | from tqdm import tqdm 19 | tqdm.pandas() 20 | 21 | import os 22 | import gc 23 | from sklearn.model_selection import KFold 24 | 25 | 26 | user_train_1 = pd.read_csv('../../raw_data/train_preliminary/user.csv') 27 | user_train_2 = pd.read_csv('../../raw_data/train_semi_final/user.csv') 28 | user_train = pd.concat([user_train_1, user_train_2]) 29 | 30 | click_train_1 = pd.read_csv('../../raw_data/train_preliminary/click_log.csv') 31 | click_train_2 = pd.read_csv('../../raw_data/train_semi_final/click_log.csv') 32 | click_train = pd.concat([click_train_1, click_train_2]) 33 | click_test = pd.read_csv('../../raw_data/test/click_log.csv') 34 | 35 | ad_train_1 = pd.read_csv('../../raw_data/train_preliminary/ad.csv') 36 | ad_train_2 = pd.read_csv('../../raw_data/train_semi_final/ad.csv') 37 | ad_train = pd.concat([ad_train_1, ad_train_2]) 38 | ad_test = pd.read_csv('../../raw_data/test/ad.csv') 39 | 40 | del user_train_1, user_train_2 41 | del click_train_1, click_train_2 42 | del ad_train_1, ad_train_2 43 | gc.collect() 44 | 45 | 46 | df_train = pd.merge(click_train, ad_train, on='creative_id') 47 | df_test = pd.merge(click_test, ad_test, on='creative_id') 48 | 49 | df_train = df_train.drop_duplicates() 50 | df_test = df_test.drop_duplicates() 51 | 52 | del click_train, click_test, ad_train, ad_test 53 | gc.collect() 54 | 55 | 56 | df_train = pd.merge(df_train, user_train, on='user_id') 57 | 58 | df = pd.concat([df_train, df_test]) 59 | 60 | del df_train, df_test 61 | gc.collect() 62 | 63 | 64 | df = df.sort_values(by=['user_id', 'time', 'click_times'], ascending=[True, True, True]).reset_index(drop=True) 65 | 66 | # 目标编码 67 | 68 | from sklearn.model_selection import KFold 69 | 70 | def stat(df, df_merge, group_by, agg): 71 | group = df.groupby(group_by).agg(agg) 72 | 73 | columns = [] 74 | for on, methods in agg.items(): 75 | for method in methods: 76 | columns.append('{}_{}_{}'.format('_'.join(group_by), on, method)) 77 | group.columns = columns 78 | group.reset_index(inplace=True) 79 | df_merge = df_merge.merge(group, on=group_by, how='left') 80 | 81 | del (group) 82 | gc.collect() 83 | return df_merge 84 | 85 | 86 | def statis_feat(df_know, df_unknow): 87 | df_unknow = stat(df_know, df_unknow, ['ad_id'], {'age': ['mean', 'std'], 'gender': ['mean', 'std']}) 88 | df_unknow = stat(df_know, df_unknow, ['creative_id'], {'age': ['mean', 'std'], 'gender': ['mean', 'std']}) 89 | df_unknow = stat(df_know, df_unknow, ['advertiser_id'], {'age': ['mean', 'std'], 'gender': ['mean', 'std']}) 90 | df_unknow = stat(df_know, df_unknow, ['product_id'], {'age': ['mean', 'std'], 'gender': ['mean', 'std']}) 91 | df_unknow = stat(df_know, df_unknow, ['industry'], {'age': ['mean', 'std'], 'gender': ['mean', 'std']}) 92 | df_unknow = stat(df_know, df_unknow, ['product_category'], {'age': ['mean', 'std'], 'gender': ['mean', 'std']}) 93 | 94 | return df_unknow 95 | 96 | 97 | df_train = df[~df['age'].isnull()] 98 | df_train = df.reset_index(drop=True) 99 | df_test = df[df['age'].isnull()] 100 | 101 | df_stas_feat = None 102 | kf = KFold(n_splits=5, random_state=2020, shuffle=True) 103 | for train_index, val_index in kf.split(df_train): 104 | df_fold_train = df_train.iloc[train_index] 105 | df_fold_val = df_train.iloc[val_index] 106 | 107 | df_fold_val = statis_feat(df_fold_train, df_fold_val) 108 | df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0) 109 | 110 | del(df_fold_train) 111 | del(df_fold_val) 112 | gc.collect() 113 | 114 | 115 | df_test = statis_feat(df_train, df_test) 116 | df = pd.concat([df_stas_feat, df_test], axis=0) 117 | 118 | del(df_stas_feat) 119 | del(df_train) 120 | del(df_test) 121 | gc.collect() 122 | 123 | 124 | df_ = df[[col for col in df.columns if col not in ['time', 'creative_id', 'click_times', 125 | 'ad_id', 'product_id', 'product_category', 126 | 'advertiser_id', 'industry', 'age', 'gender']]] 127 | 128 | gc.collect() 129 | 130 | for col in tqdm(['ad_id', 'creative_id', 'advertiser_id', 131 | 'product_id', 'industry', 'product_category']): 132 | for method in ['mean', 'std']: 133 | df_[f'{col}_age_{method}_mean'] = df_.groupby('user_id')[f'{col}_age_{method}'].transform('mean') 134 | df_[f'{col}_gender_{method}_mean'] = df_.groupby('user_id')[f'{col}_gender_{method}'].transform('mean') 135 | 136 | 137 | cols = [col for col in df_.columns if col.endswith('_mean_mean')] + [col for col in df_.columns if col.endswith('_std_mean')] 138 | 139 | df_ = df_[['user_id', 'ad_id_age_mean_mean', 'ad_id_gender_mean_mean', 'ad_id_age_std_mean', 140 | 'ad_id_gender_std_mean', 'creative_id_age_mean_mean', 141 | 'creative_id_gender_mean_mean', 'creative_id_age_std_mean', 142 | 'creative_id_gender_std_mean', 'advertiser_id_age_mean_mean', 143 | 'advertiser_id_gender_mean_mean', 'advertiser_id_age_std_mean', 144 | 'advertiser_id_gender_std_mean', 'product_id_age_mean_mean', 145 | 'product_id_gender_mean_mean', 'product_id_age_std_mean', 146 | 'product_id_gender_std_mean', 'industry_age_mean_mean', 147 | 'industry_gender_mean_mean', 'industry_age_std_mean', 148 | 'industry_gender_std_mean', 'product_category_age_mean_mean', 149 | 'product_category_gender_mean_mean', 'product_category_age_std_mean', 150 | 'product_category_gender_std_mean']].drop_duplicates(subset=['user_id']) 151 | 152 | os.system('mkdir -pv ../../data/lgb/te_feats') 153 | 154 | df_.to_pickle('../../data/lgb/te_feats/df_user_target_encoding.pickle') 155 | 156 | -------------------------------------------------------------------------------- /src/lgb/f4_save_GENDER_tf_idf_stacking_feats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ########################################################################## 5 | # 用 TF-IDF 特征训练 LR model 生成概率当做后面 LGB 的 stacking 特征 6 | ########################################################################## 7 | 8 | import warnings 9 | warnings.filterwarnings("ignore") 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | import sys 15 | import os 16 | import json 17 | import gc 18 | from tqdm import tqdm 19 | 20 | from sklearn.model_selection import train_test_split 21 | from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold 22 | from sklearn.preprocessing import LabelEncoder 23 | from sklearn.preprocessing import StandardScaler 24 | from sklearn.metrics import mean_absolute_error 25 | from sklearn.metrics import accuracy_score 26 | from sklearn.linear_model import LinearRegression 27 | from sklearn.preprocessing import StandardScaler as std 28 | from sklearn.kernel_ridge import KernelRidge 29 | from sklearn.feature_extraction.text import TfidfVectorizer 30 | from sklearn.feature_extraction.text import CountVectorizer 31 | from sklearn.decomposition import TruncatedSVD 32 | 33 | from sklearn.linear_model import LogisticRegression 34 | from sklearn.linear_model import SGDClassifier 35 | from sklearn.linear_model import PassiveAggressiveClassifier 36 | from sklearn.linear_model import RidgeClassifier 37 | from sklearn.naive_bayes import BernoulliNB 38 | from sklearn.naive_bayes import MultinomialNB 39 | from sklearn.svm import LinearSVC 40 | 41 | from sklearn.metrics import f1_score 42 | from sklearn.metrics import precision_score, recall_score 43 | from sklearn.metrics import roc_auc_score 44 | from sklearn.metrics import mean_squared_error 45 | 46 | import time 47 | import datetime 48 | from datetime import datetime, timedelta 49 | 50 | from scipy.signal import hilbert 51 | from scipy.signal import hann 52 | from scipy.signal import convolve 53 | from scipy import stats 54 | from scipy import sparse 55 | import scipy.spatial.distance as dist 56 | 57 | from collections import Counter 58 | from statistics import mode 59 | 60 | import math 61 | from itertools import product 62 | import ast 63 | 64 | 65 | def reduce_mem_usage(df, verbose=True): 66 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 67 | start_mem = df.memory_usage().sum() / 1024**2 68 | for col in df.columns: 69 | col_type = df[col].dtypes 70 | if col_type in numerics: 71 | c_min = df[col].min() 72 | c_max = df[col].max() 73 | if str(col_type)[:3] == 'int': 74 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 75 | df[col] = df[col].astype(np.int8) 76 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 77 | df[col] = df[col].astype(np.int16) 78 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 79 | df[col] = df[col].astype(np.int32) 80 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 81 | df[col] = df[col].astype(np.int64) 82 | else: 83 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 84 | df[col] = df[col].astype(np.float16) 85 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 86 | df[col] = df[col].astype(np.float32) 87 | else: 88 | df[col] = df[col].astype(np.float64) 89 | end_mem = df.memory_usage().sum() / 1024**2 90 | if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem)) 91 | return df 92 | 93 | 94 | path = '../../raw_data/' 95 | 96 | train_user1 = pd.read_csv(path+'train_preliminary/user.csv') 97 | train_user2 = pd.read_csv(path+'train_semi_final/user.csv') 98 | train_user = train_user1.append(train_user2).reset_index(drop=True) 99 | train_user = train_user.drop_duplicates(['user_id']) 100 | train_user = train_user.sort_values(by=['user_id']) 101 | test_click = pd.read_csv(path+'test/click_log.csv') 102 | 103 | train_uid= train_user[['user_id','age','gender']] 104 | test_uid= pd.DataFrame(list(set(test_click['user_id']))) 105 | test_uid.columns=['user_id'] 106 | 107 | 108 | train_x = pd.DataFrame() 109 | test_x = pd.DataFrame() 110 | 111 | 112 | train_tfidf= sparse.load_npz('./tf_idf_feats/traintext_tfidf3.npz') 113 | test_tfidf= sparse.load_npz('./tf_idf_feats/testtext_tfidf3.npz') 114 | 115 | 116 | train_data = sparse.hstack((train_x, train_tfidf), 'csr') 117 | test_data = sparse.hstack((test_x, test_tfidf), 'csr') 118 | 119 | 120 | 121 | y2 = train_uid[['gender']] 122 | all_id = train_uid.append(test_uid).reset_index(drop=True) 123 | 124 | 125 | print('开始进行一些前期处理') 126 | train_feature = train_data 127 | test_feature = test_data 128 | # 五则交叉验证 129 | n_folds = 5 130 | print('处理完毕') 131 | df_stack = pd.DataFrame() 132 | df_stack['user_id']=all_id['user_id'] 133 | seed = 1017 134 | folds = StratifiedKFold(n_splits=5, random_state=seed, shuffle=False) 135 | 136 | for label in ['gender']: 137 | score = y2[label]-1 138 | 139 | ########################### lr(LogisticRegression) ################################ 140 | print('lr stacking') 141 | stack_train = np.zeros((len(train_uid), 1)) 142 | stack_test = np.zeros((len(test_uid), 1)) 143 | score_va = 0 144 | for i, (tr, va) in enumerate(folds.split(train_feature,score)): 145 | print('stack:%d/%d' % ((i + 1), n_folds)) 146 | clf = LogisticRegression(random_state=1017, C=8) 147 | clf.fit(train_feature[tr], score[tr]) 148 | score_va = clf.predict_proba(train_feature[va])[:,1] 149 | 150 | score_te = clf.predict_proba(test_feature)[:,1] 151 | print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va])))) 152 | stack_train[va,0] = score_va 153 | stack_test[:,0]+= score_te 154 | stack_test /= n_folds 155 | 156 | stack = np.vstack([stack_train, stack_test]) 157 | 158 | 159 | df_stack['pack_tfidf_vec_lr_classfiy_{}'.format(label)] = stack[:, 0] 160 | 161 | 162 | ########################### SGD(随机梯度下降) ################################ 163 | print('sgd stacking') 164 | stack_train = np.zeros((len(train_uid), 1)) 165 | stack_test = np.zeros((len(test_uid), 1)) 166 | score_va = 0 167 | 168 | for i, (tr, va) in enumerate(folds.split(train_feature,score)): 169 | print('stack:%d/%d' % ((i + 1), n_folds)) 170 | sgd = SGDClassifier(random_state=1017, loss='log') 171 | sgd.fit(train_feature[tr], score[tr]) 172 | score_va = sgd.predict_proba(train_feature[va])[:,1] 173 | score_te = sgd.predict_proba(test_feature)[:,1] 174 | print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va])))) 175 | stack_train[va,0] = score_va 176 | stack_test[:,0]+= score_te 177 | stack_test /= n_folds 178 | stack = np.vstack([stack_train, stack_test]) 179 | 180 | df_stack['pack_tfidf_vec_sgd_classfiy_{}'.format(label)] = stack[:, 0] 181 | 182 | ########################### pac(PassiveAggressiveClassifier) ################################ 183 | print('sgd stacking') 184 | stack_train = np.zeros((len(train_uid), 1)) 185 | stack_test = np.zeros((len(test_uid), 1)) 186 | score_va = 0 187 | for i, (tr, va) in enumerate(folds.split(train_feature,score)): 188 | print('stack:%d/%d' % ((i + 1), n_folds)) 189 | pac = PassiveAggressiveClassifier(random_state=1017) 190 | pac.fit(train_feature[tr], score[tr]) 191 | score_va = pac._predict_proba_lr(train_feature[va])[:,1] 192 | score_te = pac._predict_proba_lr(test_feature)[:,1] 193 | print(score_va) 194 | print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va])))) 195 | stack_train[va,0] += score_va 196 | stack_test[:,0] += score_te 197 | stack_test /= n_folds 198 | stack = np.vstack([stack_train, stack_test]) 199 | 200 | df_stack['pack_tfidf_vec_pac_classfiy_{}'.format(label)] = stack[:, 0] 201 | 202 | 203 | ########################### ridge(RidgeClassfiy) ################################ 204 | print('RidgeClassfiy stacking') 205 | stack_train = np.zeros((len(train_uid), 1)) 206 | stack_test = np.zeros((len(test_uid), 1)) 207 | score_va = 0 208 | 209 | for i, (tr, va) in enumerate(folds.split(train_feature,score)): 210 | ridge = RidgeClassifier(random_state=1017) 211 | ridge.fit(train_feature[tr], score[tr]) 212 | score_va = ridge._predict_proba_lr(train_feature[va])[:,1] 213 | score_te = ridge._predict_proba_lr(test_feature)[:,1] 214 | print(score_va) 215 | print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va])))) 216 | stack_train[va,0] += score_va 217 | stack_test[:,0] += score_te 218 | stack_test /= n_folds 219 | stack = np.vstack([stack_train, stack_test]) 220 | 221 | 222 | df_stack['pack_tfidf_vec_ridge_classfiy_{}'.format(label)] = stack[:, 0] 223 | 224 | ########################### bnb(BernoulliNB) ################################ 225 | print('BernoulliNB stacking') 226 | stack_train = np.zeros((len(train_uid), 1)) 227 | stack_test = np.zeros((len(test_uid), 1)) 228 | score_va = 0 229 | 230 | for i, (tr, va) in enumerate(folds.split(train_feature,score)): 231 | print('stack:%d/%d' % ((i + 1), n_folds)) 232 | bnb = BernoulliNB() 233 | bnb.fit(train_feature[tr], score[tr]) 234 | score_va = bnb.predict_proba(train_feature[va])[:,1] 235 | score_te = bnb.predict_proba(test_feature)[:,1] 236 | print(score_va) 237 | print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va])))) 238 | stack_train[va,0] += score_va 239 | stack_test[:,0] += score_te 240 | stack_test /= n_folds 241 | stack = np.vstack([stack_train, stack_test]) 242 | 243 | 244 | df_stack['pack_tfidf_vec_bnb_classfiy_{}'.format(label)] = stack[:, 0] 245 | 246 | ########################### mnb(MultinomialNB) ################################ 247 | print('MultinomialNB stacking') 248 | stack_train = np.zeros((len(train_uid), 1)) 249 | stack_test = np.zeros((len(test_uid), 1)) 250 | score_va = 0 251 | 252 | for i, (tr, va) in enumerate(folds.split(train_feature,score)): 253 | print('stack:%d/%d' % ((i + 1), n_folds)) 254 | mnb = MultinomialNB() 255 | mnb.fit(train_feature[tr], score[tr]) 256 | score_va = mnb.predict_proba(train_feature[va])[:,1] 257 | score_te = mnb.predict_proba(test_feature)[:,1] 258 | print(score_va) 259 | print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va])))) 260 | stack_train[va,0] += score_va 261 | stack_test[:,0] += score_te 262 | stack_test /= n_folds 263 | stack = np.vstack([stack_train, stack_test]) 264 | 265 | 266 | df_stack['pack_tfidf_vec_mnb_classfiy_{}'.format(label)] = stack[:, 0] 267 | 268 | 269 | ############################ Linersvc(LinerSVC) ################################ 270 | print('LinerSVC stacking') 271 | stack_train = np.zeros((len(train_uid), 1)) 272 | stack_test = np.zeros((len(test_uid), 1)) 273 | score_va = 0 274 | 275 | for i, (tr, va) in enumerate(folds.split(train_feature,score)): 276 | print('stack:%d/%d' % ((i + 1), n_folds)) 277 | lsvc = LinearSVC(random_state=1017) 278 | lsvc.fit(train_feature[tr], score[tr]) 279 | score_va = lsvc._predict_proba_lr(train_feature[va])[:,1] 280 | score_te = lsvc._predict_proba_lr(test_feature)[:,1] 281 | print(score_va) 282 | print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va])))) 283 | stack_train[va,0] += score_va 284 | stack_test[:,0] += score_te 285 | stack_test /= n_folds 286 | stack = np.vstack([stack_train, stack_test]) 287 | 288 | 289 | df_stack['pack_tfidf_vec_lsvc_classfiy_{}'.format(label)] = stack[:, 0] 290 | 291 | 292 | ############################################# save ############################################### 293 | os.system('mkdir -pv ../../data/lgb/stacking_feats') 294 | df_stack.to_csv('../../data/lgb/stacking_feats/tfidf_classfiy_package.csv', index=None, encoding='utf8') 295 | 296 | -------------------------------------------------------------------------------- /src/lgb/f5_run_fold_training.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ################################################################################## 5 | # LGB 模型训练 6 | # 所使用的特征: 7 | # 1. TF-IDF 8 | # 2. COUNTVEC 9 | # 3. LR model 概率 10 | # 输出: 11 | # AGE 概率 12 | # GENDER 概率并没有使用, 如需要可取消掉对应的注释 13 | 14 | # 训练时长: ~ 5 days 15 | ################################################################################## 16 | 17 | import warnings 18 | warnings.filterwarnings("ignore") 19 | 20 | import pandas as pd 21 | import numpy as np 22 | 23 | import sys 24 | import os 25 | import json 26 | import gc 27 | from tqdm import tqdm 28 | 29 | from sklearn.model_selection import train_test_split 30 | from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold 31 | from sklearn.preprocessing import LabelEncoder 32 | from sklearn.preprocessing import StandardScaler 33 | from sklearn.metrics import mean_absolute_error 34 | from sklearn.metrics import accuracy_score 35 | from sklearn.linear_model import LinearRegression 36 | from sklearn.preprocessing import StandardScaler as std 37 | from sklearn.kernel_ridge import KernelRidge 38 | from sklearn.feature_extraction.text import TfidfVectorizer 39 | from sklearn.feature_extraction.text import CountVectorizer 40 | from sklearn.decomposition import TruncatedSVD 41 | 42 | from sklearn.linear_model import LogisticRegression 43 | from sklearn.linear_model import SGDClassifier 44 | from sklearn.linear_model import PassiveAggressiveClassifier 45 | from sklearn.linear_model import RidgeClassifier 46 | from sklearn.naive_bayes import BernoulliNB 47 | from sklearn.naive_bayes import MultinomialNB 48 | from sklearn.svm import LinearSVC 49 | 50 | from sklearn.metrics import f1_score 51 | from sklearn.metrics import precision_score, recall_score 52 | from sklearn.metrics import roc_auc_score 53 | from sklearn.metrics import mean_squared_error 54 | 55 | import time 56 | import datetime 57 | from datetime import datetime, timedelta 58 | 59 | from scipy.signal import hilbert 60 | from scipy.signal import hann 61 | from scipy.signal import convolve 62 | from scipy import stats 63 | from scipy import sparse 64 | import scipy.spatial.distance as dist 65 | 66 | from collections import Counter 67 | from statistics import mode 68 | 69 | import math 70 | from itertools import product 71 | import ast 72 | 73 | 74 | # 减少内存函数 75 | def reduce_mem_usage(df, verbose=True): 76 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 77 | start_mem = df.memory_usage().sum() / 1024**2 78 | for col in df.columns: 79 | col_type = df[col].dtypes 80 | if col_type in numerics: 81 | c_min = df[col].min() 82 | c_max = df[col].max() 83 | if str(col_type)[:3] == 'int': 84 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 85 | df[col] = df[col].astype(np.int8) 86 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 87 | df[col] = df[col].astype(np.int16) 88 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 89 | df[col] = df[col].astype(np.int32) 90 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 91 | df[col] = df[col].astype(np.int64) 92 | else: 93 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 94 | df[col] = df[col].astype(np.float16) 95 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 96 | df[col] = df[col].astype(np.float32) 97 | else: 98 | df[col] = df[col].astype(np.float64) 99 | end_mem = df.memory_usage().sum() / 1024**2 100 | if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem)) 101 | return df 102 | 103 | 104 | # ### 模型初始化 105 | 106 | lgb_model = lgb.LGBMClassifier( 107 | boosting_type="gbdt", num_leaves=15, reg_alpha=0, reg_lambda=0., 108 | max_depth=-1, n_estimators=1500, objective='multiclass',metric= 'multi_error', 109 | subsample=0.95, colsample_bytree=0.95, subsample_freq=1, 110 | learning_rate=0.1, random_state=2017 111 | ) 112 | 113 | lgb_model_binary = lgb.LGBMClassifier( 114 | boosting_type="gbdt", num_leaves=150, reg_alpha=0, reg_lambda=0., 115 | max_depth=-1, n_estimators=500, objective='binary',metric= 'error', 116 | subsample=0.95, colsample_bytree=0.95, subsample_freq=1, 117 | learning_rate=0.025, random_state=2017 118 | ) 119 | 120 | 121 | # ### 读取数据 122 | 123 | path = '../../raw_data/' 124 | train_user1 = pd.read_csv(path+'train_preliminary/user.csv') 125 | train_user2 = pd.read_csv(path+'train_semi_final/user.csv') 126 | train_user = train_user1.append(train_user2).reset_index(drop=True) 127 | train_user = train_user.drop_duplicates(['user_id']) 128 | train_user = train_user.sort_values(by=['user_id']) 129 | test_click = pd.read_csv(path+'test/click_log.csv') 130 | 131 | 132 | train_uid = train_user[['user_id','age','gender']] 133 | test_uid = pd.DataFrame(list(set(test_click['user_id']))) 134 | test_uid.columns=['user_id'] 135 | 136 | # ### target_encoding 特征 137 | 138 | target_encoding = pd.read_pickle('../../data/lgb/te_feats/df_user_target_encoding.pickle') 139 | 140 | 141 | target_feats = [i for i in target_encoding.columns if i not in ['user_id']] 142 | train_uid = train_uid.merge(target_encoding,on='user_id',how='left') 143 | test_uid = test_uid.merge(target_encoding,on='user_id',how='left') 144 | 145 | 146 | # ### tfidf_stacking特征 147 | 148 | stackingfeats = pd.read_csv('../../data/lgb/stacking_feats/tfidf_classfiy_package.csv') 149 | train_uid = train_uid.merge(stackingfeats, on='user_id', how='left') 150 | test_uid = test_uid.merge(stackingfeats, on='user_id', how='left') 151 | 152 | stackingfeats_vec = pd.read_csv('../../data/lgb/stacking_feats/tfidf_classfiy_age_package.csv') 153 | train_uid = train_uid.merge(stackingfeats_vec,on='user_id',how='left') 154 | test_uid = test_uid.merge(stackingfeats_vec,on='user_id',how='left') 155 | 156 | 157 | # ### countvec stacking 特征 158 | 159 | normal_feats = [i for i in train_uid.columns if i not in ['user_id', 'age', 'gender']] 160 | 161 | 162 | train_x = train_uid[normal_feats] 163 | test_x = test_uid[normal_feats] 164 | 165 | 166 | train_tfidf= sparse.load_npz('../../data/lgb/tf_idf_feats/traintext_tfidf3.npz') 167 | test_tfidf= sparse.load_npz('../../data/lgb/tf_idf_feats/testtext_tfidf3.npz') 168 | 169 | train_vec= sparse.load_npz('../../data/lgb/countvec_feats/traintext_countvec2.npz') 170 | test_vec= sparse.load_npz('../../data/lgb/countvec_feats/testtext_countvec2.npz') 171 | 172 | train_data = sparse.hstack((train_x, train_tfidf), 'csr') 173 | test_data = sparse.hstack((test_x, test_tfidf), 'csr') 174 | 175 | train_data = sparse.hstack((train_data, train_vec), 'csr') 176 | test_data = sparse.hstack((test_data, test_vec), 'csr') 177 | 178 | 179 | y = train_uid[['age']] 180 | y2 = train_uid[['gender']] 181 | 182 | 183 | 184 | model = lgb.LGBMClassifier( 185 | boosting_type="gbdt", num_leaves=176, reg_alpha=0.1, reg_lambda=0.1, 186 | max_depth=-1, n_estimators=55, objective='multiclass',metric= 'multi_error', 187 | subsample=0.95, colsample_bytree=0.95, subsample_freq=1, 188 | learning_rate=0.1, random_state=2017 189 | ) 190 | 191 | 192 | train_uid['agepre'] = 0 193 | train_uid['genpre'] = 0 194 | 195 | testrs = test_uid[['user_id']] 196 | testrs['age'] = 0 197 | testrs['gender'] = 0 198 | 199 | 200 | # #### 二分类模型定义 201 | 202 | #fold = 0 203 | #n_splits = 5 204 | #testrs['genpre'] = 0 205 | #train_uid['genpre'] = 0 206 | #kfold = KFold(n_splits=5, shuffle=True, random_state=42) 207 | #def run_model_gender(): 208 | # for train_idx, val_idx in kfold.split(train_data): 209 | # train_x = train_data[train_idx] 210 | # train_y = y2.loc[train_idx] 211 | # test_xt = train_data[val_idx] 212 | # test_yt = y2.loc[val_idx] 213 | # 214 | # lgb_model_binary.fit(train_x, train_y, eval_set=[(train_x,train_y),(test_xt, test_yt)], early_stopping_rounds=100, 215 | # eval_metric='error', 216 | # verbose=5) 217 | # 218 | # train_uid.loc[val_idx, 'genpre'] = lgb_model_binary.predict_proba(test_xt)[:,1] 219 | # testrs['genpre']+= lgb_model_binary.predict_proba(test_data)[:,1]/n_splits 220 | # 221 | ## #### 运行二分类模型 222 | # 223 | #run_model_gender() 224 | 225 | 226 | # #### 多分类模型定义 227 | 228 | for i in range(1,11): 229 | train_uid['age_prob_'+str(i)]=0 230 | testrs['age_prob_'+str(i)]=0 231 | 232 | foldi = 0 233 | n_splits = 5 234 | kfold = KFold(n_splits=5, shuffle=True, random_state=42) 235 | seed = 2020 236 | folds = StratifiedKFold(n_splits=5, random_state=seed, shuffle=False) 237 | def run_model_age(): 238 | for train_idx, val_idx in kfold.split(train_data): 239 | train_x = train_data[train_idx] 240 | train_y = y.loc[train_idx] 241 | test_x = train_data[val_idx] 242 | test_y = y.loc[val_idx] 243 | 244 | print('modelrun_fold_{}'.format(fold)) 245 | model.fit(train_x, train_y, eval_set=[(train_x,train_y),(test_x, test_y)], early_stopping_rounds=100, 246 | eval_metric='multi_error', 247 | verbose=5) 248 | train_uid.loc[val_idx, ['age_prob_'+str(i) for i in range(1,11)]] = model.predict_proba(test_x) 249 | testrs[['age_prob_'+str(i) for i in range(1,11)]] += model.predict_proba(test_data)/5 250 | 251 | 252 | # #### 运行多分类模型 253 | 254 | run_model_age() 255 | 256 | 257 | # 保存 AGE 概率结果 258 | train_uid[['user_id']+['age_prob_'+str(i) for i in range(1,11)]+['genpre']].to_csv('../../probs/oof_age_lgb.csv', index=False) 259 | testrs[['user_id']+['age_prob_'+str(i) for i in range(1,11)]+['genpre']].to_csv('../../probs/sub_age_lgb.csv', index=False) 260 | 261 | -------------------------------------------------------------------------------- /src/stacking/f1_merge_stacking_feats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from scipy.special import softmax 5 | 6 | probs_pth = '../../probs' 7 | 8 | x1_oof = softmax(np.load(f'{probs_pth}/oof_age_m1_torch'), axis=1) 9 | x1_preds = softmax(np.load(f'{probs_pth}/sub_age_m1_torch'), axis=1) 10 | x1 = np.concatenate((x1_oof, x1_preds), axis=0) 11 | 12 | x2_oof = softmax(np.load(f'{probs_pth}/oof_age_m2_torch'), axis=1) 13 | x2_preds = softmax(np.load(f'{probs_pth}/sub_age_m2_torch'), axis=1) 14 | x2 = np.concatenate((x2_oof, x2_preds), axis=0) 15 | 16 | x3_oof = np.load(f'{probs_pth}/oof_age_m3_keras') 17 | x3_preds = np.load(f'{probs_pth}/sub_age_m3_keras') 18 | x3 = np.concatenate((x3_oof, x3_preds), axis=0) 19 | 20 | x4_oof = np.load(f'{probs_pth}/oof_age_m4_keras') 21 | x4_preds = np.load(f'{probs_pth}/sub_age_m4_keras') 22 | x4 = np.concatenate((x4_oof, x4_preds), axis=0) 23 | 24 | x5_oof = np.load(f'{probs_pth}/oof_age_m5_keras') 25 | x5_preds = np.load(f'{probs_pth}/sub_age_m5_keras') 26 | x5 = np.concatenate((x5_oof, x5_preds), axis=0) 27 | 28 | x6_oof = np.load(f'{probs_pth}/oof_age_m6_keras') 29 | x6_preds = np.load(f'{probs_pth}/sub_age_m6_keras') 30 | x6 = np.concatenate((x6_oof, x6_preds), axis=0) 31 | 32 | x7_oof = softmax(np.load(f'{probs_pth}/oof_age_m7_torch'), axis=1) 33 | x7_preds = softmax(np.load(f'{probs_pth}/sub_age_m7_torch'), axis=1) 34 | x7 = np.concatenate((x7_oof, x7_preds), axis=0) 35 | 36 | x8_oof = np.load(f'{probs_pth}/oof_age_m8_keras') 37 | x8_preds = np.load(f'{probs_pth}/sub_age_m8_keras') 38 | x8 = np.concatenate((x8_oof, x8_preds), axis=0) 39 | 40 | x9_oof = softmax(np.load(f'{probs_pth}/oof_age_m9_torch'), axis=1) 41 | x9_preds = softmax(np.load(f'{probs_pth}/sub_age_m9_torch'), axis=1) 42 | x9 = np.concatenate((x9_oof, x9_preds), axis=0) 43 | 44 | x10_oof = np.load(f'{probs_pth}/oof_age_m10_keras') 45 | x10_preds = np.load(f'{probs_pth}/sub_age_m10_keras') 46 | x10 = np.concatenate((x10_oof, x10_preds), axis=0) 47 | 48 | x11_oof = np.load(f'{probs_pth}/oof_age_m11_keras') 49 | x11_preds = np.load(f'{probs_pth}/sub_age_m11_keras') 50 | x11 = np.concatenate((x11_oof, x11_preds), axis=0) 51 | 52 | 53 | lgb_oof = pd.read_csv(f'{probs_pth}/oof_age_lgb.csv')[[f'age_prob_{i}' for i in range(1,11)]].values 54 | lgb_preds = pd.read_csv('{probs_pth}/sub_age_lgb.csv')[[f'age_prob_{i}' for i in range(1,11)]].values 55 | lgb_probs = np.concatenate((lgb_oof, lgb_preds), axis=0) 56 | 57 | x_stacking = np.concatenate((lgb_probs, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11), axis=1) 58 | 59 | np.save(f"{probs_pth}/x_stacking_120probs", x_stacking) 60 | -------------------------------------------------------------------------------- /src/stacking/f2_save_embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | #################################################################### 5 | # 生成用于 stacking 用的序列特征 6 | # 目的主要是为了与参与 stacking 的模型有区别 7 | # 且限制过拟合 8 | #################################################################### 9 | 10 | import warnings 11 | warnings.simplefilter('ignore') 12 | 13 | import numpy as np 14 | import pandas as pd 15 | pd.set_option('max_columns', None) 16 | pd.set_option('max_rows', 1000) 17 | 18 | import pickle 19 | import gc 20 | import logging 21 | 22 | from tqdm.autonotebook import * 23 | 24 | import gensim 25 | from gensim.models import FastText, Word2Vec 26 | 27 | from keras.preprocessing import text, sequence 28 | from keras.preprocessing.text import Tokenizer, text_to_word_sequence 29 | from keras.preprocessing.sequence import pad_sequences 30 | 31 | 32 | window = 100 33 | max_len = 100 34 | min_count = 1 35 | iter_ = 20 36 | emb_dim_cid = 64 37 | emb_dim_advid = 32 38 | 39 | 40 | def set_tokenizer(docs, split_char=' '): 41 | tokenizer = Tokenizer(lower=False, char_level=False, split=split_char) 42 | tokenizer.fit_on_texts(docs) 43 | X = tokenizer.texts_to_sequences(docs) 44 | maxlen = max_len 45 | X = pad_sequences(X, maxlen=maxlen, value=0) 46 | word_index = tokenizer.word_index 47 | return X, word_index 48 | 49 | 50 | def trian_save_word2vec(sentences, emb_dim, save_name='w2v.txt', split_char=' '): 51 | input_docs = [] 52 | for i in sentences: 53 | input_docs.append([ii for ii in i]) 54 | logging.basicConfig( 55 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO 56 | ) 57 | w2v = Word2Vec(input_docs, 58 | size=emb_dim, 59 | sg=1, 60 | window=window, 61 | seed=2020, 62 | workers=18, 63 | min_count=min_count, 64 | iter=iter_) 65 | w2v.wv.save_word2vec_format(save_name) 66 | return w2v 67 | 68 | 69 | def get_embedding_matrix(word_index, embed_size=128, Emed_path="w2v_300.txt"): 70 | embeddings_index = gensim.models.KeyedVectors.load_word2vec_format( 71 | Emed_path, binary=False) 72 | nb_words = len(word_index)+1 73 | embedding_matrix = np.zeros((nb_words, embed_size)) 74 | count = 0 75 | for word, i in tqdm(word_index.items()): 76 | if i >= nb_words: 77 | continue 78 | try: 79 | embedding_vector = embeddings_index[word] 80 | except: 81 | embedding_vector = np.zeros(embed_size) 82 | count += 1 83 | if embedding_vector is not None: 84 | embedding_matrix[i] = embedding_vector 85 | print("null cnt", count) 86 | return embedding_matrix 87 | 88 | 89 | 90 | # creative_od 91 | df = pd.read_pickle('data/df_creative_sequence.pickle') 92 | cid_list = list(df['cids']) 93 | 94 | for i in tqdm(range(0, len(cid_list))): 95 | cid_list[i] =[str(ii) for ii in cid_list[i]] 96 | 97 | x_cid, index_cid = set_tokenizer(cid_list, split_char=',') 98 | trian_save_word2vec(cid_list, 99 | emb_dim_cid, 100 | save_name=f'../../w2v_models/cid_w2v_{emb_dim_cid}_win{window}_iter{iter_}_mincount{min_count}.txt', 101 | split_char=',') 102 | 103 | del df 104 | gc.collect() 105 | 106 | 107 | # advertiser_id 108 | df = pd.read_pickle('../../data/keras/df_advertiser_sequence.pickle') 109 | advid_list = list(df['advids']) 110 | 111 | for i in tqdm(range(0, len(advid_list))): 112 | advid_list[i] =[str(ii) for ii in advid_list[i]] 113 | 114 | x_advid, index_advid = set_tokenizer(advid_list, split_char=',') 115 | trian_save_word2vec(advid_list, 116 | emb_dim_advid, 117 | save_name=f'../../w2v_models/advid_w2v_{emb_dim_advid}_win{window}_iter{iter_}_mincount{min_count}.txt', 118 | split_char=',') 119 | 120 | del df 121 | gc.collect() 122 | 123 | -------------------------------------------------------------------------------- /src/stacking/f3_stacking_DNN_120probs_train_fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ##################################################################################### 5 | # AGE Stacking: DNN 双路分层残差 6 | # score: 7 | # 五折: 0.52131 (线下) 8 | # 五折: 0.52446 (线上) 9 | # 训练时长: ~ 2 hours 10 | ##################################################################################### 11 | 12 | import warnings 13 | warnings.simplefilter('ignore') 14 | 15 | import numpy as np 16 | import pandas as pd 17 | 18 | import sys 19 | import time 20 | import pickle 21 | import gc 22 | import logging 23 | 24 | from tqdm import tqdm 25 | 26 | import keras 27 | from keras import layers 28 | from keras import callbacks 29 | 30 | 31 | 32 | fold = sys.argv[1] 33 | batch_size = 20480 34 | 35 | 36 | print("loading labels") 37 | start_time = time.time() 38 | 39 | labels_1 = pd.read_csv('../../raw_data/train_preliminary/user.csv') 40 | labels_2 = pd.read_csv('../../raw_data/train_semi_final/user.csv') 41 | labels = pd.concat([labels_1, labels_2]) 42 | labels['age'] = labels['age'] - 1 43 | labels['gender'] = labels['gender'] - 1 44 | 45 | used_minutes = (time.time() - start_time) / 60 46 | print(f"done, used {used_minutes} minutes") 47 | 48 | 49 | print("split train, valid and test data") 50 | start_time = time.time() 51 | 52 | y = keras.utils.to_categorical(labels['age']) 53 | 54 | x_stacking = np.load('../../probs/x_stacking_120probs.npy') 55 | 56 | x1 = x_stacking[:,0:10] # LGB 57 | x2 = x_stacking[:,10:20] # torch 6inputs lstm+attention 58 | x3 = x_stacking[:,20:30] # torch 6inputs transformer 59 | x4 = x_stacking[:,30:40] # keras 4inputs lstm+attention 60 | x5 = x_stacking[:,40:50] # keras 4inputs transformer 61 | x6 = x_stacking[:,50:60] # keras 3inputs transformer 62 | x7 = x_stacking[:,60:70] # keras 2inputs transformer+lstm 63 | x8 = x_stacking[:,70:80] # torch 3inputs lstm+attention 64 | x9 = x_stacking[:,80:90] # keras 3inputs transformer+lstm 65 | x10 = x_stacking[:,90:100] # torch 3inputs transformer+lstm 66 | x11 = x_stacking[:,100:110] # keras 4inputs transformer+lstm 67 | x12 = x_stacking[:,110:120] # keras 5inputs transformer+lstm 68 | 69 | x_lgb = x1 70 | x_keras = np.concatenate((x4, x5, x6, x7, x9, x11, x12), axis=1) 71 | x_torch = np.concatenate((x2, x3, x8, x10), axis=1) 72 | 73 | del x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12 74 | gc.collect() 75 | 76 | if fold == "fold0": 77 | train_lgb = x_lgb[:2400000] 78 | valid_lgb = x_lgb[2400000:3000000] 79 | train_keras = x_keras[:2400000] 80 | valid_keras = x_keras[2400000:3000000] 81 | train_torch = x_torch[:2400000] 82 | valid_torch = x_torch[2400000:3000000] 83 | y_train = y[:2400000] 84 | y_valid = y[2400000:] 85 | elif fold == "fold1": 86 | train_lgb = np.concatenate((x_lgb[:1800000], x_lgb[2400000:3000000]), axis=0) 87 | valid_lgb = x_lgb[1800000:2400000] 88 | train_keras = np.concatenate((x_keras[:1800000], x_keras[2400000:3000000]), axis=0) 89 | valid_keras = x_keras[1800000:2400000] 90 | train_torch = np.concatenate((x_torch[:1800000], x_torch[2400000:3000000]), axis=0) 91 | valid_torch = x_torch[1800000:2400000] 92 | y_train = np.concatenate((y[:1800000], y[2400000:3000000])) 93 | y_valid = y[1800000:2400000] 94 | elif fold == "fold2": 95 | train_lgb = np.concatenate((x_lgb[:1200000], x_lgb[1800000:3000000]), axis=0) 96 | valid_lgb = x_lgb[1200000:1800000] 97 | train_keras = np.concatenate((x_keras[:1200000], x_keras[1800000:3000000]), axis=0) 98 | valid_keras = x_keras[1200000:1800000] 99 | train_torch = np.concatenate((x_torch[:1200000], x_torch[1800000:3000000]), axis=0) 100 | valid_torch = x_torch[1200000:1800000] 101 | y_train = np.concatenate((y[:1200000], y[1800000:3000000])) 102 | y_valid = y[1200000:1800000] 103 | elif fold == "fold3": 104 | train_lgb = np.concatenate((x_lgb[:600000], x_lgb[1200000:3000000]), axis=0) 105 | valid_lgb = x_lgb[600000:1200000] 106 | train_keras = np.concatenate((x_keras[:600000], x_keras[1200000:3000000]), axis=0) 107 | valid_keras = x_keras[600000:1200000] 108 | train_torch = np.concatenate((x_torch[:600000], x_torch[1200000:3000000]), axis=0) 109 | valid_torch = x_torch[600000:1200000] 110 | y_train = np.concatenate((y[:600000], y[1200000:3000000])) 111 | y_valid = y[600000:1200000] 112 | elif fold == "fold4": 113 | train_lgb = x_lgb[600000:3000000] 114 | valid_lgb = x_lgb[:600000] 115 | train_keras = x_keras[600000:3000000] 116 | valid_keras = x_keras[:600000] 117 | train_torch = x_torch[600000:3000000] 118 | valid_torch = x_torch[:600000] 119 | y_train = y[600000:3000000] 120 | y_valid = y[:600000] 121 | else: 122 | pass 123 | 124 | test_lgb = x_lgb[3000000:] 125 | test_keras = x_keras[3000000:] 126 | test_torch = x_torch[3000000:] 127 | 128 | del x_stacking, x_lgb, x_keras, x_torch 129 | del y 130 | gc.collect() 131 | 132 | print(train_lgb.shape, valid_lgb.shape, test_lgb.shape) 133 | print(train_keras.shape, valid_keras.shape, test_keras.shape) 134 | print(train_torch.shape, valid_torch.shape, test_torch.shape) 135 | print(y_train.shape, y_valid.shape) 136 | 137 | used_minutes = (time.time() - start_time) / 60 138 | print(f"done, used {used_minutes} minutes") 139 | 140 | 141 | 142 | print("building model") 143 | 144 | lgb_shape = train_lgb.shape[1] 145 | keras_shape = train_keras.shape[1] 146 | torch_shape = train_torch.shape[1] 147 | 148 | start_time = time.time() 149 | 150 | def build_model(): 151 | 152 | inp1 = layers.Input(shape=(lgb_shape,)) 153 | inp2 = layers.Input(shape=(keras_shape,)) 154 | inp3 = layers.Input(shape=(torch_shape,)) 155 | 156 | x1 = layers.Concatenate()([inp1, inp2]) 157 | x1 = layers.Dense(40, activation='relu')(inp1) 158 | x1 = layers.BatchNormalization()(x1) 159 | 160 | x2 = layers.Concatenate()([x1, inp3]) 161 | x2 = layers.Dense(40, activation='relu')(x2) 162 | x2 = layers.BatchNormalization()(x2) 163 | 164 | x3 = layers.Dense(40, activation='relu')(x2) 165 | x3 = layers.BatchNormalization()(x3) 166 | 167 | x_all = layers.Concatenate()([inp1, inp2, inp3]) 168 | x_all = layers.Dense(120, activation='relu')(x_all) 169 | x_all = layers.BatchNormalization()(x_all) 170 | 171 | x_all = layers.Dense(80, activation='relu')(x_all) 172 | x_all = layers.BatchNormalization()(x_all) 173 | 174 | x_all = layers.Dense(60, activation='relu')(x_all) 175 | x_all = layers.BatchNormalization()(x_all) 176 | 177 | x = layers.Concatenate()([x3, x_all]) 178 | 179 | x = layers.Dense(64, activation='relu')(x) 180 | x = layers.BatchNormalization()(x) 181 | x = layers.Dropout(0.1)(x) 182 | 183 | out = layers.Dense(10, activation='softmax')(x) 184 | model = keras.Model(inputs=[inp1, inp2, inp3], outputs=out) 185 | model.compile(loss='categorical_crossentropy', 186 | optimizer=keras.optimizers.Adam(), 187 | metrics=['accuracy']) 188 | 189 | return model 190 | 191 | 192 | model = build_model() 193 | 194 | used_minutes = (time.time() - start_time) / 60 195 | print(f"done, used {used_minutes} minutes") 196 | 197 | 198 | # In[6]: 199 | 200 | 201 | model.summary() 202 | 203 | 204 | # In[7]: 205 | 206 | 207 | checkpoint = callbacks.ModelCheckpoint(f'../../models/age_dnn_stacking_{fold}.h5', 208 | monitor='val_accuracy', 209 | verbose=1, 210 | save_best_only=True, 211 | mode='max', 212 | save_weights_only=True) 213 | 214 | reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_accuracy', 215 | factor=0.1, 216 | patience=4, 217 | verbose=1, 218 | mode='max', 219 | epsilon=1e-6) 220 | 221 | early_stop = callbacks.EarlyStopping(monitor='val_accuracy', 222 | mode='max', 223 | patience=10) 224 | 225 | 226 | hist = model.fit([train_lgb, train_keras, train_torch], 227 | y_train, 228 | batch_size=batch_size, 229 | epochs=100, 230 | validation_data=([valid_lgb, valid_keras, valid_torch], y_valid), 231 | callbacks=[checkpoint, reduce_lr, early_stop], 232 | verbose=1, 233 | shuffle=True) 234 | 235 | acc = max(hist.history['val_accuracy']) 236 | print(acc) 237 | 238 | 239 | print("predict start") 240 | start_time = time.time() 241 | 242 | model.load_weights(f'../../models/age_dnn_stacking_{fold}.h5') 243 | preds = model.predict([test_lgb, test_keras, test_torch], 244 | batch_size=batch_size, 245 | verbose=1) 246 | 247 | np.save(f'../../probs/sub_age_dnn_stacking_{fold}', preds) 248 | 249 | used_minutes = (time.time() - start_time) / 60 250 | print(f"done, used {used_minutes} minutes") 251 | 252 | -------------------------------------------------------------------------------- /src/stacking/f4_stacking_transformer_2inputs_90probs_train_fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ##################################################################################### 5 | # AGE Stacking: Keras transformer 2 inputs with 90 prob feats 6 | # score: 7 | # 五折: 0.52018 (线下) 8 | # 五折: 0.52279 (线上) 9 | # 训练时长: ~ 5 hours 10 | ##################################################################################### 11 | 12 | 13 | import warnings 14 | warnings.simplefilter('ignore') 15 | 16 | import numpy as np 17 | import pandas as pd 18 | 19 | import sys 20 | import time 21 | import pickle 22 | import gc 23 | import logging 24 | 25 | from tqdm import tqdm 26 | 27 | import gensim 28 | from gensim.models import FastText, Word2Vec 29 | 30 | from scipy.special import softmax 31 | 32 | import keras 33 | from keras import layers 34 | from keras import callbacks 35 | 36 | from keras.preprocessing import text, sequence 37 | from keras.preprocessing.text import Tokenizer, text_to_word_sequence 38 | from keras.preprocessing.sequence import pad_sequences 39 | 40 | from keras_multi_head import MultiHead, MultiHeadAttention 41 | from keras_self_attention import SeqSelfAttention 42 | from keras_position_wise_feed_forward import FeedForward 43 | from keras_layer_normalization import LayerNormalization 44 | 45 | 46 | fold = sys.argv[1] 47 | 48 | max_len = 120 49 | emb_dim_cid = 64 50 | emb_dim_advid = 32 51 | 52 | batch_size = 1024 53 | 54 | 55 | def set_tokenizer(docs, split_char=' '): 56 | tokenizer = Tokenizer(lower=False, char_level=False, split=split_char) 57 | tokenizer.fit_on_texts(docs) 58 | X = tokenizer.texts_to_sequences(docs) 59 | maxlen = max_len 60 | X = pad_sequences(X, maxlen=maxlen, value=0) 61 | word_index = tokenizer.word_index 62 | return X, word_index 63 | 64 | 65 | def get_embedding_matrix(word_index, embed_size=128, Emed_path="w2v_300.txt"): 66 | embeddings_index = gensim.models.KeyedVectors.load_word2vec_format( 67 | Emed_path, binary=False) 68 | nb_words = len(word_index)+1 69 | embedding_matrix = np.zeros((nb_words, embed_size)) 70 | count = 0 71 | for word, i in word_index.items(): 72 | if i >= nb_words: 73 | continue 74 | try: 75 | embedding_vector = embeddings_index[word] 76 | except: 77 | embedding_vector = np.zeros(embed_size) 78 | count += 1 79 | if embedding_vector is not None: 80 | embedding_matrix[i] = embedding_vector 81 | return embedding_matrix 82 | 83 | 84 | 85 | print("loading sequence data and embedding") 86 | start_time = time.time() 87 | 88 | print("loading advertiser id") 89 | df = pd.read_pickle('../../data/keras/df_advertiser_sequence.pickle') 90 | advid_list = list(df['advids']) 91 | for i in range(0, len(advid_list)): 92 | advid_list[i] =[str(ii) for ii in advid_list[i]] 93 | 94 | x_advid, index_advid = set_tokenizer(advid_list, split_char=',') 95 | emb_advid = get_embedding_matrix(index_advid, 96 | embed_size=emb_dim_advid, 97 | Emed_path='../../w2v_models/advid_w2v_32_win100_iter20_mincount1.txt') 98 | del df, advid_list, index_advid 99 | gc.collect() 100 | 101 | 102 | print("loading creative id") 103 | df = pd.read_pickle('../../data/keras/df_creative_sequence.pickle') 104 | cid_list = list(df['cids']) 105 | for i in range(0, len(cid_list)): 106 | cid_list[i] =[str(ii) for ii in cid_list[i]] 107 | 108 | x_cid, index_cid = set_tokenizer(cid_list, split_char=',') 109 | emb_cid = get_embedding_matrix(index_cid, 110 | embed_size=emb_dim_cid, 111 | Emed_path='../../w2v_models/cid_w2v_64_win100_iter20_mincount1.txt') 112 | del df, cid_list, index_cid 113 | gc.collect() 114 | 115 | 116 | used_minutes = (time.time() - start_time) / 60 117 | print(f"done, used {used_minutes} minutes") 118 | 119 | 120 | print("loading labels") 121 | start_time = time.time() 122 | 123 | labels_1 = pd.read_csv('../../raw_data/train_preliminary/user.csv') 124 | labels_2 = pd.read_csv('../../raw_data/train_semi_final/user.csv') 125 | labels = pd.concat([labels_1, labels_2]) 126 | labels['age'] = labels['age'] - 1 127 | labels['gender'] = labels['gender'] - 1 128 | 129 | used_minutes = (time.time() - start_time) / 60 130 | print(f"done, used {used_minutes} minutes") 131 | 132 | 133 | 134 | print("split train, valid and test data") 135 | start_time = time.time() 136 | 137 | y = keras.utils.to_categorical(labels['age']) 138 | 139 | x_stacking = np.load('../../probs/x_stacking_120probs.npy')[:, :90] 140 | 141 | if fold == "fold0": 142 | train_cid = x_cid[:2400000] 143 | valid_cid = x_cid[2400000:3000000] 144 | train_advid = x_advid[:2400000] 145 | valid_advid = x_advid[2400000:3000000] 146 | train_stacking = x_stacking[:2400000] 147 | valid_stacking = x_stacking[2400000:3000000] 148 | y_train = y[:2400000] 149 | y_valid = y[2400000:] 150 | elif fold == "fold1": 151 | train_cid = np.concatenate((x_cid[:1800000], x_cid[2400000:3000000]), axis=0) 152 | valid_cid = x_cid[1800000:2400000] 153 | train_advid = np.concatenate((x_advid[:1800000], x_advid[2400000:3000000]), axis=0) 154 | valid_advid = x_advid[1800000:2400000] 155 | train_stacking = np.concatenate((x_stacking[:1800000], x_stacking[2400000:3000000]), axis=0) 156 | valid_stacking = x_stacking[1800000:2400000] 157 | y_train = np.concatenate((y[:1800000], y[2400000:3000000])) 158 | y_valid = y[1800000:2400000] 159 | elif fold == "fold2": 160 | train_cid = np.concatenate((x_cid[:1200000], x_cid[1800000:3000000]), axis=0) 161 | valid_cid = x_cid[1200000:1800000] 162 | train_advid = np.concatenate((x_advid[:1200000], x_advid[1800000:3000000]), axis=0) 163 | valid_advid = x_advid[1200000:1800000] 164 | train_stacking = np.concatenate((x_stacking[:1200000], x_stacking[1800000:3000000]), axis=0) 165 | valid_stacking = x_stacking[1200000:1800000] 166 | y_train = np.concatenate((y[:1200000], y[1800000:3000000])) 167 | y_valid = y[1200000:1800000] 168 | elif fold == "fold3": 169 | train_cid = np.concatenate((x_cid[:600000], x_cid[1200000:3000000]), axis=0) 170 | valid_cid = x_cid[600000:1200000] 171 | train_advid = np.concatenate((x_advid[:600000], x_advid[1200000:3000000]), axis=0) 172 | valid_advid = x_advid[600000:1200000] 173 | train_stacking = np.concatenate((x_stacking[:600000], x_stacking[1200000:3000000]), axis=0) 174 | valid_stacking = x_stacking[600000:1200000] 175 | y_train = np.concatenate((y[:600000], y[1200000:3000000])) 176 | y_valid = y[600000:1200000] 177 | elif fold == "fold4": 178 | train_cid = x_cid[600000:3000000] 179 | valid_cid = x_cid[:600000] 180 | train_advid = x_advid[600000:3000000] 181 | valid_advid = x_advid[:600000] 182 | train_stacking = x_stacking[600000:3000000] 183 | valid_stacking= x_stacking[:600000] 184 | y_train = y[600000:3000000] 185 | y_valid = y[:600000] 186 | else: 187 | pass 188 | 189 | test_cid = x_cid[3000000:] 190 | test_advid = x_advid[3000000:] 191 | test_stacking = x_stacking[3000000:] 192 | 193 | del x_cid, x_advid 194 | del x_stacking 195 | del y 196 | gc.collect() 197 | 198 | print(train_cid.shape, valid_cid.shape, test_cid.shape) 199 | print(train_advid.shape, valid_advid.shape, test_advid.shape) 200 | print(train_stacking.shape, valid_stacking.shape, test_stacking.shape) 201 | print(y_train.shape, y_valid.shape) 202 | 203 | used_minutes = (time.time() - start_time) / 60 204 | print(f"done, used {used_minutes} minutes") 205 | 206 | 207 | print("building model") 208 | 209 | stacking_shape = train_stacking.shape[1] 210 | 211 | start_time = time.time() 212 | 213 | def build_model(emb_cid, emb_advid): 214 | 215 | inp1 = layers.Input(shape=(max_len,)) 216 | inp2 = layers.Input(shape=(max_len,)) 217 | inp_stacking = layers.Input(shape=(stacking_shape,)) 218 | 219 | emb1 = layers.Embedding( 220 | input_dim=emb_cid.shape[0], 221 | output_dim=emb_cid.shape[1], 222 | input_length=max_len, 223 | weights=[emb_cid], 224 | trainable=False 225 | )(inp1) 226 | emb2 = layers.Embedding( 227 | input_dim=emb_advid.shape[0], 228 | output_dim=emb_advid.shape[1], 229 | input_length=max_len, 230 | weights=[emb_advid], 231 | trainable=False 232 | )(inp2) 233 | 234 | sdrop = layers.SpatialDropout1D(rate=0.1) 235 | 236 | emb1 = sdrop(emb1) 237 | emb2 = sdrop(emb2) 238 | 239 | content = layers.Concatenate()([emb1, emb2]) 240 | 241 | mha1 = MultiHeadAttention(head_num=8)(content) 242 | # mha1 = layers.Dropout(0.01)(mha1) 243 | mha1 = layers.Add()([content, mha1]) 244 | mha1 = LayerNormalization()(mha1) 245 | # mha1 = layers.Dropout(0.01)(mha1) 246 | mha1_ff = FeedForward(128)(mha1) 247 | mha1_out = layers.Add()([mha1, mha1_ff]) 248 | mha1_out = LayerNormalization()(mha1_out) 249 | 250 | # mha2 = MultiHeadAttention(head_num=8)(mha1_out) 251 | # mha2 = layers.Dropout(0.01)(mha2) 252 | # mha2 = layers.Add()([mha1_out, mha2]) 253 | # mha2 = LayerNormalization()(mha2) 254 | # mha2 = layers.Dropout(0.01)(mha2) 255 | # mha2_ff = FeedForward(128)(mha2) 256 | # mha2_out = layers.Add()([mha2, mha2_ff]) 257 | # mha2_out = LayerNormalization()(mha2_out) 258 | 259 | # mha3 = MultiHeadAttention(head_num=8)(mha2_out) 260 | # mha3 = layers.Dropout(0.01)(mha3) 261 | # mha3 = layers.Add()([mha2_out, mha3]) 262 | # mha3 = LayerNormalization()(mha3) 263 | # mha3 = layers.Dropout(0.01)(mha3) 264 | # mha3_ff = FeedForward(128)(mha3) 265 | # mha3_out = layers.Add()([mha3, mha3_ff]) 266 | # mha3_out = LayerNormalization()(mha3_out) 267 | 268 | # avg_pool = layers.GlobalAveragePooling1D()(mha3_out) 269 | max_pool = layers.GlobalMaxPool1D()(mha1_out) 270 | 271 | x = layers.Concatenate()([max_pool, inp_stacking]) 272 | 273 | x = layers.Dense(128, activation='relu')(x) 274 | # x = layers.BatchNormalization()(x) 275 | 276 | x = layers.Dense(64, activation='relu')(x) 277 | # x = layers.BatchNormalization()(x) 278 | 279 | x = layers.Dense(32, activation='relu')(x) 280 | # x = layers.BatchNormalization()(x) 281 | 282 | # x = layers.Dropout(0.1)(x) 283 | 284 | out = layers.Dense(10, activation='softmax')(x) 285 | model = keras.Model(inputs=[inp1, inp2, inp_stacking], outputs=out) 286 | model.compile(loss='categorical_crossentropy', 287 | optimizer=keras.optimizers.Adam(5e-4), 288 | metrics=['accuracy']) 289 | 290 | return model 291 | 292 | 293 | model = build_model(emb_cid, emb_advid) 294 | 295 | used_minutes = (time.time() - start_time) / 60 296 | print(f"done, used {used_minutes} minutes") 297 | 298 | 299 | checkpoint = callbacks.ModelCheckpoint(f'../../models/age_transformer_stacking_{fold}.h5', 300 | monitor='val_accuracy', 301 | verbose=1, 302 | save_best_only=True, 303 | mode='max', 304 | save_weights_only=True) 305 | 306 | reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_accuracy', 307 | factor=0.1, 308 | patience=4, 309 | verbose=1, 310 | mode='max', 311 | epsilon=1e-6) 312 | 313 | early_stop = callbacks.EarlyStopping(monitor='val_accuracy', 314 | mode='max', 315 | patience=10) 316 | 317 | 318 | hist = model.fit([train_cid, train_advid, train_stacking], 319 | y_train, 320 | batch_size=batch_size, 321 | epochs=100, 322 | validation_data=([valid_cid, valid_advid, valid_stacking], y_valid), 323 | callbacks=[ 324 | checkpoint, 325 | reduce_lr, 326 | early_stop], 327 | verbose=1, 328 | shuffle=True) 329 | 330 | 331 | acc = max(hist.history['val_accuracy']) 332 | acc 333 | 334 | 335 | print("predict start") 336 | start_time = time.time() 337 | 338 | model.load_weights(f'../../models/age_transformer_stacking_{fold}.h5') 339 | preds = model.predict([test_cid, test_advid, test_stacking], 340 | batch_size=batch_size, 341 | verbose=1) 342 | 343 | np.save(f'../../probs/sub_age_transformer_stacking_{fold}', preds) 344 | 345 | used_minutes = (time.time() - start_time) / 60 346 | print(f"done, used {used_minutes} minutes") 347 | -------------------------------------------------------------------------------- /src/stacking/f5_merge_fold_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ############################################################################# 5 | # 将五折跑完的结果合并成 oof 和 sub 概率 6 | ############################################################################# 7 | 8 | import sys 9 | 10 | import warnings 11 | warnings.simplefilter('ignore') 12 | 13 | import numpy as np 14 | import pandas as pd 15 | 16 | from sklearn.metrics import accuracy_score 17 | 18 | # 参数类似 age_m3_keras or gender_m1_keras 19 | name = sys.argv[1] 20 | 21 | probs_pth = '../../probs' 22 | 23 | train_f0 = np.load(f'{probs_pth}/oof_{name}_4.npy') 24 | train_f1 = np.load(f'{probs_pth}/oof_{name}_3.npy') 25 | train_f2 = np.load(f'{probs_pth}/oof_{name}_2.npy') 26 | train_f3 = np.load(f'{probs_pth}/oof_{name}_1.npy') 27 | train_f4 = np.load(f'{probs_pth}/oof_{name}_0.npy') 28 | 29 | test_f0 = np.load(f'{probs_pth}/sub_{name}_4.npy') 30 | test_f1 = np.load(f'{probs_pth}/sub_{name}_3.npy') 31 | test_f2 = np.load(f'{probs_pth}/sub_{name}_2.npy') 32 | test_f3 = np.load(f'{probs_pth}/sub_{name}_1.npy') 33 | test_f4 = np.load(f'{probs_pth}/sub_{name}_0.npy') 34 | 35 | 36 | oof_probs = np.concatenate((train_f0, train_f1, train_f2, train_f3, train_f4), axis=0) 37 | test_probs = (test_f0 + test_f1 + test_f2 + test_f3 + test_f4) / 5 38 | 39 | 40 | np.save(f'{probs_pth}/oof_{name}', oof_probs) 41 | np.save(f'{probs_pth}/sub_{name}', test_probs) 42 | 43 | -------------------------------------------------------------------------------- /src/torch/f1_save_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import warnings 5 | warnings.filterwarnings('ignore') 6 | 7 | import os 8 | import pandas as pd 9 | from sklearn.model_selection import GroupKFold 10 | import gc 11 | from tqdm import tqdm 12 | import numpy as np 13 | from collections import defaultdict 14 | import math 15 | import pickle 16 | 17 | pd.set_option('display.max_columns', None) 18 | pd.set_option('display.max_rows', None) 19 | 20 | seed = 2020 21 | 22 | # 读取数据集 23 | df_train_ad = pd.read_csv('../../raw_data/train_preliminary/ad.csv') 24 | df_train_log = pd.read_csv('../../raw_data/train_preliminary/click_log.csv') 25 | df_train_user = pd.read_csv('../../raw_data/train_preliminary/user.csv') 26 | 27 | df_test_ad = pd.read_csv('../../raw_data/test/ad.csv') 28 | df_test_log = pd.read_csv('../../raw_data/test/click_log.csv') 29 | 30 | df_train_semi_final_ad = pd.read_csv('../../raw_data/train_semi_final/ad.csv') 31 | df_train_semi_final_log = pd.read_csv('../../raw_data/train_semi_final/click_log.csv') 32 | df_train_semi_final_user = pd.read_csv('../../raw_data/train_semi_final/user.csv') 33 | 34 | df_train_user = df_train_user.append(df_train_semi_final_user) 35 | df_train_log = df_train_log.append(df_train_semi_final_log) 36 | df_train_ad = df_train_ad.append(df_train_semi_final_ad) 37 | 38 | 39 | # 提取所有用户 40 | df_test_user = df_test_log[['user_id']] 41 | df_test_user.drop_duplicates(inplace=True) 42 | df_feature = pd.concat([df_train_user, df_test_user], sort=False) 43 | 44 | # 日志数据 45 | df_ad = pd.concat([df_train_ad, df_test_ad], sort=False) 46 | df_ad.drop_duplicates(inplace=True) 47 | 48 | df_log = pd.concat([df_train_log, df_test_log], sort=False) 49 | df_log.sort_values(['user_id', 'time'], inplace=True) 50 | 51 | df_log = df_log.merge(df_ad, how='left', on='creative_id') 52 | 53 | df_feature.to_pickle('../../data/torch/feature.pkl') 54 | df_log.to_pickle('../../data/torch/log.pkl') 55 | -------------------------------------------------------------------------------- /src/torch/f2_save_embedding_w2v.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import warnings 6 | warnings.filterwarnings('ignore') 7 | 8 | from sklearn.model_selection import GroupKFold 9 | import gc 10 | from tqdm import tqdm 11 | import numpy as np 12 | from collections import defaultdict 13 | import math 14 | import pickle 15 | from gensim.models import Word2Vec 16 | import logging 17 | import os 18 | from gensim.models import KeyedVectors 19 | 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 21 | level=logging.INFO) 22 | 23 | pd.set_option('display.max_columns', None) 24 | pd.set_option('display.max_rows', None) 25 | 26 | 27 | seed = 2020 28 | 29 | df_log = pd.read_pickle('../../data/torch/log.pkl') 30 | 31 | def emb(df, f1, f2, emb_size): 32 | print( 33 | '====================================== {} {} ======================================' 34 | .format(f1, f2)) 35 | tmp = df.groupby(f1, as_index=False)[f2].agg( 36 | {'{}_{}_list'.format(f1, f2): list}) 37 | sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist() 38 | del tmp['{}_{}_list'.format(f1, f2)] 39 | 40 | words = [] 41 | for i in range(len(sentences)): 42 | words += [x for x in sentences[i]] 43 | sentences[i] = [str(x) for x in sentences[i]] 44 | 45 | model = Word2Vec(sentences, 46 | size=emb_size, 47 | window=10, 48 | min_count=1, 49 | sg=1, 50 | hs=1, 51 | workers=30, 52 | seed=seed) 53 | 54 | emb_matrix = [] 55 | words = list(set(words)) 56 | for w in tqdm(words): 57 | if str(w) in model: 58 | emb_matrix.append(model[str(w)]) 59 | else: 60 | emb_matrix.append([0] * emb_size) 61 | 62 | df_emb = pd.DataFrame(emb_matrix) 63 | df_emb.columns = [ 64 | '{}_{}_w2v_{}'.format(f1, f2, i) for i in range(emb_size) 65 | ] 66 | df_emb[f2] = words 67 | 68 | return df_emb 69 | 70 | 71 | for f1, f2, dim in [['user_id', 'industry', 128], ['user_id', 'ad_id', 128], ['user_id', 'product_id', 128]]: 72 | df_emb = emb(df_log, f1, f2, dim) 73 | 74 | df_emb.to_pickle('../../w2v_models/w2v_{}_{}.pkl'.format(f2, dim)) 75 | 76 | del df_emb 77 | gc.collect() 78 | -------------------------------------------------------------------------------- /src/torch/f3_AGE_m7_lstm_3inputs_train_5fold.py: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | # AGE model 7: Torch LSTM+Attention 3 inputs 3 | # score: 4 | # 五折: 0.50083 (线下) 5 | # 五折: 0.51446 (线上) 6 | # 训练时长: ~ 4 days 7 | ################################################################################# 8 | 9 | 10 | import pandas as pd 11 | import warnings 12 | import gc 13 | from tqdm import tqdm 14 | import numpy as np 15 | from collections import defaultdict, OrderedDict 16 | import math 17 | import pickle 18 | import random 19 | import torch 20 | import torch.nn as nn 21 | from sklearn import preprocessing 22 | from pytorchtools import EarlyStopping 23 | from sklearn.model_selection import KFold 24 | from sklearn.metrics import accuracy_score 25 | import os 26 | import torch_optimizer as optim 27 | from m7_lstm_3inputs_age import * 28 | 29 | pd.set_option('display.max_columns', None) 30 | pd.set_option('display.max_rows', None) 31 | 32 | warnings.filterwarnings('ignore') 33 | 34 | torch.cuda.set_device(0) 35 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 36 | 37 | 38 | def fix_seed(seed): 39 | random.seed(seed) 40 | np.random.seed(seed) 41 | torch.manual_seed(seed) 42 | torch.cuda.manual_seed_all(seed) 43 | torch.backends.cudnn.deterministic = True 44 | 45 | 46 | seed = 2020 47 | fix_seed(seed) 48 | 49 | 50 | df_log = pd.read_pickle('../../data/torch/log.pkl') 51 | 52 | 53 | seq_embedding_features = OrderedDict({ 54 | 'creative_id': { 55 | 'embedding_file': '../../w2v_models/w2v_creative_id_128.pkl', 56 | 'embedding_dim': 128, 57 | 'pretrained_embedding': None, 58 | }, 59 | 'advertiser_id': { 60 | 'embedding_file': '../../w2v_models/w2v_advertiser_id_128.pkl', 61 | 'embedding_dim': 128, 62 | 'pretrained_embedding': None, 63 | }, 64 | 'product_id': { 65 | 'embedding_file': '../../w2v_models/w2v_product_id_128.pkl', 66 | 'embedding_dim': 128, 67 | 'pretrained_embedding': None, 68 | }, 69 | }) 70 | 71 | 72 | for f in tqdm(seq_embedding_features.keys()): 73 | le = preprocessing.LabelEncoder() 74 | le.fit(df_log[f].values.tolist()) 75 | 76 | df_emb = pd.read_pickle(seq_embedding_features[f]['embedding_file']) 77 | df_emb = df_emb[df_emb[f].isin(df_log[f].values.tolist())] 78 | assert df_emb.shape[1] == seq_embedding_features[f]['embedding_dim'] + 1 79 | df_emb[f] = le.transform(df_emb[f].values.tolist()) + 1 80 | 81 | # 补上作为序列填补的 0 向量 82 | df_default = pd.DataFrame() 83 | df_default[f] = [0] 84 | df_emb = df_emb.append(df_default) 85 | df_emb.fillna(0, inplace=True) 86 | 87 | # 按 id 排序 88 | df_emb.sort_values([f], inplace=True) 89 | embedding_columns = [c for c in df_emb.columns if c != f] 90 | seq_embedding_features[f]['pretrained_embedding'] = [ 91 | v for v in df_emb[embedding_columns].values 92 | ] 93 | 94 | del df_default, df_emb 95 | gc.collect() 96 | 97 | df_log[f] = le.transform(df_log[f].values.tolist()) + 1 98 | seq_embedding_features[f]['nunique'] = df_log[f].nunique() + 1 99 | 100 | 101 | # # 序列特征 102 | 103 | # ## 序列 id 特征 104 | 105 | seq_len = 128 106 | 107 | def gen_seq_data(data, features, seq_len, prefix=''): 108 | data.sort_values('time', inplace=True) 109 | data_set = OrderedDict() 110 | 111 | user_ids = [] 112 | for user_id, hist in tqdm(data.groupby('user_id')): 113 | user_ids.append(user_id) 114 | 115 | # 取最近的记录 116 | for f in features: 117 | hist_f = hist[f].values 118 | hist_f = hist_f[-seq_len:] 119 | 120 | if f not in data_set: 121 | data_set[f] = [] 122 | 123 | data_set[f].append(hist_f) 124 | 125 | for f in features: 126 | df_context = pd.DataFrame() 127 | df_context['user_id'] = user_ids 128 | df_context['{}_seq'.format(f)] = data_set[f] 129 | 130 | df_context.to_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 131 | 132 | 133 | # 是否从本地加载seq数据 134 | load_seq = True 135 | prefix = str(df_log.shape[0]) + '_' if df_log.shape[0] < 10000 else '' 136 | # 不加载本地seq,强制重新生成所有seq 137 | seq_features = [] 138 | if not load_seq: 139 | seq_features = list(seq_embedding_features.keys()) 140 | else: 141 | for f in seq_embedding_features.keys(): 142 | if not os.path.exists('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)): 143 | seq_features += [f] 144 | 145 | if len(seq_features) != 0: 146 | df_context = gen_seq_data(df_log, seq_features, seq_len, prefix) 147 | 148 | # 合并序列 149 | all_users = list(df_log['user_id'].unique()) 150 | all_users.sort() 151 | df_context = pd.DataFrame(all_users) 152 | df_context.columns = ['user_id'] 153 | for f in seq_embedding_features.keys(): 154 | df_seq = pd.read_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 155 | df_context = df_context.merge(df_seq, how='left') 156 | 157 | 158 | # ## 序列统计特征 159 | 160 | seq_statistics_features = [] 161 | df_statistics_context = None 162 | 163 | 164 | # # 合并其他特征 165 | 166 | # ## 标签 167 | 168 | df_feature = pd.read_pickle('../../data/torch/feature.pkl') 169 | df_feature['age'] = df_feature['age'].astype('float') 170 | df_feature['age'] = df_feature['age'] - 1 171 | del df_feature['gender'] 172 | 173 | user_ids = list(set(df_log['user_id'].values)) 174 | df_feature = df_feature[df_feature['user_id'].isin(user_ids)] 175 | df_feature.sort_values(['user_id'], inplace=True) 176 | df_feature.reset_index(drop=True, inplace=True) 177 | 178 | df_feature = df_feature.merge(df_context, how='left') 179 | 180 | if df_statistics_context: 181 | df_feature = df_feature.merge(df_statistics_context, how='left') 182 | del df_statistics_context 183 | 184 | del df_context 185 | gc.collect() 186 | 187 | 188 | # ## target encoder 特征 189 | # 效果不好, 留空 190 | 191 | statistics_features = [] 192 | 193 | 194 | # # 模型训练 195 | 196 | 197 | train_model_inputs = df_feature[df_feature['age'].notnull()].reset_index( 198 | drop=True) 199 | test_model_inputs = df_feature[df_feature['age'].isnull()].reset_index( 200 | drop=True) 201 | 202 | print(train_model_inputs.shape, test_model_inputs.shape) 203 | 204 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 205 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 206 | print('\nFold_{} Training ============================================\n'. 207 | format(fold_id + 1)) 208 | 209 | train_data = train_model_inputs.iloc[trn_idx] 210 | val_data = train_model_inputs.iloc[val_idx] 211 | 212 | # 模型定义 213 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 214 | statistics_features=statistics_features, 215 | seq_statistics_features=seq_statistics_features, 216 | seq_len=seq_len, 217 | device=device).to(device) 218 | 219 | criterion = nn.CrossEntropyLoss().to(device) 220 | optimizer = optim.Ranger(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-2) 221 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 222 | mode='max', 223 | factor=0.1, 224 | patience=2, 225 | min_lr=1e-6, 226 | verbose=True) 227 | early_stopping = EarlyStopping( 228 | file_name='../../models/age_m7_checkpoint{}.pt'.format(fold_id), 229 | patience=10, 230 | verbose=True, 231 | delta=0.00000001) 232 | 233 | model.set(criterion, optimizer, scheduler, early_stopping) 234 | 235 | batch_size = 1000 236 | epoches = 10000 237 | best_age_acc = model.model_train(train_data, val_data, epoches, batch_size) 238 | print('age_acc: {}'.format(best_age_acc)) 239 | 240 | test_data = test_model_inputs 241 | 242 | oof_pred_age = np.zeros((train_model_inputs.shape[0], 10)) 243 | test_pred_age = np.zeros((test_data.shape[0], 10)) 244 | 245 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 246 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 247 | print('\nFold_{} Training ============================================\n'. 248 | format(fold_id + 1)) 249 | 250 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 251 | statistics_features=statistics_features, 252 | seq_statistics_features=seq_statistics_features, 253 | seq_len=seq_len, 254 | device=device).to(device) 255 | model.load_state_dict(torch.load('../../models/age_m7_checkpoint{}.pt'.format(fold_id)), strict=False) 256 | model.eval() 257 | 258 | with torch.no_grad(): 259 | val_data = train_model_inputs.iloc[val_idx] 260 | 261 | # 对训练集预测 262 | model_pred_age, _, _ = model.model_predict(val_data, batch_size, False) 263 | oof_pred_age[val_idx] += model_pred_age 264 | 265 | # 对测试集预测 266 | model_pred_age, _, _ = model.model_predict(test_data, batch_size, False) 267 | test_pred_age += model_pred_age / 5 268 | 269 | df_oof = train_model_inputs[['user_id', 'age']] 270 | df_oof['predicted_age'] = np.argmax(oof_pred_age, axis=1) 271 | acc_age = accuracy_score(df_oof['age'], df_oof['predicted_age']) 272 | print(acc_age) 273 | 274 | np.save('../../probs/sub_age_m7_torch', test_pred_age) 275 | np.save('../../probs/oof_age_m7_torch', oof_pred_age) 276 | 277 | -------------------------------------------------------------------------------- /src/torch/f4_AGE_m1_lstm_6inputs_train_5fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ################################################################################# 5 | # AGE model 1: Torch LSTM+Attention 6 inputs 6 | # score: 7 | # 五折: 0.49961 (线下) 8 | # 五折: 0.51326 (线上) 9 | # 训练时长: ~ 5 days 10 | ################################################################################# 11 | 12 | 13 | import pandas as pd 14 | import warnings 15 | import gc 16 | from tqdm import tqdm 17 | import numpy as np 18 | from collections import defaultdict, OrderedDict 19 | import math 20 | import pickle 21 | import random 22 | import torch 23 | import torch.nn as nn 24 | from sklearn import preprocessing 25 | from pytorchtools import EarlyStopping 26 | from sklearn.model_selection import KFold 27 | from sklearn.metrics import accuracy_score 28 | import os 29 | from lookahead import * 30 | from m1_lstm_6inputs_age import * 31 | 32 | pd.set_option('display.max_columns', None) 33 | pd.set_option('display.max_rows', None) 34 | 35 | warnings.filterwarnings('ignore') 36 | 37 | torch.cuda.set_device(0) 38 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 39 | 40 | 41 | def fix_seed(seed): 42 | random.seed(seed) 43 | np.random.seed(seed) 44 | torch.manual_seed(seed) 45 | torch.cuda.manual_seed_all(seed) 46 | torch.backends.cudnn.deterministic = True 47 | 48 | 49 | seed = 2020 50 | fix_seed(seed) 51 | 52 | 53 | df_log = pd.read_pickle('../../data/torch/log.pkl') 54 | 55 | seq_embedding_features = OrderedDict({ 56 | 'creative_id': { 57 | 'embedding_file': '../../w2v_models/w2v_creative_id_128.pkl', 58 | 'embedding_dim': 128, 59 | 'pretrained_embedding': None, 60 | }, 61 | 'industry': { 62 | 'embedding_file': '../../w2v_models/w2v_industry_128.pkl', 63 | 'embedding_dim': 128, 64 | 'pretrained_embedding': None, 65 | }, 66 | 'ad_id': { 67 | 'embedding_file': '../../w2v_models/w2v_ad_id_128.pkl', 68 | 'embedding_dim': 128, 69 | 'pretrained_embedding': None, 70 | }, 71 | 'advertiser_id': { 72 | 'embedding_file': '../../w2v_models/w2v_advertiser_id_128.pkl', 73 | 'embedding_dim': 128, 74 | 'pretrained_embedding': None, 75 | }, 76 | 'product_category': { 77 | 'embedding_file': '../../w2v_models/w2v_product_category_128.pkl', 78 | 'embedding_dim': 128, 79 | 'pretrained_embedding': None, 80 | }, 81 | 'product_id': { 82 | 'embedding_file': '../../w2v_models/w2v_product_id_128.pkl', 83 | 'embedding_dim': 128, 84 | 'pretrained_embedding': None, 85 | }, 86 | }) 87 | 88 | 89 | 90 | for f in tqdm(seq_embedding_features.keys()): 91 | le = preprocessing.LabelEncoder() 92 | le.fit(df_log[f].values.tolist()) 93 | 94 | df_emb = pd.read_pickle(seq_embedding_features[f]['embedding_file']) 95 | df_emb = df_emb[df_emb[f].isin(df_log[f].values.tolist())] 96 | assert df_emb.shape[1] == seq_embedding_features[f]['embedding_dim'] + 1 97 | df_emb[f] = le.transform(df_emb[f].values.tolist()) + 1 98 | 99 | # 补上作为序列填补的 0 向量 100 | df_default = pd.DataFrame() 101 | df_default[f] = [0] 102 | df_emb = df_emb.append(df_default) 103 | df_emb.fillna(0, inplace=True) 104 | 105 | # 按 id 排序 106 | df_emb.sort_values([f], inplace=True) 107 | embedding_columns = [c for c in df_emb.columns if c != f] 108 | seq_embedding_features[f]['pretrained_embedding'] = [ 109 | v for v in df_emb[embedding_columns].values 110 | ] 111 | 112 | del df_default, df_emb 113 | gc.collect() 114 | 115 | df_log[f] = le.transform(df_log[f].values.tolist()) + 1 116 | seq_embedding_features[f]['nunique'] = df_log[f].nunique() + 1 117 | 118 | # ## 序列 id 特征 119 | 120 | seq_len = 128 121 | 122 | def gen_seq_data(data, features, seq_len, prefix=''): 123 | data.sort_values('time', inplace=True) 124 | data_set = OrderedDict() 125 | 126 | user_ids = [] 127 | for user_id, hist in tqdm(data.groupby('user_id')): 128 | user_ids.append(user_id) 129 | 130 | # 取最近的记录 131 | for f in features: 132 | hist_f = hist[f].values 133 | hist_f = hist_f[-seq_len:] 134 | 135 | if f not in data_set: 136 | data_set[f] = [] 137 | 138 | data_set[f].append(hist_f) 139 | 140 | for f in features: 141 | df_context = pd.DataFrame() 142 | df_context['user_id'] = user_ids 143 | df_context['{}_seq'.format(f)] = data_set[f] 144 | 145 | df_context.to_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 146 | 147 | 148 | # 是否从本地加载seq数据 149 | load_seq = True 150 | prefix = str(df_log.shape[0]) + '_' if df_log.shape[0] < 10000 else '' 151 | # 不加载本地seq,强制重新生成所有seq 152 | seq_features = [] 153 | if not load_seq: 154 | seq_features = list(seq_embedding_features.keys()) 155 | else: 156 | for f in seq_embedding_features.keys(): 157 | if not os.path.exists('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)): 158 | seq_features += [f] 159 | print(seq_features) 160 | 161 | if len(seq_features) != 0: 162 | df_context = gen_seq_data(df_log, seq_features, seq_len, prefix) 163 | 164 | # 合并序列 165 | all_users = list(df_log['user_id'].unique()) 166 | all_users.sort() 167 | df_context = pd.DataFrame(all_users) 168 | df_context.columns = ['user_id'] 169 | for f in seq_embedding_features.keys(): 170 | df_seq = pd.read_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 171 | df_context = df_context.merge(df_seq, how='left') 172 | 173 | 174 | # ## 序列统计特征 175 | 176 | 177 | seq_statistics_features = [] 178 | df_statistics_context = None 179 | 180 | 181 | df_feature = pd.read_pickle('../../data/torch/feature.pkl') 182 | df_feature['age'] = df_feature['age'].astype('float') 183 | df_feature['age'] = df_feature['age'] - 1 184 | del df_feature['gender'] 185 | 186 | user_ids = list(set(df_log['user_id'].values)) 187 | df_feature = df_feature[df_feature['user_id'].isin(user_ids)] 188 | df_feature.sort_values(['user_id'], inplace=True) 189 | df_feature.reset_index(drop=True, inplace=True) 190 | 191 | df_feature = df_feature.merge(df_context, how='left') 192 | 193 | if df_statistics_context: 194 | df_feature = df_feature.merge(df_statistics_context, how='left') 195 | del df_statistics_context 196 | 197 | del df_context 198 | gc.collect() 199 | 200 | 201 | statistics_features = [] 202 | 203 | # # 模型训练 204 | 205 | 206 | train_model_inputs = df_feature[df_feature['age'].notnull()].reset_index( 207 | drop=True) 208 | test_model_inputs = df_feature[df_feature['age'].isnull()].reset_index( 209 | drop=True) 210 | 211 | 212 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 213 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 214 | print('\nFold_{} Training ============================================\n'. 215 | format(fold_id + 1)) 216 | 217 | train_data = train_model_inputs.iloc[trn_idx] 218 | val_data = train_model_inputs.iloc[val_idx] 219 | 220 | # 模型定义 221 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 222 | statistics_features=statistics_features, 223 | seq_statistics_features=seq_statistics_features, 224 | seq_len=seq_len, 225 | device=device).to(device) 226 | 227 | criterion = nn.CrossEntropyLoss().to(device) 228 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, 229 | model.parameters()), 230 | lr=1e-2) 231 | optimizer = Lookahead(optimizer, k=5, alpha=0.5) 232 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 233 | mode='max', 234 | factor=0.1, 235 | patience=2, 236 | min_lr=1e-6, 237 | verbose=True) 238 | early_stopping = EarlyStopping( 239 | file_name='../../models/age_m1_checkpoint{}.pt'.format(fold_id), 240 | patience=10, 241 | verbose=True, 242 | delta=0.00000001) 243 | 244 | model.set(criterion, optimizer, scheduler, early_stopping) 245 | 246 | batch_size = 1000 247 | # 6000 248 | epoches = 10000 249 | best_age_acc = model.model_train(train_data, val_data, epoches, batch_size) 250 | print('age_acc: {}'.format(best_age_acc)) 251 | 252 | 253 | test_data = test_model_inputs 254 | 255 | oof_pred_age = np.zeros((train_model_inputs.shape[0], 10)) 256 | test_pred_age = np.zeros((test_data.shape[0], 10)) 257 | 258 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 259 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 260 | print('\nFold_{} Training ============================================\n'. 261 | format(fold_id + 1)) 262 | 263 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 264 | statistics_features=statistics_features, 265 | seq_statistics_features=seq_statistics_features, 266 | seq_len=seq_len, 267 | device=device).to(device) 268 | model.load_state_dict(torch.load('../../models/age_m1_checkpoint{}.pt'.format(fold_id)), strict=False) 269 | model.eval() 270 | 271 | with torch.no_grad(): 272 | val_data = train_model_inputs.iloc[val_idx] 273 | 274 | # 对训练集预测 275 | model_pred_age, _, _ = model.model_predict(val_data, batch_size, False) 276 | oof_pred_age[val_idx] += model_pred_age 277 | 278 | # 对测试集预测 279 | model_pred_age, _, _ = model.model_predict(test_data, batch_size, False) 280 | test_pred_age += model_pred_age / 5 281 | 282 | 283 | df_oof = train_model_inputs[['user_id', 'age']] 284 | df_oof['predicted_age'] = np.argmax(oof_pred_age, axis=1) 285 | acc_age = accuracy_score(df_oof['age'], df_oof['predicted_age']) 286 | print(acc_age) 287 | 288 | 289 | np.save('../../probs/sub_age_m1_torch', test_pred_age) 290 | np.save('../../probs/oof_age_m1_torch', oof_pred_age) 291 | 292 | 293 | -------------------------------------------------------------------------------- /src/torch/f5_AGE_m9_transformer_3inputs_train_5fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ################################################################################# 5 | # AGE model 9: Torch Transformer+LSTM 3 inputs 6 | # score: 7 | # 五折: 0.50078 (线下) 8 | # 五折: 0.51433 (线上) 9 | # 训练时长: ~ 4 days 10 | ################################################################################# 11 | 12 | 13 | import pandas as pd 14 | import warnings 15 | import gc 16 | from tqdm import tqdm 17 | import numpy as np 18 | from collections import defaultdict, OrderedDict 19 | import math 20 | import pickle 21 | import random 22 | import torch 23 | import torch.nn as nn 24 | from sklearn import preprocessing 25 | from pytorchtools import EarlyStopping 26 | import os 27 | from sklearn.model_selection import KFold 28 | import torch_optimizer as optim 29 | from sklearn.metrics import accuracy_score 30 | from m9_transformer_3inputs_age import * 31 | 32 | pd.set_option('display.max_columns', None) 33 | pd.set_option('display.max_rows', None) 34 | 35 | warnings.filterwarnings('ignore') 36 | 37 | torch.cuda.set_device(0) 38 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 39 | 40 | 41 | def fix_seed(seed): 42 | random.seed(seed) 43 | np.random.seed(seed) 44 | torch.manual_seed(seed) 45 | torch.cuda.manual_seed_all(seed) 46 | torch.backends.cudnn.deterministic = True 47 | 48 | seed = 2020 49 | fix_seed(seed) 50 | 51 | df_log = pd.read_pickle('../../data/torch/log.pkl') 52 | 53 | seq_embedding_features = OrderedDict({ 54 | 'creative_id': { 55 | 'embedding_file': '../../w2v_models/w2v_creative_id_128.pkl', 56 | 'embedding_dim': 128, 57 | 'pretrained_embedding': None, 58 | }, 59 | 'advertiser_id': { 60 | 'embedding_file': '../../w2v_models/w2v_advertiser_id_128.pkl', 61 | 'embedding_dim': 128, 62 | 'pretrained_embedding': None, 63 | }, 64 | 'product_id': { 65 | 'embedding_file': '../../w2v_models/w2v_product_id_128.pkl', 66 | 'embedding_dim': 128, 67 | 'pretrained_embedding': None, 68 | }, 69 | }) 70 | 71 | 72 | for f in tqdm(seq_embedding_features.keys()): 73 | le = preprocessing.LabelEncoder() 74 | le.fit(df_log[f].values.tolist()) 75 | 76 | df_emb = pd.read_pickle(seq_embedding_features[f]['embedding_file']) 77 | df_emb = df_emb[df_emb[f].isin(df_log[f].values.tolist())] 78 | assert df_emb.shape[1] == seq_embedding_features[f]['embedding_dim'] + 1 79 | df_emb[f] = le.transform(df_emb[f].values.tolist()) + 1 80 | 81 | # 补上作为序列填补的 0 向量 82 | df_default = pd.DataFrame() 83 | df_default[f] = [0] 84 | df_emb = df_emb.append(df_default) 85 | df_emb.fillna(0, inplace=True) 86 | 87 | # 按 id 排序 88 | df_emb.sort_values([f], inplace=True) 89 | embedding_columns = [c for c in df_emb.columns if c != f] 90 | seq_embedding_features[f]['pretrained_embedding'] = [ 91 | v for v in df_emb[embedding_columns].values 92 | ] 93 | 94 | del df_default, df_emb 95 | gc.collect() 96 | 97 | df_log[f] = le.transform(df_log[f].values.tolist()) + 1 98 | seq_embedding_features[f]['nunique'] = df_log[f].nunique() + 1 99 | 100 | 101 | # # 序列特征 102 | # ## 序列 id 特征 103 | seq_len = 128 104 | 105 | def gen_seq_data(data, features, seq_len, prefix=''): 106 | data.sort_values('time', inplace=True) 107 | data_set = OrderedDict() 108 | 109 | user_ids = [] 110 | for user_id, hist in tqdm(data.groupby('user_id')): 111 | user_ids.append(user_id) 112 | 113 | # 取最近的记录 114 | for f in features: 115 | hist_f = hist[f].values 116 | hist_f = hist_f[-seq_len:] 117 | 118 | if f not in data_set: 119 | data_set[f] = [] 120 | 121 | data_set[f].append(hist_f) 122 | 123 | for f in features: 124 | df_context = pd.DataFrame() 125 | df_context['user_id'] = user_ids 126 | df_context['{}_seq'.format(f)] = data_set[f] 127 | 128 | df_context.to_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 129 | 130 | 131 | 132 | # 是否从本地加载seq数据 133 | load_seq = True 134 | prefix = str(df_log.shape[0]) + '_' if df_log.shape[0] < 10000 else '' 135 | # 不加载本地seq,强制重新生成所有seq 136 | seq_features = [] 137 | if not load_seq: 138 | seq_features = list(seq_embedding_features.keys()) 139 | else: 140 | for f in seq_embedding_features.keys(): 141 | if not os.path.exists('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)): 142 | seq_features += [f] 143 | print(seq_features) 144 | 145 | if len(seq_features) != 0: 146 | gen_seq_data(df_log, seq_features, seq_len, prefix) 147 | 148 | # 合并序列 149 | all_users = list(df_log['user_id'].unique()) 150 | all_users.sort() 151 | df_context = pd.DataFrame(all_users) 152 | df_context.columns = ['user_id'] 153 | for f in seq_embedding_features.keys(): 154 | df_seq = pd.read_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 155 | df_context = df_context.merge(df_seq, how='left') 156 | del df_seq 157 | gc.collect() 158 | 159 | 160 | # ## 序列统计特征 161 | 162 | seq_statistics_features = [] 163 | df_statistics_context = None 164 | 165 | 166 | # # 合并其他特征 167 | 168 | # ## 标签 169 | 170 | df_feature = pd.read_pickle('../../data/torch/feature.pkl') 171 | df_feature['age'] = df_feature['age'].astype('float') 172 | df_feature['age'] = df_feature['age'] - 1 173 | del df_feature['gender'] 174 | 175 | user_ids = list(set(df_log['user_id'].values)) 176 | df_feature = df_feature[df_feature['user_id'].isin(user_ids)] 177 | df_feature.sort_values(['user_id'], inplace=True) 178 | df_feature.reset_index(drop=True, inplace=True) 179 | 180 | df_feature = df_feature.merge(df_context, how='left') 181 | 182 | if df_statistics_context: 183 | df_feature = df_feature.merge(df_statistics_context, how='left') 184 | del df_statistics_context 185 | 186 | del df_context 187 | gc.collect() 188 | 189 | del df_log 190 | gc.collect() 191 | 192 | 193 | # ## target encoding 特征 194 | 195 | statistics_features = [] 196 | 197 | 198 | # # 模型训练 199 | 200 | train_model_inputs = df_feature[df_feature['age'].notnull()].reset_index( 201 | drop=True) 202 | test_model_inputs = df_feature[df_feature['age'].isnull()].reset_index( 203 | drop=True) 204 | 205 | 206 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 207 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 208 | print('\nFold_{} Training ============================================\n'. 209 | format(fold_id + 1)) 210 | 211 | train_data = train_model_inputs.iloc[trn_idx] 212 | val_data = train_model_inputs.iloc[val_idx] 213 | 214 | # 模型定义 215 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 216 | statistics_features=statistics_features, 217 | seq_statistics_features=seq_statistics_features, 218 | seq_len=seq_len, 219 | device=device).to(device) 220 | 221 | criterion = nn.CrossEntropyLoss().to(device) 222 | optimizer = optim.Ranger(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-4) 223 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 224 | mode='max', 225 | factor=0.1, 226 | patience=2, 227 | min_lr=1e-6, 228 | verbose=True) 229 | early_stopping = EarlyStopping( 230 | file_name='../../models/age_m9_checkpoint{}.pt'.format(fold_id), 231 | patience=10, 232 | verbose=True, 233 | delta=0.00000001) 234 | 235 | model.set(criterion, optimizer, scheduler, early_stopping) 236 | 237 | batch_size = 512 238 | # 6000 239 | epoches = 600 240 | best_age_acc = model.model_train(train_data, val_data, epoches, batch_size) 241 | print('age_acc: {}'.format(best_age_acc)) 242 | 243 | 244 | test_data = test_model_inputs 245 | 246 | oof_pred_age = np.zeros((train_model_inputs.shape[0], 10)) 247 | test_pred_age = np.zeros((test_data.shape[0], 10)) 248 | 249 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 250 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 251 | print('\nFold_{} Training ============================================\n'. 252 | format(fold_id + 1)) 253 | 254 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 255 | statistics_features=statistics_features, 256 | seq_statistics_features=seq_statistics_features, 257 | seq_len=seq_len, 258 | device=device).to(device) 259 | model.load_state_dict(torch.load('../../models/age_m9_checkpoint{}.pt'.format(fold_id)), strict=False) 260 | 261 | model.eval() 262 | 263 | with torch.no_grad(): 264 | val_data = train_model_inputs.iloc[val_idx] 265 | 266 | # 对训练集预测 267 | model_pred_age, _, _ = model.model_predict(val_data, batch_size, False) 268 | oof_pred_age[val_idx] += model_pred_age 269 | 270 | # 对测试集预测 271 | model_pred_age, _, _ = model.model_predict(test_data, batch_size, False) 272 | test_pred_age += model_pred_age / 5 273 | 274 | 275 | df_oof = train_model_inputs[['user_id', 'age']] 276 | df_oof['predicted_age'] = np.argmax(oof_pred_age, axis=1) 277 | acc_age = accuracy_score(df_oof['age'], df_oof['predicted_age']) 278 | print(acc_age) 279 | 280 | 281 | np.save('../../probs/sub_age_m9_torch', test_pred_age) 282 | np.save('../../probs/oof_age_m9_torch', oof_pred_age) 283 | 284 | 285 | 286 | 287 | -------------------------------------------------------------------------------- /src/torch/f6_AGE_m2_transformer_6inputs_train_5fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ################################################################################# 5 | # AGE model 2: Torch Transformer+LSTM 6 inputs 6 | # score: 7 | # 五折: 0.50299 (线下) 8 | # 五折: 0.51589 (线上) 9 | # 训练时长: ~ 5 days 10 | ################################################################################# 11 | 12 | import pandas as pd 13 | import warnings 14 | import gc 15 | from tqdm import tqdm 16 | import numpy as np 17 | from collections import defaultdict, OrderedDict 18 | import math 19 | import pickle 20 | import random 21 | import torch 22 | import torch.nn as nn 23 | from sklearn import preprocessing 24 | from pytorchtools import EarlyStopping 25 | import os 26 | from sklearn.model_selection import KFold 27 | from lookahead import * 28 | from sklearn.metrics import accuracy_score 29 | from m2_transformer_6inputs_age import * 30 | 31 | pd.set_option('display.max_columns', None) 32 | pd.set_option('display.max_rows', None) 33 | 34 | warnings.filterwarnings('ignore') 35 | 36 | torch.cuda.set_device(0) 37 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 38 | 39 | def fix_seed(seed): 40 | random.seed(seed) 41 | np.random.seed(seed) 42 | torch.manual_seed(seed) 43 | torch.cuda.manual_seed_all(seed) 44 | torch.backends.cudnn.deterministic = True 45 | 46 | seed = 2020 47 | fix_seed(seed) 48 | 49 | df_log = pd.read_pickle('../../data/torch/log.pkl') 50 | 51 | 52 | seq_embedding_features = OrderedDict({ 53 | 'creative_id': { 54 | 'embedding_file': '../../w2v_models/w2v_creative_id_128.pkl', 55 | 'embedding_dim': 128, 56 | 'pretrained_embedding': None, 57 | }, 58 | 'industry': { 59 | 'embedding_file': '../../w2v_models/w2v_industry_128.pkl', 60 | 'embedding_dim': 128, 61 | 'pretrained_embedding': None, 62 | }, 63 | 'ad_id': { 64 | 'embedding_file': '../../w2v_models/w2v_ad_id_128.pkl', 65 | 'embedding_dim': 128, 66 | 'pretrained_embedding': None, 67 | }, 68 | 'advertiser_id': { 69 | 'embedding_file': '../../w2v_models/w2v_advertiser_id_128.pkl', 70 | 'embedding_dim': 128, 71 | 'pretrained_embedding': None, 72 | }, 73 | 'product_category': { 74 | 'embedding_file': '../../w2v_models/w2v_product_category_128.pkl', 75 | 'embedding_dim': 128, 76 | 'pretrained_embedding': None, 77 | }, 78 | 'product_id': { 79 | 'embedding_file': '../../w2v_models/w2v_product_id_128.pkl', 80 | 'embedding_dim': 128, 81 | 'pretrained_embedding': None, 82 | }, 83 | }) 84 | 85 | 86 | for f in tqdm(seq_embedding_features.keys()): 87 | le = preprocessing.LabelEncoder() 88 | le.fit(df_log[f].values.tolist()) 89 | 90 | df_emb = pd.read_pickle(seq_embedding_features[f]['embedding_file']) 91 | df_emb = df_emb[df_emb[f].isin(df_log[f].values.tolist())] 92 | assert df_emb.shape[1] == seq_embedding_features[f]['embedding_dim'] + 1 93 | df_emb[f] = le.transform(df_emb[f].values.tolist()) + 1 94 | 95 | # 补上作为序列填补的 0 向量 96 | df_default = pd.DataFrame() 97 | df_default[f] = [0] 98 | df_emb = df_emb.append(df_default) 99 | df_emb.fillna(0, inplace=True) 100 | 101 | # 按 id 排序 102 | df_emb.sort_values([f], inplace=True) 103 | embedding_columns = [c for c in df_emb.columns if c != f] 104 | seq_embedding_features[f]['pretrained_embedding'] = [ 105 | v for v in df_emb[embedding_columns].values 106 | ] 107 | 108 | del df_default, df_emb 109 | gc.collect() 110 | 111 | df_log[f] = le.transform(df_log[f].values.tolist()) + 1 112 | seq_embedding_features[f]['nunique'] = df_log[f].nunique() + 1 113 | 114 | 115 | # # 序列特征 116 | # ## 序列 id 特征 117 | 118 | seq_len = 128 119 | 120 | def gen_seq_data(data, features, seq_len, prefix=''): 121 | data.sort_values('time', inplace=True) 122 | data_set = OrderedDict() 123 | 124 | user_ids = [] 125 | for user_id, hist in tqdm(data.groupby('user_id')): 126 | user_ids.append(user_id) 127 | 128 | # 取最近的记录 129 | for f in features: 130 | hist_f = hist[f].values 131 | hist_f = hist_f[-seq_len:] 132 | 133 | if f not in data_set: 134 | data_set[f] = [] 135 | 136 | data_set[f].append(hist_f) 137 | 138 | for f in features: 139 | df_context = pd.DataFrame() 140 | df_context['user_id'] = user_ids 141 | df_context['{}_seq'.format(f)] = data_set[f] 142 | 143 | df_context.to_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 144 | 145 | 146 | # 是否从本地加载seq数据 147 | load_seq = True 148 | prefix = str(df_log.shape[0]) + '_' if df_log.shape[0] < 10000 else '' 149 | # 不加载本地seq,强制重新生成所有seq 150 | seq_features = [] 151 | if not load_seq: 152 | seq_features = list(seq_embedding_features.keys()) 153 | else: 154 | for f in seq_embedding_features.keys(): 155 | if not os.path.exists('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)): 156 | seq_features += [f] 157 | print(seq_features) 158 | 159 | if len(seq_features) != 0: 160 | gen_seq_data(df_log, seq_features, seq_len, prefix) 161 | 162 | # 合并序列 163 | all_users = list(df_log['user_id'].unique()) 164 | all_users.sort() 165 | df_context = pd.DataFrame(all_users) 166 | df_context.columns = ['user_id'] 167 | for f in seq_embedding_features.keys(): 168 | df_seq = pd.read_pickle('../../data/torch/{}{}_seqs_{}.pkl'.format(prefix, f, seq_len)) 169 | df_context = df_context.merge(df_seq, how='left') 170 | del df_seq 171 | gc.collect() 172 | 173 | 174 | # ## 序列统计特征 175 | 176 | seq_statistics_features = [] 177 | df_statistics_context = None 178 | 179 | 180 | 181 | df_feature = pd.read_pickle('../../data/torch/feature.pkl') 182 | df_feature['age'] = df_feature['age'].astype('float') 183 | df_feature['age'] = df_feature['age'] - 1 184 | del df_feature['gender'] 185 | 186 | user_ids = list(set(df_log['user_id'].values)) 187 | df_feature = df_feature[df_feature['user_id'].isin(user_ids)] 188 | df_feature.sort_values(['user_id'], inplace=True) 189 | df_feature.reset_index(drop=True, inplace=True) 190 | 191 | df_feature = df_feature.merge(df_context, how='left') 192 | 193 | if df_statistics_context: 194 | df_feature = df_feature.merge(df_statistics_context, how='left') 195 | del df_statistics_context 196 | 197 | del df_context 198 | gc.collect() 199 | 200 | 201 | del df_log 202 | gc.collect() 203 | 204 | 205 | # ## target encoding 特征 206 | 207 | statistics_features = [] 208 | 209 | 210 | # # 模型训练 211 | 212 | train_model_inputs = df_feature[df_feature['age'].notnull()].reset_index( 213 | drop=True) 214 | test_model_inputs = df_feature[df_feature['age'].isnull()].reset_index( 215 | drop=True) 216 | 217 | 218 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 219 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 220 | print('\nFold_{} Training ============================================\n'. 221 | format(fold_id + 1)) 222 | 223 | train_data = train_model_inputs.iloc[trn_idx] 224 | val_data = train_model_inputs.iloc[val_idx] 225 | 226 | # 模型定义 227 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 228 | statistics_features=statistics_features, 229 | seq_statistics_features=seq_statistics_features, 230 | seq_len=seq_len, 231 | device=device).to(device) 232 | 233 | criterion = nn.CrossEntropyLoss().to(device) 234 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, 235 | model.parameters()), 236 | lr=5e-4) 237 | optimizer = Lookahead(optimizer, k=5, alpha=0.5) 238 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 239 | mode='max', 240 | factor=0.1, 241 | patience=2, 242 | min_lr=1e-6, 243 | verbose=True) 244 | early_stopping = EarlyStopping( 245 | file_name='../../models/age_m2_checkpoint{}.pt'.format(fold_id), 246 | patience=10, 247 | verbose=True, 248 | delta=0.00000001) 249 | 250 | model.set(criterion, optimizer, scheduler, early_stopping) 251 | 252 | batch_size = 512 253 | # 6000 254 | epoches = 600 255 | best_age_acc = model.model_train(train_data, val_data, epoches, batch_size) 256 | print('age_acc: {}'.format(best_age_acc)) 257 | 258 | test_data = test_model_inputs 259 | 260 | oof_pred_age = np.zeros((train_model_inputs.shape[0], 10)) 261 | test_pred_age = np.zeros((test_data.shape[0], 10)) 262 | 263 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 264 | for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_model_inputs)): 265 | print('\nFold_{} Training ============================================\n'. 266 | format(fold_id + 1)) 267 | 268 | model = LSTMCLF(seq_embedding_features=seq_embedding_features, 269 | statistics_features=statistics_features, 270 | seq_statistics_features=seq_statistics_features, 271 | seq_len=seq_len, 272 | device=device).to(device) 273 | model.load_state_dict(torch.load('../../models/age_m2_checkpoint{}.pt'.format(fold_id)), strict=False) 274 | 275 | model.eval() 276 | 277 | with torch.no_grad(): 278 | val_data = train_model_inputs.iloc[val_idx] 279 | 280 | # 对训练集预测 281 | model_pred_age, _, _ = model.model_predict(val_data, batch_size, False) 282 | oof_pred_age[val_idx] += model_pred_age 283 | 284 | # 对测试集预测 285 | model_pred_age, _, _ = model.model_predict(test_data, batch_size, False) 286 | test_pred_age += model_pred_age / 5 287 | 288 | 289 | 290 | df_oof = train_model_inputs[['user_id', 'age']] 291 | df_oof['predicted_age'] = np.argmax(oof_pred_age, axis=1) 292 | acc_age = accuracy_score(df_oof['age'], df_oof['predicted_age']) 293 | print(acc_age) 294 | 295 | 296 | np.save('../../probs/sub_age_m2_torch', test_pred_age) 297 | np.save('../../probs/oof_age_m2_torch', oof_pred_age) 298 | 299 | 300 | 301 | -------------------------------------------------------------------------------- /src/torch/f7_save_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | ########################################################################## 6 | # 生成 transformer_v2 所需的数据文件 7 | ########################################################################## 8 | 9 | 10 | import pandas as pd 11 | import warnings 12 | from sklearn.model_selection import GroupKFold 13 | import gc 14 | from tqdm import tqdm 15 | import numpy as np 16 | from collections import defaultdict 17 | import math 18 | import pickle 19 | import os 20 | 21 | pd.set_option('display.max_columns', None) 22 | pd.set_option('display.max_rows', None) 23 | 24 | warnings.filterwarnings('ignore') 25 | 26 | 27 | seed = 2020 28 | 29 | # 读取数据集 30 | df_train_ad = pd.read_csv('../../raw_data/train_preliminary/ad.csv') 31 | df_train_log = pd.read_csv('../../raw_data/train_preliminary/click_log.csv') 32 | df_train_user = pd.read_csv('../../raw_data/train_preliminary/user.csv') 33 | 34 | df_test_ad = pd.read_csv('../../raw_data/test/ad.csv') 35 | df_test_log = pd.read_csv('../../raw_data/test/click_log.csv') 36 | 37 | df_train_semi_final_ad = pd.read_csv('../../raw_data/train_semi_final/ad.csv') 38 | df_train_semi_final_log = pd.read_csv('../../raw_data/train_semi_final/click_log.csv') 39 | df_train_semi_final_user = pd.read_csv('../../raw_data/train_semi_final/user.csv') 40 | 41 | df_train_user = df_train_user.append(df_train_semi_final_user) 42 | df_train_log = df_train_log.append(df_train_semi_final_log) 43 | df_train_ad = df_train_ad.append(df_train_semi_final_ad) 44 | df_train_ad.drop_duplicates(inplace=True) 45 | 46 | 47 | # 提取所有用户 48 | df_test_user = df_test_log[['user_id']] 49 | df_test_user.drop_duplicates(inplace=True) 50 | df_feature = pd.concat([df_train_user, df_test_user], sort=False) 51 | 52 | 53 | # 日志数据 54 | df_ad = pd.concat([df_train_ad, df_test_ad], sort=False) 55 | df_ad.drop_duplicates(inplace=True) 56 | 57 | df_log = pd.concat([df_train_log, df_test_log], sort=False) 58 | df_log.sort_values(['user_id', 'time'], inplace=True) 59 | 60 | df_log = df_log.merge(df_ad, how='left', on='creative_id') 61 | 62 | 63 | # Function to reduce the memory usage 64 | def reduce_mem_usage(df, verbose=True): 65 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 66 | start_mem = df.memory_usage().sum() / 1024**2 67 | for col in tqdm(df.columns): 68 | col_type = df[col].dtypes 69 | if col_type in numerics: 70 | c_min = df[col].min() 71 | c_max = df[col].max() 72 | if str(col_type)[:3] == 'int': 73 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( 74 | np.int8).max: 75 | df[col] = df[col].astype(np.int8) 76 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( 77 | np.int16).max: 78 | df[col] = df[col].astype(np.int16) 79 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( 80 | np.int32).max: 81 | df[col] = df[col].astype(np.int32) 82 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( 83 | np.int64).max: 84 | df[col] = df[col].astype(np.int64) 85 | else: 86 | if c_min > np.finfo(np.float16).min and c_max < np.finfo( 87 | np.float16).max: 88 | df[col] = df[col].astype(np.float16) 89 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo( 90 | np.float32).max: 91 | df[col] = df[col].astype(np.float32) 92 | else: 93 | df[col] = df[col].astype(np.float64) 94 | end_mem = df.memory_usage().sum() / 1024**2 95 | if verbose: 96 | print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format( 97 | end_mem, 100 * (start_mem - end_mem) / start_mem)) 98 | return df 99 | 100 | 101 | df_log = reduce_mem_usage(df_log) 102 | 103 | # 把 id 拼接成字符串 104 | for col in tqdm(['creative_id', 'ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry', 'click_times', 'time']): 105 | df_log[col] = df_log[col].astype(str) 106 | 107 | tmp = df_log.sort_values(['user_id', 'time']).groupby('user_id')[col].agg(list).reset_index() 108 | tmp[col] = tmp[col].map(lambda x: ' '.join(x)) 109 | df_feature = df_feature.merge(tmp, how='left') 110 | del tmp 111 | gc.collect() 112 | 113 | 114 | 115 | df_feature.to_pickle('../../data/torch/data.pkl') 116 | 117 | 118 | -------------------------------------------------------------------------------- /src/torch/f8_AGE_GENDER_m13_transformer_4inputs_train_5fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | ################################################################################# 6 | # AGE model 13: Torch huggingface transformer 4 inputs 7 | # score: 8 | # 五折: 0.50699 (线下) 9 | # 五折: 0.51866 (线上) 10 | # GENDER model 3: 11 | # score: 12 | # 五折: 0.94678 (线下) 13 | 14 | # 训练时长: ~ 5 days 15 | ################################################################################# 16 | 17 | import pandas as pd 18 | from keras.preprocessing import text, sequence 19 | import torch 20 | import random 21 | import numpy as np 22 | import os 23 | from gensim.models import Word2Vec 24 | import warnings 25 | from pytorchtools import EarlyStopping 26 | from tqdm import tqdm 27 | import torch_optimizer as optim 28 | from sklearn.metrics import accuracy_score 29 | from sklearn.model_selection import KFold 30 | import gc 31 | from m13_transformer_4inputs import * 32 | 33 | pd.set_option('display.max_columns', None) 34 | pd.set_option('display.max_rows', None) 35 | 36 | warnings.filterwarnings('ignore') 37 | 38 | torch.cuda.set_device(0) 39 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 40 | 41 | def fix_seed(seed): 42 | random.seed(seed) 43 | np.random.seed(seed) 44 | torch.manual_seed(seed) 45 | torch.cuda.manual_seed_all(seed) 46 | torch.backends.cudnn.deterministic = True 47 | 48 | 49 | seed = 2020 50 | fix_seed(seed) 51 | 52 | 53 | df_data = pd.read_pickle('../../data/torch/data.pkl') 54 | df_data['age'] = df_data['age'] - 1 55 | df_data['gender'] = df_data['gender'] - 1 56 | 57 | 58 | test = False 59 | df_data = df_data.reset_index(drop=True) 60 | 61 | 62 | seq_len = 128 63 | 64 | def sequence_processing(df_data, col, embedding_dim): 65 | print('Generate {} seqs'.format(col)) 66 | os.makedirs('../../data/torch/seqs', exist_ok=True) 67 | seq_path = '../../data/torch/seqs/seqs_{}_{}.npy'.format(col, seq_len) 68 | word_index_path = '../../data/torch/seqs/word_index_{}_{}.npy'.format(col, seq_len) 69 | if test or not os.path.exists(seq_path) or not os.path.exists(word_index_path): 70 | tokenizer = text.Tokenizer(lower=False) 71 | tokenizer.fit_on_texts(df_data[col].values.tolist()) 72 | seqs = sequence.pad_sequences(tokenizer.texts_to_sequences(df_data[col].values.tolist()), maxlen=seq_len, 73 | padding='post', truncating='pre') 74 | word_index = tokenizer.word_index 75 | 76 | if not test: 77 | np.save(seq_path, seqs) 78 | np.save(word_index_path, word_index) 79 | else: 80 | seqs = np.load(seq_path) 81 | word_index = np.load(word_index_path, allow_pickle=True).item() 82 | 83 | print('Generate {} embedding'.format(col)) 84 | os.makedirs('../../data/torch/embedding', exist_ok=True) 85 | embedding_path = '../../data/torch/embedding/w2v_{}_{}.m'.format(col, embedding_dim) 86 | if test or not os.path.exists(embedding_path): 87 | print('Training {} w2v'.format(col)) 88 | 89 | sentences = [] 90 | for sentence in df_data[col].values: 91 | sentence = sentence.split(' ') 92 | sentences.append(sentence) 93 | 94 | model = Word2Vec(sentences, size=embedding_dim, window=20, workers=32, seed=seed, min_count=1, sg=1, hs=1) 95 | if not test: 96 | model.save(embedding_path) 97 | else: 98 | model = Word2Vec.load(embedding_path) 99 | 100 | embedding = np.zeros((len(word_index)+1, embedding_dim)) 101 | for word, i in tqdm(word_index.items()): 102 | if word in model: 103 | embedding[i] = model[word] 104 | 105 | return seqs, embedding 106 | 107 | 108 | 109 | creative_id_seqs, creative_id_embedding = sequence_processing(df_data, 'creative_id', 128) 110 | ad_id_seqs, ad_id_embedding = sequence_processing(df_data, 'ad_id', 128) 111 | advertiser_id_seqs, advertiser_id_embedding = sequence_processing(df_data, 'advertiser_id', 128) 112 | product_id_seqs, product_id_embedding = sequence_processing(df_data, 'product_id', 128) 113 | 114 | 115 | all_index = df_data[df_data['age'].notnull()].index.tolist() 116 | test_index = df_data[df_data['age'].isnull()].index.tolist() 117 | 118 | 119 | target = df_data[['user_id', 'age', 'gender']].copy(deep=True) 120 | del df_data 121 | gc.collect() 122 | 123 | 124 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 125 | for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index)): 126 | model = Model(embeddings=[creative_id_embedding, ad_id_embedding, advertiser_id_embedding, product_id_embedding], 127 | device=device).to(device).to(device) 128 | criterion_age = nn.CrossEntropyLoss().to(device) 129 | criterion_gender = nn.CrossEntropyLoss().to(device) 130 | 131 | optimizer = optim.Ranger(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-4) 132 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 133 | mode='max', 134 | factor=0.1, 135 | patience=1, 136 | min_lr=1e-12, 137 | verbose=True) 138 | early_stopping = EarlyStopping( 139 | file_name='../../models/age_m13_gender_m3_checkpoint{}.pt'.format(fold_id), 140 | patience=5, 141 | verbose=True, 142 | delta=0.00000001) 143 | 144 | model.set(criterion_age,criterion_gender, optimizer, scheduler, early_stopping) 145 | 146 | batch_size = 256 147 | # 6000 148 | epoches = 10000 149 | 150 | train_creative_id_seqs = creative_id_seqs[train_index] 151 | train_ad_id_seqs = ad_id_seqs[train_index] 152 | train_advertiser_id_seqs = advertiser_id_seqs[train_index] 153 | train_product_id_seqs = product_id_seqs[train_index] 154 | 155 | train_age = target.loc[train_index]['age'].values 156 | train_gender = target.loc[train_index]['gender'].values 157 | 158 | 159 | best_acc, best_age_acc, best_gender_acc = model.model_train(train_input=[train_creative_id_seqs, 160 | train_ad_id_seqs, 161 | train_advertiser_id_seqs, 162 | train_product_id_seqs], 163 | val_input=[creative_id_seqs[val_index], 164 | ad_id_seqs[val_index], 165 | advertiser_id_seqs[val_index], 166 | product_id_seqs[val_index]], 167 | train_output=[train_age, train_gender], 168 | val_output=[target.loc[val_index]['age'].values, target.loc[val_index]['gender'].values], 169 | epoches=epoches, 170 | batch_size=batch_size) 171 | 172 | 173 | batch_size = 256 174 | 175 | oof_pred_age = np.zeros((len(all_index), 10)) 176 | test_pred_age = np.zeros((len(test_index), 10)) 177 | 178 | oof_pred_gender = np.zeros((len(all_index), 2)) 179 | test_pred_gender = np.zeros((len(test_index), 2)) 180 | 181 | test_creative_id_seqs = creative_id_seqs[test_index] 182 | test_ad_id_seqs = ad_id_seqs[test_index] 183 | test_advertiser_id_seqs = advertiser_id_seqs[test_index] 184 | test_product_id_seqs = product_id_seqs[test_index] 185 | 186 | kfold = KFold(n_splits=5, shuffle=True, random_state=seed) 187 | for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index)): 188 | print('\nFold_{} Training ============================================\n'. 189 | format(fold_id + 1)) 190 | 191 | model = Model(embeddings=[creative_id_embedding, ad_id_embedding, advertiser_id_embedding, product_id_embedding], 192 | device=device).to(device).to(device) 193 | model.load_state_dict(torch.load('../../models/age_m13_gender_m3_checkpoint{}.pt'.format(fold_id)), strict=False) 194 | model.eval() 195 | 196 | # 对训练集预测 197 | val_creative_id_seqs = creative_id_seqs[val_index] 198 | val_ad_id_seqs = ad_id_seqs[val_index] 199 | val_advertiser_id_seqs = advertiser_id_seqs[val_index] 200 | val_product_id_seqs = product_id_seqs[val_index] 201 | 202 | pred_age, pred_gender = model.model_predict([val_creative_id_seqs, val_ad_id_seqs, 203 | val_advertiser_id_seqs, val_product_id_seqs], 204 | batch_size) 205 | 206 | oof_pred_age[val_index] = pred_age 207 | oof_pred_gender[val_index] = pred_gender 208 | 209 | # 对测试集预测 210 | pred_age, pred_gender = model.model_predict([test_creative_id_seqs, test_ad_id_seqs, 211 | test_advertiser_id_seqs, test_product_id_seqs], 212 | batch_size) 213 | 214 | test_pred_age += pred_age / 5 215 | test_pred_gender += pred_gender / 5 216 | 217 | 218 | df_oof = target.loc[all_index][['user_id', 'age', 'gender']] 219 | df_oof['predicted_age'] = np.argmax(oof_pred_age, axis=1) 220 | df_oof['predicted_gender'] = np.argmax(oof_pred_gender, axis=1) 221 | 222 | acc_age = accuracy_score(df_oof['age'], df_oof['predicted_age']) 223 | acc_gender = accuracy_score(df_oof['gender'], df_oof['predicted_gender']) 224 | acc = acc_age + acc_gender 225 | print(acc, acc_age, acc_gender) 226 | 227 | 228 | np.save('../../probs/sub_age_m13_torch', test_pred_age) 229 | np.save('../../probs/oof_age_m13_torch', oof_pred_age) 230 | 231 | 232 | np.save('../../probs/sub_gender_m3_torch', test_pred_gender) 233 | np.save('../../probs/oof_gender_m3_torch', oof_pred_gender) 234 | -------------------------------------------------------------------------------- /src/torch/lookahead.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import chain 3 | from torch.optim import Optimizer 4 | import torch 5 | import warnings 6 | 7 | class Lookahead(Optimizer): 8 | def __init__(self, optimizer, k=5, alpha=0.5): 9 | self.optimizer = optimizer 10 | self.k = k 11 | self.alpha = alpha 12 | self.param_groups = self.optimizer.param_groups 13 | self.state = defaultdict(dict) 14 | self.fast_state = self.optimizer.state 15 | for group in self.param_groups: 16 | group["counter"] = 0 17 | 18 | def update(self, group): 19 | for fast in group["params"]: 20 | param_state = self.state[fast] 21 | if "slow_param" not in param_state: 22 | param_state["slow_param"] = torch.zeros_like(fast.data) 23 | param_state["slow_param"].copy_(fast.data) 24 | slow = param_state["slow_param"] 25 | slow += (fast.data - slow) * self.alpha 26 | fast.data.copy_(slow) 27 | 28 | def update_lookahead(self): 29 | for group in self.param_groups: 30 | self.update(group) 31 | 32 | def step(self, closure=None): 33 | loss = self.optimizer.step(closure) 34 | for group in self.param_groups: 35 | if group["counter"] == 0: 36 | self.update(group) 37 | group["counter"] += 1 38 | if group["counter"] >= self.k: 39 | group["counter"] = 0 40 | return loss 41 | 42 | def state_dict(self): 43 | fast_state_dict = self.optimizer.state_dict() 44 | slow_state = { 45 | (id(k) if isinstance(k, torch.Tensor) else k): v 46 | for k, v in self.state.items() 47 | } 48 | fast_state = fast_state_dict["state"] 49 | param_groups = fast_state_dict["param_groups"] 50 | return { 51 | "fast_state": fast_state, 52 | "slow_state": slow_state, 53 | "param_groups": param_groups, 54 | } 55 | 56 | def load_state_dict(self, state_dict): 57 | slow_state_dict = { 58 | "state": state_dict["slow_state"], 59 | "param_groups": state_dict["param_groups"], 60 | } 61 | fast_state_dict = { 62 | "state": state_dict["fast_state"], 63 | "param_groups": state_dict["param_groups"], 64 | } 65 | super(Lookahead, self).load_state_dict(slow_state_dict) 66 | self.optimizer.load_state_dict(fast_state_dict) 67 | self.fast_state = self.optimizer.state 68 | 69 | def add_param_group(self, param_group): 70 | param_group["counter"] = 0 71 | self.optimizer.add_param_group(param_group) -------------------------------------------------------------------------------- /src/torch/m13_transformer_4inputs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | from transformers.modeling_bert import BertEmbeddings, BertEncoder 6 | from transformers.configuration_bert import BertConfig 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import functional as F 11 | import math 12 | 13 | 14 | 15 | def fix_seed(seed): 16 | random.seed(seed) 17 | np.random.seed(seed) 18 | torch.manual_seed(seed) 19 | torch.cuda.manual_seed_all(seed) 20 | torch.backends.cudnn.deterministic = True 21 | 22 | fix_seed(2020) 23 | 24 | 25 | class Mish(nn.Module): 26 | def __init__(self): 27 | super().__init__() 28 | 29 | def forward(self,x): 30 | x = x * (torch.tanh(F.softplus(x))) 31 | return x 32 | 33 | 34 | class Model(nn.Module): 35 | def __init__(self, embeddings, device): 36 | super(Model, self).__init__() 37 | self.device = device 38 | 39 | cid_emb_size = embeddings[0].shape[1] 40 | creative_id_embedding = nn.Embedding(embeddings[0].shape[0], cid_emb_size) 41 | creative_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[0])) 42 | creative_id_embedding.weight.requires_grad = False 43 | self.creative_id_embedding = creative_id_embedding 44 | 45 | aid_emb_size = embeddings[1].shape[1] 46 | ad_id_embedding = nn.Embedding(embeddings[1].shape[0], aid_emb_size) 47 | ad_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[1])) 48 | ad_id_embedding.weight.requires_grad = False 49 | self.ad_id_embedding = ad_id_embedding 50 | 51 | adv_emb_size = embeddings[2].shape[1] 52 | advertiser_id_embedding = nn.Embedding(embeddings[2].shape[0], adv_emb_size) 53 | advertiser_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[2])) 54 | advertiser_id_embedding.weight.requires_grad = False 55 | self.advertiser_id_embedding = advertiser_id_embedding 56 | 57 | pid_emb_size = embeddings[3].shape[1] 58 | product_id_embedding = nn.Embedding(embeddings[3].shape[0], pid_emb_size) 59 | product_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[3])) 60 | product_id_embedding.weight.requires_grad = False 61 | self.product_id_embedding = product_id_embedding 62 | 63 | hidden_size = cid_emb_size + aid_emb_size + adv_emb_size + pid_emb_size 64 | 65 | # transformer 66 | config = BertConfig(num_hidden_layers=3, 67 | num_attention_heads=8, 68 | hidden_size=hidden_size, 69 | layer_norm_eps=1e-12, 70 | hidden_dropout_prob=0.2, 71 | attention_probs_dropout_prob=0.2, 72 | hidden_act='mish') 73 | self.config = config 74 | self.bert_encoder = BertEncoder(config) 75 | 76 | # DNN 层 77 | self.linears = nn.Sequential(nn.Linear(config.hidden_size, 1024), Mish(), nn.BatchNorm1d(1024), 78 | nn.Linear(1024, 256), Mish(), nn.BatchNorm1d(256), 79 | nn.Linear(256, 64), Mish(), nn.BatchNorm1d(64), 80 | nn.Linear(64, 16), Mish(), nn.BatchNorm1d(16), 81 | nn.Dropout(0.1)) 82 | 83 | # 输出层 84 | self.age_output = nn.Linear(16, 10) 85 | self.gender_output = nn.Linear(16, 2) 86 | 87 | def forward(self, seqs, seq_lengths): 88 | # embedding 89 | cid_emb = self.creative_id_embedding(seqs[0]) 90 | aid_emb = self.ad_id_embedding(seqs[1]) 91 | advid_emb = self.advertiser_id_embedding(seqs[2]) 92 | pid_emb = self.product_id_embedding(seqs[3]) 93 | conc_emb = torch.cat([cid_emb, aid_emb, advid_emb, pid_emb], 2) 94 | 95 | # transformer 96 | head_mask = [None] * self.config.num_hidden_layers 97 | bert_ouput = self.bert_encoder(hidden_states=conc_emb, head_mask=head_mask) 98 | bert_ouput = bert_ouput[0] 99 | # mask padding 100 | mask = torch.zeros(bert_ouput.shape).to(self.device) 101 | for idx, seqlen in enumerate(seq_lengths): 102 | mask[idx, :seqlen] = 1 103 | bert_ouput = bert_ouput * mask 104 | bert_max, _ = torch.max(bert_ouput, dim=1) 105 | 106 | # DNN 107 | dnn_output = self.linears(bert_max) 108 | age_output = self.age_output(dnn_output) 109 | gender_output = self.gender_output(dnn_output) 110 | 111 | return age_output, gender_output 112 | 113 | def set(self, criterion_age,criterion_gender, optimizer, scheduler, early_stopping): 114 | self.criterion_age = criterion_age 115 | self.criterion_gender = criterion_gender 116 | 117 | self.optimizer = optimizer 118 | self.scheduler = scheduler 119 | if early_stopping is None: 120 | self.early_stopping = EarlyStopping( 121 | file_name='model/checkpoint.pt', patience=10, verbose=True) 122 | else: 123 | self.early_stopping = early_stopping 124 | 125 | def model_train(self, train_input, val_input, train_output, val_output, epoches, batch_size): 126 | data_size = train_input[0].shape[0] 127 | n_batches = math.ceil(data_size / batch_size) 128 | 129 | # 序列真实长度 130 | tmp = train_input[0] 131 | tmp[tmp < 0 ] = 0 132 | tmp[tmp > 0] = 1 133 | seq_lengths = tmp.sum(axis=1) 134 | 135 | best_age_acc = 0 136 | best_gender_acc = 0 137 | best_acc = 0 138 | for epoch in range(epoches): 139 | model.train() 140 | 141 | train_loss_list = [] 142 | for batch in tqdm(range(n_batches), 143 | desc='epoch:{}/{}'.format(epoch, epoches)): 144 | start = batch * batch_size 145 | end = min((batch + 1) * batch_size, data_size) 146 | bs = end - start 147 | 148 | batch_creative_id_seqs = train_input[0][start:end] 149 | batch_ad_id_seqs = train_input[1][start:end] 150 | batch_advertiser_id_seqs = train_input[2][start:end] 151 | batch_product_id_seqs = train_input[3][start:end] 152 | batch_creative_id_seqs = torch.LongTensor(batch_creative_id_seqs).to(self.device) 153 | batch_ad_id_seqs = torch.LongTensor(batch_ad_id_seqs).to(self.device) 154 | batch_advertiser_id_seqs = torch.LongTensor(batch_advertiser_id_seqs).to(self.device) 155 | batch_product_id_seqs = torch.LongTensor(batch_product_id_seqs).to(self.device) 156 | 157 | y_age = train_output[0][start:end] 158 | y_gender = train_output[1][start:end] 159 | y_age = torch.LongTensor(y_age).to(self.device) 160 | y_gender = torch.LongTensor(y_gender).to(self.device) 161 | 162 | batch_seq_lengths = seq_lengths[start:end] 163 | 164 | pred_age, pred_gender = model([batch_creative_id_seqs, batch_ad_id_seqs, batch_advertiser_id_seqs, batch_product_id_seqs], batch_seq_lengths) 165 | loss_age = self.criterion_age(pred_age, y_age) 166 | loss_gender = self.criterion_gender(pred_gender, y_gender) 167 | loss = loss_age + 0.1 * loss_gender 168 | train_loss_list.append(loss.item()) 169 | 170 | self.optimizer.zero_grad() 171 | loss.backward() 172 | self.optimizer.step() 173 | 174 | del batch_creative_id_seqs, batch_ad_id_seqs, batch_advertiser_id_seqs, batch_product_id_seqs, y_age, y_gender, loss, loss_age, loss_gender, pred_age, pred_gender 175 | torch.cuda.empty_cache() 176 | 177 | train_loss = np.mean(train_loss_list) 178 | 179 | # 预测验证集,计算指标 180 | model.eval() 181 | with torch.no_grad(): 182 | val_pred_age, val_pred_gender = self.model_predict(val_input, batch_size) 183 | 184 | y_age = val_output[0] 185 | y_gender = val_output[1] 186 | y_age = torch.LongTensor(y_age).to(self.device) 187 | y_gender = torch.LongTensor(y_gender).to(self.device) 188 | 189 | loss_age = self.criterion_age(torch.from_numpy(val_pred_age).to(self.device), y_age) 190 | loss_gender = self.criterion_gender(torch.from_numpy(val_pred_gender).to(self.device), y_gender) 191 | val_loss = loss_age + 0.1 * loss_gender 192 | val_loss = val_loss.item() 193 | 194 | val_age_acc = accuracy_score(val_output[0], np.argmax(val_pred_age, axis=1)) 195 | val_gender_acc = accuracy_score(val_output[1], np.argmax(val_pred_gender, axis=1)) 196 | 197 | if self.scheduler: 198 | self.scheduler.step(val_age_acc) 199 | 200 | val_acc = val_age_acc + val_gender_acc 201 | if val_acc > best_acc: 202 | best_acc = val_acc 203 | best_age_acc = val_age_acc 204 | best_gender_acc = val_gender_acc 205 | 206 | print( 207 | 'epoch {}/{} train_loss: {:.5f}, val_loss: {:.5f}, val_age_acc: {:.5f}, val_gender_acc: {:.5f}' 208 | .format(epoch + 1, epoches, train_loss, val_loss, val_age_acc, val_gender_acc)) 209 | 210 | self.early_stopping(val_acc, model) 211 | if self.early_stopping.early_stop: 212 | print("Early stopping") 213 | break 214 | 215 | return best_acc, best_age_acc, best_gender_acc 216 | 217 | def model_predict(self, input_data, batch_size): 218 | model.eval() 219 | 220 | # 序列真实长度 221 | tmp = input_data[0] 222 | tmp[tmp < 0 ] = 0 223 | tmp[tmp > 0] = 1 224 | seq_lengths = tmp.sum(axis=1) 225 | 226 | data_size = input_data[0].shape[0] 227 | n_batches = math.ceil(data_size / batch_size) 228 | 229 | oof_pred_age = np.zeros((data_size, 10)) 230 | oof_pred_gender = np.zeros((data_size, 2)) 231 | 232 | for batch in range(n_batches): 233 | start = batch * batch_size 234 | end = min((batch + 1) * batch_size, data_size) 235 | bs = end - start 236 | 237 | batch_creative_id_seqs = input_data[0][start:end] 238 | batch_ad_id_seqs = input_data[1][start:end] 239 | batch_advertiser_id_seqs = input_data[2][start:end] 240 | batch_product_id_seqs = input_data[3][start:end] 241 | batch_creative_id_seqs = torch.LongTensor(batch_creative_id_seqs).to(self.device) 242 | batch_ad_id_seqs = torch.LongTensor(batch_ad_id_seqs).to(self.device) 243 | batch_advertiser_id_seqs = torch.LongTensor(batch_advertiser_id_seqs).to(self.device) 244 | batch_product_id_seqs = torch.LongTensor(batch_product_id_seqs).to(self.device) 245 | 246 | batch_seq_lengths = seq_lengths[start:end] 247 | 248 | pred_age, pred_gender = model([batch_creative_id_seqs, batch_ad_id_seqs, batch_advertiser_id_seqs, batch_product_id_seqs], 249 | batch_seq_lengths) 250 | 251 | oof_pred_age[start:end] = pred_age.cpu().data.numpy() 252 | oof_pred_gender[start:end] = pred_gender.cpu().data.numpy() 253 | 254 | del batch_creative_id_seqs, batch_ad_id_seqs, batch_advertiser_id_seqs, batch_product_id_seqs 255 | torch.cuda.empty_cache() 256 | return oof_pred_age, oof_pred_gender 257 | 258 | 259 | -------------------------------------------------------------------------------- /src/torch/m1_lstm_6inputs_age.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | from torch.nn import functional as F 6 | import torch.nn.utils.rnn as rnn_utils 7 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence 8 | 9 | 10 | def fix_seed(seed): 11 | random.seed(seed) 12 | np.random.seed(seed) 13 | torch.manual_seed(seed) 14 | torch.cuda.manual_seed_all(seed) 15 | torch.backends.cudnn.deterministic = True 16 | 17 | 18 | fix_seed(2020) 19 | 20 | 21 | class LSTMCLF(nn.Module): 22 | def __init__(self, seq_embedding_features, statistics_features, 23 | seq_statistics_features, seq_len, device): 24 | super(LSTMCLF, self).__init__() 25 | 26 | self.seq_embedding_features = seq_embedding_features 27 | self.statistics_features = statistics_features 28 | self.seq_statistics_features = seq_statistics_features 29 | 30 | self.seq_len = seq_len 31 | 32 | self.seq_statistics_size = len(seq_statistics_features) 33 | self.statistics_size = len(statistics_features) 34 | 35 | self.device = device 36 | 37 | input_size = 0 38 | self.embeds = nn.ModuleDict() 39 | 40 | for f in self.seq_embedding_features: 41 | embedding_layer = nn.Embedding( 42 | self.seq_embedding_features[f]['nunique'], 43 | self.seq_embedding_features[f]['embedding_dim']) 44 | 45 | pretrained_weight = np.array( 46 | self.seq_embedding_features[f]['pretrained_embedding']) 47 | embedding_layer.weight.data.copy_( 48 | torch.from_numpy(pretrained_weight)) 49 | embedding_layer.weight.requires_grad = False 50 | self.embeds[f] = embedding_layer 51 | 52 | # LSTM 层 53 | for f in self.seq_embedding_features: 54 | input_size += seq_embedding_features[f]['embedding_dim'] 55 | input_size += self.seq_statistics_size 56 | 57 | 58 | self.lstm = nn.LSTM(input_size, 59 | 128, 60 | batch_first=True, 61 | num_layers=2, 62 | bidirectional=True) 63 | 64 | # Attention 层 65 | attention_input_size = 128 * 2 66 | self.attention_output_size = attention_input_size 67 | self.Q_weight = nn.Linear(attention_input_size, 68 | self.attention_output_size) 69 | self.K_weight = nn.Linear(attention_input_size, 70 | self.attention_output_size) 71 | self.V_weight = nn.Linear(attention_input_size, 72 | self.attention_output_size) 73 | 74 | # DNN 层 75 | dnn_input_size = self.attention_output_size + attention_input_size + self.statistics_size 76 | 77 | self.linears = nn.Sequential(nn.Linear(dnn_input_size, 1024), 78 | nn.LeakyReLU(), nn.BatchNorm1d(1024), 79 | nn.Linear(1024, 256), nn.LeakyReLU(), 80 | nn.BatchNorm1d(256), nn.Linear(256, 64), 81 | nn.LeakyReLU(), nn.BatchNorm1d(64), 82 | nn.Linear(64, 16), nn.LeakyReLU(), 83 | nn.BatchNorm1d(16), nn.Dropout(0.1)) 84 | 85 | # age 输出层 86 | self.age_output = nn.Linear(16, 10) 87 | 88 | def forward(self, seq_id_list, statistics_input, statistics_seq_input_list, 89 | seq_lengths): 90 | batch_size = seq_id_list[0].shape[0] 91 | 92 | # 序列 id Embedding 93 | seq_feature_list = [] 94 | for i, seq_id in enumerate(seq_id_list): 95 | feature_name = list(self.seq_embedding_features.keys())[i] 96 | embeddings = self.embeds[feature_name](seq_id.to(self.device)) 97 | seq_feature_list.append(embeddings) 98 | 99 | seq_input = torch.cat(seq_feature_list, 2) 100 | seq_input = F.dropout2d(seq_input, 0.1, training=self.training) 101 | 102 | # LSTM 103 | seq_output, _ = self.lstm(seq_input) 104 | # mask padding 105 | mask = torch.zeros(seq_output.shape).to(self.device) 106 | for idx, seqlen in enumerate(seq_lengths): 107 | mask[idx, :seqlen] = 1 108 | 109 | seq_output = seq_output * mask 110 | lstm_output_max, _ = torch.max(seq_output, dim=1) 111 | 112 | # Attention 113 | Q = self.Q_weight(seq_output) 114 | K = self.K_weight(seq_output) 115 | V = self.V_weight(seq_output) 116 | 117 | tmp = torch.bmm(Q, K.transpose(1, 2)) 118 | tmp = tmp / np.sqrt(self.attention_output_size) 119 | w = torch.softmax(tmp, 2) 120 | att_output = torch.bmm(w, V) 121 | att_output = att_output * mask 122 | att_max_output, _ = torch.max(att_output, dim=1) 123 | 124 | # 拼接统计特征 125 | cat_output = torch.cat( 126 | [att_max_output, lstm_output_max, statistics_input], 1) 127 | 128 | # DNN 129 | dnn_output = self.linears(cat_output) 130 | age_output = self.age_output(dnn_output) 131 | 132 | return age_output 133 | 134 | def set(self, criterion_age, optimizer, scheduler, early_stopping): 135 | self.criterion_age = criterion_age 136 | self.optimizer = optimizer 137 | self.scheduler = scheduler 138 | if early_stopping is None: 139 | self.early_stopping = EarlyStopping( 140 | file_name='model/age_checkpoint.pt', patience=10, verbose=True) 141 | else: 142 | self.early_stopping = early_stopping 143 | 144 | # self.set_embedding() 145 | 146 | def set_embedding(self): 147 | for f in self.seq_embedding_features: 148 | embedding_layer = nn.Embedding( 149 | self.seq_embedding_features[f]['nunique'], 150 | self.seq_embedding_features[f]['embedding_dim']) 151 | 152 | pretrained_weight = np.array( 153 | self.seq_embedding_features[f]['pretrained_embedding']) 154 | embedding_layer.weight.data.copy_( 155 | torch.from_numpy(pretrained_weight)) 156 | embedding_layer.weight.requires_grad = False 157 | self.embeds[f] = embedding_layer 158 | 159 | def gen_data(self, data): 160 | # 序列 id 特征 161 | seq_id_list = [] 162 | for f in self.seq_embedding_features.keys(): 163 | vectorized_seqs = data[f + '_seq'].values 164 | 165 | seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs))) 166 | # seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())) 167 | 168 | # for idx, (seq, 169 | # seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)): 170 | # seq_tensor[idx, :seqlen] = torch.LongTensor(seq) 171 | 172 | # seq_tensor = seq_tensor.long() 173 | # seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) 174 | seq = [torch.from_numpy(v) for v in vectorized_seqs] 175 | seq_tensor = pad_sequence(seq, batch_first=True, padding_value=0) 176 | seq_tensor = seq_tensor.long() 177 | seq_id_list.append(seq_tensor) 178 | 179 | # 统计特征 180 | statistics_input = data[self.statistics_features].values 181 | statistics_input = torch.Tensor(statistics_input).to(self.device) 182 | 183 | # 序列统计特征 184 | seq_statistics_list = [] 185 | for f in self.seq_statistics_features: 186 | seq_statistics_input = data[f].values 187 | seq_statistics_input = torch.Tensor(seq_statistics_input).to( 188 | self.self.device) 189 | seq_statistics_list.append(seq_statistics_input) 190 | 191 | y_age = data['age'].values 192 | y_age = torch.LongTensor(y_age).to(self.device) 193 | 194 | return seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths 195 | 196 | def model_train(self, train_data, val_data, epoches, batch_size): 197 | data_size = train_data.shape[0] 198 | n_batches = math.ceil(data_size / batch_size) 199 | 200 | best_age_acc = 0 201 | 202 | for epoch in range(epoches): 203 | model.train() 204 | 205 | train_loss_list = [] 206 | for batch in tqdm(range(n_batches), 207 | desc='epoch:{}/{}'.format(epoch, epoches)): 208 | start = batch * batch_size 209 | end = min((batch + 1) * batch_size, data_size) 210 | bs = end - start 211 | 212 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 213 | train_data.iloc[start:end]) 214 | 215 | pred_age = model(seq_id_list, statistics_input, 216 | seq_statistics_list, seq_lengths) 217 | 218 | loss = self.criterion_age(pred_age, y_age) 219 | 220 | train_loss_list.append(loss.item()) 221 | 222 | self.optimizer.zero_grad() 223 | loss.backward() 224 | self.optimizer.step() 225 | 226 | del seq_id_list, statistics_input, seq_statistics_list, y_age, loss 227 | torch.cuda.empty_cache() 228 | 229 | train_loss = np.mean(train_loss_list) 230 | _, val_loss, val_age_acc = self.model_predict(val_data, 231 | batch_size, 232 | log=True) 233 | if self.scheduler: 234 | self.scheduler.step(val_age_acc) 235 | 236 | if val_age_acc > best_age_acc: 237 | best_age_acc = val_age_acc 238 | 239 | print( 240 | 'epoch {}/{} train_loss: {:.5f}, val_loss: {:.5f}, val_age_acc: {:.5f}' 241 | .format(epoch + 1, epoches, train_loss, val_loss, val_age_acc)) 242 | 243 | self.early_stopping(val_age_acc, model) 244 | if self.early_stopping.early_stop: 245 | print("Early stopping") 246 | break 247 | 248 | return best_age_acc 249 | 250 | def model_predict(self, data, batch_size, log): 251 | model.eval() 252 | 253 | data_size = data.shape[0] 254 | n_batches = math.ceil(data_size / batch_size) 255 | 256 | if log: 257 | age_acc_list = [] 258 | loss_list = [] 259 | 260 | oof_pred_age = np.zeros((data.shape[0], 10)) 261 | 262 | for batch in range(n_batches): 263 | start = batch * batch_size 264 | end = min((batch + 1) * batch_size, data_size) 265 | bs = end - start 266 | 267 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 268 | data.iloc[start:end]) 269 | 270 | pred_age = model(seq_id_list, statistics_input, 271 | seq_statistics_list, seq_lengths) 272 | 273 | oof_pred_age[start:end] = pred_age.cpu().data.numpy() 274 | 275 | del seq_id_list, statistics_input, seq_statistics_list 276 | 277 | if log: 278 | loss = self.criterion_age(pred_age, y_age) 279 | 280 | pred_age_cat = torch.max(pred_age, 1)[1].cpu().data.numpy() 281 | 282 | age_accuracy = float( 283 | (pred_age_cat == y_age.cpu().data.numpy() 284 | ).astype(int).sum()) / float(y_age.shape[0]) 285 | 286 | age_acc_list.append(age_accuracy) 287 | loss_list.append(loss.item()) 288 | 289 | del y_age, loss, pred_age 290 | 291 | torch.cuda.empty_cache() 292 | 293 | if log: 294 | return oof_pred_age, np.mean(loss_list), np.mean(age_acc_list) 295 | else: 296 | return oof_pred_age, None, None 297 | -------------------------------------------------------------------------------- /src/torch/m2_transformer_6inputs_age.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | from torch.nn import functional as F 6 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 7 | from torch.nn.init import xavier_uniform_ 8 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence 9 | 10 | 11 | def fix_seed(seed): 12 | random.seed(seed) 13 | np.random.seed(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | torch.backends.cudnn.deterministic = True 17 | 18 | 19 | fix_seed(2020) 20 | 21 | 22 | class PositionalEncoding(nn.Module): 23 | 24 | def __init__(self, d_model, dropout=0.1, max_len=5000): 25 | super(PositionalEncoding, self).__init__() 26 | self.dropout = nn.Dropout(p=dropout) 27 | 28 | pe = torch.zeros(max_len, d_model) 29 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 30 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 31 | pe[:, 0::2] = torch.sin(position * div_term) 32 | pe[:, 1::2] = torch.cos(position * div_term) 33 | pe = pe.unsqueeze(0).transpose(0, 1) 34 | self.register_buffer('pe', pe) 35 | 36 | def forward(self, x): 37 | x = x + self.pe[:x.size(0), :] 38 | return self.dropout(x) 39 | 40 | 41 | 42 | # self.positional_encoder = Positional_Encoding_Layer( 43 | # embed_size, max_seq_len=max_seq_len) 44 | # transformer_encoder_layer = TransformerEncoderLayer( 45 | # embed_size, 46 | # n_head, 47 | # dim_feedforward=intermediate_size, 48 | # dropout=dropout) 49 | # self.transformer_encoder = TransformerEncoder( 50 | # transformer_encoder_layer, n_enc_layer) 51 | 52 | 53 | class LSTMCLF(nn.Module): 54 | def __init__(self, seq_embedding_features, statistics_features, 55 | seq_statistics_features, seq_len, device): 56 | super(LSTMCLF, self).__init__() 57 | 58 | self.seq_embedding_features = seq_embedding_features 59 | self.statistics_features = statistics_features 60 | self.seq_statistics_features = seq_statistics_features 61 | 62 | self.seq_len = seq_len 63 | 64 | self.seq_statistics_size = len(seq_statistics_features) 65 | self.statistics_size = len(statistics_features) 66 | 67 | self.device = device 68 | 69 | input_size = 0 70 | self.embeds = nn.ModuleDict() 71 | 72 | for f in self.seq_embedding_features: 73 | embedding_layer = nn.Embedding( 74 | self.seq_embedding_features[f]['nunique'], 75 | self.seq_embedding_features[f]['embedding_dim']) 76 | 77 | pretrained_weight = np.array( 78 | self.seq_embedding_features[f]['pretrained_embedding']) 79 | embedding_layer.weight.data.copy_( 80 | torch.from_numpy(pretrained_weight)) 81 | embedding_layer.weight.requires_grad = False 82 | self.embeds[f] = embedding_layer 83 | 84 | for f in self.seq_embedding_features: 85 | input_size += seq_embedding_features[f]['embedding_dim'] 86 | input_size += self.seq_statistics_size 87 | 88 | encoder_layer = TransformerEncoderLayer(d_model=input_size, nhead=8) 89 | self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=2) 90 | self.lstm = nn.LSTM(input_size, 91 | 128, 92 | bidirectional=True) 93 | # DNN 层 94 | dnn_input_size = 128 * 2 95 | self.linears = nn.Sequential(nn.Linear(dnn_input_size, 1024), 96 | nn.LeakyReLU(), nn.BatchNorm1d(1024), 97 | nn.Linear(1024, 256), nn.LeakyReLU(), 98 | nn.BatchNorm1d(256), nn.Linear(256, 64), 99 | nn.LeakyReLU(), nn.BatchNorm1d(64), 100 | nn.Dropout(0.1)) 101 | 102 | # age 输出层 103 | self.age_output = nn.Linear(64, 10) 104 | 105 | def _generate_square_subsequent_mask(self, sz): 106 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) 107 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) 108 | return mask 109 | 110 | def forward(self, seq_id_list, statistics_input, statistics_seq_input_list, 111 | seq_lengths): 112 | batch_size = seq_id_list[0].shape[0] 113 | 114 | # 序列 id Embedding 115 | seq_feature_list = [] 116 | for i, seq_id in enumerate(seq_id_list): 117 | feature_name = list(self.seq_embedding_features.keys())[i] 118 | embeddings = self.embeds[feature_name](seq_id.to(self.device)) 119 | seq_feature_list.append(embeddings) 120 | 121 | # 序列统计特征 122 | for i, statistics_seq_input in enumerate(statistics_seq_input_list): 123 | statistics_seq_input = statistics_seq_input.view( 124 | batch_size, self.seq_len, -1) 125 | seq_feature_list.append(statistics_seq_input) 126 | 127 | seq_input = torch.cat(seq_feature_list, 2) 128 | 129 | src_mask = self._generate_square_subsequent_mask(len(seq_input)).to(self.device) 130 | seq_output = self.transformer_encoder(seq_input, src_mask) # (batch_size, n_step, embed_size) 131 | seq_output, _ = self.lstm(seq_output) # (batch_size, n_step, embed_size) 132 | 133 | # mask padding 134 | mask = torch.zeros(seq_output.shape).to(self.device) 135 | for idx, seqlen in enumerate(seq_lengths): 136 | mask[idx, :seqlen] = 1 137 | seq_output = seq_output * mask 138 | seq_output, _ = torch.max(seq_output, dim=1) # (batch_size, embed_size) 139 | 140 | # DNN 141 | dnn_output = self.linears(seq_output) 142 | age_output = self.age_output(dnn_output) 143 | 144 | return age_output 145 | 146 | def set(self, criterion_age, optimizer, scheduler, early_stopping): 147 | self.criterion_age = criterion_age 148 | self.optimizer = optimizer 149 | self.scheduler = scheduler 150 | if early_stopping is None: 151 | self.early_stopping = EarlyStopping( 152 | file_name='model/age_checkpoint.pt', patience=10, verbose=True) 153 | else: 154 | self.early_stopping = early_stopping 155 | 156 | # self.set_embedding() 157 | 158 | def set_embedding(self): 159 | for f in self.seq_embedding_features: 160 | embedding_layer = nn.Embedding( 161 | self.seq_embedding_features[f]['nunique'], 162 | self.seq_embedding_features[f]['embedding_dim']) 163 | 164 | pretrained_weight = np.array( 165 | self.seq_embedding_features[f]['pretrained_embedding']) 166 | embedding_layer.weight.data.copy_( 167 | torch.from_numpy(pretrained_weight)) 168 | embedding_layer.weight.requires_grad = False 169 | self.embeds[f] = embedding_layer 170 | 171 | def gen_data(self, data): 172 | # 序列 id 特征 173 | seq_id_list = [] 174 | for f in self.seq_embedding_features.keys(): 175 | vectorized_seqs = data[f + '_seq'].values 176 | 177 | seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs))) 178 | seq = [torch.from_numpy(v) for v in vectorized_seqs] 179 | seq_tensor = pad_sequence(seq, batch_first=True, padding_value=0) 180 | seq_tensor = seq_tensor.long() 181 | seq_id_list.append(seq_tensor) 182 | 183 | # 统计特征 184 | statistics_input = data[self.statistics_features].values 185 | statistics_input = torch.Tensor(statistics_input).to(self.device) 186 | 187 | # 序列统计特征 188 | seq_statistics_list = [] 189 | for f in self.seq_statistics_features: 190 | seq_statistics_input = data[f].values 191 | seq_statistics_input = torch.Tensor(seq_statistics_input).to( 192 | self.self.device) 193 | seq_statistics_list.append(seq_statistics_input) 194 | 195 | y_age = data['age'].values 196 | y_age = torch.LongTensor(y_age).to(self.device) 197 | 198 | return seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths 199 | 200 | def model_train(self, train_data, val_data, epoches, batch_size): 201 | data_size = train_data.shape[0] 202 | n_batches = math.ceil(data_size / batch_size) 203 | 204 | best_age_acc = 0 205 | 206 | for epoch in range(epoches): 207 | model.train() 208 | 209 | train_loss_list = [] 210 | for batch in tqdm(range(n_batches), 211 | desc='epoch:{}/{}'.format(epoch, epoches)): 212 | start = batch * batch_size 213 | end = min((batch + 1) * batch_size, data_size) 214 | bs = end - start 215 | 216 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 217 | train_data.iloc[start:end]) 218 | 219 | pred_age = model(seq_id_list, statistics_input, 220 | seq_statistics_list, seq_lengths) 221 | 222 | y_age = train_data['age'].values[start:end] 223 | y_age = torch.LongTensor(y_age).to(device) 224 | 225 | loss = self.criterion_age(pred_age, y_age) 226 | 227 | train_loss_list.append(loss.item()) 228 | 229 | self.optimizer.zero_grad() 230 | loss.backward() 231 | self.optimizer.step() 232 | 233 | del seq_id_list, statistics_input, seq_statistics_list, y_age, loss 234 | torch.cuda.empty_cache() 235 | 236 | train_loss = np.mean(train_loss_list) 237 | _, val_loss, val_age_acc = self.model_predict(val_data, 238 | batch_size, 239 | log=True) 240 | 241 | scheduler.step(val_age_acc) 242 | 243 | if val_age_acc > best_age_acc: 244 | best_age_acc = val_age_acc 245 | 246 | print( 247 | 'epoch {}/{} train_loss: {:.5f}, val_loss: {:.5f}, val_age_acc: {:.5f}' 248 | .format(epoch + 1, epoches, train_loss, val_loss, val_age_acc)) 249 | 250 | self.early_stopping(val_age_acc, model) 251 | if self.early_stopping.early_stop: 252 | print("Early stopping") 253 | break 254 | 255 | return best_age_acc 256 | 257 | def model_predict(self, data, batch_size, log): 258 | model.eval() 259 | 260 | data_size = data.shape[0] 261 | n_batches = math.ceil(data_size / batch_size) 262 | 263 | if log: 264 | age_acc_list = [] 265 | loss_list = [] 266 | 267 | oof_pred_age = np.zeros((data.shape[0], 10)) 268 | 269 | for batch in range(n_batches): 270 | start = batch * batch_size 271 | end = min((batch + 1) * batch_size, data_size) 272 | bs = end - start 273 | 274 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 275 | data.iloc[start:end]) 276 | 277 | pred_age = model(seq_id_list, statistics_input, 278 | seq_statistics_list, seq_lengths) 279 | 280 | oof_pred_age[start:end] = pred_age.cpu().data.numpy() 281 | 282 | del seq_id_list, statistics_input, seq_statistics_list 283 | 284 | if log: 285 | loss = self.criterion_age(pred_age, y_age) 286 | 287 | pred_age_cat = torch.max(pred_age, 1)[1].cpu().data.numpy() 288 | 289 | age_accuracy = float( 290 | (pred_age_cat == y_age.cpu().data.numpy() 291 | ).astype(int).sum()) / float(y_age.shape[0]) 292 | 293 | age_acc_list.append(age_accuracy) 294 | loss_list.append(loss.item()) 295 | 296 | torch.cuda.empty_cache() 297 | 298 | if log: 299 | return oof_pred_age, np.mean(loss_list), np.mean(age_acc_list) 300 | else: 301 | return oof_pred_age, None, None 302 | 303 | 304 | 305 | -------------------------------------------------------------------------------- /src/torch/m7_lstm_3inputs_age.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | from torch.nn import functional as F 6 | import torch.nn.utils.rnn as rnn_utils 7 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence 8 | 9 | 10 | def fix_seed(seed): 11 | random.seed(seed) 12 | np.random.seed(seed) 13 | torch.manual_seed(seed) 14 | torch.cuda.manual_seed_all(seed) 15 | torch.backends.cudnn.deterministic = True 16 | 17 | 18 | fix_seed(2020) 19 | 20 | 21 | class LSTMCLF(nn.Module): 22 | def __init__(self, seq_embedding_features, statistics_features, 23 | seq_statistics_features, seq_len, device): 24 | super(LSTMCLF, self).__init__() 25 | 26 | self.seq_embedding_features = seq_embedding_features 27 | self.statistics_features = statistics_features 28 | self.seq_statistics_features = seq_statistics_features 29 | 30 | self.seq_len = seq_len 31 | 32 | self.seq_statistics_size = len(seq_statistics_features) 33 | self.statistics_size = len(statistics_features) 34 | 35 | self.device = device 36 | 37 | input_size = 0 38 | self.embeds = nn.ModuleDict() 39 | 40 | for f in self.seq_embedding_features: 41 | embedding_layer = nn.Embedding( 42 | self.seq_embedding_features[f]['nunique'], 43 | self.seq_embedding_features[f]['embedding_dim']) 44 | 45 | pretrained_weight = np.array( 46 | self.seq_embedding_features[f]['pretrained_embedding']) 47 | embedding_layer.weight.data.copy_( 48 | torch.from_numpy(pretrained_weight)) 49 | embedding_layer.weight.requires_grad = False 50 | self.embeds[f] = embedding_layer 51 | 52 | # LSTM 层 53 | for f in self.seq_embedding_features: 54 | input_size += seq_embedding_features[f]['embedding_dim'] 55 | input_size += self.seq_statistics_size 56 | 57 | 58 | self.lstm = nn.LSTM(input_size, 59 | 128, 60 | batch_first=True, 61 | num_layers=2, 62 | bidirectional=True) 63 | 64 | # Attention 层 65 | attention_input_size = 128 * 2 66 | self.attention_output_size = attention_input_size 67 | self.Q_weight = nn.Linear(attention_input_size, 68 | self.attention_output_size) 69 | self.K_weight = nn.Linear(attention_input_size, 70 | self.attention_output_size) 71 | self.V_weight = nn.Linear(attention_input_size, 72 | self.attention_output_size) 73 | 74 | # DNN 层 75 | dnn_input_size = self.attention_output_size + attention_input_size + self.statistics_size 76 | 77 | self.linears = nn.Sequential(nn.Linear(dnn_input_size, 1024), 78 | nn.LeakyReLU(), nn.BatchNorm1d(1024), 79 | nn.Linear(1024, 256), nn.LeakyReLU(), 80 | nn.BatchNorm1d(256), nn.Linear(256, 64), 81 | nn.LeakyReLU(), nn.BatchNorm1d(64), 82 | nn.Linear(64, 16), nn.LeakyReLU(), 83 | nn.BatchNorm1d(16), nn.Dropout(0.1)) 84 | 85 | # age 输出层 86 | self.age_output = nn.Linear(16, 10) 87 | 88 | def forward(self, seq_id_list, statistics_input, statistics_seq_input_list, 89 | seq_lengths): 90 | batch_size = seq_id_list[0].shape[0] 91 | 92 | # 序列 id Embedding 93 | seq_feature_list = [] 94 | for i, seq_id in enumerate(seq_id_list): 95 | feature_name = list(self.seq_embedding_features.keys())[i] 96 | embeddings = self.embeds[feature_name](seq_id.to(self.device)) 97 | seq_feature_list.append(embeddings) 98 | 99 | seq_input = torch.cat(seq_feature_list, 2) 100 | seq_input = F.dropout2d(seq_input, 0.1, training=self.training) 101 | 102 | # LSTM 103 | seq_output, _ = self.lstm(seq_input) 104 | # mask padding 105 | mask = torch.zeros(seq_output.shape).to(self.device) 106 | for idx, seqlen in enumerate(seq_lengths): 107 | mask[idx, :seqlen] = 1 108 | 109 | seq_output = seq_output * mask 110 | lstm_output_max, _ = torch.max(seq_output, dim=1) 111 | 112 | # Attention 113 | Q = self.Q_weight(seq_output) 114 | K = self.K_weight(seq_output) 115 | V = self.V_weight(seq_output) 116 | 117 | tmp = torch.bmm(Q, K.transpose(1, 2)) 118 | tmp = tmp / np.sqrt(self.attention_output_size) 119 | w = torch.softmax(tmp, 2) 120 | att_output = torch.bmm(w, V) 121 | att_output = att_output * mask 122 | att_max_output, _ = torch.max(att_output, dim=1) 123 | 124 | # 拼接统计特征 125 | cat_output = torch.cat( 126 | [att_max_output, lstm_output_max, statistics_input], 1) 127 | 128 | # DNN 129 | dnn_output = self.linears(cat_output) 130 | age_output = self.age_output(dnn_output) 131 | 132 | return age_output 133 | 134 | def set(self, criterion_age, optimizer, scheduler, early_stopping): 135 | self.criterion_age = criterion_age 136 | self.optimizer = optimizer 137 | self.scheduler = scheduler 138 | if early_stopping is None: 139 | self.early_stopping = EarlyStopping( 140 | file_name='model/age_checkpoint.pt', patience=10, verbose=True) 141 | else: 142 | self.early_stopping = early_stopping 143 | 144 | # self.set_embedding() 145 | 146 | def set_embedding(self): 147 | for f in self.seq_embedding_features: 148 | embedding_layer = nn.Embedding( 149 | self.seq_embedding_features[f]['nunique'], 150 | self.seq_embedding_features[f]['embedding_dim']) 151 | 152 | pretrained_weight = np.array( 153 | self.seq_embedding_features[f]['pretrained_embedding']) 154 | embedding_layer.weight.data.copy_( 155 | torch.from_numpy(pretrained_weight)) 156 | embedding_layer.weight.requires_grad = False 157 | self.embeds[f] = embedding_layer 158 | 159 | def gen_data(self, data): 160 | # 序列 id 特征 161 | seq_id_list = [] 162 | for f in self.seq_embedding_features.keys(): 163 | vectorized_seqs = data[f + '_seq'].values 164 | 165 | seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs))) 166 | # seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())) 167 | 168 | # for idx, (seq, 169 | # seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)): 170 | # seq_tensor[idx, :seqlen] = torch.LongTensor(seq) 171 | 172 | # seq_tensor = seq_tensor.long() 173 | # seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) 174 | seq = [torch.from_numpy(v) for v in vectorized_seqs] 175 | seq_tensor = pad_sequence(seq, batch_first=True, padding_value=0) 176 | seq_tensor = seq_tensor.long() 177 | seq_id_list.append(seq_tensor) 178 | 179 | # 统计特征 180 | statistics_input = data[self.statistics_features].values 181 | statistics_input = torch.Tensor(statistics_input).to(self.device) 182 | 183 | # 序列统计特征 184 | seq_statistics_list = [] 185 | for f in self.seq_statistics_features: 186 | seq_statistics_input = data[f].values 187 | seq_statistics_input = torch.Tensor(seq_statistics_input).to( 188 | self.self.device) 189 | seq_statistics_list.append(seq_statistics_input) 190 | 191 | y_age = data['age'].values 192 | y_age = torch.LongTensor(y_age).to(self.device) 193 | 194 | return seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths 195 | 196 | def model_train(self, train_data, val_data, epoches, batch_size): 197 | data_size = train_data.shape[0] 198 | n_batches = math.ceil(data_size / batch_size) 199 | 200 | best_age_acc = 0 201 | 202 | for epoch in range(epoches): 203 | model.train() 204 | 205 | train_loss_list = [] 206 | for batch in tqdm(range(n_batches), 207 | desc='epoch:{}/{}'.format(epoch, epoches)): 208 | start = batch * batch_size 209 | end = min((batch + 1) * batch_size, data_size) 210 | bs = end - start 211 | 212 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 213 | train_data.iloc[start:end]) 214 | 215 | pred_age = model(seq_id_list, statistics_input, 216 | seq_statistics_list, seq_lengths) 217 | 218 | loss = self.criterion_age(pred_age, y_age) 219 | 220 | train_loss_list.append(loss.item()) 221 | 222 | self.optimizer.zero_grad() 223 | loss.backward() 224 | self.optimizer.step() 225 | 226 | del seq_id_list, statistics_input, seq_statistics_list, y_age, loss 227 | torch.cuda.empty_cache() 228 | 229 | train_loss = np.mean(train_loss_list) 230 | _, val_loss, val_age_acc = self.model_predict(val_data, 231 | batch_size, 232 | log=True) 233 | if self.scheduler: 234 | self.scheduler.step(val_age_acc) 235 | 236 | if val_age_acc > best_age_acc: 237 | best_age_acc = val_age_acc 238 | 239 | print( 240 | 'epoch {}/{} train_loss: {:.5f}, val_loss: {:.5f}, val_age_acc: {:.5f}' 241 | .format(epoch + 1, epoches, train_loss, val_loss, val_age_acc)) 242 | 243 | self.early_stopping(val_age_acc, model) 244 | if self.early_stopping.early_stop: 245 | print("Early stopping") 246 | break 247 | 248 | return best_age_acc 249 | 250 | def model_predict(self, data, batch_size, log): 251 | model.eval() 252 | 253 | data_size = data.shape[0] 254 | n_batches = math.ceil(data_size / batch_size) 255 | 256 | if log: 257 | age_acc_list = [] 258 | loss_list = [] 259 | 260 | oof_pred_age = np.zeros((data.shape[0], 10)) 261 | 262 | for batch in range(n_batches): 263 | start = batch * batch_size 264 | end = min((batch + 1) * batch_size, data_size) 265 | bs = end - start 266 | 267 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 268 | data.iloc[start:end]) 269 | 270 | pred_age = model(seq_id_list, statistics_input, 271 | seq_statistics_list, seq_lengths) 272 | 273 | oof_pred_age[start:end] = pred_age.cpu().data.numpy() 274 | 275 | del seq_id_list, statistics_input, seq_statistics_list 276 | 277 | if log: 278 | loss = self.criterion_age(pred_age, y_age) 279 | 280 | pred_age_cat = torch.max(pred_age, 1)[1].cpu().data.numpy() 281 | 282 | age_accuracy = float( 283 | (pred_age_cat == y_age.cpu().data.numpy() 284 | ).astype(int).sum()) / float(y_age.shape[0]) 285 | 286 | age_acc_list.append(age_accuracy) 287 | loss_list.append(loss.item()) 288 | 289 | del y_age, loss, pred_age 290 | 291 | torch.cuda.empty_cache() 292 | 293 | if log: 294 | return oof_pred_age, np.mean(loss_list), np.mean(age_acc_list) 295 | else: 296 | return oof_pred_age, None, None 297 | -------------------------------------------------------------------------------- /src/torch/m9_transformer_3inputs_age.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | from torch.nn import functional as F 6 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 7 | from torch.nn.init import xavier_uniform_ 8 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence 9 | 10 | 11 | def fix_seed(seed): 12 | random.seed(seed) 13 | np.random.seed(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | torch.backends.cudnn.deterministic = True 17 | 18 | 19 | fix_seed(2020) 20 | 21 | 22 | class PositionalEncoding(nn.Module): 23 | 24 | def __init__(self, d_model, dropout=0.1, max_len=5000): 25 | super(PositionalEncoding, self).__init__() 26 | self.dropout = nn.Dropout(p=dropout) 27 | 28 | pe = torch.zeros(max_len, d_model) 29 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 30 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 31 | pe[:, 0::2] = torch.sin(position * div_term) 32 | pe[:, 1::2] = torch.cos(position * div_term) 33 | pe = pe.unsqueeze(0).transpose(0, 1) 34 | self.register_buffer('pe', pe) 35 | 36 | def forward(self, x): 37 | x = x + self.pe[:x.size(0), :] 38 | return self.dropout(x) 39 | 40 | 41 | 42 | # self.positional_encoder = Positional_Encoding_Layer( 43 | # embed_size, max_seq_len=max_seq_len) 44 | # transformer_encoder_layer = TransformerEncoderLayer( 45 | # embed_size, 46 | # n_head, 47 | # dim_feedforward=intermediate_size, 48 | # dropout=dropout) 49 | # self.transformer_encoder = TransformerEncoder( 50 | # transformer_encoder_layer, n_enc_layer) 51 | 52 | 53 | class LSTMCLF(nn.Module): 54 | def __init__(self, seq_embedding_features, statistics_features, 55 | seq_statistics_features, seq_len, device): 56 | super(LSTMCLF, self).__init__() 57 | 58 | self.seq_embedding_features = seq_embedding_features 59 | self.statistics_features = statistics_features 60 | self.seq_statistics_features = seq_statistics_features 61 | 62 | self.seq_len = seq_len 63 | 64 | self.seq_statistics_size = len(seq_statistics_features) 65 | self.statistics_size = len(statistics_features) 66 | 67 | self.device = device 68 | 69 | input_size = 0 70 | self.embeds = nn.ModuleDict() 71 | 72 | for f in self.seq_embedding_features: 73 | embedding_layer = nn.Embedding( 74 | self.seq_embedding_features[f]['nunique'], 75 | self.seq_embedding_features[f]['embedding_dim']) 76 | 77 | pretrained_weight = np.array( 78 | self.seq_embedding_features[f]['pretrained_embedding']) 79 | embedding_layer.weight.data.copy_( 80 | torch.from_numpy(pretrained_weight)) 81 | embedding_layer.weight.requires_grad = False 82 | self.embeds[f] = embedding_layer 83 | 84 | for f in self.seq_embedding_features: 85 | input_size += seq_embedding_features[f]['embedding_dim'] 86 | input_size += self.seq_statistics_size 87 | 88 | encoder_layer = TransformerEncoderLayer(d_model=input_size, nhead=8) 89 | self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=2) 90 | self.lstm = nn.LSTM(input_size, 91 | 128, 92 | bidirectional=True) 93 | # DNN 层 94 | dnn_input_size = 128 * 2 95 | self.linears = nn.Sequential(nn.Linear(dnn_input_size, 1024), 96 | nn.LeakyReLU(), nn.BatchNorm1d(1024), 97 | nn.Linear(1024, 256), nn.LeakyReLU(), 98 | nn.BatchNorm1d(256), nn.Linear(256, 64), 99 | nn.LeakyReLU(), nn.BatchNorm1d(64), 100 | nn.Dropout(0.1)) 101 | 102 | # age 输出层 103 | self.age_output = nn.Linear(64, 10) 104 | 105 | def _generate_square_subsequent_mask(self, sz): 106 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) 107 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) 108 | return mask 109 | 110 | def forward(self, seq_id_list, statistics_input, statistics_seq_input_list, 111 | seq_lengths): 112 | batch_size = seq_id_list[0].shape[0] 113 | 114 | # 序列 id Embedding 115 | seq_feature_list = [] 116 | for i, seq_id in enumerate(seq_id_list): 117 | feature_name = list(self.seq_embedding_features.keys())[i] 118 | embeddings = self.embeds[feature_name](seq_id.to(self.device)) 119 | seq_feature_list.append(embeddings) 120 | 121 | # 序列统计特征 122 | for i, statistics_seq_input in enumerate(statistics_seq_input_list): 123 | statistics_seq_input = statistics_seq_input.view( 124 | batch_size, self.seq_len, -1) 125 | seq_feature_list.append(statistics_seq_input) 126 | 127 | seq_input = torch.cat(seq_feature_list, 2) 128 | 129 | src_mask = self._generate_square_subsequent_mask(len(seq_input)).to(self.device) 130 | seq_output = self.transformer_encoder(seq_input, src_mask) # (batch_size, n_step, embed_size) 131 | seq_output, _ = self.lstm(seq_output) # (batch_size, n_step, embed_size) 132 | 133 | # mask padding 134 | mask = torch.zeros(seq_output.shape).to(self.device) 135 | for idx, seqlen in enumerate(seq_lengths): 136 | mask[idx, :seqlen] = 1 137 | seq_output = seq_output * mask 138 | seq_output, _ = torch.max(seq_output, dim=1) # (batch_size, embed_size) 139 | 140 | # DNN 141 | dnn_output = self.linears(seq_output) 142 | age_output = self.age_output(dnn_output) 143 | 144 | return age_output 145 | 146 | def set(self, criterion_age, optimizer, scheduler, early_stopping): 147 | self.criterion_age = criterion_age 148 | self.optimizer = optimizer 149 | self.scheduler = scheduler 150 | if early_stopping is None: 151 | self.early_stopping = EarlyStopping( 152 | file_name='model/age_checkpoint.pt', patience=10, verbose=True) 153 | else: 154 | self.early_stopping = early_stopping 155 | 156 | def set_embedding(self): 157 | for f in self.seq_embedding_features: 158 | embedding_layer = nn.Embedding( 159 | self.seq_embedding_features[f]['nunique'], 160 | self.seq_embedding_features[f]['embedding_dim']) 161 | 162 | pretrained_weight = np.array( 163 | self.seq_embedding_features[f]['pretrained_embedding']) 164 | embedding_layer.weight.data.copy_( 165 | torch.from_numpy(pretrained_weight)) 166 | embedding_layer.weight.requires_grad = False 167 | self.embeds[f] = embedding_layer 168 | 169 | def gen_data(self, data): 170 | # 序列 id 特征 171 | seq_id_list = [] 172 | for f in self.seq_embedding_features.keys(): 173 | vectorized_seqs = data[f + '_seq'].values 174 | 175 | seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs))) 176 | seq = [torch.from_numpy(v) for v in vectorized_seqs] 177 | seq_tensor = pad_sequence(seq, batch_first=True, padding_value=0) 178 | seq_tensor = seq_tensor.long() 179 | seq_id_list.append(seq_tensor) 180 | 181 | # 统计特征 182 | statistics_input = data[self.statistics_features].values 183 | statistics_input = torch.Tensor(statistics_input).to(self.device) 184 | 185 | # 序列统计特征 186 | seq_statistics_list = [] 187 | for f in self.seq_statistics_features: 188 | seq_statistics_input = data[f].values 189 | seq_statistics_input = torch.Tensor(seq_statistics_input).to( 190 | self.self.device) 191 | seq_statistics_list.append(seq_statistics_input) 192 | 193 | y_age = data['age'].values 194 | y_age = torch.LongTensor(y_age).to(self.device) 195 | 196 | return seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths 197 | 198 | def model_train(self, train_data, val_data, epoches, batch_size): 199 | data_size = train_data.shape[0] 200 | n_batches = math.ceil(data_size / batch_size) 201 | 202 | best_age_acc = 0 203 | 204 | for epoch in range(epoches): 205 | model.train() 206 | 207 | train_loss_list = [] 208 | for batch in tqdm(range(n_batches), 209 | desc='epoch:{}/{}'.format(epoch, epoches)): 210 | start = batch * batch_size 211 | end = min((batch + 1) * batch_size, data_size) 212 | bs = end - start 213 | 214 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 215 | train_data.iloc[start:end]) 216 | 217 | pred_age = model(seq_id_list, statistics_input, 218 | seq_statistics_list, seq_lengths) 219 | 220 | y_age = train_data['age'].values[start:end] 221 | y_age = torch.LongTensor(y_age).to(device) 222 | 223 | loss = self.criterion_age(pred_age, y_age) 224 | 225 | train_loss_list.append(loss.item()) 226 | 227 | self.optimizer.zero_grad() 228 | loss.backward() 229 | self.optimizer.step() 230 | 231 | del seq_id_list, statistics_input, seq_statistics_list, y_age, loss 232 | torch.cuda.empty_cache() 233 | 234 | train_loss = np.mean(train_loss_list) 235 | _, val_loss, val_age_acc = self.model_predict(val_data, 236 | batch_size, 237 | log=True) 238 | 239 | scheduler.step(val_age_acc) 240 | 241 | if val_age_acc > best_age_acc: 242 | best_age_acc = val_age_acc 243 | 244 | print( 245 | 'epoch {}/{} train_loss: {:.5f}, val_loss: {:.5f}, val_age_acc: {:.5f}' 246 | .format(epoch + 1, epoches, train_loss, val_loss, val_age_acc)) 247 | 248 | self.early_stopping(val_age_acc, model) 249 | if self.early_stopping.early_stop: 250 | print("Early stopping") 251 | break 252 | 253 | return best_age_acc 254 | 255 | def model_predict(self, data, batch_size, log): 256 | model.eval() 257 | 258 | data_size = data.shape[0] 259 | n_batches = math.ceil(data_size / batch_size) 260 | 261 | if log: 262 | age_acc_list = [] 263 | loss_list = [] 264 | 265 | oof_pred_age = np.zeros((data.shape[0], 10)) 266 | 267 | for batch in range(n_batches): 268 | start = batch * batch_size 269 | end = min((batch + 1) * batch_size, data_size) 270 | bs = end - start 271 | 272 | seq_id_list, statistics_input, seq_statistics_list, y_age, seq_lengths = self.gen_data( 273 | data.iloc[start:end]) 274 | 275 | pred_age = model(seq_id_list, statistics_input, 276 | seq_statistics_list, seq_lengths) 277 | 278 | oof_pred_age[start:end] = pred_age.cpu().data.numpy() 279 | 280 | del seq_id_list, statistics_input, seq_statistics_list 281 | 282 | if log: 283 | loss = self.criterion_age(pred_age, y_age) 284 | 285 | pred_age_cat = torch.max(pred_age, 1)[1].cpu().data.numpy() 286 | 287 | age_accuracy = float( 288 | (pred_age_cat == y_age.cpu().data.numpy() 289 | ).astype(int).sum()) / float(y_age.shape[0]) 290 | 291 | age_acc_list.append(age_accuracy) 292 | loss_list.append(loss.item()) 293 | 294 | torch.cuda.empty_cache() 295 | 296 | if log: 297 | return oof_pred_age, np.mean(loss_list), np.mean(age_acc_list) 298 | else: 299 | return oof_pred_age, None, None 300 | -------------------------------------------------------------------------------- /src/torch/pytorchtools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class EarlyStopping: 5 | """Early stops the training if validation loss doesn't improve after a given patience.""" 6 | def __init__(self, file_name='checkpoint.pt', patience=7, verbose=False, delta=0): 7 | """ 8 | Args: 9 | patience (int): How long to wait after last time validation loss improved. 10 | Default: 7 11 | verbose (bool): If True, prints a message for each validation loss improvement. 12 | Default: False 13 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 14 | Default: 0 15 | """ 16 | self.patience = patience 17 | self.verbose = verbose 18 | self.counter = 0 19 | self.best_score = None 20 | self.early_stop = False 21 | self.val_score_max = np.Inf 22 | self.delta = delta 23 | self.file_name = file_name 24 | 25 | def __call__(self, score, model): 26 | if self.best_score is None: 27 | self.best_score = score 28 | self.save_checkpoint(score, model) 29 | elif score < self.best_score + self.delta: 30 | self.counter += 1 31 | print(f'EarlyStopping counter: {self.counter} out of {self.patience}') 32 | if self.counter >= self.patience: 33 | self.early_stop = True 34 | else: 35 | self.best_score = score 36 | self.save_checkpoint(score, model) 37 | self.counter = 0 38 | 39 | def save_checkpoint(self, score, model): 40 | '''Saves model when validation loss decrease.''' 41 | if self.verbose: 42 | print(f'Validation score increased ({self.val_score_max:.6f} --> {score:.6f}). Saving model ...') 43 | torch.save(model.state_dict(), self.file_name) 44 | self.val_score_max = score 45 | --------------------------------------------------------------------------------