├── README.md └── src ├── model.py ├── preprocess.py ├── track2.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # ICME2019短视频内容理解与推荐竞赛Rank 14方案 2 | [比赛链接](https://www.biendata.com/competition/icmechallenge2019/) 3 | 4 | 队员:[Yumho](https://github.com/gyh75520), [byxshr](https://github.com/byxshr) 5 | 6 | ## 文件说明 7 | - `utils.py`: 提供了生成label均值特征的函数 8 | - `preprocess.py`: 视频特征和音频特征预处理 9 | - `model.py`: 模型文件 10 | - `track2.py`: 训练文件 11 | 12 | ## 机器配置 13 | - 内存256G,TITAN Xp 12G显存 * 2 14 | 15 | ## 模型 16 | 基于[DeepCTR](https://github.com/shenweichen/DeepCTR)(版本大概在v0.2.2至v0.3.2之间)的xDeepFM模型,做了些修改来支持视频特征和音频特征的输入。其中视频特征和音频特征通过`128->embedding_size`的神经网络做embedding,拼接到所有特征的embedding向量后面。 17 | 18 | 具体参数设置可查看`track2.py`。 19 | 20 | ## 特征 21 | 1. 原始特征(uid, user_city, item_id, author_id, item_city, music_id, did, video_duration) 22 | 2. 计数特征,即统计某个字段的出现次数(uid, did, item_id, author_id, uid-author_id) 23 | 3. label均值特征,即根据某个字段分组统计每个分组的标签均值(uid, did, item_id, uid-author_id, uid-did, did-channel) 24 | 4. nunique特征,例如uid_item_nunique,是统计每个uid下有多少不同的item_id,等频离散化 25 | - uid_icity_nunique 26 | - uid_item_nunique 27 | - uid_author_nunique 28 | - uid_music_nunique 29 | - item_ucity_nunique 30 | - item_uid_nunique 31 | - author_uid_nunique 32 | 5. 视频特征 33 | 6. 音频特征 34 | 7. 标题特征,提取视频标题的不重复字段,当作序列特征输入,最后做sum pooling得到embedding向量 35 | 36 | ## 成绩 37 | 最终成绩是跑10次取平均; 38 | track1:a/b榜都是15; 39 | track2:a榜第14,分数为0.79405 (0.73,0.93);b榜第14,分数为0.79485 (0.74,0.93) 40 | 41 | ## 参考 42 | - Lian J, Zhou X, Zhang F, et al. xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems[J]. arXiv preprint arXiv:1803.05170, 2018.(https://arxiv.org/pdf/1803.05170.pdf) 43 | - [Bytedance_ICME2019_challenge_baseline](https://github.com/shenweichen/Bytedance_ICME2019_challenge_baseline) 44 | - [Data-Competition-TopSolution](https://github.com/Smilexuhc/Data-Competition-TopSolution) 45 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from deepctr.input_embedding import preprocess_input_embedding 3 | from deepctr.layers.interaction import CIN 4 | from deepctr.layers.core import MLP, PredictionLayer 5 | from deepctr.layers.utils import concat_fun 6 | from deepctr.utils import check_feature_config_dict 7 | from tensorflow.python.keras.layers import Lambda 8 | from tensorflow.python.keras.regularizers import l2 9 | 10 | 11 | def xDeepFM_MTL(feature_dim_dict, embedding_size=8, seed=1024, init_std=0.0001, l2_reg_linear=0.00001, l2_reg_embedding=0.00001, 12 | cin_layer_size=(256, 256), cin_split_half=True, cin_activation='relu', 13 | hidden_size=(256, 256), activation='relu', keep_prob=1, use_bn=False, l2_reg_deep=0, 14 | task_net_size=(128,), task_activation='relu', task_keep_prob=1, task_use_bn=False, task_l2=0, 15 | final_activation='sigmoid', use_video=False, use_audio=False): 16 | 17 | check_feature_config_dict(feature_dim_dict) 18 | if len(task_net_size) < 1: 19 | raise ValueError('task_net_size must be at least one layer') 20 | 21 | deep_emb_list, linear_logit, inputs_list = preprocess_input_embedding( 22 | feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed, True) 23 | 24 | if use_video: 25 | video_input = tf.keras.layers.Input(shape=(128,), name='video') 26 | video_emb = tf.keras.layers.Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding))(video_input) 27 | video_emb = tf.keras.layers.Reshape((1, embedding_size), input_shape=(embedding_size,))(video_emb) 28 | deep_emb_list.append(video_emb) 29 | inputs_list.append(video_input) 30 | 31 | if use_audio: 32 | audio_input = tf.keras.layers.Input(shape=(128,), name='audio') 33 | audio_emb = tf.keras.layers.Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding))(audio_input) 34 | audio_emb = tf.keras.layers.Reshape((1, embedding_size), input_shape=(embedding_size,))(audio_emb) 35 | deep_emb_list.append(audio_emb) 36 | inputs_list.append(audio_input) 37 | 38 | fm_input = concat_fun(deep_emb_list, axis=1) 39 | 40 | if len(cin_layer_size) > 0: 41 | exFM_out = CIN(cin_layer_size, cin_activation, cin_split_half, seed)(fm_input) 42 | exFM_logit = tf.keras.layers.Dense(1, activation=None,)(exFM_out) 43 | 44 | deep_input = tf.keras.layers.Flatten()(fm_input) 45 | 46 | deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, use_bn, seed)(deep_input) 47 | 48 | finish_out = MLP(task_net_size, task_activation, task_l2, task_keep_prob, task_use_bn, seed)(deep_out) 49 | finish_logit = tf.keras.layers.Dense(1, use_bias=False, activation=None)(finish_out) 50 | 51 | like_out = MLP(task_net_size, task_activation, task_l2, task_keep_prob, task_use_bn, seed)(deep_out) 52 | like_logit = tf.keras.layers.Dense(1, use_bias=False, activation=None)(like_out) 53 | 54 | finish_logit = tf.keras.layers.add([linear_logit, finish_logit, exFM_logit]) 55 | like_logit = tf.keras.layers.add([linear_logit, like_logit, exFM_logit]) 56 | 57 | output_finish = PredictionLayer(final_activation, name='finish')(finish_logit) 58 | output_like = PredictionLayer(final_activation, name='like')(like_logit) 59 | 60 | model = tf.keras.models.Model(inputs=inputs_list, outputs=[output_finish, output_like]) 61 | return model 62 | 63 | 64 | def xDeepFM(feature_dim_dict, embedding_size=8, seed=1024, init_std=0.0001, l2_reg_linear=0.00001, l2_reg_embedding=0.00001, 65 | cin_layer_size=(256, 256), cin_split_half=True, cin_activation='relu', 66 | hidden_size=(256, 256), activation='relu', keep_prob=1, use_bn=False, l2_reg_deep=0, 67 | final_activation='sigmoid', use_video=False, use_audio=False): 68 | 69 | check_feature_config_dict(feature_dim_dict) 70 | deep_emb_list, linear_logit, inputs_list = preprocess_input_embedding( 71 | feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed, True) 72 | 73 | if use_video: 74 | video_input = tf.keras.layers.Input(shape=(128,), name='video') 75 | video_emb = tf.keras.layers.Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding))(video_input) 76 | video_emb = tf.keras.layers.Reshape((1, embedding_size), input_shape=(embedding_size,))(video_emb) 77 | deep_emb_list.append(video_emb) 78 | inputs_list.append(video_input) 79 | 80 | if use_audio: 81 | audio_input = tf.keras.layers.Input(shape=(128,), name='audio') 82 | audio_emb = tf.keras.layers.Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding))(audio_input) 83 | audio_emb = tf.keras.layers.Reshape((1, embedding_size), input_shape=(embedding_size,))(audio_emb) 84 | deep_emb_list.append(audio_emb) 85 | inputs_list.append(audio_input) 86 | 87 | fm_input = concat_fun(deep_emb_list, axis=1) 88 | 89 | if len(cin_layer_size) > 0: 90 | exFM_out = CIN(cin_layer_size, cin_activation, cin_split_half, seed)(fm_input) 91 | exFM_logit = tf.keras.layers.Dense(1, activation=None,)(exFM_out) 92 | 93 | deep_input = tf.keras.layers.Flatten()(fm_input) 94 | 95 | deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, use_bn, seed)(deep_input) 96 | deep_logit = tf.keras.layers.Dense(1, use_bias=False, activation=None)(deep_out) 97 | 98 | final_logit = tf.keras.layers.add([linear_logit, deep_logit, exFM_logit]) 99 | output = PredictionLayer(final_activation, name='output')(final_logit) 100 | 101 | model = tf.keras.models.Model(inputs=inputs_list, outputs=output) 102 | return model 103 | -------------------------------------------------------------------------------- /src/preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import MinMaxScaler 3 | 4 | # 将音频特征文件转换为标准的csv 5 | audio_feats = pd.read_json('track2/track2_audio_features.txt', lines=True) 6 | audio_feats.drop_duplicates(subset='item_id', inplace=True) 7 | audio_df = pd.DataFrame(audio_feats.audio_feature_128_dim.tolist(), columns=['ad' + str(i) for i in range(128)]) 8 | audio_df['item_id'] = video_feats['item_id'] 9 | audio_df.to_csv('track2/track2_audio_features.csv', index=False, float_format='%.4f') 10 | 11 | # 将视频特征文件转换为标准的csv 12 | video_feats = pd.read_json('track2/track2_video_features.txt', lines=True) 13 | video_df = pd.DataFrame(video_feats.video_feature_dim_128.tolist(), columns=['vd' + str(i) for i in range(128)]) 14 | video_df['item_id'] = video_feats['item_id'] 15 | video_df.fillna(0, inplace=True) 16 | cols = ['vd' + str(i) for i in range(128)] 17 | video_df[cols] = MinMaxScaler().fit_transform(video_df[cols]) 18 | video_df.to_csv('track2/track2_video_features_mms.csv', index=False, float_format='%.4f') 19 | -------------------------------------------------------------------------------- /src/track2.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import tensorflow as tf 7 | from deepctr import SingleFeat, VarLenFeat 8 | from keras import backend as K 9 | from keras.backend.tensorflow_backend import set_session 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler 12 | from tensorflow.python.keras.preprocessing.sequence import pad_sequences 13 | 14 | from model import xDeepFM 15 | from utils import * 16 | 17 | config = tf.ConfigProto() 18 | config.gpu_options.allow_growth=True 19 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 20 | set_session(tf.Session(config=config)) 21 | 22 | 23 | def get_input(use_count=False, use_unique=False, use_video=False, use_audio=False, use_title=False, ONLINE_FLAG=False, SAMPLE_FLAG=True, VALIDATION_FRAC=0.2, target='finish'): 24 | train_file = 'track2/sample_train.txt' if SAMPLE_FLAG else 'track2/final_track2_train.txt' 25 | test_file = 'track2/sample_test_no_answer.txt' if SAMPLE_FLAG else 'track2/final_track2_test_no_anwser.txt' 26 | video_file = 'track2/sample_video_features.csv' if SAMPLE_FLAG else 'track2/track2_video_features_mms.csv' 27 | face_file = 'track2/sample_face2.csv' if SAMPLE_FLAG else 'track2/face_df2.csv' 28 | audio_file = 'track2/sample_audio_features.csv' if SAMPLE_FLAG else 'track2/track2_audio_features.csv' 29 | title_file = 'track2/sample_title.txt' if SAMPLE_FLAG else 'track2/track2_title.txt' 30 | 31 | data = pd.read_csv(train_file, sep='\t', names=['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 32 | 'finish', 'like', 'music_id', 'did', 'creat_time', 'video_duration']) 33 | print('training set read completed.') 34 | if ONLINE_FLAG: 35 | test_data = pd.read_csv(test_file, sep='\t', names=['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 36 | 'finish', 'like', 'music_id', 'did', 'creat_time', 'video_duration']) 37 | train_size = data.shape[0] 38 | data = data.append(test_data).reset_index(drop=True) 39 | else: 40 | train_size = int(data.shape[0]*(1-VALIDATION_FRAC)) 41 | print('test set read completed.') 42 | 43 | sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'music_id', 'did', ] 44 | dense_features = [] 45 | 46 | data['video_duration'] = pd.qcut(data['video_duration'], q=10, labels=False, duplicates='drop') 47 | sparse_features.append('video_duration') 48 | 49 | data['creat_time'] = data['creat_time'] % (24 * 3600) / 3600 50 | data['creat_time'] = pd.qcut(data['creat_time'], q=24, labels=False, duplicates='drop') 51 | sparse_features.append('creat_time') 52 | 53 | if use_count: 54 | data['uid-author_id'] = data['uid'].astype(str) + '-' + data['author_id'].astype(str) 55 | data['uid-did'] = data['uid'].astype(str) + '-' + data['did'].astype(str) 56 | data['did-channel'] = data['did'].astype(str) + '-' + data['channel'].astype(str) 57 | 58 | # 计数特征 59 | cols = ['uid', 'did', 'item_id', 'author_id', 'uid-author_id'] 60 | for c in cols: 61 | data[c + '_cnt'] = data[c].map(data[c].value_counts()) 62 | data[c + '_cnt'] = pd.qcut(data[c + '_cnt'], q=10, labels=False, duplicates='drop') 63 | sparse_features.append(c + '_cnt') 64 | 65 | # 均值特征 66 | df = get_expanding_mean(data[:train_size], data[train_size:], 67 | ['uid-author_id', 'uid-did', 'did-channel', 'uid', 'did', 'item_id'], 68 | 'finish') 69 | dense_features += list(df.columns) 70 | data = pd.concat([data, df], axis=1) 71 | 72 | if use_unique: 73 | data['uid_icity_nunique'] = data['uid'].map(data.groupby('uid')['item_city'].nunique()) 74 | data['uid_icity_nunique'] = pd.qcut(data['uid_icity_nunique'], q=10, labels=False, duplicates='drop') 75 | sparse_features.append('uid_icity_nunique') 76 | 77 | data['uid_item_nunique'] = data['uid'].map(data.groupby('uid')['item_id'].nunique()) 78 | data['uid_item_nunique'] = pd.qcut(data['uid_item_nunique'], q=10, labels=False, duplicates='drop') 79 | sparse_features.append('uid_item_nunique') 80 | 81 | data['uid_author_nunique'] = data['uid'].map(data.groupby('uid')['author_id'].nunique()) 82 | data['uid_author_nunique'] = pd.qcut(data['uid_author_nunique'], q=10, labels=False, duplicates='drop') 83 | sparse_features.append('uid_author_nunique') 84 | 85 | data['uid_music_nunique'] = data['uid'].map(data.groupby('uid')['music_id'].nunique()) 86 | data['uid_music_nunique'] = pd.qcut(data['uid_music_nunique'], q=10, labels=False, duplicates='drop') 87 | sparse_features.append('uid_music_nunique') 88 | 89 | data['item_ucity_nunique'] = data['item_id'].map(data.groupby('item_id')['user_city'].nunique()) 90 | data['item_ucity_nunique'] = pd.qcut(data['item_ucity_nunique'], q=10, labels=False, duplicates='drop') 91 | sparse_features.append('item_ucity_nunique') 92 | 93 | data['item_uid_nunique'] = data['item_id'].map(data.groupby('item_id')['uid'].nunique()) 94 | data['item_uid_nunique'] = pd.qcut(data['item_uid_nunique'], q=30, labels=False, duplicates='drop') 95 | sparse_features.append('item_uid_nunique') 96 | 97 | data['author_uid_nunique'] = data['author_id'].map(data.groupby('author_id')['uid'].nunique()) 98 | data['author_uid_nunique'] = pd.qcut(data['author_uid_nunique'], q=20, labels=False, duplicates='drop') 99 | sparse_features.append('author_uid_nunique') 100 | 101 | print('generate stats feats completed.') 102 | 103 | if use_video: 104 | video_feats = pd.read_csv(video_file) 105 | print('video feats read completed.') 106 | 107 | data = pd.merge(data, video_feats, how='left', on='item_id') 108 | for i in range(128): 109 | col = 'vd' + str(i) 110 | data[col].fillna(0, inplace=True) 111 | print('merge video feats completed.') 112 | 113 | if use_audio: 114 | audio_feats = pd.read_csv(audio_file) 115 | print('audio feats read completed.') 116 | 117 | data = pd.merge(data, audio_feats, how='left', on='item_id') 118 | for i in range(128): 119 | col = 'ad' + str(i) 120 | data[col].fillna(0, inplace=True) 121 | print('merge audio feats completed.') 122 | 123 | if use_title: 124 | max_len = 47 125 | title_feats = pd.read_json(title_file, lines=True) 126 | print('title feats read completed') 127 | 128 | def get_title_len(d): 129 | return sum(d.values()) 130 | title_feats['title_len'] = title_feats['title_features'].apply(get_title_len) 131 | prior = title_feats['title_len'].mean() 132 | 133 | dense_features.append('title_len') 134 | title_feats['title_features'] = title_feats['title_features'].apply(lambda x: list(x.keys())) 135 | 136 | data = pd.merge(data, title_feats, how='left', on='item_id') 137 | for row in data.loc[data.title_features.isna(), 'title_features'].index: 138 | data.at[row, 'title_features'] = [] 139 | data['title_len'].fillna(prior, inplace=True) 140 | print('merge title feats completed') 141 | 142 | data[sparse_features] = data[sparse_features].fillna('-1', ) 143 | 144 | for feat in sparse_features: 145 | lbe = LabelEncoder() 146 | data[feat] = lbe.fit_transform(data[feat]) 147 | 148 | if len(dense_features) > 0: 149 | mms = MinMaxScaler(feature_range=(0, 1)) 150 | data[dense_features] = mms.fit_transform(data[dense_features]) 151 | 152 | sparse_feature_list = [SingleFeat(feat, data[feat].nunique()) for feat in sparse_features] 153 | dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features] 154 | sequence_feature_list = [] 155 | 156 | if use_title: 157 | sequence_feature_list.append(VarLenFeat('title', 134545, max_len, 'sum')) 158 | 159 | print('data preprocess completed.') 160 | 161 | train = data.iloc[:train_size] 162 | test = data.iloc[train_size:] 163 | 164 | train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ 165 | [train[feat.name].values for feat in dense_feature_list] 166 | 167 | test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ 168 | [test[feat.name].values for feat in dense_feature_list] 169 | 170 | if use_title: 171 | train_model_input += [pad_sequences(train['title_features'], maxlen=max_len, padding='post')] 172 | test_model_input += [pad_sequences(test['title_features'], maxlen=max_len, padding='post')] 173 | 174 | if use_video: 175 | vd_cols = ['vd' + str(i) for i in range(128)] 176 | video_input = data[vd_cols].values 177 | train_model_input += [video_input[:train_size]] 178 | test_model_input += [video_input[train_size:]] 179 | 180 | if use_audio: 181 | ad_cols = ['ad' + str(i) for i in range(128)] 182 | audio_input = data[ad_cols].values 183 | train_model_input += [audio_input[:train_size]] 184 | test_model_input += [audio_input[train_size:]] 185 | 186 | print('input process completed.') 187 | print(f'use sparse feats: [{",".join(sparse_features)}]') 188 | print(f'use dense feats: [{",".join(dense_features)}]') 189 | 190 | train_labels, test_labels = train[target].values, test[target].values 191 | feature_dim_dict = {"sparse": sparse_feature_list, "dense": dense_feature_list, "sequence": sequence_feature_list} 192 | 193 | if ONLINE_FLAG: 194 | return feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels, test_data 195 | return feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels 196 | 197 | 198 | def auc(y_true, y_pred): 199 | return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double) 200 | 201 | 202 | def main(): 203 | input_params = { 204 | 'use_count': False, # 是否使用计数、均值特征 205 | 'use_unique': False, # 是否使用nunique等频离散化特征 206 | 'use_video': False, # 是否使用视频特征 207 | 'use_audio': False, # 是否使用音频特征 208 | 'use_title': False, # 是否使用标题特征 209 | 'ONLINE_FLAG': False, # True为线上提交,输出提交文件;False为线下测试 210 | 'SAMPLE_FLAG': False, # True为小样本测试程序正确性 211 | 'VALIDATION_FRAC': 0.2, # 线下验证集大小,按照顺序划分 212 | 'target': 'finish' # 预测目标 213 | } 214 | 215 | if input_params['ONLINE_FLAG']: 216 | feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels, test_data = get_input(**input_params) 217 | result = test_data[['uid', 'item_id', 'finish', 'like']].copy() 218 | result.rename(columns={'finish': 'finish_probability', 'like': 'like_probability'}, inplace=True) 219 | else: 220 | feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels = get_input(**input_params) 221 | 222 | iterations = 10 # 跑多次取平均 223 | for i in range(iterations): 224 | print(f'iteration {i + 1}/{iterations}') 225 | 226 | model = xDeepFM(feature_dim_dict, 227 | embedding_size=8, seed=1278, init_std=0.0001, l2_reg_linear=0.00001, l2_reg_embedding=0.00001, 228 | cin_layer_size=(256, 256, 256), cin_split_half=True, cin_activation='relu', 229 | hidden_size=(256, 256), activation='relu', keep_prob=1, use_bn=False, l2_reg_deep=0, 230 | final_activation='sigmoid', use_video=input_params['use_video'], use_audio=input_params['use_audio']) 231 | model.compile("adagrad", "binary_crossentropy", metrics=[auc]) 232 | 233 | if input_params['ONLINE_FLAG']: 234 | history = model.fit(train_model_input, train_labels, batch_size=4096, epochs=1, verbose=1) 235 | pred_ans = model.predict(test_model_input, batch_size=4096) 236 | auc_train = history.history['auc'][-1] 237 | 238 | else: 239 | history = model.fit(train_model_input, train_labels, batch_size=4096, epochs=1, verbose=1, 240 | validation_data=(test_model_input, test_labels)) 241 | 242 | auc_train, auc_valid = history.history['auc'][-1], history.history['val_auc'][-1] 243 | print(f'train score = {auc_train:.5f}\nvalid score = {auc_valid:.5f}') 244 | 245 | if input_params['ONLINE_FLAG']: 246 | tmp_df = pd.DataFrame({'uid': result['uid'], 'item_id': result['item_id']}) 247 | tmp_df['finish_probability'] = pred_ans 248 | tmp_df.to_csv(f'output/finish_{auc_train:.5f}.csv', index=False, float_format='%.6f') 249 | 250 | if i == 0: 251 | result['finish_probability'] = tmp_df['finish_probability'] / iterations 252 | else: 253 | result['finish_probability'] += tmp_df['finish_probability'] / iterations 254 | 255 | if input_params['ONLINE_FLAG']: 256 | result.to_csv('output/finish_result.csv', index=None, float_format='%.6f') 257 | 258 | print('done.') 259 | 260 | 261 | if __name__ == '__main__': 262 | main() 263 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.model_selection import StratifiedKFold 4 | 5 | 6 | # 获取正样本计数 7 | def gen_pos_counts(X_train, X_test, cols, target): 8 | train, test = pd.DataFrame(), pd.DataFrame() 9 | 10 | for col in cols: 11 | new_col = col + '_' + target + '_cnt' 12 | count_map = X_train.groupby(col)[target].sum() 13 | train[new_col] = X_train[col].map(count_map) - X_train[target] 14 | test[new_col] = X_test[col].map(count_map).fillna(np.mean(count_map)) 15 | return pd.concat([train, test], axis=0).reset_index(drop=True) 16 | 17 | 18 | # kfold target均值 19 | def get_kfold_mean(X_train, X_test, cols, target, folds=5): 20 | skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=918) 21 | train, test = pd.DataFrame(), pd.DataFrame() 22 | 23 | for col in cols: 24 | new_col = col + '_' + target + '_mean' 25 | train[new_col] = np.zeros(X_train.shape[0]) 26 | 27 | for tr_idx, val_idx in skf.split(X_train, X_train[target]): 28 | X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx] 29 | for col in cols: 30 | new_col = col + '_' + target + '_mean' 31 | tmp_means = X_val[col].map(X_tr.groupby(col)[target].mean()) 32 | train[new_col][val_idx] = tmp_means 33 | 34 | prior = X_train[target].mean() 35 | for col in cols: 36 | target_map = X_train.groupby(col)[target].mean() 37 | 38 | new_col = col + '_' + target + '_mean' 39 | train[new_col].fillna(prior, inplace=True) 40 | 41 | test[new_col] = X_test[col].map(target_map) 42 | test[new_col].fillna(prior, inplace=True) 43 | 44 | return pd.concat([train, test], axis=0).reset_index(drop=True) 45 | 46 | 47 | # smooth target均值 48 | def get_smooth_mean(X_train, X_test, cols, target, m=300): 49 | def get_smooth_mean_map(df, by, on, m=300): 50 | mean = df[on].mean() 51 | agg = df.groupby(by)[on].agg(['count', 'mean']) 52 | counts = agg['count'] 53 | means = agg['mean'] 54 | smooth = (counts * means + m * mean) / (counts + m) 55 | return smooth 56 | 57 | prior = X_train[target].mean() 58 | train, test = pd.DataFrame(), pd.DataFrame() 59 | 60 | for col in cols: 61 | new_col = col + '_' + target + '_mean' 62 | target_map = get_smooth_mean_map(X_train, by=col, on=target, m=m) 63 | train[new_col] = X_train[col].map(target_map) 64 | test[new_col] = X_test[col].map(target_map).fillna(prior) 65 | 66 | return pd.concat([train, test], axis=0).reset_index(drop=True) 67 | 68 | 69 | # leave-one-out target均值 70 | def get_loo_mean(X_train, X_test, cols, target): 71 | prior = X_train[target].mean() 72 | train, test = pd.DataFrame(), pd.DataFrame() 73 | 74 | for col in cols: 75 | new_col = col + '_' + target + '_mean' 76 | 77 | target_sum = X_train.groupby(col)[target].transform('sum') 78 | n_objects = X_train.groupby(col)[target].transform('count') 79 | 80 | train[new_col] = (target_sum - X_train[target]) / (n_objects - 1) 81 | train[new_col].fillna(prior, inplace=True) 82 | 83 | test[new_col] = X_test[col].map(X_train.groupby(col)[target].mean()) 84 | test[new_col].fillna(prior, inplace=True) 85 | return pd.concat([train, test], axis=0).reset_index(drop=True) 86 | 87 | 88 | # expanding target均值 89 | def get_expanding_mean(X_train, X_test, cols, target): 90 | prior = X_train[target].mean() 91 | train, test = pd.DataFrame(), pd.DataFrame() 92 | 93 | for col in cols: 94 | new_col = col + '_' + target + '_mean' 95 | 96 | cumsum = X_train.groupby(col)[target].cumsum() - X_train[target] 97 | cumcnt = X_train.groupby(col)[target].cumcount() 98 | train[new_col] = cumsum / cumcnt 99 | train[new_col].fillna(prior, inplace=True) 100 | 101 | test[new_col] = X_test[col].map(X_train.groupby(col)[target].mean()) 102 | test[new_col].fillna(prior, inplace=True) 103 | return pd.concat([train, test], axis=0).reset_index(drop=True) 104 | --------------------------------------------------------------------------------