├── README.md ├── baseline.py ├── comm.py └── evaluation.py /README.md: -------------------------------------------------------------------------------- 1 | # **2021中国高校计算机大赛-微信大数据挑战赛Baseline** 2 | 3 | 本次比赛基于脱敏和采样后的数据信息,对于给定的一定数量到访过微信视频号“热门推荐”的用户,根据这些用户在视频号内的历史n天的行为数据,通过算法在测试集上预测出这些用户对于不同视频内容的互动行为(包括点赞、点击头像、收藏、转发等)的发生概率。 4 | 5 | 本次比赛以多个行为预测结果的加权uAUC值进行评分。大赛官方网站:https://algo.weixin.qq.com/ 6 | 7 | ## **1. 环境配置** 8 | 9 | - pandas>=1.0.5 10 | - tensorflow>=1.14.0 11 | - python3 12 | 13 | ## **2. 运行配置** 14 | 15 | - CPU/GPU均可 16 | - 最小内存要求 17 | - 特征/样本生成:3G 18 | - 模型训练及评估:6G 19 | 20 | - 耗时 21 | - 测试环境:内存8G,CPU 2.3 GHz 双核Intel Core i5 22 | - 特征/样本生成:226 s 23 | - 模型训练及评估:740 s 24 | 25 | ## **3. 目录结构** 26 | 27 | - comm.py: 数据集生成 28 | - baseline.py: 模型训练,评估,提交 29 | - evaluation.py: uauc 评估 30 | - data/: 数据,特征,模型 31 | - wechat_algo_data1/: 初赛数据集 32 | - feature/: 特征 33 | - offline_train/:离线训练数据集 34 | - online_train/:在线训练数据集 35 | - evaluate/:评估数据集 36 | - submit/:在线预估结果提交 37 | - model/: 模型文件 38 | 39 | ## **4. 运行流程** 40 | - 新建data目录,下载比赛数据集,放在data目录下并解压,得到wechat_algo_data1目录 41 | - 生成特征/样本:python comm.py (自动新建data目录下用于存储特征、样本和模型的各个目录) 42 | - 训练离线模型:python baseline.py offline_train 43 | - 评估离线模型:python baseline.py evaluate (生成data/evaluate/submit_${timestamp}.csv) 44 | - 训练在线模型:python baseline.py online_train 45 | - 生成提交文件:python baseline.py submit (生成data/submit/submit_${timestamp}.csv) 46 | - 评估代码: evaluation.py 47 | 48 | ## **5. 模型及特征** 49 | - 模型:[Wide & Deep](https://dl.acm.org/doi/pdf/10.1145/2988450.2988454) 50 | - 参数: 51 | - batch_size: 128 52 | - emded_dim: 10 53 | - num_epochs: 1 54 | - learning_rate: 0.1 55 | - 特征: 56 | - dnn 特征: userid, feedid, authorid, bgm_singer_id, bgm_song_id 57 | - linear 特征:videoplayseconds, device,用户/feed 历史行为次数 58 | 59 | ## **6. 模型结果** 60 | 61 | |stage |weight_uauc |read_comment|like|click_avatar|forward| 62 | |:---- |:---- |:---- |:---- |:---- |:----| 63 | | 离线 | 0.657003 |0.626822 |0.633864 |0.735366 |0.690416 | 64 | | 在线 | 0.607908| 0.577496 |0.588645 |0.682383 |0.638398 | 65 | 66 | ## **7. 相关文献** 67 | * Cheng, Heng-Tze, et al. "Wide & deep learning for recommender systems." Proceedings of the 1st workshop on deep learning for recommender systems. 2016. 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /baseline.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import time 5 | import numpy as np 6 | import pandas as pd 7 | import tensorflow.compat.v1 as tf 8 | from tensorflow import feature_column as fc 9 | from comm import ACTION_LIST, STAGE_END_DAY, FEA_COLUMN_LIST 10 | from evaluation import uAUC, compute_weighted_score 11 | 12 | 13 | flags = tf.app.flags 14 | FLAGS = flags.FLAGS 15 | 16 | flags.DEFINE_string('model_checkpoint_dir', './data/model', 'model dir') 17 | flags.DEFINE_string('root_path', './data/', 'data dir') 18 | flags.DEFINE_integer('batch_size', 128, 'batch_size') 19 | flags.DEFINE_integer('embed_dim', 10, 'embed_dim') 20 | flags.DEFINE_float('learning_rate', 0.1, 'learning_rate') 21 | flags.DEFINE_float('embed_l2', None, 'embedding l2 reg') 22 | 23 | SEED = 2021 24 | 25 | 26 | 27 | class WideAndDeep(object): 28 | 29 | def __init__(self, linear_feature_columns, dnn_feature_columns, stage, action): 30 | """ 31 | :param linear_feature_columns: List of tensorflow feature_column 32 | :param dnn_feature_columns: List of tensorflow feature_column 33 | :param stage: String. Including "online_train"/"offline_train"/"evaluate"/"submit" 34 | :param action: String. Including "read_comment"/"like"/"click_avatar"/"favorite"/"forward"/"comment"/"follow" 35 | """ 36 | super(WideAndDeep, self).__init__() 37 | self.num_epochs_dict = {"read_comment": 1, "like": 1, "click_avatar": 1, "favorite": 1, "forward": 1, 38 | "comment": 1, "follow": 1} 39 | self.estimator = None 40 | self.linear_feature_columns = linear_feature_columns 41 | self.dnn_feature_columns = dnn_feature_columns 42 | self.stage = stage 43 | self.action = action 44 | tf.logging.set_verbosity(tf.logging.INFO) 45 | 46 | def build_estimator(self): 47 | if self.stage in ["evaluate", "offline_train"]: 48 | stage = "offline_train" 49 | else: 50 | stage = "online_train" 51 | model_checkpoint_stage_dir = os.path.join(FLAGS.model_checkpoint_dir, stage, self.action) 52 | if not os.path.exists(model_checkpoint_stage_dir): 53 | # 如果模型目录不存在,则创建该目录 54 | os.makedirs(model_checkpoint_stage_dir) 55 | elif self.stage in ["online_train", "offline_train"]: 56 | # 训练时如果模型目录已存在,则清空目录 57 | del_file(model_checkpoint_stage_dir) 58 | optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate, beta1=0.9, beta2=0.999, 59 | epsilon=1) 60 | config = tf.estimator.RunConfig(model_dir=model_checkpoint_stage_dir, tf_random_seed=SEED) 61 | self.estimator = tf.estimator.DNNLinearCombinedClassifier( 62 | model_dir=model_checkpoint_stage_dir, 63 | linear_feature_columns=self.linear_feature_columns, 64 | dnn_feature_columns=self.dnn_feature_columns, 65 | dnn_hidden_units=[32, 8], 66 | dnn_optimizer=optimizer, 67 | config=config) 68 | 69 | def df_to_dataset(self, df, stage, action, shuffle=True, batch_size=128, num_epochs=1): 70 | ''' 71 | 把DataFrame转为tensorflow dataset 72 | :param df: pandas dataframe. 73 | :param stage: String. Including "online_train"/"offline_train"/"evaluate"/"submit" 74 | :param action: String. Including "read_comment"/"like"/"click_avatar"/"favorite"/"forward"/"comment"/"follow" 75 | :param shuffle: Boolean. 76 | :param batch_size: Int. Size of each batch 77 | :param num_epochs: Int. Epochs num 78 | :return: tf.data.Dataset object. 79 | ''' 80 | print(df.shape) 81 | print(df.columns) 82 | print("batch_size: ", batch_size) 83 | print("num_epochs: ", num_epochs) 84 | if stage != "submit": 85 | label = df[action] 86 | ds = tf.data.Dataset.from_tensor_slices((dict(df), label)) 87 | else: 88 | ds = tf.data.Dataset.from_tensor_slices((dict(df))) 89 | if shuffle: 90 | ds = ds.shuffle(buffer_size=len(df), seed=SEED) 91 | ds = ds.batch(batch_size) 92 | if stage in ["online_train", "offline_train"]: 93 | ds = ds.repeat(num_epochs) 94 | return ds 95 | 96 | def input_fn_train(self, df, stage, action, num_epochs): 97 | return self.df_to_dataset(df, stage, action, shuffle=True, batch_size=FLAGS.batch_size, 98 | num_epochs=num_epochs) 99 | 100 | def input_fn_predict(self, df, stage, action): 101 | return self.df_to_dataset(df, stage, action, shuffle=False, batch_size=len(df), num_epochs=1) 102 | 103 | def train(self): 104 | """ 105 | 训练单个行为的模型 106 | """ 107 | file_name = "{stage}_{action}_{day}_concate_sample.csv".format(stage=self.stage, action=self.action, 108 | day=STAGE_END_DAY[self.stage]) 109 | stage_dir = os.path.join(FLAGS.root_path, self.stage, file_name) 110 | df = pd.read_csv(stage_dir) 111 | self.estimator.train( 112 | input_fn=lambda: self.input_fn_train(df, self.stage, self.action, self.num_epochs_dict[self.action]) 113 | ) 114 | 115 | def evaluate(self): 116 | """ 117 | 评估单个行为的uAUC值 118 | """ 119 | if self.stage in ["online_train", "offline_train"]: 120 | # 训练集,每个action一个文件 121 | action = self.action 122 | else: 123 | # 测试集,所有action在同一个文件 124 | action = "all" 125 | file_name = "{stage}_{action}_{day}_concate_sample.csv".format(stage=self.stage, action=action, 126 | day=STAGE_END_DAY[self.stage]) 127 | evaluate_dir = os.path.join(FLAGS.root_path, self.stage, file_name) 128 | df = pd.read_csv(evaluate_dir) 129 | userid_list = df['userid'].astype(str).tolist() 130 | predicts = self.estimator.predict( 131 | input_fn=lambda: self.input_fn_predict(df, self.stage, self.action) 132 | ) 133 | predicts_df = pd.DataFrame.from_dict(predicts) 134 | logits = predicts_df["logistic"].map(lambda x: x[0]) 135 | labels = df[self.action].values 136 | uauc = uAUC(labels, logits, userid_list) 137 | return df[["userid", "feedid"]], logits, uauc 138 | 139 | 140 | def predict(self): 141 | ''' 142 | 预测单个行为的发生概率 143 | ''' 144 | file_name = "{stage}_{action}_{day}_concate_sample.csv".format(stage=self.stage, action="all", 145 | day=STAGE_END_DAY[self.stage]) 146 | submit_dir = os.path.join(FLAGS.root_path, self.stage, file_name) 147 | df = pd.read_csv(submit_dir) 148 | t = time.time() 149 | predicts = self.estimator.predict( 150 | input_fn=lambda: self.input_fn_predict(df, self.stage, self.action) 151 | ) 152 | predicts_df = pd.DataFrame.from_dict(predicts) 153 | logits = predicts_df["logistic"].map(lambda x: x[0]) 154 | # 计算2000条样本平均预测耗时(毫秒) 155 | ts = (time.time()-t)*1000.0/len(df)*2000.0 156 | return df[["userid", "feedid"]], logits, ts 157 | 158 | 159 | 160 | def del_file(path): 161 | ''' 162 | 删除path目录下的所有内容 163 | ''' 164 | ls = os.listdir(path) 165 | for i in ls: 166 | c_path = os.path.join(path, i) 167 | if os.path.isdir(c_path): 168 | del_file(c_path) 169 | else: 170 | print("del: ", c_path) 171 | os.remove(c_path) 172 | 173 | 174 | def get_feature_columns(): 175 | ''' 176 | 获取特征列 177 | ''' 178 | dnn_feature_columns = list() 179 | linear_feature_columns = list() 180 | # DNN features 181 | user_cate = fc.categorical_column_with_hash_bucket("userid", 40000, tf.int64) 182 | feed_cate = fc.categorical_column_with_hash_bucket("feedid", 240000, tf.int64) 183 | author_cate = fc.categorical_column_with_hash_bucket("authorid", 40000, tf.int64) 184 | bgm_singer_cate = fc.categorical_column_with_hash_bucket("bgm_singer_id", 40000, tf.int64) 185 | bgm_song_cate = fc.categorical_column_with_hash_bucket("bgm_song_id", 60000, tf.int64) 186 | user_embedding = fc.embedding_column(user_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) 187 | feed_embedding = fc.embedding_column(feed_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) 188 | author_embedding = fc.embedding_column(author_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) 189 | bgm_singer_embedding = fc.embedding_column(bgm_singer_cate, FLAGS.embed_dim) 190 | bgm_song_embedding = fc.embedding_column(bgm_song_cate, FLAGS.embed_dim) 191 | dnn_feature_columns.append(user_embedding) 192 | dnn_feature_columns.append(feed_embedding) 193 | dnn_feature_columns.append(author_embedding) 194 | dnn_feature_columns.append(bgm_singer_embedding) 195 | dnn_feature_columns.append(bgm_song_embedding) 196 | # Linear features 197 | video_seconds = fc.numeric_column("videoplayseconds", default_value=0.0) 198 | device = fc.numeric_column("device", default_value=0.0) 199 | linear_feature_columns.append(video_seconds) 200 | linear_feature_columns.append(device) 201 | # 行为统计特征 202 | for b in FEA_COLUMN_LIST: 203 | feed_b = fc.numeric_column(b+"sum", default_value=0.0) 204 | linear_feature_columns.append(feed_b) 205 | user_b = fc.numeric_column(b+"sum_user", default_value=0.0) 206 | linear_feature_columns.append(user_b) 207 | return dnn_feature_columns, linear_feature_columns 208 | 209 | 210 | def main(argv): 211 | t = time.time() 212 | dnn_feature_columns, linear_feature_columns = get_feature_columns() 213 | stage = argv[1] 214 | print('Stage: %s'%stage) 215 | eval_dict = {} 216 | predict_dict = {} 217 | predict_time_cost = {} 218 | ids = None 219 | for action in ACTION_LIST: 220 | print("Action:", action) 221 | model = WideAndDeep(linear_feature_columns, dnn_feature_columns, stage, action) 222 | model.build_estimator() 223 | 224 | if stage in ["online_train", "offline_train"]: 225 | # 训练 并评估 226 | model.train() 227 | ids, logits, action_uauc = model.evaluate() 228 | eval_dict[action] = action_uauc 229 | 230 | if stage == "evaluate": 231 | # 评估线下测试集结果,计算单个行为的uAUC值,并保存预测结果 232 | ids, logits, action_uauc = model.evaluate() 233 | eval_dict[action] = action_uauc 234 | predict_dict[action] = logits 235 | 236 | if stage == "submit": 237 | # 预测线上测试集结果,保存预测结果 238 | ids, logits, ts = model.predict() 239 | predict_time_cost[action] = ts 240 | predict_dict[action] = logits 241 | 242 | if stage in ["evaluate", "offline_train", "online_train"]: 243 | # 计算所有行为的加权uAUC 244 | print(eval_dict) 245 | weight_dict = {"read_comment": 4, "like": 3, "click_avatar": 2, "favorite": 1, "forward": 1, 246 | "comment": 1, "follow": 1} 247 | weight_auc = compute_weighted_score(eval_dict, weight_dict) 248 | print("Weighted uAUC: ", weight_auc) 249 | 250 | 251 | if stage in ["evaluate", "submit"]: 252 | # 保存所有行为的预测结果,生成submit文件 253 | actions = pd.DataFrame.from_dict(predict_dict) 254 | print("Actions:", actions) 255 | ids[["userid", "feedid"]] = ids[["userid", "feedid"]].astype(int) 256 | res = pd.concat([ids, actions], sort=False, axis=1) 257 | # 写文件 258 | file_name = "submit_" + str(int(time.time())) + ".csv" 259 | submit_file = os.path.join(FLAGS.root_path, stage, file_name) 260 | print('Save to: %s'%submit_file) 261 | res.to_csv(submit_file, index=False) 262 | 263 | if stage == "submit": 264 | print('不同目标行为2000条样本平均预测耗时(毫秒):') 265 | print(predict_time_cost) 266 | print('单个目标行为2000条样本平均预测耗时(毫秒):') 267 | print(np.mean([v for v in predict_time_cost.values()])) 268 | print('Time cost: %.2f s'%(time.time()-t)) 269 | 270 | 271 | if __name__ == "__main__": 272 | tf.app.run(main) 273 | -------------------------------------------------------------------------------- /comm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import time 4 | import logging 5 | LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" 6 | logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) 7 | logger = logging.getLogger(__file__) 8 | import numpy as np 9 | import pandas as pd 10 | 11 | # 存储数据的根目录 12 | ROOT_PATH = "./data" 13 | # 比赛数据集路径 14 | DATASET_PATH = os.path.join(ROOT_PATH, "wechat_algo_data1") 15 | # 训练集 16 | USER_ACTION = os.path.join(DATASET_PATH, "user_action.csv") 17 | FEED_INFO = os.path.join(DATASET_PATH, "feed_info.csv") 18 | FEED_EMBEDDINGS = os.path.join(DATASET_PATH, "feed_embeddings.csv") 19 | # 测试集 20 | TEST_FILE = os.path.join(DATASET_PATH, "test_a.csv") 21 | END_DAY = 15 22 | SEED = 2021 23 | 24 | # 初赛待预测行为列表 25 | ACTION_LIST = ["read_comment", "like", "click_avatar", "forward"] 26 | # 复赛待预测行为列表 27 | # ACTION_LIST = ["read_comment", "like", "click_avatar", "forward", "comment", "follow", "favorite"] 28 | # 用于构造特征的字段列表 29 | FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar", "forward", "comment", "follow", "favorite"] 30 | # 每个行为的负样本下采样比例(下采样后负样本数/原负样本数) 31 | ACTION_SAMPLE_RATE = {"read_comment": 0.2, "like": 0.2, "click_avatar": 0.2, "forward": 0.1, "comment": 0.1, "follow": 0.1, "favorite": 0.1} 32 | 33 | # 各个阶段数据集的设置的最后一天 34 | STAGE_END_DAY = {"online_train": 14, "offline_train": 12, "evaluate": 13, "submit": 15} 35 | # 各个行为构造训练数据的天数 36 | ACTION_DAY_NUM = {"read_comment": 5, "like": 5, "click_avatar": 5, "forward": 5, "comment": 5, "follow": 5, "favorite": 5} 37 | 38 | 39 | def create_dir(): 40 | """ 41 | 创建所需要的目录 42 | """ 43 | # 创建data目录 44 | if not os.path.exists(ROOT_PATH): 45 | print('Create dir: %s'%ROOT_PATH) 46 | os.mkdir(ROOT_PATH) 47 | # data目录下需要创建的子目录 48 | need_dirs = ["offline_train", "online_train", "evaluate", "submit", 49 | "feature", "model", "model/online_train", "model/offline_train"] 50 | for need_dir in need_dirs: 51 | need_dir = os.path.join(ROOT_PATH, need_dir) 52 | if not os.path.exists(need_dir): 53 | print('Create dir: %s'%need_dir) 54 | os.mkdir(need_dir) 55 | 56 | 57 | def check_file(): 58 | ''' 59 | 检查数据文件是否存在 60 | ''' 61 | paths = [USER_ACTION, FEED_INFO, TEST_FILE] 62 | flag = True 63 | not_exist_file = [] 64 | for f in paths: 65 | if not os.path.exists(f): 66 | not_exist_file.append(f) 67 | flag = False 68 | return flag, not_exist_file 69 | 70 | 71 | def statis_data(): 72 | """ 73 | 统计特征最大,最小,均值 74 | """ 75 | paths = [USER_ACTION, FEED_INFO, TEST_FILE] 76 | pd.set_option('display.max_columns', None) 77 | for path in paths: 78 | df = pd.read_csv(path) 79 | print(path + " statis: ") 80 | print(df.describe()) 81 | print('Distinct count:') 82 | print(df.nunique()) 83 | 84 | 85 | def statis_feature(start_day=1, before_day=7, agg='sum'): 86 | """ 87 | 统计用户/feed 过去n天各类行为的次数 88 | :param start_day: Int. 起始日期 89 | :param before_day: Int. 时间范围(天数) 90 | :param agg: String. 统计方法 91 | """ 92 | history_data = pd.read_csv(USER_ACTION)[["userid", "date_", "feedid"] + FEA_COLUMN_LIST] 93 | feature_dir = os.path.join(ROOT_PATH, "feature") 94 | for dim in ["userid", "feedid"]: 95 | print(dim) 96 | user_data = history_data[[dim, "date_"] + FEA_COLUMN_LIST] 97 | res_arr = [] 98 | for start in range(start_day, END_DAY-before_day+1): 99 | temp = user_data[((user_data["date_"]) >= start) & (user_data["date_"] < (start + before_day))] 100 | temp = temp.drop(columns=['date_']) 101 | temp = temp.groupby([dim]).agg([agg]).reset_index() 102 | temp.columns = list(map(''.join, temp.columns.values)) 103 | temp["date_"] = start + before_day 104 | res_arr.append(temp) 105 | dim_feature = pd.concat(res_arr) 106 | feature_path = os.path.join(feature_dir, dim+"_feature.csv") 107 | print('Save to: %s'%feature_path) 108 | dim_feature.to_csv(feature_path, index=False) 109 | 110 | 111 | def generate_sample(stage="offline_train"): 112 | """ 113 | 对负样本进行下采样,生成各个阶段所需样本 114 | :param stage: String. Including "online_train"/"offline_train"/"evaluate"/"submit" 115 | :return: List of sample df 116 | """ 117 | day = STAGE_END_DAY[stage] 118 | if stage == "submit": 119 | sample_path = TEST_FILE 120 | else: 121 | sample_path = USER_ACTION 122 | stage_dir = os.path.join(ROOT_PATH, stage) 123 | df = pd.read_csv(sample_path) 124 | df_arr = [] 125 | if stage == "evaluate": 126 | # 线下评估 127 | col = ["userid", "feedid", "date_", "device"] + ACTION_LIST 128 | df = df[df["date_"] == day][col] 129 | file_name = os.path.join(stage_dir, stage + "_" + "all" + "_" + str(day) + "_generate_sample.csv") 130 | print('Save to: %s'%file_name) 131 | df.to_csv(file_name, index=False) 132 | df_arr.append(df) 133 | elif stage == "submit": 134 | # 线上提交 135 | file_name = os.path.join(stage_dir, stage + "_" + "all" + "_" + str(day) + "_generate_sample.csv") 136 | df["date_"] = 15 137 | print('Save to: %s'%file_name) 138 | df.to_csv(file_name, index=False) 139 | df_arr.append(df) 140 | else: 141 | # 线下/线上训练 142 | # 同行为取按时间最近的样本 143 | for action in ACTION_LIST: 144 | df = df.drop_duplicates(subset=['userid', 'feedid', action], keep='last') 145 | # 负样本下采样 146 | for action in ACTION_LIST: 147 | action_df = df[(df["date_"] <= day) & (df["date_"] >= day - ACTION_DAY_NUM[action] + 1)] 148 | df_neg = action_df[action_df[action] == 0] 149 | df_pos = action_df[action_df[action] == 1] 150 | df_neg = df_neg.sample(frac=ACTION_SAMPLE_RATE[action], random_state=SEED, replace=False) 151 | df_all = pd.concat([df_neg, df_pos]) 152 | col = ["userid", "feedid", "date_", "device"] + [action] 153 | file_name = os.path.join(stage_dir, stage + "_" + action + "_" + str(day) + "_generate_sample.csv") 154 | print('Save to: %s'%file_name) 155 | df_all[col].to_csv(file_name, index=False) 156 | df_arr.append(df_all[col]) 157 | return df_arr 158 | 159 | 160 | def concat_sample(sample_arr, stage="offline_train"): 161 | """ 162 | 基于样本数据和特征,生成特征数据 163 | :param sample_arr: List of sample df 164 | :param stage: String. Including "online_train"/"offline_train"/"evaluate"/"submit" 165 | """ 166 | day = STAGE_END_DAY[stage] 167 | # feed信息表 168 | feed_info = pd.read_csv(FEED_INFO) 169 | feed_info = feed_info.set_index('feedid') 170 | # 基于userid统计的历史行为的次数 171 | user_date_feature_path = os.path.join(ROOT_PATH, "feature", "userid_feature.csv") 172 | user_date_feature = pd.read_csv(user_date_feature_path) 173 | user_date_feature = user_date_feature.set_index(["userid", "date_"]) 174 | # 基于feedid统计的历史行为的次数 175 | feed_date_feature_path = os.path.join(ROOT_PATH, "feature", "feedid_feature.csv") 176 | feed_date_feature = pd.read_csv(feed_date_feature_path) 177 | feed_date_feature = feed_date_feature.set_index(["feedid", "date_"]) 178 | 179 | for index, sample in enumerate(sample_arr): 180 | features = ["userid", "feedid", "device", "authorid", "bgm_song_id", "bgm_singer_id", 181 | "videoplayseconds"] 182 | if stage == "evaluate": 183 | action = "all" 184 | features += ACTION_LIST 185 | elif stage == "submit": 186 | action = "all" 187 | else: 188 | action = ACTION_LIST[index] 189 | features += [action] 190 | print("action: ", action) 191 | sample = sample.join(feed_info, on="feedid", how="left", rsuffix="_feed") 192 | sample = sample.join(feed_date_feature, on=["feedid", "date_"], how="left", rsuffix="_feed") 193 | sample = sample.join(user_date_feature, on=["userid", "date_"], how="left", rsuffix="_user") 194 | feed_feature_col = [b+"sum" for b in FEA_COLUMN_LIST] 195 | user_feature_col = [b+"sum_user" for b in FEA_COLUMN_LIST] 196 | sample[feed_feature_col] = sample[feed_feature_col].fillna(0.0) 197 | sample[user_feature_col] = sample[user_feature_col].fillna(0.0) 198 | sample[feed_feature_col] = np.log(sample[feed_feature_col] + 1.0) 199 | sample[user_feature_col] = np.log(sample[user_feature_col] + 1.0) 200 | features += feed_feature_col 201 | features += user_feature_col 202 | 203 | sample[["authorid", "bgm_song_id", "bgm_singer_id"]] += 1 # 0 用于填未知 204 | sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = \ 205 | sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0) 206 | sample["videoplayseconds"] = np.log(sample["videoplayseconds"] + 1.0) 207 | 208 | sample[["authorid", "bgm_song_id", "bgm_singer_id"]] = \ 209 | sample[["authorid", "bgm_song_id", "bgm_singer_id"]].astype(int) 210 | file_name = os.path.join(ROOT_PATH, stage, stage + "_" + action + "_" + str(day) + "_concate_sample.csv") 211 | print('Save to: %s'%file_name) 212 | sample[features].to_csv(file_name, index=False) 213 | 214 | 215 | def main(): 216 | t = time.time() 217 | statis_data() 218 | logger.info('Create dir and check file') 219 | create_dir() 220 | flag, not_exists_file = check_file() 221 | if not flag: 222 | print("请检查目录中是否存在下列文件: ", ",".join(not_exists_file)) 223 | return 224 | logger.info('Generate statistic feature') 225 | statis_feature() 226 | for stage in STAGE_END_DAY: 227 | logger.info("Stage: %s"%stage) 228 | logger.info('Generate sample') 229 | sample_arr = generate_sample(stage) 230 | logger.info('Concat sample with feature') 231 | concat_sample(sample_arr, stage) 232 | print('Time cost: %.2f s'%(time.time()-t)) 233 | 234 | 235 | if __name__ == "__main__": 236 | main() 237 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import time 3 | import traceback 4 | from collections import defaultdict 5 | import logging 6 | LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" 7 | logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) 8 | logger = logging.getLogger(__file__) 9 | import numpy as np 10 | import pandas as pd 11 | from numba import njit 12 | from scipy.stats import rankdata 13 | 14 | 15 | @njit 16 | def _auc(actual, pred_ranks): 17 | n_pos = np.sum(actual) 18 | n_neg = len(actual) - n_pos 19 | return (np.sum(pred_ranks[actual == 1]) - n_pos*(n_pos+1)/2) / (n_pos*n_neg) 20 | 21 | 22 | def fast_auc(actual, predicted): 23 | # https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/208031 24 | pred_ranks = rankdata(predicted) 25 | return _auc(actual, pred_ranks) 26 | 27 | 28 | def uAUC(labels, preds, user_id_list): 29 | """Calculate user AUC""" 30 | user_pred = defaultdict(lambda: []) 31 | user_truth = defaultdict(lambda: []) 32 | for idx, truth in enumerate(labels): 33 | user_id = user_id_list[idx] 34 | pred = preds[idx] 35 | truth = labels[idx] 36 | user_pred[user_id].append(pred) 37 | user_truth[user_id].append(truth) 38 | 39 | user_flag = defaultdict(lambda: False) 40 | for user_id in set(user_id_list): 41 | truths = user_truth[user_id] 42 | flag = False 43 | # 若全是正样本或全是负样本,则flag为False 44 | for i in range(len(truths) - 1): 45 | if truths[i] != truths[i + 1]: 46 | flag = True 47 | break 48 | user_flag[user_id] = flag 49 | 50 | total_auc = 0.0 51 | size = 0.0 52 | for user_id in user_flag: 53 | if user_flag[user_id]: 54 | auc = fast_auc(np.asarray(user_truth[user_id]), np.asarray(user_pred[user_id])) 55 | total_auc += auc 56 | size += 1.0 57 | user_auc = float(total_auc)/size 58 | return user_auc 59 | 60 | 61 | def compute_weighted_score(score_dict, weight_dict): 62 | '''基于多个行为的uAUC值,计算加权uAUC 63 | Input: 64 | scores_dict: 多个行为的uAUC值映射字典, dict 65 | weights_dict: 多个行为的权重映射字典, dict 66 | Output: 67 | score: 加权uAUC值, float 68 | ''' 69 | score = 0.0 70 | weight_sum = 0.0 71 | for action in score_dict: 72 | weight = float(weight_dict[action]) 73 | score += weight*score_dict[action] 74 | weight_sum += weight 75 | score /= float(weight_sum) 76 | score = round(score, 6) 77 | return score 78 | 79 | 80 | def score(result_data, label_data, mode="初赛"): 81 | '''评测结果: 多个行为的加权uAUC分数 82 | Input: 83 | result_data: 提交的结果文件,二进制格式 84 | label_data: 对应的label文件,二进制格式 85 | mode: 比赛阶段,String. "初赛"/"复赛" 86 | Output: 87 | result: 评测结果,dict 88 | ''' 89 | try: 90 | # 读取数据 91 | logger.info('Read data') 92 | result_df = pd.read_csv(result_data, sep=',') 93 | label_df = pd.read_csv(label_data, sep=',') 94 | if mode == "初赛": 95 | # 初赛只评估四个互动行为 96 | actions = ['read_comment', 'like', 'click_avatar', 'forward'] 97 | else: 98 | # 复赛评估七个互动行为 99 | actions = ['read_comment', 'like', 'click_avatar', 'forward', 'favorite', 'comment', 'follow'] 100 | # 互动行为权重映射表 101 | weights_map = { 102 | "read_comment": 4.0, # 是否查看评论 103 | "like": 3.0, # 是否点赞 104 | "click_avatar": 2.0, # 是否点击头像 105 | "forward": 1.0, # 是否转发 106 | "favorite": 1.0, # 是否收藏 107 | "comment": 1.0, # 是否发表评论 108 | "follow": 1.0 # 是否关注 109 | } 110 | target_cols = ['userid', 'feedid'] + actions 111 | label_df = label_df[target_cols] 112 | # 规范检查 113 | logger.info('Check result file') 114 | if result_df.shape[0] != label_df.shape[0]: 115 | err_msg = "结果文件的行数(%i行)与测试集(%i行)不一致"%(result_df.shape[0], label_df.shape[0]) 116 | res = { 117 | "ret": 1, 118 | "err_msg": err_msg, 119 | } 120 | logger.error(res) 121 | return res 122 | err_cols = [] 123 | result_cols = set(result_df.columns) 124 | for col in target_cols: 125 | if col not in result_cols: 126 | err_cols.append(col) 127 | if len(err_cols) > 0: 128 | err_msg = "结果文件缺少字段/列:%s"%(', '.join(err_cols)) 129 | res = { 130 | "ret": 2, 131 | "err_msg": err_msg, 132 | } 133 | logger.error(res) 134 | return res 135 | result_actions_map = {} 136 | label_actions_map = {} 137 | result_actions = [] 138 | label_actions = [] 139 | for action in actions: 140 | result_actions_map[action] = "result_"+action 141 | result_actions.append("result_"+action) 142 | label_actions_map[action] = "label_"+action 143 | label_actions.append("label_"+action) 144 | result_df = result_df.rename(columns=result_actions_map) 145 | label_df = label_df.rename(columns=label_actions_map) 146 | df = label_df.merge(result_df, on=['userid', 'feedid']) 147 | if len(df) != len(label_df): 148 | err_msg = "结果文件中userid-feedid与测试集不一致" 149 | res = { 150 | "ret": 3, 151 | "err_msg": err_msg, 152 | } 153 | logger.error(res) 154 | return res 155 | # 计算分数 156 | logger.info('Compute score') 157 | y_true = df[label_actions].astype(int).values 158 | y_pred = df[result_actions].astype(float).values.round(decimals=6) 159 | userid_list = df['userid'].astype(str).tolist() 160 | del df, result_df, label_df 161 | score = 0.0 162 | weights_sum = 0.0 163 | score_detail = {} 164 | for i, action in enumerate(actions): 165 | print(action) 166 | y_true_bev = y_true[:, i] 167 | y_pred_bev = y_pred[:, i] 168 | weight = weights_map[action] 169 | # user AUC 170 | uauc = uAUC(y_true_bev, y_pred_bev, userid_list) 171 | print(uauc) 172 | score_detail[action] = round(uauc, 6) 173 | score += weight*uauc 174 | weights_sum += weight 175 | score /= weights_sum 176 | score = round(score, 6) 177 | res = { 178 | "ret": 0, 179 | "data": { 180 | "score": score, 181 | "score_detail": score_detail 182 | } 183 | } 184 | logger.info(res) 185 | except Exception as e: 186 | traceback.print_exc() 187 | res = { 188 | "ret": 4, 189 | "err_msg": str(e) 190 | } 191 | logger.error(res) 192 | return res 193 | 194 | 195 | if __name__ == '__main__': 196 | t = time.time() 197 | label_data = open('data/evaluate/evaluate_all_13_generate_sample.csv', 'r') 198 | result_data = open('data/evaluate/submit_1619332123.csv', 'r') 199 | res = score(result_data, label_data, mode='初赛') 200 | print('Time cost: %.2f s'%(time.time()-t)) 201 | --------------------------------------------------------------------------------