├── .gitignore ├── README.md ├── code ├── data.py ├── rank_feature.py ├── rank_lgb.py ├── recall.py ├── recall_binetwork.py ├── recall_itemcf.py ├── recall_w2v.py ├── test.sh └── utils.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | */__pycache__ 3 | tcdata 4 | user_data 5 | prediction_result -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 零基础入门推荐系统 - 新闻推荐 Top2 2 | 3 | 比赛地址: https://tianchi.aliyun.com/competition/entrance/531842/introduction 4 | 5 | # 解决方案 6 | 采用3种召回方式:itemcf 召回,binetwork 召回和基于 word2vec 的 i2i 召回。合并去重并删除没有召回到真实商品的用户数据后,利用特征工程+ LGB 二分类模型进行排序。 7 | 8 | # 复现步骤 9 | 操作系统:ubuntu 16.04 10 | ``` 11 | pip install requirements.txt 12 | cd code 13 | bash test.sh 14 | ``` 15 | -------------------------------------------------------------------------------- /code/data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | from random import sample 5 | 6 | import pandas as pd 7 | from tqdm import tqdm 8 | 9 | from utils import Logger 10 | 11 | random.seed(2020) 12 | 13 | # 命令行参数 14 | parser = argparse.ArgumentParser(description='数据处理') 15 | parser.add_argument('--mode', default='valid') 16 | parser.add_argument('--logfile', default='test.log') 17 | 18 | args = parser.parse_args() 19 | 20 | mode = args.mode 21 | logfile = args.logfile 22 | 23 | # 初始化日志 24 | os.makedirs('../user_data/log', exist_ok=True) 25 | log = Logger(f'../user_data/log/{logfile}').logger 26 | log.info(f'数据处理,mode: {mode}') 27 | 28 | 29 | def data_offline(df_train_click, df_test_click): 30 | train_users = df_train_click['user_id'].values.tolist() 31 | # 随机采样出一部分样本 32 | val_users = sample(train_users, 50000) 33 | log.debug(f'val_users num: {len(set(val_users))}') 34 | 35 | # 训练集用户 抽出行为数据最后一条作为线下验证集 36 | click_list = [] 37 | valid_query_list = [] 38 | 39 | groups = df_train_click.groupby(['user_id']) 40 | for user_id, g in tqdm(groups): 41 | if user_id in val_users: 42 | valid_query = g.tail(1) 43 | valid_query_list.append( 44 | valid_query[['user_id', 'click_article_id']]) 45 | 46 | train_click = g.head(g.shape[0] - 1) 47 | click_list.append(train_click) 48 | else: 49 | click_list.append(g) 50 | 51 | df_train_click = pd.concat(click_list, sort=False) 52 | df_valid_query = pd.concat(valid_query_list, sort=False) 53 | 54 | test_users = df_test_click['user_id'].unique() 55 | test_query_list = [] 56 | 57 | for user in tqdm(test_users): 58 | test_query_list.append([user, -1]) 59 | 60 | df_test_query = pd.DataFrame(test_query_list, 61 | columns=['user_id', 'click_article_id']) 62 | 63 | df_query = pd.concat([df_valid_query, df_test_query], 64 | sort=False).reset_index(drop=True) 65 | df_click = pd.concat([df_train_click, df_test_click], 66 | sort=False).reset_index(drop=True) 67 | df_click = df_click.sort_values(['user_id', 68 | 'click_timestamp']).reset_index(drop=True) 69 | 70 | log.debug( 71 | f'df_query shape: {df_query.shape}, df_click shape: {df_click.shape}') 72 | log.debug(f'{df_query.head()}') 73 | log.debug(f'{df_click.head()}') 74 | 75 | # 保存文件 76 | os.makedirs('../user_data/data/offline', exist_ok=True) 77 | 78 | df_click.to_pickle('../user_data/data/offline/click.pkl') 79 | df_query.to_pickle('../user_data/data/offline/query.pkl') 80 | 81 | 82 | def data_online(df_train_click, df_test_click): 83 | test_users = df_test_click['user_id'].unique() 84 | test_query_list = [] 85 | 86 | for user in tqdm(test_users): 87 | test_query_list.append([user, -1]) 88 | 89 | df_test_query = pd.DataFrame(test_query_list, 90 | columns=['user_id', 'click_article_id']) 91 | 92 | df_query = df_test_query 93 | df_click = pd.concat([df_train_click, df_test_click], 94 | sort=False).reset_index(drop=True) 95 | df_click = df_click.sort_values(['user_id', 96 | 'click_timestamp']).reset_index(drop=True) 97 | 98 | log.debug( 99 | f'df_query shape: {df_query.shape}, df_click shape: {df_click.shape}') 100 | log.debug(f'{df_query.head()}') 101 | log.debug(f'{df_click.head()}') 102 | 103 | # 保存文件 104 | os.makedirs('../data/online', exist_ok=True) 105 | 106 | df_click.to_pickle('../user_data/data/online/click.pkl') 107 | df_query.to_pickle('../user_data/data/online/query.pkl') 108 | 109 | 110 | if __name__ == '__main__': 111 | df_train_click = pd.read_csv('../tcdata/train_click_log.csv') 112 | df_test_click = pd.read_csv('../tcdata/testB_click_log_Test_B.csv') 113 | 114 | log.debug( 115 | f'df_train_click shape: {df_train_click.shape}, df_test_click shape: {df_test_click.shape}' 116 | ) 117 | 118 | if mode == 'valid': 119 | data_offline(df_train_click, df_test_click) 120 | else: 121 | data_online(df_train_click, df_test_click) 122 | -------------------------------------------------------------------------------- /code/rank_feature.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import warnings 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from pandarallel import pandarallel 9 | 10 | from utils import Logger 11 | 12 | pd.set_option('display.max_columns', None) 13 | pd.set_option('display.max_rows', None) 14 | 15 | pandarallel.initialize() 16 | 17 | warnings.filterwarnings('ignore') 18 | 19 | seed = 2020 20 | 21 | # 命令行参数 22 | parser = argparse.ArgumentParser(description='排序特征') 23 | parser.add_argument('--mode', default='valid') 24 | parser.add_argument('--logfile', default='test.log') 25 | 26 | args = parser.parse_args() 27 | 28 | mode = args.mode 29 | logfile = args.logfile 30 | 31 | # 初始化日志 32 | os.makedirs('../user_data/log', exist_ok=True) 33 | log = Logger(f'../user_data/log/{logfile}').logger 34 | log.info(f'排序特征,mode: {mode}') 35 | 36 | 37 | def func_if_sum(x): 38 | user_id = x['user_id'] 39 | article_id = x['article_id'] 40 | 41 | interacted_items = user_item_dict[user_id] 42 | interacted_items = interacted_items[::-1] 43 | 44 | sim_sum = 0 45 | for loc, i in enumerate(interacted_items): 46 | try: 47 | sim_sum += item_sim[i][article_id] * (0.7**loc) 48 | except Exception as e: 49 | pass 50 | return sim_sum 51 | 52 | 53 | def func_if_last(x): 54 | user_id = x['user_id'] 55 | article_id = x['article_id'] 56 | 57 | last_item = user_item_dict[user_id][-1] 58 | 59 | sim = 0 60 | try: 61 | sim = item_sim[last_item][article_id] 62 | except Exception as e: 63 | pass 64 | return sim 65 | 66 | 67 | def func_binetwork_sim_last(x): 68 | user_id = x['user_id'] 69 | article_id = x['article_id'] 70 | 71 | last_item = user_item_dict[user_id][-1] 72 | 73 | sim = 0 74 | try: 75 | sim = binetwork_sim[last_item][article_id] 76 | except Exception as e: 77 | pass 78 | return sim 79 | 80 | 81 | def consine_distance(vector1, vector2): 82 | if type(vector1) != np.ndarray or type(vector2) != np.ndarray: 83 | return -1 84 | distance = np.dot(vector1, vector2) / \ 85 | (np.linalg.norm(vector1)*(np.linalg.norm(vector2))) 86 | return distance 87 | 88 | 89 | def func_w2w_sum(x, num): 90 | user_id = x['user_id'] 91 | article_id = x['article_id'] 92 | 93 | interacted_items = user_item_dict[user_id] 94 | interacted_items = interacted_items[::-1][:num] 95 | 96 | sim_sum = 0 97 | for loc, i in enumerate(interacted_items): 98 | try: 99 | sim_sum += consine_distance(article_vec_map[article_id], 100 | article_vec_map[i]) 101 | except Exception as e: 102 | pass 103 | return sim_sum 104 | 105 | 106 | def func_w2w_last_sim(x): 107 | user_id = x['user_id'] 108 | article_id = x['article_id'] 109 | 110 | last_item = user_item_dict[user_id][-1] 111 | 112 | sim = 0 113 | try: 114 | sim = consine_distance(article_vec_map[article_id], 115 | article_vec_map[last_item]) 116 | except Exception as e: 117 | pass 118 | return sim 119 | 120 | 121 | if __name__ == '__main__': 122 | if mode == 'valid': 123 | df_feature = pd.read_pickle('../user_data/data/offline/recall.pkl') 124 | df_click = pd.read_pickle('../user_data/data/offline/click.pkl') 125 | 126 | else: 127 | df_feature = pd.read_pickle('../user_data/data/online/recall.pkl') 128 | df_click = pd.read_pickle('../user_data/data/online/click.pkl') 129 | 130 | # 文章特征 131 | log.debug(f'df_feature.shape: {df_feature.shape}') 132 | 133 | df_article = pd.read_csv('../tcdata/articles.csv') 134 | df_article['created_at_ts'] = df_article['created_at_ts'] / 1000 135 | df_article['created_at_ts'] = df_article['created_at_ts'].astype('int') 136 | df_feature = df_feature.merge(df_article, how='left') 137 | df_feature['created_at_datetime'] = pd.to_datetime( 138 | df_feature['created_at_ts'], unit='s') 139 | 140 | log.debug(f'df_article.head(): {df_article.head()}') 141 | log.debug(f'df_feature.shape: {df_feature.shape}') 142 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 143 | 144 | # 历史记录相关特征 145 | df_click.sort_values(['user_id', 'click_timestamp'], inplace=True) 146 | df_click.rename(columns={'click_article_id': 'article_id'}, inplace=True) 147 | df_click = df_click.merge(df_article, how='left') 148 | 149 | df_click['click_timestamp'] = df_click['click_timestamp'] / 1000 150 | df_click['click_datetime'] = pd.to_datetime(df_click['click_timestamp'], 151 | unit='s', 152 | errors='coerce') 153 | df_click['click_datetime_hour'] = df_click['click_datetime'].dt.hour 154 | 155 | # 用户点击文章的创建时间差的平均值 156 | df_click['user_id_click_article_created_at_ts_diff'] = df_click.groupby( 157 | ['user_id'])['created_at_ts'].diff() 158 | df_temp = df_click.groupby([ 159 | 'user_id' 160 | ])['user_id_click_article_created_at_ts_diff'].mean().reset_index() 161 | df_temp.columns = [ 162 | 'user_id', 'user_id_click_article_created_at_ts_diff_mean' 163 | ] 164 | df_feature = df_feature.merge(df_temp, how='left') 165 | 166 | log.debug(f'df_feature.shape: {df_feature.shape}') 167 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 168 | 169 | # 用户点击文章的时间差的平均值 170 | df_click['user_id_click_diff'] = df_click.groupby( 171 | ['user_id'])['click_timestamp'].diff() 172 | df_temp = df_click.groupby(['user_id' 173 | ])['user_id_click_diff'].mean().reset_index() 174 | df_temp.columns = ['user_id', 'user_id_click_diff_mean'] 175 | df_feature = df_feature.merge(df_temp, how='left') 176 | 177 | log.debug(f'df_feature.shape: {df_feature.shape}') 178 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 179 | 180 | df_click['click_timestamp_created_at_ts_diff'] = df_click[ 181 | 'click_timestamp'] - df_click['created_at_ts'] 182 | 183 | # 点击文章的创建时间差的统计值 184 | df_temp = df_click.groupby( 185 | ['user_id'])['click_timestamp_created_at_ts_diff'].agg({ 186 | 'user_click_timestamp_created_at_ts_diff_mean': 187 | 'mean', 188 | 'user_click_timestamp_created_at_ts_diff_std': 189 | 'std' 190 | }).reset_index() 191 | df_feature = df_feature.merge(df_temp, how='left') 192 | 193 | log.debug(f'df_feature.shape: {df_feature.shape}') 194 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 195 | 196 | # 点击的新闻的 click_datetime_hour 统计值 197 | df_temp = df_click.groupby(['user_id'])['click_datetime_hour'].agg({ 198 | 'user_click_datetime_hour_std': 199 | 'std' 200 | }).reset_index() 201 | df_feature = df_feature.merge(df_temp, how='left') 202 | 203 | log.debug(f'df_feature.shape: {df_feature.shape}') 204 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 205 | 206 | # 点击的新闻的 words_count 统计值 207 | df_temp = df_click.groupby(['user_id'])['words_count'].agg({ 208 | 'user_clicked_article_words_count_mean': 209 | 'mean', 210 | 'user_click_last_article_words_count': 211 | lambda x: x.iloc[-1] 212 | }).reset_index() 213 | df_feature = df_feature.merge(df_temp, how='left') 214 | 215 | log.debug(f'df_feature.shape: {df_feature.shape}') 216 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 217 | 218 | # 点击的新闻的 created_at_ts 统计值 219 | df_temp = df_click.groupby('user_id')['created_at_ts'].agg({ 220 | 'user_click_last_article_created_time': 221 | lambda x: x.iloc[-1], 222 | 'user_clicked_article_created_time_max': 223 | 'max', 224 | }).reset_index() 225 | df_feature = df_feature.merge(df_temp, how='left') 226 | 227 | log.debug(f'df_feature.shape: {df_feature.shape}') 228 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 229 | 230 | # 点击的新闻的 click_timestamp 统计值 231 | df_temp = df_click.groupby('user_id')['click_timestamp'].agg({ 232 | 'user_click_last_article_click_time': 233 | lambda x: x.iloc[-1], 234 | 'user_clicked_article_click_time_mean': 235 | 'mean', 236 | }).reset_index() 237 | df_feature = df_feature.merge(df_temp, how='left') 238 | 239 | log.debug(f'df_feature.shape: {df_feature.shape}') 240 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 241 | 242 | df_feature['user_last_click_created_at_ts_diff'] = df_feature[ 243 | 'created_at_ts'] - df_feature['user_click_last_article_created_time'] 244 | df_feature['user_last_click_timestamp_diff'] = df_feature[ 245 | 'created_at_ts'] - df_feature['user_click_last_article_click_time'] 246 | df_feature['user_last_click_words_count_diff'] = df_feature[ 247 | 'words_count'] - df_feature['user_click_last_article_words_count'] 248 | 249 | log.debug(f'df_feature.shape: {df_feature.shape}') 250 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 251 | 252 | # 计数统计 253 | for f in [['user_id'], ['article_id'], ['user_id', 'category_id']]: 254 | df_temp = df_click.groupby(f).size().reset_index() 255 | df_temp.columns = f + ['{}_cnt'.format('_'.join(f))] 256 | 257 | df_feature = df_feature.merge(df_temp, how='left') 258 | 259 | log.debug(f'df_feature.shape: {df_feature.shape}') 260 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 261 | 262 | # 召回相关特征 263 | ## itemcf 相关 264 | user_item_ = df_click.groupby('user_id')['article_id'].agg( 265 | list).reset_index() 266 | user_item_dict = dict(zip(user_item_['user_id'], user_item_['article_id'])) 267 | 268 | if mode == 'valid': 269 | f = open('../user_data/sim/offline/itemcf_sim.pkl', 'rb') 270 | item_sim = pickle.load(f) 271 | f.close() 272 | else: 273 | f = open('../user_data/sim/online/itemcf_sim.pkl', 'rb') 274 | item_sim = pickle.load(f) 275 | f.close() 276 | 277 | # 用户历史点击物品与待预测物品相似度 278 | df_feature['user_clicked_article_itemcf_sim_sum'] = df_feature[[ 279 | 'user_id', 'article_id' 280 | ]].parallel_apply(func_if_sum, axis=1) 281 | df_feature['user_last_click_article_itemcf_sim'] = df_feature[[ 282 | 'user_id', 'article_id' 283 | ]].parallel_apply(func_if_last, axis=1) 284 | 285 | log.debug(f'df_feature.shape: {df_feature.shape}') 286 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 287 | 288 | ## binetwork 相关 289 | if mode == 'valid': 290 | f = open('../user_data/sim/offline/binetwork_sim.pkl', 'rb') 291 | binetwork_sim = pickle.load(f) 292 | f.close() 293 | else: 294 | f = open('../user_data/sim/online/binetwork_sim.pkl', 'rb') 295 | binetwork_sim = pickle.load(f) 296 | f.close() 297 | 298 | df_feature['user_last_click_article_binetwork_sim'] = df_feature[[ 299 | 'user_id', 'article_id' 300 | ]].parallel_apply(func_binetwork_sim_last, axis=1) 301 | 302 | log.debug(f'df_feature.shape: {df_feature.shape}') 303 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 304 | 305 | ## w2v 相关 306 | if mode == 'valid': 307 | f = open('../user_data/data/offline/article_w2v.pkl', 'rb') 308 | article_vec_map = pickle.load(f) 309 | f.close() 310 | else: 311 | f = open('../user_data/data/online/article_w2v.pkl', 'rb') 312 | article_vec_map = pickle.load(f) 313 | f.close() 314 | 315 | df_feature['user_last_click_article_w2v_sim'] = df_feature[[ 316 | 'user_id', 'article_id' 317 | ]].parallel_apply(func_w2w_last_sim, axis=1) 318 | df_feature['user_click_article_w2w_sim_sum_2'] = df_feature[[ 319 | 'user_id', 'article_id' 320 | ]].parallel_apply(lambda x: func_w2w_sum(x, 2), axis=1) 321 | 322 | log.debug(f'df_feature.shape: {df_feature.shape}') 323 | log.debug(f'df_feature.columns: {df_feature.columns.tolist()}') 324 | 325 | # 保存特征文件 326 | if mode == 'valid': 327 | df_feature.to_pickle('../user_data/data/offline/feature.pkl') 328 | 329 | else: 330 | df_feature.to_pickle('../user_data/data/online/feature.pkl') 331 | -------------------------------------------------------------------------------- /code/rank_lgb.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gc 3 | import os 4 | import random 5 | import warnings 6 | 7 | import joblib 8 | import lightgbm as lgb 9 | import pandas as pd 10 | from sklearn.model_selection import GroupKFold 11 | from sklearn.preprocessing import LabelEncoder 12 | from tqdm import tqdm 13 | 14 | from utils import Logger, evaluate, gen_sub 15 | 16 | warnings.filterwarnings('ignore') 17 | 18 | seed = 2020 19 | random.seed(seed) 20 | 21 | # 命令行参数 22 | parser = argparse.ArgumentParser(description='lightgbm 排序') 23 | parser.add_argument('--mode', default='valid') 24 | parser.add_argument('--logfile', default='test.log') 25 | 26 | args = parser.parse_args() 27 | 28 | mode = args.mode 29 | logfile = args.logfile 30 | 31 | # 初始化日志 32 | os.makedirs('../user_data/log', exist_ok=True) 33 | log = Logger(f'../user_data/log/{logfile}').logger 34 | log.info(f'lightgbm 排序,mode: {mode}') 35 | 36 | 37 | def train_model(df_feature, df_query): 38 | df_train = df_feature[df_feature['label'].notnull()] 39 | df_test = df_feature[df_feature['label'].isnull()] 40 | 41 | del df_feature 42 | gc.collect() 43 | 44 | ycol = 'label' 45 | feature_names = list( 46 | filter( 47 | lambda x: x not in [ycol, 'created_at_datetime', 'click_datetime'], 48 | df_train.columns)) 49 | feature_names.sort() 50 | 51 | model = lgb.LGBMClassifier(num_leaves=64, 52 | max_depth=10, 53 | learning_rate=0.05, 54 | n_estimators=10000, 55 | subsample=0.8, 56 | feature_fraction=0.8, 57 | reg_alpha=0.5, 58 | reg_lambda=0.5, 59 | random_state=seed, 60 | importance_type='gain', 61 | metric=None) 62 | 63 | oof = [] 64 | prediction = df_test[['user_id', 'article_id']] 65 | prediction['pred'] = 0 66 | df_importance_list = [] 67 | 68 | # 训练模型 69 | kfold = GroupKFold(n_splits=5) 70 | for fold_id, (trn_idx, val_idx) in enumerate( 71 | kfold.split(df_train[feature_names], df_train[ycol], 72 | df_train['user_id'])): 73 | X_train = df_train.iloc[trn_idx][feature_names] 74 | Y_train = df_train.iloc[trn_idx][ycol] 75 | 76 | X_val = df_train.iloc[val_idx][feature_names] 77 | Y_val = df_train.iloc[val_idx][ycol] 78 | 79 | log.debug( 80 | f'\nFold_{fold_id + 1} Training ================================\n' 81 | ) 82 | 83 | lgb_model = model.fit(X_train, 84 | Y_train, 85 | eval_names=['train', 'valid'], 86 | eval_set=[(X_train, Y_train), (X_val, Y_val)], 87 | verbose=100, 88 | eval_metric='auc', 89 | early_stopping_rounds=100) 90 | 91 | pred_val = lgb_model.predict_proba( 92 | X_val, num_iteration=lgb_model.best_iteration_)[:, 1] 93 | df_oof = df_train.iloc[val_idx][['user_id', 'article_id', ycol]].copy() 94 | df_oof['pred'] = pred_val 95 | oof.append(df_oof) 96 | 97 | pred_test = lgb_model.predict_proba( 98 | df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 99 | 1] 100 | prediction['pred'] += pred_test / 5 101 | 102 | df_importance = pd.DataFrame({ 103 | 'feature_name': 104 | feature_names, 105 | 'importance': 106 | lgb_model.feature_importances_, 107 | }) 108 | df_importance_list.append(df_importance) 109 | 110 | joblib.dump(model, f'../user_data/model/lgb{fold_id}.pkl') 111 | 112 | # 特征重要性 113 | df_importance = pd.concat(df_importance_list) 114 | df_importance = df_importance.groupby([ 115 | 'feature_name' 116 | ])['importance'].agg('mean').sort_values(ascending=False).reset_index() 117 | log.debug(f'importance: {df_importance}') 118 | 119 | # 生成线下 120 | df_oof = pd.concat(oof) 121 | df_oof.sort_values(['user_id', 'pred'], 122 | inplace=True, 123 | ascending=[True, False]) 124 | log.debug(f'df_oof.head: {df_oof.head()}') 125 | 126 | # 计算相关指标 127 | total = df_query[df_query['click_article_id'] != -1].user_id.nunique() 128 | hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 = evaluate( 129 | df_oof, total) 130 | log.debug( 131 | f'{hitrate_5}, {mrr_5}, {hitrate_10}, {mrr_10}, {hitrate_20}, {mrr_20}, {hitrate_40}, {mrr_40}, {hitrate_50}, {mrr_50}' 132 | ) 133 | 134 | # 生成提交文件 135 | df_sub = gen_sub(prediction) 136 | df_sub.sort_values(['user_id'], inplace=True) 137 | os.makedirs('../prediction_result', exist_ok=True) 138 | df_sub.to_csv(f'../prediction_result/result.csv', index=False) 139 | 140 | 141 | def online_predict(df_test): 142 | ycol = 'label' 143 | feature_names = list( 144 | filter( 145 | lambda x: x not in [ycol, 'created_at_datetime', 'click_datetime'], 146 | df_test.columns)) 147 | feature_names.sort() 148 | 149 | prediction = df_test[['user_id', 'article_id']] 150 | prediction['pred'] = 0 151 | 152 | for fold_id in tqdm(range(5)): 153 | model = joblib.load(f'../user_data/model/lgb{fold_id}.pkl') 154 | pred_test = model.predict_proba(df_test[feature_names])[:, 1] 155 | prediction['pred'] += pred_test / 5 156 | 157 | # 生成提交文件 158 | df_sub = gen_sub(prediction) 159 | df_sub.sort_values(['user_id'], inplace=True) 160 | os.makedirs('../prediction_result', exist_ok=True) 161 | df_sub.to_csv(f'../prediction_result/result.csv', index=False) 162 | 163 | 164 | if __name__ == '__main__': 165 | if mode == 'valid': 166 | df_feature = pd.read_pickle('../user_data/data/offline/feature.pkl') 167 | df_query = pd.read_pickle('../user_data/data/offline/query.pkl') 168 | 169 | for f in df_feature.select_dtypes('object').columns: 170 | lbl = LabelEncoder() 171 | df_feature[f] = lbl.fit_transform(df_feature[f].astype(str)) 172 | 173 | train_model(df_feature, df_query) 174 | else: 175 | df_feature = pd.read_pickle('../user_data/data/online/feature.pkl') 176 | online_predict(df_feature) 177 | -------------------------------------------------------------------------------- /code/recall.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import random 5 | import signal 6 | import warnings 7 | from collections import defaultdict 8 | from itertools import permutations 9 | from random import shuffle 10 | 11 | import multitasking 12 | import numpy as np 13 | import pandas as pd 14 | from tqdm import tqdm 15 | 16 | from utils import Logger, evaluate 17 | 18 | warnings.filterwarnings('ignore') 19 | 20 | max_threads = multitasking.config['CPU_CORES'] 21 | multitasking.set_max_threads(max_threads) 22 | multitasking.set_engine('process') 23 | signal.signal(signal.SIGINT, multitasking.killall) 24 | 25 | random.seed(2020) 26 | 27 | # 命令行参数 28 | parser = argparse.ArgumentParser(description='召回合并') 29 | parser.add_argument('--mode', default='valid') 30 | parser.add_argument('--logfile', default='test.log') 31 | 32 | args = parser.parse_args() 33 | 34 | mode = args.mode 35 | logfile = args.logfile 36 | 37 | # 初始化日志 38 | os.makedirs('../user_data/log', exist_ok=True) 39 | log = Logger(f'../user_data/log/{logfile}').logger 40 | log.info(f'召回合并: {mode}') 41 | 42 | 43 | def mms(df): 44 | user_score_max = {} 45 | user_score_min = {} 46 | 47 | # 获取用户下的相似度的最大值和最小值 48 | for user_id, g in df[['user_id', 'sim_score']].groupby('user_id'): 49 | scores = g['sim_score'].values.tolist() 50 | user_score_max[user_id] = scores[0] 51 | user_score_min[user_id] = scores[-1] 52 | 53 | ans = [] 54 | for user_id, sim_score in tqdm(df[['user_id', 'sim_score']].values): 55 | ans.append((sim_score - user_score_min[user_id]) / 56 | (user_score_max[user_id] - user_score_min[user_id]) + 57 | 10**-3) 58 | return ans 59 | 60 | 61 | def recall_result_sim(df1_, df2_): 62 | df1 = df1_.copy() 63 | df2 = df2_.copy() 64 | 65 | user_item_ = df1.groupby('user_id')['article_id'].agg(set).reset_index() 66 | user_item_dict1 = dict(zip(user_item_['user_id'], 67 | user_item_['article_id'])) 68 | 69 | user_item_ = df2.groupby('user_id')['article_id'].agg(set).reset_index() 70 | user_item_dict2 = dict(zip(user_item_['user_id'], 71 | user_item_['article_id'])) 72 | 73 | cnt = 0 74 | hit_cnt = 0 75 | 76 | for user in user_item_dict1.keys(): 77 | item_set1 = user_item_dict1[user] 78 | 79 | cnt += len(item_set1) 80 | 81 | if user in user_item_dict2: 82 | item_set2 = user_item_dict2[user] 83 | 84 | inters = item_set1 & item_set2 85 | hit_cnt += len(inters) 86 | 87 | return hit_cnt / cnt 88 | 89 | 90 | if __name__ == '__main__': 91 | if mode == 'valid': 92 | df_click = pd.read_pickle('../user_data/data/offline/click.pkl') 93 | df_query = pd.read_pickle('../user_data/data/offline/query.pkl') 94 | 95 | recall_path = '../user_data/data/offline' 96 | else: 97 | df_click = pd.read_pickle('../user_data/data/online/click.pkl') 98 | df_query = pd.read_pickle('../user_data/data/online/query.pkl') 99 | 100 | recall_path = '../user_data/data/online' 101 | 102 | log.debug(f'max_threads {max_threads}') 103 | 104 | recall_methods = ['itemcf', 'w2v', 'binetwork'] 105 | 106 | weights = {'itemcf': 1, 'binetwork': 1, 'w2v': 0.1} 107 | recall_list = [] 108 | recall_dict = {} 109 | for recall_method in recall_methods: 110 | recall_result = pd.read_pickle( 111 | f'{recall_path}/recall_{recall_method}.pkl') 112 | weight = weights[recall_method] 113 | 114 | recall_result['sim_score'] = mms(recall_result) 115 | recall_result['sim_score'] = recall_result['sim_score'] * weight 116 | 117 | recall_list.append(recall_result) 118 | recall_dict[recall_method] = recall_result 119 | 120 | # 求相似度 121 | for recall_method1, recall_method2 in permutations(recall_methods, 2): 122 | score = recall_result_sim(recall_dict[recall_method1], 123 | recall_dict[recall_method2]) 124 | log.debug(f'召回相似度 {recall_method1}-{recall_method2}: {score}') 125 | 126 | # 合并召回结果 127 | recall_final = pd.concat(recall_list, sort=False) 128 | recall_score = recall_final[['user_id', 'article_id', 129 | 'sim_score']].groupby([ 130 | 'user_id', 'article_id' 131 | ])['sim_score'].sum().reset_index() 132 | 133 | recall_final = recall_final[['user_id', 'article_id', 'label' 134 | ]].drop_duplicates(['user_id', 'article_id']) 135 | recall_final = recall_final.merge(recall_score, how='left') 136 | 137 | recall_final.sort_values(['user_id', 'sim_score'], 138 | inplace=True, 139 | ascending=[True, False]) 140 | 141 | log.debug(f'recall_final.shape: {recall_final.shape}') 142 | log.debug(f'recall_final: {recall_final.head()}') 143 | 144 | # 删除无正样本的训练集用户 145 | gg = recall_final.groupby(['user_id']) 146 | useful_recall = [] 147 | 148 | for user_id, g in tqdm(gg): 149 | if g['label'].isnull().sum() > 0: 150 | useful_recall.append(g) 151 | else: 152 | label_sum = g['label'].sum() 153 | if label_sum > 1: 154 | print('error', user_id) 155 | elif label_sum == 1: 156 | useful_recall.append(g) 157 | 158 | df_useful_recall = pd.concat(useful_recall, sort=False) 159 | log.debug(f'df_useful_recall: {df_useful_recall.head()}') 160 | 161 | df_useful_recall = df_useful_recall.sort_values( 162 | ['user_id', 'sim_score'], ascending=[True, 163 | False]).reset_index(drop=True) 164 | 165 | # 计算相关指标 166 | if mode == 'valid': 167 | total = df_query[df_query['click_article_id'] != -1].user_id.nunique() 168 | hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 = evaluate( 169 | df_useful_recall[df_useful_recall['label'].notnull()], total) 170 | hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 171 | 172 | log.debug( 173 | f'召回合并后指标: {hitrate_5}, {mrr_5}, {hitrate_10}, {mrr_10}, {hitrate_20}, {mrr_20}, {hitrate_40}, {mrr_40}, {hitrate_50}, {mrr_50}' 174 | ) 175 | 176 | df = df_useful_recall['user_id'].value_counts().reset_index() 177 | df.columns = ['user_id', 'cnt'] 178 | log.debug(f"平均每个用户召回数量:{df['cnt'].mean()}") 179 | 180 | log.debug( 181 | f"标签分布: {df_useful_recall[df_useful_recall['label'].notnull()]['label'].value_counts()}" 182 | ) 183 | 184 | # 保存到本地 185 | if mode == 'valid': 186 | df_useful_recall.to_pickle('../user_data/data/offline/recall.pkl') 187 | else: 188 | df_useful_recall.to_pickle('../user_data/data/online/recall.pkl') 189 | -------------------------------------------------------------------------------- /code/recall_binetwork.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import os 4 | import pickle 5 | import random 6 | import signal 7 | from collections import defaultdict 8 | from random import shuffle 9 | 10 | import multitasking 11 | import numpy as np 12 | import pandas as pd 13 | from tqdm import tqdm 14 | 15 | from utils import Logger, evaluate 16 | 17 | max_threads = multitasking.config['CPU_CORES'] 18 | multitasking.set_max_threads(max_threads) 19 | multitasking.set_engine('process') 20 | signal.signal(signal.SIGINT, multitasking.killall) 21 | 22 | random.seed(2020) 23 | 24 | # 命令行参数 25 | parser = argparse.ArgumentParser(description='binetwork 召回') 26 | parser.add_argument('--mode', default='valid') 27 | parser.add_argument('--logfile', default='test.log') 28 | 29 | args = parser.parse_args() 30 | 31 | mode = args.mode 32 | logfile = args.logfile 33 | 34 | # 初始化日志 35 | os.makedirs('../user_data/log', exist_ok=True) 36 | log = Logger(f'../user_data/log/{logfile}').logger 37 | log.info(f'binetwork 召回,mode: {mode}') 38 | 39 | 40 | def cal_sim(df): 41 | user_item_ = df.groupby('user_id')['click_article_id'].agg( 42 | list).reset_index() 43 | user_item_dict = dict( 44 | zip(user_item_['user_id'], user_item_['click_article_id'])) 45 | 46 | item_user_ = df.groupby('click_article_id')['user_id'].agg( 47 | list).reset_index() 48 | item_user_dict = dict( 49 | zip(item_user_['click_article_id'], item_user_['user_id'])) 50 | 51 | sim_dict = {} 52 | 53 | for item, users in tqdm(item_user_dict.items()): 54 | sim_dict.setdefault(item, {}) 55 | 56 | for user in users: 57 | tmp_len = len(user_item_dict[user]) 58 | for relate_item in user_item_dict[user]: 59 | sim_dict[item].setdefault(relate_item, 0) 60 | sim_dict[item][relate_item] += 1 / \ 61 | (math.log(len(users)+1) * math.log(tmp_len+1)) 62 | 63 | return sim_dict, user_item_dict 64 | 65 | 66 | @multitasking.task 67 | def recall(df_query, item_sim, user_item_dict, worker_id): 68 | data_list = [] 69 | 70 | for user_id, item_id in tqdm(df_query.values): 71 | rank = {} 72 | 73 | if user_id not in user_item_dict: 74 | continue 75 | 76 | interacted_items = user_item_dict[user_id] 77 | interacted_items = interacted_items[::-1][:1] 78 | 79 | for _, item in enumerate(interacted_items): 80 | for relate_item, wij in sorted(item_sim[item].items(), 81 | key=lambda d: d[1], 82 | reverse=True)[0:100]: 83 | if relate_item not in interacted_items: 84 | rank.setdefault(relate_item, 0) 85 | rank[relate_item] += wij 86 | 87 | sim_items = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:50] 88 | item_ids = [item[0] for item in sim_items] 89 | item_sim_scores = [item[1] for item in sim_items] 90 | 91 | df_temp = pd.DataFrame() 92 | df_temp['article_id'] = item_ids 93 | df_temp['sim_score'] = item_sim_scores 94 | df_temp['user_id'] = user_id 95 | 96 | if item_id == -1: 97 | df_temp['label'] = np.nan 98 | else: 99 | df_temp['label'] = 0 100 | df_temp.loc[df_temp['article_id'] == item_id, 'label'] = 1 101 | 102 | df_temp = df_temp[['user_id', 'article_id', 'sim_score', 'label']] 103 | df_temp['user_id'] = df_temp['user_id'].astype('int') 104 | df_temp['article_id'] = df_temp['article_id'].astype('int') 105 | 106 | data_list.append(df_temp) 107 | 108 | df_data = pd.concat(data_list, sort=False) 109 | 110 | os.makedirs('../user_data/tmp/binetwork', exist_ok=True) 111 | df_data.to_pickle(f'../user_data/tmp/binetwork/{worker_id}.pkl') 112 | 113 | 114 | if __name__ == '__main__': 115 | if mode == 'valid': 116 | df_click = pd.read_pickle('../user_data/data/offline/click.pkl') 117 | df_query = pd.read_pickle('../user_data/data/offline/query.pkl') 118 | 119 | os.makedirs('../user_data/sim/offline', exist_ok=True) 120 | sim_pkl_file = '../user_data/sim/offline/binetwork_sim.pkl' 121 | else: 122 | df_click = pd.read_pickle('../user_data/data/online/click.pkl') 123 | df_query = pd.read_pickle('../user_data/data/online/query.pkl') 124 | 125 | os.makedirs('../user_data/sim/online', exist_ok=True) 126 | sim_pkl_file = '../user_data/sim/online/binetwork_sim.pkl' 127 | 128 | log.debug(f'df_click shape: {df_click.shape}') 129 | log.debug(f'{df_click.head()}') 130 | 131 | item_sim, user_item_dict = cal_sim(df_click) 132 | f = open(sim_pkl_file, 'wb') 133 | pickle.dump(item_sim, f) 134 | f.close() 135 | 136 | # 召回 137 | n_split = max_threads 138 | all_users = df_query['user_id'].unique() 139 | shuffle(all_users) 140 | total = len(all_users) 141 | n_len = total // n_split 142 | 143 | # 清空临时文件夹 144 | for path, _, file_list in os.walk('../user_data/tmp/binetwork'): 145 | for file_name in file_list: 146 | os.remove(os.path.join(path, file_name)) 147 | 148 | for i in range(0, total, n_len): 149 | part_users = all_users[i:i + n_len] 150 | df_temp = df_query[df_query['user_id'].isin(part_users)] 151 | recall(df_temp, item_sim, user_item_dict, i) 152 | 153 | multitasking.wait_for_tasks() 154 | log.info('合并任务') 155 | 156 | df_data = pd.DataFrame() 157 | for path, _, file_list in os.walk('../user_data/tmp/binetwork'): 158 | for file_name in file_list: 159 | df_temp = pd.read_pickle(os.path.join(path, file_name)) 160 | df_data = df_data.append(df_temp) 161 | 162 | # 必须加,对其进行排序 163 | df_data = df_data.sort_values(['user_id', 'sim_score'], 164 | ascending=[True, 165 | False]).reset_index(drop=True) 166 | log.debug(f'df_data.head: {df_data.head()}') 167 | 168 | # 计算召回指标 169 | if mode == 'valid': 170 | log.info(f'计算召回指标') 171 | 172 | total = df_query[df_query['click_article_id'] != -1].user_id.nunique() 173 | 174 | hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 = evaluate( 175 | df_data[df_data['label'].notnull()], total) 176 | 177 | log.debug( 178 | f'binetwork: {hitrate_5}, {mrr_5}, {hitrate_10}, {mrr_10}, {hitrate_20}, {mrr_20}, {hitrate_40}, {mrr_40}, {hitrate_50}, {mrr_50}' 179 | ) 180 | 181 | # 保存召回结果 182 | if mode == 'valid': 183 | df_data.to_pickle('../user_data/data/offline/recall_binetwork.pkl') 184 | else: 185 | df_data.to_pickle('../user_data/data/online/recall_binetwork.pkl') 186 | -------------------------------------------------------------------------------- /code/recall_itemcf.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import os 4 | import pickle 5 | import random 6 | import signal 7 | from collections import defaultdict 8 | from random import shuffle 9 | 10 | import multitasking 11 | import numpy as np 12 | import pandas as pd 13 | from tqdm import tqdm 14 | 15 | from utils import Logger, evaluate 16 | 17 | max_threads = multitasking.config['CPU_CORES'] 18 | multitasking.set_max_threads(max_threads) 19 | multitasking.set_engine('process') 20 | signal.signal(signal.SIGINT, multitasking.killall) 21 | 22 | random.seed(2020) 23 | 24 | # 命令行参数 25 | parser = argparse.ArgumentParser(description='itemcf 召回') 26 | parser.add_argument('--mode', default='valid') 27 | parser.add_argument('--logfile', default='test.log') 28 | 29 | args = parser.parse_args() 30 | 31 | mode = args.mode 32 | logfile = args.logfile 33 | 34 | # 初始化日志 35 | os.makedirs('../user_data/log', exist_ok=True) 36 | log = Logger(f'../user_data/log/{logfile}').logger 37 | log.info(f'itemcf 召回,mode: {mode}') 38 | 39 | 40 | def cal_sim(df): 41 | user_item_ = df.groupby('user_id')['click_article_id'].agg( 42 | lambda x: list(x)).reset_index() 43 | user_item_dict = dict( 44 | zip(user_item_['user_id'], user_item_['click_article_id'])) 45 | 46 | item_cnt = defaultdict(int) 47 | sim_dict = {} 48 | 49 | for _, items in tqdm(user_item_dict.items()): 50 | for loc1, item in enumerate(items): 51 | item_cnt[item] += 1 52 | sim_dict.setdefault(item, {}) 53 | 54 | for loc2, relate_item in enumerate(items): 55 | if item == relate_item: 56 | continue 57 | 58 | sim_dict[item].setdefault(relate_item, 0) 59 | 60 | # 位置信息权重 61 | # 考虑文章的正向顺序点击和反向顺序点击 62 | loc_alpha = 1.0 if loc2 > loc1 else 0.7 63 | loc_weight = loc_alpha * (0.9**(np.abs(loc2 - loc1) - 1)) 64 | 65 | sim_dict[item][relate_item] += loc_weight / \ 66 | math.log(1 + len(items)) 67 | 68 | for item, relate_items in tqdm(sim_dict.items()): 69 | for relate_item, cij in relate_items.items(): 70 | sim_dict[item][relate_item] = cij / \ 71 | math.sqrt(item_cnt[item] * item_cnt[relate_item]) 72 | 73 | return sim_dict, user_item_dict 74 | 75 | 76 | @multitasking.task 77 | def recall(df_query, item_sim, user_item_dict, worker_id): 78 | data_list = [] 79 | 80 | for user_id, item_id in tqdm(df_query.values): 81 | rank = {} 82 | 83 | if user_id not in user_item_dict: 84 | continue 85 | 86 | interacted_items = user_item_dict[user_id] 87 | interacted_items = interacted_items[::-1][:2] 88 | 89 | for loc, item in enumerate(interacted_items): 90 | for relate_item, wij in sorted(item_sim[item].items(), 91 | key=lambda d: d[1], 92 | reverse=True)[0:200]: 93 | if relate_item not in interacted_items: 94 | rank.setdefault(relate_item, 0) 95 | rank[relate_item] += wij * (0.7**loc) 96 | 97 | sim_items = sorted(rank.items(), key=lambda d: d[1], 98 | reverse=True)[:100] 99 | item_ids = [item[0] for item in sim_items] 100 | item_sim_scores = [item[1] for item in sim_items] 101 | 102 | df_temp = pd.DataFrame() 103 | df_temp['article_id'] = item_ids 104 | df_temp['sim_score'] = item_sim_scores 105 | df_temp['user_id'] = user_id 106 | 107 | if item_id == -1: 108 | df_temp['label'] = np.nan 109 | else: 110 | df_temp['label'] = 0 111 | df_temp.loc[df_temp['article_id'] == item_id, 'label'] = 1 112 | 113 | df_temp = df_temp[['user_id', 'article_id', 'sim_score', 'label']] 114 | df_temp['user_id'] = df_temp['user_id'].astype('int') 115 | df_temp['article_id'] = df_temp['article_id'].astype('int') 116 | 117 | data_list.append(df_temp) 118 | 119 | df_data = pd.concat(data_list, sort=False) 120 | 121 | os.makedirs('../user_data/tmp/itemcf', exist_ok=True) 122 | df_data.to_pickle(f'../user_data/tmp/itemcf/{worker_id}.pkl') 123 | 124 | 125 | if __name__ == '__main__': 126 | if mode == 'valid': 127 | df_click = pd.read_pickle('../user_data/data/offline/click.pkl') 128 | df_query = pd.read_pickle('../user_data/data/offline/query.pkl') 129 | 130 | os.makedirs('../user_data/sim/offline', exist_ok=True) 131 | sim_pkl_file = '../user_data/sim/offline/itemcf_sim.pkl' 132 | else: 133 | df_click = pd.read_pickle('../user_data/data/online/click.pkl') 134 | df_query = pd.read_pickle('../user_data/data/online/query.pkl') 135 | 136 | os.makedirs('../user_data/sim/online', exist_ok=True) 137 | sim_pkl_file = '../user_data/sim/online/itemcf_sim.pkl' 138 | 139 | log.debug(f'df_click shape: {df_click.shape}') 140 | log.debug(f'{df_click.head()}') 141 | 142 | item_sim, user_item_dict = cal_sim(df_click) 143 | f = open(sim_pkl_file, 'wb') 144 | pickle.dump(item_sim, f) 145 | f.close() 146 | 147 | # 召回 148 | n_split = max_threads 149 | all_users = df_query['user_id'].unique() 150 | shuffle(all_users) 151 | total = len(all_users) 152 | n_len = total // n_split 153 | 154 | # 清空临时文件夹 155 | for path, _, file_list in os.walk('../user_data/tmp/itemcf'): 156 | for file_name in file_list: 157 | os.remove(os.path.join(path, file_name)) 158 | 159 | for i in range(0, total, n_len): 160 | part_users = all_users[i:i + n_len] 161 | df_temp = df_query[df_query['user_id'].isin(part_users)] 162 | recall(df_temp, item_sim, user_item_dict, i) 163 | 164 | multitasking.wait_for_tasks() 165 | log.info('合并任务') 166 | 167 | df_data = pd.DataFrame() 168 | for path, _, file_list in os.walk('../user_data/tmp/itemcf'): 169 | for file_name in file_list: 170 | df_temp = pd.read_pickle(os.path.join(path, file_name)) 171 | df_data = df_data.append(df_temp) 172 | 173 | # 必须加,对其进行排序 174 | df_data = df_data.sort_values(['user_id', 'sim_score'], 175 | ascending=[True, 176 | False]).reset_index(drop=True) 177 | log.debug(f'df_data.head: {df_data.head()}') 178 | 179 | # 计算召回指标 180 | if mode == 'valid': 181 | log.info(f'计算召回指标') 182 | 183 | total = df_query[df_query['click_article_id'] != -1].user_id.nunique() 184 | 185 | hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 = evaluate( 186 | df_data[df_data['label'].notnull()], total) 187 | 188 | log.debug( 189 | f'itemcf: {hitrate_5}, {mrr_5}, {hitrate_10}, {mrr_10}, {hitrate_20}, {mrr_20}, {hitrate_40}, {mrr_40}, {hitrate_50}, {mrr_50}' 190 | ) 191 | # 保存召回结果 192 | if mode == 'valid': 193 | df_data.to_pickle('../user_data/data/offline/recall_itemcf.pkl') 194 | else: 195 | df_data.to_pickle('../user_data/data/online/recall_itemcf.pkl') 196 | -------------------------------------------------------------------------------- /code/recall_w2v.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import os 4 | import pickle 5 | import random 6 | import signal 7 | import warnings 8 | from collections import defaultdict 9 | from random import shuffle 10 | 11 | import multitasking 12 | import numpy as np 13 | import pandas as pd 14 | from annoy import AnnoyIndex 15 | from gensim.models import Word2Vec 16 | from tqdm import tqdm 17 | 18 | from utils import Logger, evaluate 19 | 20 | warnings.filterwarnings('ignore') 21 | 22 | max_threads = multitasking.config['CPU_CORES'] 23 | multitasking.set_max_threads(max_threads) 24 | multitasking.set_engine('process') 25 | signal.signal(signal.SIGINT, multitasking.killall) 26 | 27 | seed = 2020 28 | random.seed(seed) 29 | 30 | # 命令行参数 31 | parser = argparse.ArgumentParser(description='w2v 召回') 32 | parser.add_argument('--mode', default='valid') 33 | parser.add_argument('--logfile', default='test.log') 34 | 35 | args = parser.parse_args() 36 | 37 | mode = args.mode 38 | logfile = args.logfile 39 | 40 | # 初始化日志 41 | os.makedirs('../user_data/log', exist_ok=True) 42 | log = Logger(f'../user_data/log/{logfile}').logger 43 | log.info(f'w2v 召回,mode: {mode}') 44 | 45 | 46 | def word2vec(df_, f1, f2, model_path): 47 | df = df_.copy() 48 | tmp = df.groupby(f1, as_index=False)[f2].agg( 49 | {'{}_{}_list'.format(f1, f2): list}) 50 | 51 | sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist() 52 | del tmp['{}_{}_list'.format(f1, f2)] 53 | 54 | words = [] 55 | for i in range(len(sentences)): 56 | x = [str(x) for x in sentences[i]] 57 | sentences[i] = x 58 | words += x 59 | 60 | if os.path.exists(f'{model_path}/w2v.m'): 61 | model = Word2Vec.load(f'{model_path}/w2v.m') 62 | else: 63 | model = Word2Vec(sentences=sentences, 64 | size=256, 65 | window=3, 66 | min_count=1, 67 | sg=1, 68 | hs=0, 69 | seed=seed, 70 | negative=5, 71 | workers=10, 72 | iter=1) 73 | model.save(f'{model_path}/w2v.m') 74 | 75 | article_vec_map = {} 76 | for word in set(words): 77 | if word in model: 78 | article_vec_map[int(word)] = model[word] 79 | 80 | return article_vec_map 81 | 82 | 83 | @multitasking.task 84 | def recall(df_query, article_vec_map, article_index, user_item_dict, 85 | worker_id): 86 | data_list = [] 87 | 88 | for user_id, item_id in tqdm(df_query.values): 89 | rank = defaultdict(int) 90 | 91 | interacted_items = user_item_dict[user_id] 92 | interacted_items = interacted_items[-1:] 93 | 94 | for item in interacted_items: 95 | article_vec = article_vec_map[item] 96 | 97 | item_ids, distances = article_index.get_nns_by_vector( 98 | article_vec, 100, include_distances=True) 99 | sim_scores = [2 - distance for distance in distances] 100 | 101 | for relate_item, wij in zip(item_ids, sim_scores): 102 | if relate_item not in interacted_items: 103 | rank.setdefault(relate_item, 0) 104 | rank[relate_item] += wij 105 | 106 | sim_items = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:50] 107 | item_ids = [item[0] for item in sim_items] 108 | item_sim_scores = [item[1] for item in sim_items] 109 | 110 | df_temp = pd.DataFrame() 111 | df_temp['article_id'] = item_ids 112 | df_temp['sim_score'] = item_sim_scores 113 | df_temp['user_id'] = user_id 114 | 115 | if item_id == -1: 116 | df_temp['label'] = np.nan 117 | else: 118 | df_temp['label'] = 0 119 | df_temp.loc[df_temp['article_id'] == item_id, 'label'] = 1 120 | 121 | df_temp = df_temp[['user_id', 'article_id', 'sim_score', 'label']] 122 | df_temp['user_id'] = df_temp['user_id'].astype('int') 123 | df_temp['article_id'] = df_temp['article_id'].astype('int') 124 | 125 | data_list.append(df_temp) 126 | 127 | df_data = pd.concat(data_list, sort=False) 128 | 129 | os.makedirs('../user_data/tmp/w2v', exist_ok=True) 130 | df_data.to_pickle('../user_data/tmp/w2v/{}.pkl'.format(worker_id)) 131 | 132 | 133 | if __name__ == '__main__': 134 | if mode == 'valid': 135 | df_click = pd.read_pickle('../user_data/data/offline/click.pkl') 136 | df_query = pd.read_pickle('../user_data/data/offline/query.pkl') 137 | 138 | os.makedirs('../user_data/data/offline', exist_ok=True) 139 | os.makedirs('../user_data/model/offline', exist_ok=True) 140 | 141 | w2v_file = '../user_data/data/offline/article_w2v.pkl' 142 | model_path = '../user_data/model/offline' 143 | else: 144 | df_click = pd.read_pickle('../user_data/data/online/click.pkl') 145 | df_query = pd.read_pickle('../user_data/data/online/query.pkl') 146 | 147 | os.makedirs('../user_data/data/online', exist_ok=True) 148 | os.makedirs('../user_data/model/online', exist_ok=True) 149 | 150 | w2v_file = '../user_data/data/online/article_w2v.pkl' 151 | model_path = '../user_data/model/online' 152 | 153 | log.debug(f'df_click shape: {df_click.shape}') 154 | log.debug(f'{df_click.head()}') 155 | 156 | article_vec_map = word2vec(df_click, 'user_id', 'click_article_id', 157 | model_path) 158 | f = open(w2v_file, 'wb') 159 | pickle.dump(article_vec_map, f) 160 | f.close() 161 | 162 | # 将 embedding 建立索引 163 | article_index = AnnoyIndex(256, 'angular') 164 | article_index.set_seed(2020) 165 | 166 | for article_id, emb in tqdm(article_vec_map.items()): 167 | article_index.add_item(article_id, emb) 168 | 169 | article_index.build(100) 170 | 171 | user_item_ = df_click.groupby('user_id')['click_article_id'].agg( 172 | lambda x: list(x)).reset_index() 173 | user_item_dict = dict( 174 | zip(user_item_['user_id'], user_item_['click_article_id'])) 175 | 176 | # 召回 177 | n_split = max_threads 178 | all_users = df_query['user_id'].unique() 179 | shuffle(all_users) 180 | total = len(all_users) 181 | n_len = total // n_split 182 | 183 | # 清空临时文件夹 184 | for path, _, file_list in os.walk('../tmp/w2v'): 185 | for file_name in file_list: 186 | os.remove(os.path.join(path, file_name)) 187 | 188 | for i in range(0, total, n_len): 189 | part_users = all_users[i:i + n_len] 190 | df_temp = df_query[df_query['user_id'].isin(part_users)] 191 | recall(df_temp, article_vec_map, article_index, user_item_dict, i) 192 | 193 | multitasking.wait_for_tasks() 194 | log.info('合并任务') 195 | 196 | df_data = pd.DataFrame() 197 | for path, _, file_list in os.walk('../user_data/tmp/w2v'): 198 | for file_name in file_list: 199 | df_temp = pd.read_pickle(os.path.join(path, file_name)) 200 | df_data = df_data.append(df_temp) 201 | 202 | # 必须加,对其进行排序 203 | df_data = df_data.sort_values(['user_id', 'sim_score'], 204 | ascending=[True, 205 | False]).reset_index(drop=True) 206 | log.debug(f'df_data.head: {df_data.head()}') 207 | 208 | # 计算召回指标 209 | if mode == 'valid': 210 | log.info(f'计算召回指标') 211 | 212 | total = df_query[df_query['click_article_id'] != -1].user_id.nunique() 213 | 214 | hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 = evaluate( 215 | df_data[df_data['label'].notnull()], total) 216 | 217 | log.debug( 218 | f'w2v: {hitrate_5}, {mrr_5}, {hitrate_10}, {mrr_10}, {hitrate_20}, {mrr_20}, {hitrate_40}, {mrr_40}, {hitrate_50}, {mrr_50}' 219 | ) 220 | # 保存召回结果 221 | if mode == 'valid': 222 | df_data.to_pickle('../user_data/data/offline/recall_w2v.pkl') 223 | else: 224 | df_data.to_pickle('../user_data/data/online/recall_w2v.pkl') 225 | -------------------------------------------------------------------------------- /code/test.sh: -------------------------------------------------------------------------------- 1 | time=$(date "+%Y-%m-%d-%H:%M:%S") 2 | 处理数据 3 | python data.py --mode valid --logfile "${time}.log" 4 | 5 | # itemcf 召回 6 | python recall_itemcf.py --mode valid --logfile "${time}.log" 7 | 8 | # binetwork 召回 9 | python recall_binetwork.py --mode valid --logfile "${time}.log" 10 | 11 | # w2v 召回 12 | python recall_w2v.py --mode valid --logfile "${time}.log" 13 | 14 | # 召回合并 15 | python recall.py --mode valid --logfile "${time}.log" 16 | 17 | # 排序特征 18 | python rank_feature.py --mode valid --logfile "${time}.log" 19 | 20 | # lgb 模型训练 21 | python rank_lgb.py --mode valid --logfile "${time}.log" 22 | -------------------------------------------------------------------------------- /code/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pickle 4 | import signal 5 | from random import sample 6 | 7 | import multitasking 8 | import numpy as np 9 | import pandas as pd 10 | from tqdm import tqdm 11 | 12 | max_threads = multitasking.config['CPU_CORES'] 13 | multitasking.set_max_threads(max_threads) 14 | multitasking.set_engine('process') 15 | signal.signal(signal.SIGINT, multitasking.killall) 16 | 17 | 18 | class Logger(object): 19 | level_relations = { 20 | 'debug': logging.DEBUG, 21 | 'info': logging.INFO, 22 | 'warning': logging.WARNING, 23 | 'error': logging.ERROR, 24 | 'crit': logging.CRITICAL 25 | } 26 | 27 | def __init__( 28 | self, 29 | filename, 30 | level='debug', 31 | fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' 32 | ): 33 | self.logger = logging.getLogger(filename) 34 | format_str = logging.Formatter(fmt) 35 | self.logger.setLevel(self.level_relations.get(level)) 36 | 37 | sh = logging.StreamHandler() 38 | sh.setFormatter(format_str) 39 | 40 | th = logging.FileHandler(filename=filename, encoding='utf-8', mode='a') 41 | th.setFormatter(format_str) 42 | self.logger.addHandler(sh) 43 | self.logger.addHandler(th) 44 | 45 | 46 | def reduce_mem_usage(df, verbose=True): 47 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 48 | start_mem = df.memory_usage().sum() / 1024**2 49 | for col in df.columns: 50 | col_type = df[col].dtypes 51 | if col_type in numerics: 52 | c_min = df[col].min() 53 | c_max = df[col].max() 54 | if str(col_type)[:3] == 'int': 55 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( 56 | np.int8).max: 57 | df[col] = df[col].astype(np.int8) 58 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( 59 | np.int16).max: 60 | df[col] = df[col].astype(np.int16) 61 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( 62 | np.int32).max: 63 | df[col] = df[col].astype(np.int32) 64 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( 65 | np.int64).max: 66 | df[col] = df[col].astype(np.int64) 67 | else: 68 | if c_min > np.finfo(np.float16).min and c_max < np.finfo( 69 | np.float16).max: 70 | df[col] = df[col].astype(np.float16) 71 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo( 72 | np.float32).max: 73 | df[col] = df[col].astype(np.float32) 74 | else: 75 | df[col] = df[col].astype(np.float64) 76 | end_mem = df.memory_usage().sum() / 1024**2 77 | if verbose: 78 | print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format( 79 | end_mem, 100 * (start_mem - end_mem) / start_mem)) 80 | return df 81 | 82 | 83 | def evaluate(df, total): 84 | hitrate_5 = 0 85 | mrr_5 = 0 86 | 87 | hitrate_10 = 0 88 | mrr_10 = 0 89 | 90 | hitrate_20 = 0 91 | mrr_20 = 0 92 | 93 | hitrate_40 = 0 94 | mrr_40 = 0 95 | 96 | hitrate_50 = 0 97 | mrr_50 = 0 98 | 99 | gg = df.groupby(['user_id']) 100 | 101 | for _, g in tqdm(gg): 102 | try: 103 | item_id = g[g['label'] == 1]['article_id'].values[0] 104 | except Exception as e: 105 | continue 106 | 107 | predictions = g['article_id'].values.tolist() 108 | 109 | rank = 0 110 | while predictions[rank] != item_id: 111 | rank += 1 112 | 113 | if rank < 5: 114 | mrr_5 += 1.0 / (rank + 1) 115 | hitrate_5 += 1 116 | 117 | if rank < 10: 118 | mrr_10 += 1.0 / (rank + 1) 119 | hitrate_10 += 1 120 | 121 | if rank < 20: 122 | mrr_20 += 1.0 / (rank + 1) 123 | hitrate_20 += 1 124 | 125 | if rank < 40: 126 | mrr_40 += 1.0 / (rank + 1) 127 | hitrate_40 += 1 128 | 129 | if rank < 50: 130 | mrr_50 += 1.0 / (rank + 1) 131 | hitrate_50 += 1 132 | 133 | hitrate_5 /= total 134 | mrr_5 /= total 135 | 136 | hitrate_10 /= total 137 | mrr_10 /= total 138 | 139 | hitrate_20 /= total 140 | mrr_20 /= total 141 | 142 | hitrate_40 /= total 143 | mrr_40 /= total 144 | 145 | hitrate_50 /= total 146 | mrr_50 /= total 147 | 148 | return hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 149 | 150 | 151 | @multitasking.task 152 | def gen_sub_multitasking(test_users, prediction, all_articles, worker_id): 153 | lines = [] 154 | 155 | for test_user in tqdm(test_users): 156 | g = prediction[prediction['user_id'] == test_user] 157 | g = g.head(5) 158 | items = g['article_id'].values.tolist() 159 | 160 | if len(set(items)) < 5: 161 | buchong = all_articles - set(items) 162 | buchong = sample(buchong, 5 - len(set(items))) 163 | items += buchong 164 | 165 | assert len(set(items)) == 5 166 | 167 | lines.append([test_user] + items) 168 | 169 | os.makedirs('../user_data/tmp/sub', exist_ok=True) 170 | 171 | with open(f'../user_data/tmp/sub/{worker_id}.pkl', 'wb') as f: 172 | pickle.dump(lines, f) 173 | 174 | 175 | def gen_sub(prediction): 176 | prediction.sort_values(['user_id', 'pred'], 177 | inplace=True, 178 | ascending=[True, False]) 179 | 180 | all_articles = set(prediction['article_id'].values) 181 | 182 | sub_sample = pd.read_csv('../tcdata/testB_click_log_Test_B.csv') 183 | test_users = sub_sample.user_id.unique() 184 | 185 | n_split = max_threads 186 | total = len(test_users) 187 | n_len = total // n_split 188 | 189 | # 清空临时文件夹 190 | for path, _, file_list in os.walk('../user_data/tmp/sub'): 191 | for file_name in file_list: 192 | os.remove(os.path.join(path, file_name)) 193 | 194 | for i in range(0, total, n_len): 195 | part_users = test_users[i:i + n_len] 196 | gen_sub_multitasking(part_users, prediction, all_articles, i) 197 | 198 | multitasking.wait_for_tasks() 199 | 200 | lines = [] 201 | for path, _, file_list in os.walk('../user_data/tmp/sub'): 202 | for file_name in file_list: 203 | with open(os.path.join(path, file_name), 'rb') as f: 204 | line = pickle.load(f) 205 | lines += line 206 | 207 | df_sub = pd.DataFrame(lines) 208 | df_sub.columns = [ 209 | 'user_id', 'article_1', 'article_2', 'article_3', 'article_4', 210 | 'article_5' 211 | ] 212 | return df_sub 213 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.17.0 2 | annoy==1.17.0 3 | tqdm==4.50.2 4 | pandas==0.25.3 5 | pandarallel==1.5.1 6 | numpy==1.19.2 7 | multitasking==0.0.9 8 | gensim==3.8.3 9 | lightgbm==3.0.0 10 | scikit_learn==0.24.0 11 | --------------------------------------------------------------------------------