├── .gitignore ├── README.md ├── code ├── calc_i2i_30k_sim.py ├── const.py ├── csv_handler.py ├── data_holder.py ├── main.py ├── recaller.py ├── test.bat └── test.sh └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | prediction_result/ 3 | tcdata/ 4 | user_data/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | 3 | 本项目为零基础入门推荐系统 正式赛 第三名(0.2592)的公开版代码。 4 | 5 | 文章链接: https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12282027.0.0.4e4d379c8uyfeC&postId=169915 6 | -------------------------------------------------------------------------------- /code/calc_i2i_30k_sim.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import time 4 | import multiprocessing as mp 5 | from tqdm import tqdm 6 | import pickle 7 | import math 8 | from os.path import isfile 9 | from const import CACHE_FOLDER 10 | 11 | def _i2i_30k_sim_core(job_id, user_id_list, dataset): 12 | _item_counts_dic = {} 13 | _i2i_30k_sim = {} 14 | 15 | start_time = time.time() 16 | for user_id in user_id_list: 17 | item_dt = dataset[user_id] 18 | ts_list = pd.Series([ts for _, ts in item_dt]) 19 | idx_list = [idx for idx, val in dict(ts_list - ts_list.shift(1) == 30000).items() if val] 20 | 21 | for idx in idx_list: 22 | i_art_id, _ = item_dt[idx] 23 | j_art_id, _ = item_dt[idx - 1] 24 | 25 | _i2i_30k_sim.setdefault(i_art_id, {}) 26 | _i2i_30k_sim[i_art_id].setdefault(j_art_id, 0) 27 | _i2i_30k_sim[i_art_id][j_art_id] += 1 28 | 29 | _i2i_30k_sim.setdefault(j_art_id, {}) 30 | _i2i_30k_sim[j_art_id].setdefault(i_art_id, 0) 31 | _i2i_30k_sim[j_art_id][i_art_id] += 1 32 | 33 | print('子任务[{}]: 完成i2i_30k相似度的计算。({}秒)'.format(job_id, '%.2f' % (time.time() - start_time))) 34 | 35 | return _i2i_30k_sim 36 | 37 | def i2i_30k_sim(dataset, n_cpu, offline, max_related=50): 38 | filename = 'i2i_30k_sim_{}.pkl'.format('offline' if offline else 'online') 39 | if isfile(CACHE_FOLDER + filename): 40 | print('直接从文件{}中读取计算好的i2i_30k相似度'.format(filename)) 41 | return pickle.load(open(CACHE_FOLDER + filename, 'rb')) 42 | 43 | # 计算相似度 44 | start_time = time.time() 45 | print('开始计算i2i_30k相似度') 46 | i2i_sim_3k = {} 47 | n_block = (len(dataset.keys()) - 1) // n_cpu + 1 48 | keys = list(dataset.keys()) 49 | pool = mp.Pool(processes=n_cpu) 50 | results = [pool.apply_async(_i2i_30k_sim_core, args=(i, keys[i * n_block:(i + 1) * n_block], dataset)) for i in range(0, n_cpu)] 51 | pool.close() 52 | pool.join() 53 | 54 | for result in results: 55 | _i2i_sim_3k = result.get() 56 | 57 | for art_id, related_art_id_dic in _i2i_sim_3k.items(): 58 | i2i_sim_3k.setdefault(art_id, {}) 59 | for related_art_id, value in related_art_id_dic.items(): 60 | i2i_sim_3k[art_id].setdefault(related_art_id, 0) 61 | i2i_sim_3k[art_id][related_art_id] += value 62 | 63 | print('逆序排序') 64 | for art_id, related_arts in tqdm(i2i_sim_3k.items()): 65 | sorted_and_topK = sorted(related_arts.items(), key=lambda x: x[1], reverse=True) 66 | i2i_sim_3k[art_id] = { 67 | 'sorted_keys': [art_id for art_id, _ in sorted_and_topK], 68 | 'related_arts': dict(sorted_and_topK) 69 | } 70 | 71 | print('i2i_30k相似度计算完毕({}秒)'.format('%.2f' % (time.time() - start_time))) 72 | print('保存i2i_30k相似度数据至文件{}中'.format(filename)) 73 | pickle.dump(i2i_sim_3k, open(CACHE_FOLDER + filename, 'wb')) 74 | return i2i_sim_3k 75 | 76 | -------------------------------------------------------------------------------- /code/const.py: -------------------------------------------------------------------------------- 1 | RAW_DATA_FOLDER = '../tcdata/' 2 | OUTPUT_FOLDER = '../prediction_result/' 3 | CACHE_FOLDER = '../user_data/' 4 | -------------------------------------------------------------------------------- /code/csv_handler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import time 4 | from tqdm import tqdm 5 | from const import CACHE_FOLDER 6 | 7 | def neg_sampling(ds, min=1, max=5): 8 | start_time = time.time() 9 | pos_ds = ds.loc[ds['answer'] == 1] 10 | neg_ds = ds.loc[ds['answer'] == 0] 11 | 12 | def _neg_sampling_func(x): 13 | n_sampling = len(x) 14 | n_sampling = min if n_sampling < min else (max if n_sampling > max else n_sampling) 15 | return x.sample(n=n_sampling, replace=False) 16 | 17 | neg_ds = pd.concat([ 18 | neg_ds.groupby(['user_id', 'last_clicked_timestamp']).apply(_neg_sampling_func), 19 | neg_ds.groupby('article_id').apply(_neg_sampling_func), 20 | ]).drop_duplicates() 21 | 22 | ret = pd.concat([pos_ds, neg_ds]).reset_index(drop=True) 23 | print('负采样处理完毕({}秒, {}->{}件)'.format('%.2f' % (time.time() - start_time), len(ds), len(ret))) 24 | return ret 25 | 26 | def get_user_features(raw_data, train_dataset, test_users, articles_dic): 27 | def calc_avg_words_count(items): 28 | return np.average([articles_dic[item[0]]['words_count'] for item in items]) 29 | 30 | def calc_min_words_count(items): 31 | return np.min([articles_dic[item[0]]['words_count'] for item in items]) 32 | 33 | def calc_max_words_count(items): 34 | return np.max([articles_dic[item[0]]['words_count'] for item in items]) 35 | 36 | def calc_lag_between_created_at_ts_and_clicked_ts(items, articles_dic): 37 | item = items[-1] 38 | return (item[1] - articles_dic[item[0]]['created_at_ts']) / (1000 * 60 * 60 * 24) 39 | 40 | def calc_lag_between_two_click(items): 41 | if len(items) > 1: 42 | return (items[-1][1] - items[-2][1]) / (1000 * 60 * 60 * 24) 43 | else: 44 | return np.nan 45 | 46 | def calc_lag_between_two_articles(items, articles_dic): 47 | if len(items) > 1: 48 | return (articles_dic[items[-1][0]]['created_at_ts'] - articles_dic[items[-2][0]]['created_at_ts']) / (1000 * 60 * 60 * 24) 49 | else: 50 | return np.nan 51 | 52 | df_users = pd.DataFrame(list(test_users.keys()), columns=['user_id']) 53 | 54 | # 计算 55 | # 1. 用户看新闻的平均字数 56 | _data = [] 57 | for user_id, ts_set in tqdm(test_users.items()): 58 | for last_clicked_timestamp in ts_set: 59 | _data.append(( 60 | user_id, 61 | last_clicked_timestamp, 62 | calc_avg_words_count(train_dataset[user_id][last_clicked_timestamp]), 63 | calc_min_words_count(train_dataset[user_id][last_clicked_timestamp]), 64 | calc_max_words_count(train_dataset[user_id][last_clicked_timestamp]), 65 | calc_lag_between_created_at_ts_and_clicked_ts(train_dataset[user_id][last_clicked_timestamp], articles_dic), 66 | calc_lag_between_two_click(train_dataset[user_id][last_clicked_timestamp]), 67 | calc_lag_between_two_articles(train_dataset[user_id][last_clicked_timestamp], articles_dic), 68 | )) 69 | 70 | df1 = pd.DataFrame(_data, columns=['user_id', 'last_clicked_timestamp', 'avg_words_count', 'min_words_count', 'max_words_count', 'lag_between_created_at_ts_and_clicked_ts', 'lag_between_two_click', 'lag_between_two_articles']) 71 | 72 | # 计算用户使用设备,环境等的众数 73 | columns = ['user_id','click_environment','click_deviceGroup','click_os','click_country','click_region','click_referrer_type'] 74 | df2 = df_users.merge(raw_data.get_all_click_log())[columns].groupby('user_id').agg(lambda x: x.value_counts().index[0]).reset_index() 75 | 76 | return df1.merge(df2) 77 | 78 | def create_train_data(raw_data, train_dataset, test_users, articles_dic, recall_results, offline, y_answer): 79 | start_time = time.time() 80 | keys_ds = [] 81 | 82 | for user_id, ts_set in test_users.items(): 83 | for last_clicked_timestamp in ts_set: 84 | items = np.concatenate([result[user_id][last_clicked_timestamp] for _, result in recall_results.items()]) 85 | keys_ds.append(list(zip(np.repeat(user_id, len(items)), np.repeat(last_clicked_timestamp, len(items)), items))) 86 | 87 | ds = pd.DataFrame(np.concatenate(keys_ds), columns=['user_id', 'last_clicked_timestamp', 'article_id'], dtype=np.int64).drop_duplicates() 88 | 89 | if offline: 90 | answer_keys_ds = [] 91 | # 拼接正确答案标签 92 | for user_id, ts_list in y_answer.items(): 93 | for last_clicked_timestamp, art_id in ts_list.items(): 94 | answer_keys_ds.append((user_id, last_clicked_timestamp, art_id)) 95 | 96 | answers = pd.DataFrame(answer_keys_ds, columns=['user_id', 'last_clicked_timestamp', 'article_id'], dtype=np.int64) 97 | # 将正确答案融合进数据集 98 | answers['answer'] = 1 99 | ds = ds.merge(answers, how='left').fillna({'answer': 0}) 100 | ds['answer'] = ds['answer'].astype(np.int8) 101 | 102 | # 负采样 103 | ds = neg_sampling(ds) 104 | 105 | ds = ds.merge(raw_data.get_articles()).merge(get_user_features(raw_data, train_dataset, test_users, articles_dic)) 106 | 107 | # 新特征 108 | ds['lag_period_last_article'] = ds['last_clicked_timestamp'] - ds['created_at_ts'] 109 | ds['diff_words_last_article'] = ds['avg_words_count'] - ds['words_count'] 110 | ds.to_csv(CACHE_FOLDER + '{}.csv'.format('train' if offline else 'test'), index=False) 111 | print('{}用的csv文件生成完毕({}秒, {}件)'.format('训练' if offline else '测试', '%.2f' % (time.time() - start_time), len(ds))) -------------------------------------------------------------------------------- /code/data_holder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tqdm import tqdm 4 | import pickle 5 | import time 6 | from os.path import isfile 7 | from const import CACHE_FOLDER 8 | 9 | class DataHolder: 10 | def __init__(self, articles, train_click_log, test_click_log, trainB_click_log=None): 11 | self.articles = articles 12 | self.train_click_log = train_click_log 13 | self.test_click_log = test_click_log 14 | self.trainB_click_log = trainB_click_log 15 | self.all_click_log = self.train_click_log.append(self.trainB_click_log).append(self.test_click_log) if self.trainB_click_log is not None else self.train_click_log.append(self.test_click_log) 16 | self.all_click_log.drop_duplicates(['user_id', 'click_article_id', 'click_timestamp'], inplace=True) 17 | 18 | print('从train_click_log读取{}件(UserId=[{},{}])'.format(len(self.train_click_log), self.train_click_log['user_id'].min(), self.train_click_log['user_id'].max())) 19 | print('从test_click_log读取{}件(UserId=[{},{}])'.format(len(self.test_click_log), self.test_click_log['user_id'].min(), self.test_click_log['user_id'].max())) 20 | 21 | if self.trainB_click_log is not None: 22 | print('从trainB_click_log读取{}件(UserId=[{},{}])'.format(len(self.trainB_click_log), self.trainB_click_log['user_id'].min(), self.trainB_click_log['user_id'].max())) 23 | 24 | print('使用训练集all_click_log共{}件(UserId=[{},{}])'.format(len(self.all_click_log), self.all_click_log['user_id'].min(), self.all_click_log['user_id'].max())) 25 | 26 | # DataFrame对象转换成字典 27 | filename = 'dataset.pkl' 28 | if isfile(CACHE_FOLDER + filename): 29 | print('直接从文件{}中读取dataset'.format(filename)) 30 | self.dataset = pickle.load(open(CACHE_FOLDER + filename, 'rb')) 31 | else: 32 | start_time = time.time() 33 | _t = self.all_click_log.sort_values('click_timestamp').groupby('user_id')\ 34 | .apply(lambda x: list(zip(x['click_article_id'], x['click_timestamp'])))\ 35 | .reset_index()\ 36 | .rename(columns={0: 'item_dt_list'}) 37 | 38 | self.dataset = dict(zip(_t['user_id'], _t['item_dt_list'])) 39 | print('dataset对象完毕({}秒)'.format('%.2f' % (time.time() - start_time))) 40 | 41 | print('保存dataset至文件{}中'.format(filename)) 42 | pickle.dump(self.dataset, open(CACHE_FOLDER + filename, 'wb')) 43 | 44 | # 生成可供训练用的(user_id, timestamp)字典 45 | filename = 'train_users_dic.pkl' 46 | if isfile(CACHE_FOLDER + filename): 47 | print('直接从文件{}中读取train_users_dic'.format(filename)) 48 | self.train_users_dic = pickle.load(open(CACHE_FOLDER + filename, 'rb')) 49 | else: 50 | start_time = time.time() 51 | self.train_users_dic = {} 52 | for user_id, items in tqdm(self.dataset.items()): 53 | ts_list = pd.Series([item[1] for item in items]) 54 | self.train_users_dic[user_id] = list(ts_list.loc[ts_list.shift(-1) - ts_list == 30000]) 55 | 56 | print('train_users_dic对象完毕({}秒)'.format('%.2f' % (time.time() - start_time))) 57 | 58 | print('保存train_users_dic至文件{}中'.format(filename)) 59 | pickle.dump(self.train_users_dic, open(CACHE_FOLDER + filename, 'wb')) 60 | 61 | def get_articles(self): 62 | return self.articles 63 | 64 | def get_train_click_log(self): 65 | return self.train_click_log 66 | 67 | def get_test_click_log(self): 68 | return self.test_click_log 69 | 70 | def get_all_click_log(self): 71 | return self.all_click_log 72 | 73 | def get_user_list(self): 74 | return self.train_click_log['user_id'].unique() 75 | 76 | def get_item_dt_groupby_user(self): 77 | return self.dataset 78 | 79 | def users_df2dic(self, df_users): 80 | _t = df_users.sort_values('click_timestamp').groupby('user_id')\ 81 | .apply(lambda x: set(x['click_timestamp']))\ 82 | .reset_index()\ 83 | .rename(columns={0: 'ts_set'}) 84 | 85 | return dict(zip(_t['user_id'], _t['ts_set'])) 86 | 87 | def get_test_users(self, offline, samples=100000): 88 | if offline: 89 | # 一维数组化 90 | users = [] 91 | for user_id, ts_list in self.train_users_dic.items(): 92 | # for ts in ts_list: 93 | # users.append((user_id, ts)) 94 | if len(ts_list) > 0: 95 | users.append((user_id, ts_list[-1])) 96 | 97 | np.random.seed(42) 98 | idx_list = np.random.choice(len(users), samples, replace=False) 99 | selected_users = [users[idx] for idx in idx_list] 100 | 101 | # 字典化 102 | return self.users_df2dic(pd.DataFrame(selected_users, columns=['user_id', 'click_timestamp'])) 103 | else: 104 | return self.users_df2dic(self.test_click_log.groupby('user_id').max('click_timestamp').reset_index()[['user_id', 'click_timestamp']]) 105 | 106 | def take_last(self, items, last=1): 107 | if len(items) <= last: 108 | return items.copy(), items[0] 109 | else: 110 | return items[:-last], items[-last] 111 | 112 | def get_train_dataset_and_answers(self, test_users): 113 | start_time = time.time() 114 | train_dataset = {} 115 | y_answer = {} 116 | 117 | for user_id, ts_set in tqdm(test_users.items()): 118 | items = self.dataset[user_id] 119 | for last_clicked_timestamp in ts_set: 120 | idx = [item[1] for item in items].index(last_clicked_timestamp) 121 | train_dataset.setdefault(user_id, {}) 122 | train_dataset[user_id][last_clicked_timestamp] = items[0:idx+1] 123 | y_answer.setdefault(user_id, {}) 124 | y_answer[user_id][last_clicked_timestamp] = items[idx+1][0] 125 | 126 | print('训练集和答案分割完毕({}秒)'.format('%.2f' % (time.time() - start_time))) 127 | return train_dataset, y_answer 128 | 129 | def get_train_dataset_for_online(self, test_users): 130 | start_time = time.time() 131 | train_dataset = {} 132 | 133 | for user_id, ts_set in tqdm(test_users.items()): 134 | items = self.dataset[user_id] 135 | for last_clicked_timestamp in ts_set: 136 | train_dataset.setdefault(user_id, {}) 137 | train_dataset[user_id][last_clicked_timestamp] = items 138 | 139 | print('测试集制作完毕({}秒)'.format('%.2f' % (time.time() - start_time))) 140 | return train_dataset 141 | -------------------------------------------------------------------------------- /code/main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | from data_holder import DataHolder 5 | from tqdm import tqdm 6 | import pickle 7 | import math 8 | import time 9 | import multiprocessing as mp 10 | import pickle 11 | import lightgbm as lgb 12 | from const import RAW_DATA_FOLDER, OUTPUT_FOLDER, CACHE_FOLDER 13 | from recaller import calc_and_recall 14 | from csv_handler import create_train_data 15 | from sklearn.model_selection import train_test_split 16 | 17 | def read_raw_data(filename, cb=None): 18 | data = pd.read_csv(RAW_DATA_FOLDER + filename) 19 | return cb(data) if cb is not None else data 20 | 21 | def read_all_raw_data(filenames=['articles.csv', 'train_click_log.csv', 'testB_click_log_Test_B.csv', 'testA_click_log.csv']): 22 | return DataHolder(*[read_raw_data(filename) for filename in filenames]) 23 | 24 | def calc_mrr_and_hit(recommend_dict, y, k=5): 25 | #assert len(recommend_dict) == len(y) 26 | sum_mrr = 0.0 27 | sum_hit = 0.0 28 | sum_hit_detail = np.repeat(0.0, 5) 29 | user_cnt = len(recommend_dict.keys()) 30 | 31 | for user_id, recommend_items in recommend_dict.items(): 32 | answer = y[user_id] if user_id in y else -1 33 | if (answer in recommend_items) and (recommend_items.index(answer) < k): 34 | sum_hit += 1 35 | sum_mrr += 1 / (recommend_items.index(answer) + 1) 36 | sum_hit_detail[recommend_items.index(answer)] += 1 37 | 38 | return (sum_mrr / user_cnt), (sum_hit / user_cnt), (sum_hit_detail / user_cnt) 39 | 40 | def create_submission(recommend_dict): 41 | _data = [{'user_id': user_id, 42 | 'article_1': art_id_list[0], 43 | 'article_2': art_id_list[1], 44 | 'article_3': art_id_list[2], 45 | 'article_4': art_id_list[3], 46 | 'article_5': art_id_list[4]} for user_id, art_id_list in tqdm(recommend_dict.items())] 47 | _t = pd.DataFrame(_data) 48 | _t.sort_values('user_id', inplace=True) 49 | _t.to_csv(OUTPUT_FOLDER + 'result.csv', index=False) 50 | 51 | def handler(offline=True): 52 | cpu_cores = mp.cpu_count() 53 | print('使用CPU核心数: {}'.format(cpu_cores)) 54 | print('开始{}数据验证处理'.format('线下' if offline else '线上')) 55 | raw_data = read_all_raw_data() 56 | test_users = raw_data.get_test_users(offline) 57 | 58 | _user_id_list = list(test_users.keys()) 59 | user_id_min = np.min(_user_id_list) 60 | user_id_max = np.max(_user_id_list) 61 | print('获得{}用户集合{}件 [{} ~ {}]'.format('验证' if offline else '测试', len(test_users), user_id_min, user_id_max)) 62 | 63 | dataset = raw_data.get_item_dt_groupby_user() 64 | 65 | if offline: 66 | train_dataset, y_answer = raw_data.get_train_dataset_and_answers(test_users) 67 | else: 68 | train_dataset = raw_data.get_train_dataset_for_online(test_users) 69 | y_answer = None 70 | 71 | print('训练数据({}件)'.format(np.sum([len(ts_list) for user_id, ts_list in train_dataset.items()]))) 72 | 73 | articles_dic = dict(list(raw_data.get_articles().apply(lambda x: (x['article_id'], dict(x)), axis=1))) 74 | print('获得文章字典({}件)'.format(len(articles_dic.keys()))) 75 | 76 | recall_results = calc_and_recall(dataset, train_dataset, test_users, articles_dic, cpu_cores, offline, y_answer) 77 | create_train_data(raw_data, train_dataset, test_users, articles_dic, recall_results, offline, y_answer) 78 | 79 | def make_train_data(): 80 | handler() 81 | 82 | def make_test_data(): 83 | handler(False) 84 | 85 | def prepare_dataset(df): 86 | agg_column = [column for column in df.columns if column != 'user_id'][0] 87 | df.sort_values('user_id', inplace=True) 88 | grp_info = df.groupby('user_id', as_index=False).count()[agg_column].values 89 | y = df['answer'] if 'answer' in df.columns else None 90 | return df.drop(columns=['answer']) if 'answer' in df.columns else df, grp_info, y 91 | 92 | def make_recommend_dict(X_val, y_pred): 93 | X_val['pred'] = y_pred 94 | _t = X_val.groupby('user_id')\ 95 | .apply(lambda x: list(x.sort_values('pred', ascending=False)['article_id'].head(5)))\ 96 | .reset_index()\ 97 | .rename(columns={0: 'item_list'}) 98 | 99 | recommend_dict = dict(zip(_t['user_id'], _t['item_list'])) 100 | return recommend_dict 101 | 102 | def test(): 103 | df_train = pd.read_csv(CACHE_FOLDER + 'train.csv') 104 | 105 | clf = lgb.LGBMRanker(random_state=777, n_estimators=1000) 106 | 107 | users = df_train['user_id'].unique() 108 | train_users, _test_users = train_test_split(users, test_size=0.2, random_state=98) 109 | test_users, val_users = train_test_split(_test_users, test_size=0.5, random_state=38) 110 | df_new_train = df_train.merge(pd.DataFrame(train_users, columns=['user_id'])) 111 | df_test = df_train.merge(pd.DataFrame(test_users, columns=['user_id'])) 112 | df_val = df_train.merge(pd.DataFrame(val_users, columns=['user_id'])) 113 | 114 | X_train, X_grp_train, y_train = prepare_dataset(df_new_train) 115 | X_test, X_grp_test, y_test = prepare_dataset(df_test) 116 | X_val, X_grp_val, _ = prepare_dataset(df_val) 117 | 118 | def handle_columns(X): 119 | return X.drop(columns=['user_id', 'article_id']) 120 | 121 | _X_train = handle_columns(X_train) 122 | 123 | clf.fit(_X_train, y_train, group=X_grp_train, eval_set=[(handle_columns(X_test), y_test)], eval_group=[X_grp_test], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, verbose=False) 124 | print('Best iteration: {}'.format(clf.best_iteration_)) 125 | 126 | 127 | for X, X_grp, df, title in [(X_test, X_grp_test, df_test, 'Test Set'), (X_val, X_grp_val, df_val, 'Validation Set')]: 128 | print('[{}]'.format(title)) 129 | y_pred = clf.predict(handle_columns(X), group=X_grp, num_iteration=clf.best_iteration_) 130 | recommend_dict = make_recommend_dict(X, y_pred) 131 | answers = dict(df.loc[df['answer'] == 1, ['user_id', 'article_id']].values) 132 | mrr, hit, details = calc_mrr_and_hit(recommend_dict, answers) 133 | print('MRR: {} / HIT: {}'.format(mrr, hit)) 134 | print(' / '.join(['%.2f' % detail for detail in details])) 135 | 136 | for column, score in sorted(zip(_X_train.columns, clf.feature_importances_), key=lambda x: x[1], reverse=True): 137 | print('{}: {}'.format(column, score)) 138 | 139 | def run(): 140 | df_train = pd.read_csv(CACHE_FOLDER + 'train.csv') 141 | df_test = pd.read_csv(CACHE_FOLDER + 'test.csv') 142 | 143 | clf = lgb.LGBMRanker(random_state=777, n_estimators=1000) 144 | 145 | users = df_train['user_id'].unique() 146 | train_users, eval_users = train_test_split(users, test_size=0.2, random_state=77) 147 | df_new_train = df_train.merge(pd.DataFrame(train_users, columns=['user_id'])) 148 | df_eval = df_train.merge(pd.DataFrame(eval_users, columns=['user_id'])) 149 | 150 | X_train, X_grp_train, y_train = prepare_dataset(df_new_train) 151 | X_eval, X_grp_eval, y_eval = prepare_dataset(df_eval) 152 | X_test, X_grp_test, _ = prepare_dataset(df_test) 153 | 154 | def handle_columns(X): 155 | return X.drop(columns=['user_id', 'article_id']) 156 | 157 | _X_train = handle_columns(X_train) 158 | 159 | clf.fit(_X_train, y_train, group=X_grp_train, eval_set=[(handle_columns(X_eval), y_eval)], eval_group=[X_grp_eval], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, verbose=False) 160 | print('Best iteration: {}'.format(clf.best_iteration_)) 161 | y_pred = clf.predict(handle_columns(X_test), group=X_grp_test, num_iteration=clf.best_iteration_) 162 | 163 | for column, score in sorted(zip(_X_train.columns, clf.feature_importances_), key=lambda x: x[1], reverse=True): 164 | print('{}: {}'.format(column, score)) 165 | 166 | recommend_dict = make_recommend_dict(X_test, y_pred) 167 | 168 | create_submission(recommend_dict) 169 | 170 | if __name__ == "__main__": 171 | make_train_data() 172 | # test() 173 | make_test_data() 174 | run() 175 | -------------------------------------------------------------------------------- /code/recaller.py: -------------------------------------------------------------------------------- 1 | from calc_i2i_30k_sim import i2i_30k_sim 2 | from tqdm import tqdm 3 | import multiprocessing as mp 4 | import time 5 | import math 6 | import pandas as pd 7 | import numpy as np 8 | 9 | def get_clicked_items(items): 10 | return { art_id for art_id, _ in items } 11 | 12 | def _calc_sim(dataset, articles_dic, cpu_cores, offline): 13 | # 计算各种相似度 14 | num = len([i2i_30k_sim]) 15 | 16 | start_time = time.time() 17 | print('召回前的计算处理开始({}件)'.format(num)) 18 | 19 | sims = {} 20 | sims['i2i_30k_sim'] = i2i_30k_sim(dataset, cpu_cores, offline) 21 | 22 | print('召回前的计算处理结束({}秒)'.format('%.2f' % (time.time() - start_time))) 23 | 24 | return sims 25 | 26 | 27 | def _is_recall_target(last_clicked_timestamp, art_id, articles_dic, lag_hour_max=27, lag_hour_min=3): 28 | # 热度文章在用户最后一次点击时刻起,前3小时~27小时内的文章 29 | lag_max = lag_hour_max * 60 * 60 * 1000 30 | lag_min = lag_hour_min * 60 * 60 * 1000 31 | if articles_dic[art_id]['created_at_ts'] < (last_clicked_timestamp - lag_max): 32 | return False 33 | 34 | if articles_dic[art_id]['created_at_ts'] > (last_clicked_timestamp - lag_min): 35 | return False 36 | 37 | return True 38 | 39 | def _recall_hot_items(dataset, train_dataset, test_users, articles_dic, topK=10): 40 | result = {} 41 | start_time = time.time() 42 | lag_hour_min = 3 43 | lag_hour_max = 27 44 | 45 | hot_items = {} 46 | for _, items in tqdm(dataset.items()): 47 | for art_id, _ in items: 48 | hot_items.setdefault(art_id, 0) 49 | hot_items[art_id] += 1 50 | 51 | sorted_hot_items = sorted(hot_items.items(), key=lambda x: x[1], reverse=True) 52 | 53 | for user_id, ts_set in tqdm(test_users.items()): 54 | for last_clicked_timestamp in ts_set: 55 | items = train_dataset[user_id][last_clicked_timestamp] 56 | clicked_items = get_clicked_items(items) 57 | recommend_items = [] 58 | 59 | for art_id, _ in sorted_hot_items: 60 | if art_id in clicked_items: 61 | continue 62 | 63 | if not _is_recall_target(last_clicked_timestamp, art_id, articles_dic, lag_hour_min=lag_hour_min, lag_hour_max=lag_hour_max): 64 | continue 65 | 66 | recommend_items.append(art_id) 67 | 68 | if len(recommend_items) >= topK: 69 | break 70 | 71 | result.setdefault(user_id, {}) 72 | result[user_id][last_clicked_timestamp] = recommend_items 73 | 74 | print('hot召回处理完毕({}秒) 限制:[{}-{}]'.format('%.2f' % (time.time() - start_time), lag_hour_min, lag_hour_max)) 75 | return result 76 | 77 | def _recall_i2i_30k_sim_items(dataset, test_users, articles_dic, i2i_30k_sim, topK=25): 78 | result = {} 79 | start_time = time.time() 80 | lag_hour_min = 0 81 | lag_hour_max = 27 82 | 83 | for user_id, ts_set in tqdm(test_users.items()): 84 | for last_clicked_timestamp in ts_set: 85 | items = dataset[user_id][last_clicked_timestamp] 86 | clicked_items = get_clicked_items(items) 87 | recommend_items = {} 88 | 89 | for art_id, _ in items: 90 | if art_id not in i2i_30k_sim: 91 | break 92 | 93 | recommand_art_id_list = i2i_30k_sim[art_id]['sorted_keys'] 94 | for recommend_art_id in recommand_art_id_list: 95 | if recommend_art_id in clicked_items: 96 | continue 97 | 98 | if not _is_recall_target(last_clicked_timestamp, art_id, articles_dic, lag_hour_min=lag_hour_min, lag_hour_max=lag_hour_max): 99 | continue 100 | 101 | if i2i_30k_sim[art_id]['related_arts'][recommend_art_id] < 2: 102 | break 103 | 104 | recommend_items.setdefault(recommend_art_id, 0) 105 | recommend_items[recommend_art_id] += (i2i_30k_sim[art_id]['related_arts'][recommend_art_id]) 106 | 107 | result.setdefault(user_id, {}) 108 | result[user_id][last_clicked_timestamp] = [art_id for art_id, _ in sorted(recommend_items.items(), key=lambda x: x[1], reverse=True)[:topK]] 109 | 110 | print('i2i_30k_sim召回处理完毕({}秒) 限制:[{}-{}]'.format('%.2f' % (time.time() - start_time), lag_hour_min, lag_hour_max)) 111 | return result 112 | 113 | def calc_and_recall(dataset, train_dataset, test_users, articles_dic, cpu_cores, offline, answers=None): 114 | sims = _calc_sim(dataset, articles_dic, cpu_cores, offline) 115 | num = len([_recall_hot_items, _recall_i2i_30k_sim_items]) 116 | 117 | start_time = time.time() 118 | print('召回处理开始({}件)'.format(num)) 119 | 120 | recalls = {} 121 | recalls['hot'] = _recall_hot_items(dataset, train_dataset, test_users, articles_dic) 122 | recalls['i2i_30k_sim'] = _recall_i2i_30k_sim_items(train_dataset, test_users, articles_dic, sims['i2i_30k_sim']) 123 | 124 | if offline and answers is not None: 125 | test_users_count = np.sum([len(ts_list) for _, ts_list in test_users.items()]) 126 | for recall_name, result in recalls.items(): 127 | accuracy = 0 128 | recall_counts = np.repeat(0, np.max([len(items) for _, ts_list in result.items() for _, items in ts_list.items()])) 129 | for user_id, ts_list in result.items(): 130 | for last_clicked_timestamp, items in ts_list.items(): 131 | if answers[user_id][last_clicked_timestamp] in items: 132 | accuracy += 1 133 | recall_counts[items.index(answers[user_id][last_clicked_timestamp])] += 1 134 | 135 | print('召回处理[{}]的召回率为{}%'.format(recall_name, '%.2f' % (accuracy * 100 / test_users_count))) 136 | print('召回处理[{}]的详细召回命中计数: {}'.format(recall_name, recall_counts)) 137 | 138 | total_accuracy = 0 139 | for user_id, ts_list in test_users.items(): 140 | for last_clicked_timestamp in ts_list: 141 | for _, result in recalls.items(): 142 | if answers[user_id][last_clicked_timestamp] in result[user_id][last_clicked_timestamp]: 143 | total_accuracy += 1 144 | break 145 | 146 | print('所有召回处理的总召回率为{}%'.format('%.2f' % (total_accuracy * 100 / test_users_count))) 147 | 148 | print('召回处理结束({}秒)'.format('%.2f' % (time.time() - start_time))) 149 | 150 | return recalls 151 | -------------------------------------------------------------------------------- /code/test.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | python main.py -------------------------------------------------------------------------------- /code/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python main.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==1.0.0 2 | lightgbm==3.1.1 3 | numpy==1.20.0 4 | pandas==1.2.1 5 | python-dateutil==2.8.1 6 | pytz==2021.1 7 | scikit-learn==0.24.1 8 | scipy==1.6.0 9 | six==1.15.0 10 | threadpoolctl==2.1.0 11 | tqdm==4.56.0 12 | --------------------------------------------------------------------------------