├── Code ├── ItemCF.py ├── ItemCF_main.py ├── UserCF.py ├── UserCF_main.py ├── __init__.py ├── evaluate.py ├── sim_matrix │ ├── i2i_sim_10000_10.pkl │ └── u2u_sim_10000_10.pkl └── utils.py └── README.md /Code/ItemCF.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 19, 2020 3 | Updated on Jan 25, 2021 4 | 5 | Item-based collaborative filtering 6 | 7 | @author: Gengziyao(zggzy1996@163.com) 8 | """ 9 | import math 10 | from collections import defaultdict 11 | from tqdm import tqdm 12 | 13 | 14 | class ItemCF: 15 | def __init__(self, user_item_dict, item_hot_list, sim_item_topK, topN, i2i_sim=None): 16 | """ 17 | Item-based collaborative filtering 18 | :param user_item_dict: A dict. {user1: [(item1, score),...], user2: ...} 19 | :param item_hot_list: A list. The popular movies list. 20 | :param sim_item_topK: A scalar. Choose topK items for calculate. 21 | :param topN: A scalar. The number of recommender list. 22 | :param i2i_sim: dict. If None, the model should calculate similarity matrix. 23 | """ 24 | self.user_item_dict = user_item_dict 25 | self.item_hot_list = item_hot_list 26 | self.sim_item_topK = sim_item_topK 27 | self.topN = topN 28 | self.i2i_sim = self.__get_item_sim() if i2i_sim is None else i2i_sim 29 | 30 | def __get_item_sim(self): 31 | """ 32 | calculate item similarity weight matrix 33 | :return: i2i_sim 34 | """ 35 | i2i_sim = dict() 36 | item_cnt = defaultdict(int) # Count the number of visits to the item 37 | for user, items in tqdm(self.user_item_dict.items()): 38 | for i, score_i in items: 39 | item_cnt[i] += 1 40 | i2i_sim.setdefault(i, {}) 41 | for j, score_j in items: 42 | if i == j: 43 | continue 44 | i2i_sim[i].setdefault(j, 0) 45 | i2i_sim[i][j] += 1 / math.log(len(items) + 1) # punish hot items 46 | for i, related_items in i2i_sim.items(): 47 | for j, wij in related_items.items(): 48 | i2i_sim[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j]) # Cosine similarity 49 | return i2i_sim 50 | 51 | def recommend(self, user_id): 52 | """ 53 | recommend one user 54 | :param user_id: user's ID 55 | :return: 56 | """ 57 | item_rank = dict() 58 | user_hist_items = self.user_item_dict[user_id] 59 | for i, score_i in user_hist_items: 60 | for j, wij in sorted(self.i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:self.sim_item_topK]: 61 | if j in user_hist_items: 62 | continue 63 | 64 | item_rank.setdefault(j, 0) 65 | item_rank[j] += 1 * wij 66 | 67 | if len(item_rank) < self.topN: 68 | for i, item in enumerate(self.item_hot_list): 69 | if item in item_rank: 70 | continue 71 | item_rank[item] = - i - 1 # rank score < 0 72 | if len(item_rank) == self.topN: 73 | break 74 | item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:self.topN] 75 | 76 | return [i for i, score in item_rank] 77 | 78 | def recommend_all(self, test): 79 | """ 80 | recommend all user of test raw_data 81 | :return: 82 | """ 83 | user_recall_items = defaultdict(dict) 84 | for user in tqdm(test.keys()): 85 | user_recall_items[user] = self.recommend(user) 86 | return user_recall_items -------------------------------------------------------------------------------- /Code/ItemCF_main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Jan 25, 2021 3 | 4 | A simple start of ItemCF 5 | 6 | @author: Gengziyao(zggzy1996@163.com) 7 | """ 8 | import os 9 | import pickle 10 | from time import time 11 | 12 | from ItemCF import * 13 | from evaluate import * 14 | from utils import * 15 | 16 | 17 | def main(): 18 | # ========================= Hyper Parameters ======================= 19 | file = '../dataset/ml-1m/ratings.dat' 20 | trans_score = 1 21 | sample_num = 10000 # sample_num = -1 can load all data 22 | 23 | sim_item_topK = 10 24 | topN = 100 25 | path = 'sim_matrix/i2i_sim_' + str(sample_num) + '_' + str(sim_item_topK) + '.pkl' 26 | # ========================== load dataset =========================== 27 | user_item_dict, _, item_hot_list, test_data = load_ml_1m(file, topN, trans_score=trans_score, sample_num=sample_num) 28 | # =========================== i2i_sim =============================== 29 | # if you store the similarity matrix, you can load it to model 30 | i2i_sim = None 31 | if os.path.exists(path): 32 | i2i_sim = pickle.load(open(path, 'rb')) 33 | print("================ Build Model =================") 34 | t1 = time() 35 | model = ItemCF(user_item_dict, item_hot_list, sim_item_topK, topN, i2i_sim) 36 | if i2i_sim is None: 37 | pickle.dump(model.i2i_sim, open(path, 'wb')) 38 | t2 = time() 39 | # =========================== recommend =============================== 40 | # item_rank = model.recommend(1) 41 | print("================== Evaluate ==================") 42 | hr, ndcg = evaluate_model(model, test_data) 43 | t3 = time() 44 | print('Calculate similarity matrix [%d s], Evaluate[%d s]: HR = %f, NDCG = %f' 45 | % (t2 - t1, t3 - t2, hr, ndcg)) 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /Code/UserCF.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 18, 2020 3 | Updated on Jan 25, 2021 4 | 5 | User-based collaborative filtering 6 | 7 | @author: Gengziyao(zggzy1996@163.com) 8 | """ 9 | import math 10 | from collections import defaultdict 11 | from tqdm import tqdm 12 | 13 | 14 | class UserCF: 15 | def __init__(self, user_item_dict, item_user_dict, item_hot_list, sim_user_topK, topN, u2u_sim=None): 16 | """ 17 | User-based collaborative filtering 18 | :param user_item_dict: A dict. {user1: [(item1, score),...], user2: ...} 19 | :param item_user_dict: A dict. {item1: [(user1, score),...], item2: ...} 20 | :param item_hot_list: A list. The popular movies list. 21 | :param sim_item_topK: A scalar. Choose topK items for calculate. 22 | :param topN: A scalar. The number of recommender list. 23 | :param i2i_sim: dict. If None, the model should calculate similarity matrix. 24 | """ 25 | self.user_item_dict = user_item_dict 26 | self.item_user_dict = item_user_dict 27 | self.item_hot_list = item_hot_list 28 | self.sim_user_topK = sim_user_topK 29 | self.topN = topN 30 | self.u2u_sim = self.__get_user_sim() if u2u_sim is None else u2u_sim 31 | 32 | def __get_user_sim(self): 33 | """ 34 | calculate user similarity weight matrix 35 | :return: u2u_sim 36 | """ 37 | u2u_sim = dict() 38 | user_cnt = defaultdict(int) # Count the number of visits to the user 39 | for item, users in tqdm(self.item_user_dict.items()): 40 | for i, score_i in users: 41 | user_cnt[i] += 1 42 | u2u_sim.setdefault(i, {}) 43 | for j, score_j in users: 44 | if i == j: 45 | continue 46 | u2u_sim[i].setdefault(j, 0) 47 | u2u_sim[i][j] += 1 / math.log(len(users) + 1) # punish highly active users 48 | for i, related_users in u2u_sim.items(): 49 | for j, wij in related_users.items(): 50 | u2u_sim[i][j] = wij / math.sqrt(user_cnt[i] * user_cnt[j]) # Cosine similarity 51 | return u2u_sim 52 | 53 | def recommend(self, user_id): 54 | """ 55 | recommend one user 56 | :param user_id: user's ID 57 | :return: 58 | """ 59 | item_rank = dict() 60 | user_hist_items = self.user_item_dict[user_id] 61 | for i, wij in sorted(self.u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:self.sim_user_topK]: 62 | for j, score_j in self.user_item_dict[i]: 63 | if j in user_hist_items: 64 | continue 65 | 66 | item_rank.setdefault(j, 0) 67 | item_rank[j] += 1 * wij 68 | 69 | if len(item_rank) < self.topN: 70 | for i, item in enumerate(self.item_hot_list): 71 | if item in item_rank: 72 | continue 73 | item_rank[item] = - i - 1 # rank score < 0 74 | if len(item_rank) == self.topN: 75 | break 76 | item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:self.topN] 77 | 78 | return [i for i, score in item_rank] 79 | 80 | def recommend_all(self, test): 81 | """ 82 | recommend all user of test raw_data 83 | :return: 84 | """ 85 | user_recall_items = defaultdict(dict) 86 | for user in tqdm(test.keys()): 87 | user_recall_items[user] = self.recommend(user) 88 | return user_recall_items -------------------------------------------------------------------------------- /Code/UserCF_main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Jan 25, 2021 3 | 4 | A simple start of UserCF 5 | 6 | @author: Gengziyao(zggzy1996@163.com) 7 | """ 8 | import os 9 | import pickle 10 | from time import time 11 | 12 | from UserCF import * 13 | from evaluate import * 14 | from utils import * 15 | 16 | 17 | def main(): 18 | # ========================= Hyper Parameters ======================= 19 | file = '../dataset/ml-1m/ratings.dat' 20 | trans_score = 1 21 | sample_num = 10000 # sample_num = -1 can load all data 22 | 23 | sim_user_topK = 10 24 | topN = 100 25 | path = 'sim_matrix/u2u_sim_' + str(sample_num) + '_' + str(sim_user_topK) + '.pkl' 26 | # ========================== load dataset =========================== 27 | user_item_dict, item_user_dict, item_hot_list, test_data = load_ml_1m( 28 | file, topN, trans_score=trans_score, sample_num=sample_num) 29 | # =========================== u2u_sim =============================== 30 | # if you store the similarity matrix, you can load it to model 31 | u2u_sim = None 32 | if os.path.exists(path): 33 | u2u_sim = pickle.load(open(path, 'rb')) 34 | print("================ Build Model =================") 35 | t1 = time() 36 | model = UserCF(user_item_dict, item_user_dict, item_hot_list, sim_user_topK, topN, u2u_sim) 37 | if u2u_sim is None: 38 | pickle.dump(model.u2u_sim, open(path, 'wb')) 39 | t2 = time() 40 | # =========================== recommend =============================== 41 | # item_rank = model.recommend(1) 42 | print("================== Evaluate ==================") 43 | hr, ndcg = evaluate_model(model, test_data) 44 | t3 = time() 45 | print('Calculate similarity matrix [%d s], Evaluate[%d s]: HR = %f, NDCG = %f' 46 | % (t2 - t1, t3 - t2, hr, ndcg)) 47 | 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /Code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiyaoGeng/SimpleCF/4d05a9ad427ff77df93f71239e6c796c9070c119/Code/__init__.py -------------------------------------------------------------------------------- /Code/evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 18, 2020 3 | Updated on Jan 25, 2021 4 | 5 | Evaluation indicators 6 | 7 | @author: Gengziyao(zggzy1996@163.com) 8 | """ 9 | import numpy as np 10 | from tqdm import tqdm 11 | 12 | 13 | def evaluate_model(model, test): 14 | """ 15 | evaluate model 16 | :param model: model of CF 17 | :param test: dict. 18 | :return: hit rate, ndcg 19 | """ 20 | hit, ndcg = 0, 0 21 | for user_id, item_id in tqdm(test.items()): 22 | item_rank = model.recommend(user_id) 23 | if item_id in item_rank: 24 | hit += 1 25 | ndcg += 1 / np.log2(item_rank.index(item_id) + 2) 26 | return hit / len(test), ndcg / len(test) -------------------------------------------------------------------------------- /Code/sim_matrix/i2i_sim_10000_10.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiyaoGeng/SimpleCF/4d05a9ad427ff77df93f71239e6c796c9070c119/Code/sim_matrix/i2i_sim_10000_10.pkl -------------------------------------------------------------------------------- /Code/sim_matrix/u2u_sim_10000_10.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiyaoGeng/SimpleCF/4d05a9ad427ff77df93f71239e6c796c9070c119/Code/sim_matrix/u2u_sim_10000_10.pkl -------------------------------------------------------------------------------- /Code/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 18, 2020 3 | Updated on Jan 25, 2021 4 | 5 | load ml-1m 6 | 7 | @author: Gengziyao(zggzy1996@163.com) 8 | """ 9 | import numpy as np 10 | import pandas as pd 11 | from tqdm import tqdm 12 | 13 | 14 | def load_ml_1m(file, topN, trans_score=2, sample_num=-1): 15 | """ 16 | :param file: A string. dataset path. 17 | :param topN: A scalar. The number of recommender list 18 | :param trans_score: A scalar. Greater than it is 1, and less than it is 0. 19 | :param sample_num: A scalar. The number of data. 20 | :return: user_item_dict, item_user_dict, item_hot_list, test_data 21 | """ 22 | print('========== Data Preprocess Start =============') 23 | names = ['user_id', 'item_id', 'score', 'Timestamp'] 24 | if sample_num > 0: 25 | data_df = pd.read_csv(file, sep="::", engine='python', iterator=True, header=None, 26 | names=names) 27 | data_df = data_df.get_chunk(sample_num) 28 | else: 29 | data_df = pd.read_csv(file, sep="::", engine='python', names=names) 30 | data_df = data_df[data_df.score >= trans_score] # trans score 31 | data_df = data_df.sort_values(by=['user_id', 'Timestamp']) # sort 32 | 33 | test_data = pd.DataFrame() 34 | for user_id, df in tqdm(data_df.groupby('user_id')): 35 | # use last i️nteracted movie for each user 36 | test_data = pd.concat([test_data, df.iloc[-1:]], axis=0) 37 | data_df = data_df.drop(index=test_data.index) 38 | # user_item_dict 39 | user_item_df = data_df.groupby('user_id')[['item_id', 'score']].apply(lambda x: list(zip(x['item_id'], x['score']))) 40 | user_item_dict = dict(zip(user_item_df.index, user_item_df.values)) 41 | # item_user_dict 42 | item_user_df = data_df.groupby('item_id')[['user_id', 'score']].apply(lambda x: list(zip(x['user_id'], x['score']))) 43 | item_user_dict = dict(zip(item_user_df.index, item_user_df.values)) 44 | # hot_item_list 45 | item_hot_list = data_df['item_id'].value_counts().index[:topN].tolist() 46 | test_data = dict(zip(test_data['user_id'], test_data['item_id'])) 47 | return user_item_dict, item_user_dict, item_hot_list, test_data 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SimpleCF 2 | Collaborative filtering that based on user behavior design is one of the most well-known, influential and widely used models in the recommendation world. Therefore, I share my code(ItemCF and UserCF) for a simple start. 3 | 4 | 5 | 6 | ## Code catalogue 7 | 8 | ```csharp 9 | . 10 | ├── dataset 11 | │ └── ml-1m 12 | │ └── ratings.dat 13 | ├── sim_matrix 14 | │ ├── i2i***.pkl 15 | │ └── u2u***.pkl 16 | ├── ItemCF.py 17 | ├── ItemCF_main.py 18 | ├── UserCF.py 19 | ├── UserCF_main.py 20 | ├── __init__.py 21 | ├── evaluate.py 22 | └── utils.py 23 | ``` 24 | 25 | 26 | 27 | ## Details 28 | 29 | **Dataset:** [ml-1m dataset](https://grouplens.org/datasets/movielens/) and `utils.py` contains details for processing dataset. 30 | 31 | --------------------------------------------------------------------------------