├── .gitignore ├── LICENSE ├── README.md ├── chapter2-Algorithms-user behavior based ├── CF │ ├── Dataset.py │ ├── __init__.py │ ├── chapter2_itemCF_backup.py │ ├── chapter2_userCF_backup.py │ ├── item-IUF.py │ ├── item_CF.py │ ├── item_CF_norm.py │ ├── metrics.py │ ├── readme.txt │ ├── recom_popular.py │ ├── recom_random.py │ ├── train_itemCFs.py │ ├── train_userCFs.py │ ├── user_CF.py │ └── user_IIF.py ├── LFM │ ├── Dataset.py │ ├── LFM-backup.py │ ├── __init__.py │ ├── metrics.py │ ├── readme.txt │ └── train-LFM.py └── PersonalRank │ ├── Dataset.py │ ├── __init__.py │ ├── metrics.py │ ├── personalrank_example.py │ └── train_PersonalRank.py ├── chapter3_cold_start ├── Dataset.py ├── Metrics.py ├── __init__.py ├── train_item_inf_coldstart.py └── train_reg_inf_coldstart.py ├── chapter4_tags_based └── train_user_tags_based.py ├── chapter5_context_inf ├── Dataset.py ├── Metric.py ├── __init__.py └── train_context_inf.py ├── chapter6_socail_network └── train_social_network.py ├── chapter8_score_pre ├── __init__.py ├── train_cascade_model.py └── train_score_pre.py └── images ├── TopN推荐.png ├── 推荐系统.png ├── 推荐系统架构.png └── 评分预测推荐.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 wyw 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Recommended-system-practice 2 | 项亮等《推荐系统实践》练习 3 | 4 | 5 | ![Image text](https://github.com/wangyuyunmu/Recommended-system-practice/blob/master/images/%E6%8E%A8%E8%8D%90%E7%B3%BB%E7%BB%9F.png) 6 | 7 | ![Image text](https://github.com/wangyuyunmu/Recommended-system-practice/blob/master/images/TopN%E6%8E%A8%E8%8D%90.png) 8 | 9 | ![Image text](https://github.com/wangyuyunmu/Recommended-system-practice/blob/master/images/%E8%AF%84%E5%88%86%E9%A2%84%E6%B5%8B%E6%8E%A8%E8%8D%90.png) 10 | 11 | ![Image text](https://github.com/wangyuyunmu/Recommended-system-practice/blob/master/images/%E6%8E%A8%E8%8D%90%E7%B3%BB%E7%BB%9F%E6%9E%B6%E6%9E%84.png) 12 | 13 | 14 | 其中所涉及到的数据集: 15 | 16 | MovieLens: http://www.grouplens.org/node/73 17 | 18 | delicious-2k: http://files.grouplens.org/datasets/hetrec2011/hetrec2011-delicious-2k.zip 19 | 20 | lastfm-dataset-360K: http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz 21 | 22 | slashdot: http://snap.stanford.edu/data/soc-Slashdot0902.txt.gz 23 | 24 | epinions: http://snap.stanford.edu/data/soc-Epinions1.txt.gz 25 | 26 | lastfm-dataset-360K大概500M,其他均比较小 27 | 28 | slashdot,epinions是关于社交关系的,内容比较简单 29 | -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/Dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | # 定义装饰器,监控运行时间 5 | def timmer(func): 6 | def wrapper(*args, **kwargs): 7 | start_time = time.time() 8 | res = func(*args, **kwargs) 9 | stop_time = time.time() 10 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 11 | return res 12 | return wrapper 13 | 14 | # shju 处理相关 load data/split data 15 | class Dataset(): 16 | 17 | def __init__(self, fp): 18 | # fp: data file path 19 | self.data = self.loadData(fp) 20 | 21 | @timmer 22 | def loadData(self, fp): 23 | data = [] 24 | for l in open(fp): 25 | data.append(tuple(map(int, l.strip().split('::')[:2]))) 26 | return data 27 | 28 | @timmer 29 | def splitData(self, M, k, seed=1): 30 | ''' 31 | :params: data, 加载的所有(user, item)数据条目 32 | :params: M, 划分的数目,最后需要取M折的平均 33 | :params: k, 本次是第几次划分,k~[0, M) 34 | :params: seed, random的种子数,对于不同的k应设置成一样的 35 | :return: train, test 36 | ''' 37 | train, test = [], [] 38 | random.seed(seed) 39 | for user, item in self.data: 40 | if random.randint(0, M - 1) == k: 41 | test.append((user, item)) 42 | else: 43 | train.append((user, item)) 44 | 45 | # 处理成字典的形式,user->set(items) 46 | def convert_dict(data): 47 | data_dict = {} 48 | for user, item in data: 49 | if user not in data_dict: 50 | data_dict[user] = set() 51 | data_dict[user].add(item) 52 | data_dict = {k: list(data_dict[k]) for k in data_dict} 53 | return data_dict 54 | 55 | return convert_dict(train), convert_dict(test) -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/chapter2-Algorithms-user behavior based/CF/__init__.py -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/chapter2_itemCF_backup.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import time 5 | from tqdm import tqdm 6 | 7 | 8 | # 定义装饰器,监控运行时间 9 | def timmer(func): 10 | def wrapper(*args, **kwargs): 11 | start_time = time.time() 12 | res = func(*args, **kwargs) 13 | stop_time = time.time() 14 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 15 | return res 16 | return wrapper 17 | 18 | 19 | class Dataset(): 20 | 21 | def __init__(self, fp): 22 | # fp: data file path 23 | self.data = self.loadData(fp) 24 | 25 | @timmer 26 | def loadData(self, fp): 27 | data = [] 28 | for l in open(fp): 29 | data.append(tuple(map(int, l.strip().split('::')[:2]))) 30 | return data 31 | 32 | @timmer 33 | def splitData(self, M, k, seed=1): 34 | ''' 35 | :params: data, 加载的所有(user, item)数据条目 36 | :params: M, 划分的数目,最后需要取M折的平均 37 | :params: k, 本次是第几次划分,k~[0, M) 38 | :params: seed, random的种子数,对于不同的k应设置成一样的 39 | :return: train, test 40 | ''' 41 | train, test = [], [] 42 | random.seed(seed) 43 | for user, item in self.data: 44 | # 这里与书中的不一致,本人认为取M-1较为合理,因randint是左右都覆盖的 45 | if random.randint(0, M - 1) == k: 46 | test.append((user, item)) 47 | else: 48 | train.append((user, item)) 49 | 50 | # 处理成字典的形式,user->set(items) 51 | def convert_dict(data): 52 | data_dict = {} 53 | for user, item in data: 54 | if user not in data_dict: 55 | data_dict[user] = set() 56 | data_dict[user].add(item) 57 | data_dict = {k: list(data_dict[k]) for k in data_dict} 58 | return data_dict 59 | 60 | return convert_dict(train), convert_dict(test) 61 | 62 | 63 | class Metric(): 64 | 65 | def __init__(self, train, test, GetRecommendation): 66 | ''' 67 | :params: train, 训练数据 68 | :params: test, 测试数据 69 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 70 | ''' 71 | self.train = train 72 | self.test = test 73 | self.GetRecommendation = GetRecommendation 74 | self.recs = self.getRec() 75 | 76 | # 为test中的每个用户进行推荐 77 | def getRec(self): 78 | recs = {} 79 | for user in self.test: 80 | rank = self.GetRecommendation(user) 81 | recs[user] = rank 82 | return recs 83 | 84 | # 定义精确率指标计算方式 85 | def precision(self): 86 | all, hit = 0, 0 87 | for user in self.test: 88 | test_items = set(self.test[user]) 89 | rank = self.recs[user] 90 | for item, score in rank: 91 | if item in test_items: 92 | hit += 1 93 | all += len(rank) 94 | return round(hit / all * 100, 2) 95 | 96 | # 定义召回率指标计算方式 97 | def recall(self): 98 | all, hit = 0, 0 99 | for user in self.test: 100 | test_items = set(self.test[user]) 101 | rank = self.recs[user] 102 | for item, score in rank: 103 | if item in test_items: 104 | hit += 1 105 | all += len(test_items) 106 | return round(hit / all * 100, 2) 107 | 108 | # 定义覆盖率指标计算方式 109 | def coverage(self): 110 | all_item, recom_item = set(), set() 111 | for user in self.test: 112 | for item in self.train[user]: 113 | all_item.add(item) 114 | rank = self.recs[user] 115 | for item, score in rank: 116 | recom_item.add(item) 117 | return round(len(recom_item) / len(all_item) * 100, 2) 118 | 119 | # 定义新颖度指标计算方式 120 | def popularity(self): 121 | # 计算物品的流行度 122 | item_pop = {} 123 | for user in self.train: 124 | for item in self.train[user]: 125 | if item not in item_pop: 126 | item_pop[item] = 0 127 | item_pop[item] += 1 128 | 129 | num, pop = 0, 0 130 | for user in self.test: 131 | rank = self.recs[user] 132 | for item, score in rank: 133 | # 取对数,防止因长尾问题带来的被流行物品所主导 134 | pop += math.log(1 + item_pop[item]) 135 | num += 1 136 | return round(pop / num, 6) 137 | 138 | def eval(self): 139 | metric = {'Precision': self.precision(), 140 | 'Recall': self.recall(), 141 | 'Coverage': self.coverage(), 142 | 'Popularity': self.popularity()} 143 | print('Metric:', metric) 144 | return metric 145 | 146 | 147 | # 1. 基于物品余弦相似度的推荐 148 | def ItemCF(train, K, N): 149 | ''' 150 | :params: train, 训练数据集 151 | :params: K, 超参数,设置取TopK相似物品数目 152 | :params: N, 超参数,设置取TopN推荐物品数目 153 | :return: GetRecommendation, 推荐接口函数 154 | ''' 155 | # 计算物品相似度矩阵 156 | sim = {} 157 | num = {} 158 | for user in train: 159 | items = train[user] 160 | for i in range(len(items)): 161 | u = items[i] 162 | if u not in num: 163 | num[u] = 0 164 | num[u] += 1 165 | if u not in sim: 166 | sim[u] = {} 167 | for j in range(len(items)): 168 | if j == i: continue 169 | v = items[j] 170 | if v not in sim[u]: 171 | sim[u][v] = 0 172 | sim[u][v] += 1 173 | for u in sim: 174 | for v in sim[u]: 175 | sim[u][v] /= math.sqrt(num[u] * num[v]) 176 | 177 | # 按照相似度排序 178 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 179 | for k, v in sim.items()} 180 | 181 | # 获取接口函数 182 | def GetRecommendation(user): 183 | items = {} 184 | seen_items = set(train[user]) 185 | for item in train[user]: 186 | for u, _ in sorted_item_sim[item][:K]: 187 | if u not in seen_items: 188 | if u not in items: 189 | items[u] = 0 190 | items[u] += sim[item][u] 191 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 192 | return recs 193 | 194 | return GetRecommendation 195 | 196 | 197 | # 2. 基于改进的物品余弦相似度的推荐 198 | def ItemIUF(train, K, N): 199 | ''' 200 | :params: train, 训练数据集 201 | :params: K, 超参数,设置取TopK相似物品数目 202 | :params: N, 超参数,设置取TopN推荐物品数目 203 | :return: GetRecommendation, 推荐接口函数 204 | ''' 205 | # 计算物品相似度矩阵 206 | sim = {} 207 | num = {} 208 | for user in train: 209 | items = train[user] 210 | for i in range(len(items)): 211 | u = items[i] 212 | if u not in num: 213 | num[u] = 0 214 | num[u] += 1 215 | if u not in sim: 216 | sim[u] = {} 217 | for j in range(len(items)): 218 | if j == i: continue 219 | v = items[j] 220 | if v not in sim[u]: 221 | sim[u][v] = 0 222 | # 相比ItemCF,主要是改进了这里 223 | sim[u][v] += 1 / math.log(1 + len(items)) 224 | for u in sim: 225 | for v in sim[u]: 226 | sim[u][v] /= math.sqrt(num[u] * num[v]) 227 | 228 | # 按照相似度排序 229 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 230 | for k, v in sim.items()} 231 | 232 | # 获取接口函数 233 | def GetRecommendation(user): 234 | items = {} 235 | seen_items = set(train[user]) 236 | for item in train[user]: 237 | for u, _ in sorted_item_sim[item][:K]: 238 | # 要去掉用户见过的 239 | if u not in seen_items: 240 | if u not in items: 241 | items[u] = 0 242 | items[u] += sim[item][u] 243 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 244 | return recs 245 | 246 | return GetRecommendation 247 | 248 | 249 | # 3. 基于归一化的物品余弦相似度的推荐 250 | def ItemCF_Norm(train, K, N): 251 | ''' 252 | :params: train, 训练数据集 253 | :params: K, 超参数,设置取TopK相似物品数目 254 | :params: N, 超参数,设置取TopN推荐物品数目 255 | :return: GetRecommendation, 推荐接口函数 256 | ''' 257 | # 计算物品相似度矩阵 258 | sim = {} 259 | num = {} 260 | for user in train: 261 | items = train[user] 262 | for i in range(len(items)): 263 | u = items[i] 264 | if u not in num: 265 | num[u] = 0 266 | num[u] += 1 267 | if u not in sim: 268 | sim[u] = {} 269 | for j in range(len(items)): 270 | if j == i: continue 271 | v = items[j] 272 | if v not in sim[u]: 273 | sim[u][v] = 0 274 | sim[u][v] += 1 275 | for u in sim: 276 | for v in sim[u]: 277 | sim[u][v] /= math.sqrt(num[u] * num[v]) 278 | 279 | # 对相似度矩阵进行按行归一化 280 | for u in sim: 281 | s = 0 282 | for v in sim[u]: 283 | s += sim[u][v] 284 | if s > 0: 285 | for v in sim[u]: 286 | sim[u][v] /= s 287 | 288 | # 按照相似度排序 289 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 290 | for k, v in sim.items()} 291 | 292 | # 获取接口函数 293 | def GetRecommendation(user): 294 | items = {} 295 | seen_items = set(train[user]) 296 | for item in train[user]: 297 | for u, _ in sorted_item_sim[item][:K]: 298 | if u not in seen_items: 299 | if u not in items: 300 | items[u] = 0 301 | items[u] += sim[item][u] 302 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 303 | return recs 304 | 305 | return GetRecommendation 306 | 307 | 308 | class Experiment(): 309 | 310 | def __init__(self, M, K, N, fp='E:\PythonWorkSpace\pycharm\data\movies_data\\ratings.dat', rt='ItemCF'): 311 | ''' 312 | :params: M, 进行多少次实验 313 | :params: K, TopK相似物品的个数 314 | :params: N, TopN推荐物品的个数 315 | :params: fp, 数据文件路径 316 | :params: rt, 推荐算法类型 317 | ''' 318 | self.M = M 319 | self.K = K 320 | self.N = N 321 | self.fp = fp 322 | self.rt = rt 323 | self.alg = {'ItemCF': ItemCF, 'ItemIUF': ItemIUF, 'ItemCF-Norm': ItemCF_Norm} 324 | 325 | # 定义单次实验 326 | @timmer 327 | def worker(self, train, test): 328 | ''' 329 | :params: train, 训练数据集 330 | :params: test, 测试数据集 331 | :return: 各指标的值 332 | ''' 333 | getRecommendation = self.alg[self.rt](train, self.K, self.N) 334 | metric = Metric(train, test, getRecommendation) 335 | return metric.eval() 336 | 337 | # 多次实验取平均 338 | @timmer 339 | def run(self): 340 | metrics = {'Precision': 0, 'Recall': 0, 341 | 'Coverage': 0, 'Popularity': 0} 342 | dataset = Dataset(self.fp) 343 | for ii in range(self.M): 344 | train, test = dataset.splitData(self.M, ii) 345 | print('Experiment {}:'.format(ii)) 346 | metric = self.worker(train, test) 347 | metrics = {k: metrics[k] + metric[k] for k in metrics} 348 | metrics = {k: metrics[k] / self.M for k in metrics} 349 | print('Average Result (M={}, K={}, N={}): {}'.format(self.M, self.K, self.N, metrics)) 350 | 351 | # 1. ItemCF实验 352 | # M, N = 8, 10 353 | # # for K in [5, 10, 20, 40, 80, 160]: 354 | # # cf_exp = Experiment(M, K, N, rt='ItemCF') 355 | # # cf_exp.run() 356 | # cf_exp = Experiment(M, 10, N, rt='ItemCF') 357 | # cf_exp.run() 358 | 359 | 360 | # 2. ItemIUF实验 361 | M, N = 8, 10 362 | K = 10 # 与书中保持一致 363 | iuf_exp = Experiment(M, K, N, rt='ItemIUF') 364 | iuf_exp.run() 365 | 366 | # 3. ItemCF-Norm实验 367 | M, N = 8, 10 368 | K = 10 # 与书中保持一致 369 | norm_exp = Experiment(M, K, N, rt='ItemCF-Norm') 370 | norm_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/chapter2_userCF_backup.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import time 4 | from tqdm import tqdm 5 | 6 | # 定义装饰器,监控运行时间 7 | def timmer(func): 8 | def wrapper(*args, **kwargs): 9 | start_time = time.time() 10 | res = func(*args, **kwargs) 11 | stop_time = time.time() 12 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 13 | return res 14 | return wrapper 15 | 16 | # shju 处理相关 load data/split data 17 | class Dataset(): 18 | 19 | def __init__(self, fp): 20 | # fp: data file path 21 | self.data = self.loadData(fp) 22 | 23 | @timmer 24 | def loadData(self, fp): 25 | data = [] 26 | for l in open(fp): 27 | data.append(tuple(map(int, l.strip().split('::')[:2]))) 28 | return data 29 | 30 | @timmer 31 | def splitData(self, M, k, seed=1): 32 | ''' 33 | :params: data, 加载的所有(user, item)数据条目 34 | :params: M, 划分的数目,最后需要取M折的平均 35 | :params: k, 本次是第几次划分,k~[0, M) 36 | :params: seed, random的种子数,对于不同的k应设置成一样的 37 | :return: train, test 38 | ''' 39 | train, test = [], [] 40 | random.seed(seed) 41 | for user, item in self.data: 42 | if random.randint(0, M - 1) == k: 43 | test.append((user, item)) 44 | else: 45 | train.append((user, item)) 46 | 47 | # 处理成字典的形式,user->set(items) 48 | def convert_dict(data): 49 | data_dict = {} 50 | for user, item in data: 51 | if user not in data_dict: 52 | data_dict[user] = set() 53 | data_dict[user].add(item) 54 | data_dict = {k: list(data_dict[k]) for k in data_dict} 55 | return data_dict 56 | 57 | return convert_dict(train), convert_dict(test) 58 | 59 | # 评价指标 precision、recall、coverage、popularity 60 | class Metric(): 61 | 62 | def __init__(self, train, test, GetRecommendation): 63 | ''' 64 | :params: train, 训练数据 65 | :params: test, 测试数据 66 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 67 | ''' 68 | self.train = train 69 | self.test = test 70 | self.GetRecommendation = GetRecommendation 71 | self.recs = self.getRec() 72 | 73 | # 为test中的每个用户进行推荐 74 | def getRec(self): 75 | recs = {} 76 | for user in self.test: 77 | rank = self.GetRecommendation(user) 78 | recs[user] = rank 79 | return recs 80 | 81 | # 定义精确率指标计算方式 82 | def precision(self): 83 | all, hit = 0, 0 84 | for user in self.test: 85 | test_items = set(self.test[user]) 86 | rank = self.recs[user] 87 | for item, score in rank: 88 | if item in test_items: 89 | hit += 1 90 | all += len(rank) 91 | return round(hit / all * 100, 2) 92 | 93 | # 定义召回率指标计算方式 94 | def recall(self): 95 | all, hit = 0, 0 96 | for user in self.test: 97 | test_items = set(self.test[user]) 98 | rank = self.recs[user] 99 | for item, score in rank: 100 | if item in test_items: 101 | hit += 1 102 | all += len(test_items) 103 | return round(hit / all * 100, 2) 104 | 105 | # 定义覆盖率指标计算方式 106 | def coverage(self): 107 | all_item, recom_item = set(), set() 108 | for user in self.test: 109 | for item in self.train[user]: 110 | all_item.add(item) 111 | rank = self.recs[user] 112 | for item, score in rank: 113 | recom_item.add(item) 114 | return round(len(recom_item) / len(all_item) * 100, 2) 115 | 116 | # 定义新颖度度指标计算方式 117 | def popularity(self): 118 | # 计算物品的流行度 119 | item_pop = {} 120 | for user in self.train: 121 | for item in self.train[user]: 122 | if item not in item_pop: 123 | item_pop[item] = 0 124 | item_pop[item] += 1 125 | 126 | num, pop = 0, 0 127 | for user in self.test: 128 | rank = self.recs[user] 129 | for item, score in rank: 130 | # 取对数,防止因长尾问题带来的被流行物品所主导 131 | pop += math.log(1 + item_pop[item]) 132 | num += 1 133 | return round(pop / num, 6) 134 | 135 | def eval(self): 136 | metric = {'Precision': self.precision(), 137 | 'Recall': self.recall(), 138 | 'Coverage': self.coverage(), 139 | 'Popularity': self.popularity()} 140 | print('Metric:', metric) 141 | return metric 142 | 143 | # 算法实现 random、mostpopular、userCF、userIIF 144 | # 1. 随机推荐 145 | def Random(train, K, N): 146 | ''' 147 | :params: train, 训练数据集 148 | :params: K, 可忽略 149 | :params: N, 超参数,设置取TopN推荐物品数目 150 | :return: GetRecommendation,推荐接口函数 151 | ''' 152 | items = {} 153 | for user in train: 154 | for item in train[user]: 155 | items[item] = 1 156 | 157 | def GetRecommendation(user): 158 | # 随机推荐N个未见过的 159 | user_items = set(train[user]) 160 | rec_items = {k: items[k] for k in items if k not in user_items} 161 | rec_items = list(rec_items.items()) 162 | random.shuffle(rec_items) 163 | return rec_items[:N] 164 | 165 | return GetRecommendation 166 | 167 | 168 | # 2. 热门推荐 169 | def MostPopular(train, K, N): 170 | ''' 171 | :params: train, 训练数据集 172 | :params: K, 可忽略 173 | :params: N, 超参数,设置取TopN推荐物品数目 174 | :return: GetRecommendation, 推荐接口函数 175 | ''' 176 | items = {} 177 | for user in train: 178 | for item in train[user]: 179 | if item not in items: 180 | items[item] = 0 181 | items[item] += 1 182 | 183 | def GetRecommendation(user): 184 | # 随机推荐N个没见过的最热门的 185 | user_items = set(train[user]) 186 | rec_items = {k: items[k] for k in items if k not in user_items} 187 | rec_items = list(sorted(rec_items.items(), key=lambda x: x[1], reverse=True)) 188 | return rec_items[:N] 189 | 190 | return GetRecommendation 191 | 192 | 193 | # 3. 基于用户余弦相似度的推荐 194 | def UserCF(train, K, N): 195 | ''' 196 | :params: train, 训练数据集 197 | :params: K, 超参数,设置取TopK相似用户数目 198 | :params: N, 超参数,设置取TopN推荐物品数目 199 | :return: GetRecommendation, 推荐接口函数 200 | ''' 201 | # 计算item->user的倒排索引 202 | item_users = {} 203 | for user in train: 204 | for item in train[user]: 205 | if item not in item_users: 206 | item_users[item] = [] 207 | item_users[item].append(user) 208 | 209 | # 计算用户相似度矩阵 210 | sim = {} 211 | num = {} 212 | for item in item_users: 213 | users = item_users[item] 214 | for i in range(len(users)): 215 | u = users[i] 216 | if u not in num: 217 | num[u] = 0 218 | num[u] += 1 219 | if u not in sim: 220 | sim[u] = {} 221 | for j in range(len(users)): 222 | if j == i: continue 223 | v = users[j] 224 | if v not in sim[u]: 225 | sim[u][v] = 0 226 | sim[u][v] += 1 227 | for u in sim: 228 | for v in sim[u]: 229 | sim[u][v] /= math.sqrt(num[u] * num[v]) 230 | 231 | # 按照相似度排序 232 | sorted_user_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) 233 | for k, v in sim.items()} 234 | 235 | # 获取接口函数 236 | def GetRecommendation(user): 237 | items = {} 238 | seen_items = set(train[user]) 239 | for u, _ in sorted_user_sim[user][:K]: 240 | for item in train[u]: 241 | # 要去掉用户见过的 242 | if item not in seen_items: 243 | if item not in items: 244 | items[item] = 0 245 | items[item] += sim[user][u] 246 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 247 | return recs 248 | 249 | return GetRecommendation 250 | 251 | 252 | # 4. 基于改进的用户余弦相似度的推荐 253 | def UserIIF(train, K, N): 254 | ''' 255 | :params: train, 训练数据集 256 | :params: K, 超参数,设置取TopK相似用户数目 257 | :params: N, 超参数,设置取TopN推荐物品数目 258 | :return: GetRecommendation, 推荐接口函数 259 | ''' 260 | # 计算item->user的倒排索引 261 | item_users = {} 262 | for user in train: 263 | for item in train[user]: 264 | if item not in item_users: 265 | item_users[item] = [] 266 | item_users[item].append(user) 267 | 268 | # 计算用户相似度矩阵 269 | sim = {} 270 | num = {} 271 | for item in item_users: 272 | users = item_users[item] 273 | for i in range(len(users)): 274 | u = users[i] 275 | if u not in num: 276 | num[u] = 0 277 | num[u] += 1 278 | if u not in sim: 279 | sim[u] = {} 280 | for j in range(len(users)): 281 | if j == i: continue 282 | v = users[j] 283 | if v not in sim[u]: 284 | sim[u][v] = 0 285 | # 相比UserCF,主要是改进了这里,同一个item,感兴趣的人越多,说明比较热门,相似度权重要小,越冷门的item的user相似度越高 286 | sim[u][v] += 1 / math.log(1 + len(users)) 287 | for u in sim: 288 | for v in sim[u]: 289 | sim[u][v] /= math.sqrt(num[u] * num[v]) 290 | 291 | # 按照相似度排序 292 | sorted_user_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 293 | for k, v in sim.items()} 294 | 295 | # 获取接口函数 296 | def GetRecommendation(user): 297 | items = {} 298 | seen_items = set(train[user]) 299 | for u, _ in sorted_user_sim[user][:K]: 300 | for item in train[u]: 301 | # 要去掉用户见过的 302 | if item not in seen_items: 303 | if item not in items: 304 | items[item] = 0 305 | items[item] += sim[user][u] 306 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 307 | return recs 308 | 309 | return GetRecommendation 310 | 311 | # 测试 312 | class Experiment(): 313 | 314 | def __init__(self, M, K, N, fp='E:\PythonWorkSpace\pycharm\data\movies_data\\ratings.dat', rt='UserCF'): 315 | ''' 316 | :params: M, 进行多少次实验 317 | :params: K, TopK相似用户的个数 318 | :params: N, TopN推荐物品的个数 319 | :params: fp, 数据文件路径 320 | :params: rt, 推荐算法类型 321 | ''' 322 | self.M = M 323 | self.K = K 324 | self.N = N 325 | self.fp = fp 326 | self.rt = rt 327 | self.alg = {'Random': Random, 'MostPopular': MostPopular, 328 | 'UserCF': UserCF, 'UserIIF': UserIIF} 329 | 330 | # 定义单次实验 331 | @timmer 332 | def worker(self, train, test): 333 | ''' 334 | :params: train, 训练数据集 335 | :params: test, 测试数据集 336 | :return: 各指标的值 337 | ''' 338 | getRecommendation = self.alg[self.rt](train, self.K, self.N) 339 | metric = Metric(train, test, getRecommendation) 340 | return metric.eval() 341 | 342 | # 多次实验取平均 343 | @timmer 344 | def run(self): 345 | metrics = {'Precision': 0, 'Recall': 0, 346 | 'Coverage': 0, 'Popularity': 0} 347 | dataset = Dataset(self.fp) 348 | for ii in range(self.M): 349 | train, test = dataset.splitData(self.M, ii) 350 | print('Experiment {}:'.format(ii)) 351 | metric = self.worker(train, test) 352 | metrics = {k: metrics[k] + metric[k] for k in metrics} 353 | metrics = {k: metrics[k] / self.M for k in metrics} 354 | print('Average Result (M={}, K={}, N={}): {}'.format(self.M, self.K, self.N, metrics)) 355 | 356 | -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/item-IUF.py: -------------------------------------------------------------------------------- 1 | from code_w.recommand.chapter2.train_itemCFs import Experiment 2 | 3 | 4 | # 2. ItemIUF实验 5 | M, N = 8, 10 6 | K = 10 # 与书中保持一致 7 | iuf_exp = Experiment(M, K, N, rt='ItemIUF') 8 | iuf_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/item_CF.py: -------------------------------------------------------------------------------- 1 | from code_w.recommand.chapter2.train_itemCFs import Experiment 2 | 3 | 4 | # 1. ItemCF实验 5 | M, N = 8, 10 6 | for K in [5, 10, 20, 40, 80, 160]: 7 | cf_exp = Experiment(M, K, N, rt='ItemCF') 8 | cf_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/item_CF_norm.py: -------------------------------------------------------------------------------- 1 | from code_w.recommand.chapter2.train_itemCFs import Experiment 2 | 3 | # 3. ItemCF-Norm实验 4 | M, N = 8, 10 5 | K = 10 # 与书中保持一致 6 | norm_exp = Experiment(M, K, N, rt='ItemCF-Norm') 7 | norm_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | 4 | # 评价指标 precision、recall、coverage、popularity 5 | class Metric(): 6 | 7 | def __init__(self, train, test, GetRecommendation): 8 | ''' 9 | :params: train, 训练数据 10 | :params: test, 测试数据 11 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 12 | ''' 13 | self.train = train 14 | self.test = test 15 | self.GetRecommendation = GetRecommendation 16 | self.recs = self.getRec() 17 | 18 | # 为test中的每个用户进行推荐 19 | def getRec(self): 20 | recs = {} 21 | for user in self.test: 22 | rank = self.GetRecommendation(user) 23 | recs[user] = rank 24 | return recs 25 | 26 | # 定义精确率指标计算方式 27 | def precision(self): 28 | all, hit = 0, 0 29 | for user in self.test: 30 | test_items = set(self.test[user]) 31 | rank = self.recs[user] 32 | for item, score in rank: 33 | if item in test_items: 34 | hit += 1 35 | all += len(rank) 36 | return round(hit / all * 100, 2) 37 | 38 | # 定义召回率指标计算方式 39 | def recall(self): 40 | all, hit = 0, 0 41 | for user in self.test: 42 | test_items = set(self.test[user]) 43 | rank = self.recs[user] 44 | for item, score in rank: 45 | if item in test_items: 46 | hit += 1 47 | all += len(test_items) 48 | return round(hit / all * 100, 2) 49 | 50 | # 定义覆盖率指标计算方式 51 | def coverage(self): 52 | all_item, recom_item = set(), set() 53 | for user in self.test: 54 | for item in self.train[user]: 55 | all_item.add(item) 56 | rank = self.recs[user] 57 | for item, score in rank: 58 | recom_item.add(item) 59 | return round(len(recom_item) / len(all_item) * 100, 2) 60 | 61 | # 定义新颖度度指标计算方式 62 | def popularity(self): 63 | # 计算物品的流行度 64 | item_pop = {} 65 | for user in self.train: 66 | for item in self.train[user]: 67 | if item not in item_pop: 68 | item_pop[item] = 0 69 | item_pop[item] += 1 70 | 71 | num, pop = 0, 0 72 | for user in self.test: 73 | rank = self.recs[user] 74 | for item, score in rank: 75 | # 取对数,防止因长尾问题带来的被流行物品所主导 76 | pop += math.log(1 + item_pop[item]) 77 | num += 1 78 | return round(pop / num, 6) 79 | 80 | def eval(self): 81 | metric = {'Precision': self.precision(), 82 | 'Recall': self.recall(), 83 | 'Coverage': self.coverage(), 84 | 'Popularity': self.popularity()} 85 | print('Metric:', metric) 86 | return metric -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/chapter2-Algorithms-user behavior based/CF/readme.txt -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/recom_popular.py: -------------------------------------------------------------------------------- 1 | 2 | from code_w.recommand.chapter2.train_userCFs import Experiment 3 | 4 | # 2. MostPopular实验 5 | M, N = 8, 10 6 | K = 0 # 为保持一致而设置,随便填一个值 7 | mp_exp = Experiment(M, K, N, rt='MostPopular') 8 | mp_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/recom_random.py: -------------------------------------------------------------------------------- 1 | 2 | from code_w.recommand.chapter2.train_userCFs import Experiment 3 | 4 | # 1. random实验 5 | M, N = 8, 10 6 | K = 0 # 为保持一致而设置,随便填一个值 7 | random_exp = Experiment(M, K, N, rt='Random') 8 | random_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/train_itemCFs.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import time 5 | from tqdm import tqdm 6 | from code_w.recommand.chapter2.metrics import Metric 7 | from code_w.recommand.chapter2.Dataset import Dataset 8 | 9 | 10 | # 定义装饰器,监控运行时间 11 | def timmer(func): 12 | def wrapper(*args, **kwargs): 13 | start_time = time.time() 14 | res = func(*args, **kwargs) 15 | stop_time = time.time() 16 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 17 | return res 18 | return wrapper 19 | 20 | 21 | # 1. 基于物品余弦相似度的推荐 22 | def ItemCF(train, K, N): 23 | ''' 24 | :params: train, 训练数据集 25 | :params: K, 超参数,设置取TopK相似物品数目 26 | :params: N, 超参数,设置取TopN推荐物品数目 27 | :return: GetRecommendation, 推荐接口函数 28 | ''' 29 | # 计算物品相似度矩阵 30 | sim = {} 31 | num = {} 32 | for user in train: 33 | items = train[user] 34 | for i in range(len(items)): 35 | u = items[i] 36 | if u not in num: 37 | num[u] = 0 38 | num[u] += 1 39 | if u not in sim: 40 | sim[u] = {} 41 | for j in range(len(items)): 42 | if j == i: continue 43 | v = items[j] 44 | if v not in sim[u]: 45 | sim[u][v] = 0 46 | sim[u][v] += 1 47 | for u in sim: 48 | for v in sim[u]: 49 | sim[u][v] /= math.sqrt(num[u] * num[v]) 50 | 51 | # 按照相似度排序 52 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 53 | for k, v in sim.items()} 54 | 55 | # 获取接口函数 56 | def GetRecommendation(user): 57 | items = {} 58 | seen_items = set(train[user]) 59 | for item in train[user]: 60 | for u, _ in sorted_item_sim[item][:K]: 61 | if u not in seen_items: 62 | if u not in items: 63 | items[u] = 0 64 | items[u] += sim[item][u] 65 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 66 | return recs 67 | 68 | return GetRecommendation 69 | 70 | 71 | # 2. 基于改进的物品余弦相似度的推荐 72 | def ItemIUF(train, K, N): 73 | ''' 74 | :params: train, 训练数据集 75 | :params: K, 超参数,设置取TopK相似物品数目 76 | :params: N, 超参数,设置取TopN推荐物品数目 77 | :return: GetRecommendation, 推荐接口函数 78 | ''' 79 | # 计算物品相似度矩阵 80 | sim = {} 81 | num = {} 82 | for user in train: 83 | items = train[user] 84 | for i in range(len(items)): 85 | u = items[i] 86 | if u not in num: 87 | num[u] = 0 88 | num[u] += 1 89 | if u not in sim: 90 | sim[u] = {} 91 | for j in range(len(items)): 92 | if j == i: continue 93 | v = items[j] 94 | if v not in sim[u]: 95 | sim[u][v] = 0 96 | # 相比ItemCF,主要是改进了这里 97 | sim[u][v] += 1 / math.log(1 + len(items)) 98 | for u in sim: 99 | for v in sim[u]: 100 | sim[u][v] /= math.sqrt(num[u] * num[v]) 101 | 102 | # 按照相似度排序 103 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 104 | for k, v in sim.items()} 105 | 106 | # 获取接口函数 107 | def GetRecommendation(user): 108 | items = {} 109 | seen_items = set(train[user]) 110 | for item in train[user]: 111 | for u, _ in sorted_item_sim[item][:K]: 112 | # 要去掉用户见过的 113 | if u not in seen_items: 114 | if u not in items: 115 | items[u] = 0 116 | items[u] += sim[item][u] 117 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 118 | return recs 119 | 120 | return GetRecommendation 121 | 122 | 123 | # 3. 基于归一化的物品余弦相似度的推荐 124 | def ItemCF_Norm(train, K, N): 125 | ''' 126 | :params: train, 训练数据集 127 | :params: K, 超参数,设置取TopK相似物品数目 128 | :params: N, 超参数,设置取TopN推荐物品数目 129 | :return: GetRecommendation, 推荐接口函数 130 | ''' 131 | # 计算物品相似度矩阵 132 | sim = {} 133 | num = {} 134 | for user in train: 135 | items = train[user] 136 | for i in range(len(items)): 137 | u = items[i] 138 | if u not in num: 139 | num[u] = 0 140 | num[u] += 1 141 | if u not in sim: 142 | sim[u] = {} 143 | for j in range(len(items)): 144 | if j == i: continue 145 | v = items[j] 146 | if v not in sim[u]: 147 | sim[u][v] = 0 148 | sim[u][v] += 1 149 | for u in sim: 150 | for v in sim[u]: 151 | sim[u][v] /= math.sqrt(num[u] * num[v]) 152 | 153 | # 对相似度矩阵进行按行归一化 154 | for u in sim: 155 | s = 0 156 | for v in sim[u]: 157 | s += sim[u][v] 158 | if s > 0: 159 | for v in sim[u]: 160 | sim[u][v] /= s 161 | 162 | # 按照相似度排序 163 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 164 | for k, v in sim.items()} 165 | 166 | # 获取接口函数 167 | def GetRecommendation(user): 168 | items = {} 169 | seen_items = set(train[user]) 170 | for item in train[user]: 171 | for u, _ in sorted_item_sim[item][:K]: 172 | if u not in seen_items: 173 | if u not in items: 174 | items[u] = 0 175 | items[u] += sim[item][u] 176 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 177 | return recs 178 | 179 | return GetRecommendation 180 | 181 | 182 | class Experiment(): 183 | 184 | def __init__(self, M, K, N, fp='E:\PythonWorkSpace\pycharm\data\movies_data\\ratings.dat', rt='ItemCF'): 185 | ''' 186 | :params: M, 进行多少次实验 187 | :params: K, TopK相似物品的个数 188 | :params: N, TopN推荐物品的个数 189 | :params: fp, 数据文件路径 190 | :params: rt, 推荐算法类型 191 | ''' 192 | self.M = M 193 | self.K = K 194 | self.N = N 195 | self.fp = fp 196 | self.rt = rt 197 | self.alg = {'ItemCF': ItemCF, 'ItemIUF': ItemIUF, 'ItemCF-Norm': ItemCF_Norm} 198 | 199 | # 定义单次实验 200 | @timmer 201 | def worker(self, train, test): 202 | ''' 203 | :params: train, 训练数据集 204 | :params: test, 测试数据集 205 | :return: 各指标的值 206 | ''' 207 | getRecommendation = self.alg[self.rt](train, self.K, self.N) 208 | metric = Metric(train, test, getRecommendation) 209 | return metric.eval() 210 | 211 | # 多次实验取平均 212 | @timmer 213 | def run(self): 214 | metrics = {'Precision': 0, 'Recall': 0, 215 | 'Coverage': 0, 'Popularity': 0} 216 | dataset = Dataset(self.fp) 217 | for ii in range(self.M): 218 | train, test = dataset.splitData(self.M, ii) 219 | print('Experiment {}:'.format(ii)) 220 | metric = self.worker(train, test) 221 | metrics = {k: metrics[k] + metric[k] for k in metrics} 222 | metrics = {k: metrics[k] / self.M for k in metrics} 223 | print('Average Result (M={}, K={}, N={}): {}'.format(self.M, self.K, self.N, metrics)) 224 | 225 | -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/train_userCFs.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import time 4 | from tqdm import tqdm 5 | from code_w.recommand.chapter2.Dataset import Dataset 6 | from code_w.recommand.chapter2.metrics import Metric 7 | 8 | # 定义装饰器,监控运行时间 9 | def timmer(func): 10 | def wrapper(*args, **kwargs): 11 | start_time = time.time() 12 | res = func(*args, **kwargs) 13 | stop_time = time.time() 14 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 15 | return res 16 | return wrapper 17 | 18 | # 算法实现 random、mostpopular、userCF、userIIF 19 | # 1. 随机推荐 20 | def Random(train, K, N): 21 | ''' 22 | :params: train, 训练数据集 23 | :params: K, 可忽略 24 | :params: N, 超参数,设置取TopN推荐物品数目 25 | :return: GetRecommendation,推荐接口函数 26 | ''' 27 | items = {} 28 | for user in train: 29 | for item in train[user]: 30 | items[item] = 1 31 | 32 | def GetRecommendation(user): 33 | # 随机推荐N个未见过的 34 | user_items = set(train[user]) 35 | rec_items = {k: items[k] for k in items if k not in user_items} 36 | rec_items = list(rec_items.items()) 37 | random.shuffle(rec_items) 38 | return rec_items[:N] 39 | 40 | return GetRecommendation 41 | 42 | 43 | # 2. 热门推荐 44 | def MostPopular(train, K, N): 45 | ''' 46 | :params: train, 训练数据集 47 | :params: K, 可忽略 48 | :params: N, 超参数,设置取TopN推荐物品数目 49 | :return: GetRecommendation, 推荐接口函数 50 | ''' 51 | items = {} 52 | for user in train: 53 | for item in train[user]: 54 | if item not in items: 55 | items[item] = 0 56 | items[item] += 1 57 | 58 | def GetRecommendation(user): 59 | # 随机推荐N个没见过的最热门的 60 | user_items = set(train[user]) 61 | rec_items = {k: items[k] for k in items if k not in user_items} 62 | rec_items = list(sorted(rec_items.items(), key=lambda x: x[1], reverse=True)) 63 | return rec_items[:N] 64 | 65 | return GetRecommendation 66 | 67 | 68 | # 3. 基于用户余弦相似度的推荐 69 | def UserCF(train, K, N): 70 | ''' 71 | :params: train, 训练数据集 72 | :params: K, 超参数,设置取TopK相似用户数目 73 | :params: N, 超参数,设置取TopN推荐物品数目 74 | :return: GetRecommendation, 推荐接口函数 75 | ''' 76 | # 计算item->user的倒排索引 77 | item_users = {} 78 | for user in train: 79 | for item in train[user]: 80 | if item not in item_users: 81 | item_users[item] = [] 82 | item_users[item].append(user) 83 | 84 | # 计算用户相似度矩阵 85 | sim = {} 86 | num = {} 87 | for item in item_users: 88 | users = item_users[item] 89 | for i in range(len(users)): 90 | u = users[i] 91 | if u not in num: 92 | num[u] = 0 93 | num[u] += 1 94 | if u not in sim: 95 | sim[u] = {} 96 | for j in range(len(users)): 97 | if j == i: continue 98 | v = users[j] 99 | if v not in sim[u]: 100 | sim[u][v] = 0 101 | sim[u][v] += 1 102 | for u in sim: 103 | for v in sim[u]: 104 | sim[u][v] /= math.sqrt(num[u] * num[v]) 105 | 106 | # 按照相似度排序 107 | sorted_user_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) 108 | for k, v in sim.items()} 109 | 110 | # 获取接口函数 111 | def GetRecommendation(user): 112 | items = {} 113 | seen_items = set(train[user]) 114 | for u, _ in sorted_user_sim[user][:K]: 115 | for item in train[u]: 116 | # 要去掉用户见过的 117 | if item not in seen_items: 118 | if item not in items: 119 | items[item] = 0 120 | items[item] += sim[user][u] 121 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 122 | return recs 123 | 124 | return GetRecommendation 125 | 126 | 127 | # 4. 基于改进的用户余弦相似度的推荐 128 | def UserIIF(train, K, N): 129 | ''' 130 | :params: train, 训练数据集 131 | :params: K, 超参数,设置取TopK相似用户数目 132 | :params: N, 超参数,设置取TopN推荐物品数目 133 | :return: GetRecommendation, 推荐接口函数 134 | ''' 135 | # 计算item->user的倒排索引 136 | item_users = {} 137 | for user in train: 138 | for item in train[user]: 139 | if item not in item_users: 140 | item_users[item] = [] 141 | item_users[item].append(user) 142 | 143 | # 计算用户相似度矩阵 144 | sim = {} 145 | num = {} 146 | for item in item_users: 147 | users = item_users[item] 148 | for i in range(len(users)): 149 | u = users[i] 150 | if u not in num: 151 | num[u] = 0 152 | num[u] += 1 153 | if u not in sim: 154 | sim[u] = {} 155 | for j in range(len(users)): 156 | if j == i: continue 157 | v = users[j] 158 | if v not in sim[u]: 159 | sim[u][v] = 0 160 | # 相比UserCF,主要是改进了这里,同一个item,感兴趣的人越多,说明比较热门,相似度权重要小,越冷门的item的user相似度越高 161 | sim[u][v] += 1 / math.log(1 + len(users)) 162 | for u in sim: 163 | for v in sim[u]: 164 | sim[u][v] /= math.sqrt(num[u] * num[v]) 165 | 166 | # 按照相似度排序 167 | sorted_user_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) 168 | for k, v in sim.items()} 169 | 170 | # 获取接口函数 171 | def GetRecommendation(user): 172 | items = {} 173 | seen_items = set(train[user]) 174 | for u, _ in sorted_user_sim[user][:K]: 175 | for item in train[u]: 176 | # 要去掉用户见过的 177 | if item not in seen_items: 178 | if item not in items: 179 | items[item] = 0 180 | items[item] += sim[user][u] 181 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 182 | return recs 183 | 184 | return GetRecommendation 185 | 186 | # 测试 187 | class Experiment(): 188 | 189 | def __init__(self, M, K, N, fp='E:\PythonWorkSpace\pycharm\data\movies_data\\ratings.dat', rt='UserCF'): 190 | ''' 191 | :params: M, 进行多少次实验 192 | :params: K, TopK相似用户的个数 193 | :params: N, TopN推荐物品的个数 194 | :params: fp, 数据文件路径 195 | :params: rt, 推荐算法类型 196 | ''' 197 | self.M = M 198 | self.K = K 199 | self.N = N 200 | self.fp = fp 201 | self.rt = rt 202 | self.alg = {'Random': Random, 'MostPopular': MostPopular, 203 | 'UserCF': UserCF, 'UserIIF': UserIIF} 204 | 205 | # 定义单次实验 206 | @timmer 207 | def worker(self, train, test): 208 | ''' 209 | :params: train, 训练数据集 210 | :params: test, 测试数据集 211 | :return: 各指标的值 212 | ''' 213 | getRecommendation = self.alg[self.rt](train, self.K, self.N) 214 | metric = Metric(train, test, getRecommendation) 215 | return metric.eval() 216 | 217 | # 多次实验取平均 218 | @timmer 219 | def run(self): 220 | metrics = {'Precision': 0, 'Recall': 0, 221 | 'Coverage': 0, 'Popularity': 0} 222 | dataset = Dataset(self.fp) 223 | for ii in range(self.M): 224 | train, test = dataset.splitData(self.M, ii) 225 | print('Experiment {}:'.format(ii)) 226 | metric = self.worker(train, test) 227 | metrics = {k: metrics[k] + metric[k] for k in metrics} 228 | metrics = {k: metrics[k] / self.M for k in metrics} 229 | print('Average Result (M={}, K={}, N={}): {}'.format(self.M, self.K, self.N, metrics)) 230 | 231 | -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/user_CF.py: -------------------------------------------------------------------------------- 1 | from code_w.recommand.chapter2.train_userCFs import Experiment 2 | 3 | # 3. UserCF实验 4 | M, N = 8, 10 5 | for K in [5, 10, 20, 40, 80, 160]: 6 | cf_exp = Experiment(M, K, N, rt='UserCF') 7 | cf_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/CF/user_IIF.py: -------------------------------------------------------------------------------- 1 | from code_w.recommand.chapter2.train_userCFs import Experiment 2 | 3 | # 4. UserIIF实验 4 | M, N = 8, 10 5 | K = 80 # 与书中保持一致 6 | iif_exp = Experiment(M, K, N, rt='UserIIF') 7 | iif_exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/LFM/Dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | # 定义装饰器,监控运行时间 5 | def timmer(func): 6 | def wrapper(*args, **kwargs): 7 | start_time = time.time() 8 | res = func(*args, **kwargs) 9 | stop_time = time.time() 10 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 11 | return res 12 | return wrapper 13 | 14 | # shju 处理相关 load data/split data 15 | class Dataset(): 16 | 17 | def __init__(self, fp): 18 | # fp: data file path 19 | self.data = self.loadData(fp) 20 | 21 | @timmer 22 | def loadData(self, fp): 23 | data = [] 24 | for l in open(fp): 25 | data.append(tuple(map(int, l.strip().split('::')[:2]))) 26 | return data 27 | 28 | @timmer 29 | def splitData(self, M, k, seed=1): 30 | ''' 31 | :params: data, 加载的所有(user, item)数据条目 32 | :params: M, 划分的数目,最后需要取M折的平均 33 | :params: k, 本次是第几次划分,k~[0, M) 34 | :params: seed, random的种子数,对于不同的k应设置成一样的 35 | :return: train, test 36 | ''' 37 | train, test = [], [] 38 | random.seed(seed) 39 | for user, item in self.data: 40 | if random.randint(0, M - 1) == k: 41 | test.append((user, item)) 42 | else: 43 | train.append((user, item)) 44 | 45 | # 处理成字典的形式,user->set(items) 46 | def convert_dict(data): 47 | data_dict = {} 48 | for user, item in data: 49 | if user not in data_dict: 50 | data_dict[user] = set() 51 | data_dict[user].add(item) 52 | data_dict = {k: list(data_dict[k]) for k in data_dict} 53 | return data_dict 54 | 55 | return convert_dict(train), convert_dict(test) -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/LFM/LFM-backup.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import numpy as np 5 | import time 6 | from tqdm import tqdm, trange 7 | 8 | 9 | # 定义装饰器,监控运行时间 10 | def timmer(func): 11 | def wrapper(*args, **kwargs): 12 | start_time = time.time() 13 | res = func(*args, **kwargs) 14 | stop_time = time.time() 15 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 16 | return res 17 | return wrapper 18 | 19 | 20 | class Dataset(): 21 | 22 | def __init__(self, fp): 23 | # fp: data file path 24 | self.data = self.loadData(fp) 25 | 26 | @timmer 27 | def loadData(self, fp): 28 | data = [] 29 | for l in open(fp): 30 | data.append(tuple(map(int, l.strip().split('::')[:2]))) 31 | return data 32 | 33 | @timmer 34 | def splitData(self, M, k, seed=1): 35 | ''' 36 | :params: data, 加载的所有(user, item)数据条目 37 | :params: M, 划分的数目,最后需要取M折的平均 38 | :params: k, 本次是第几次划分,k~[0, M) 39 | :params: seed, random的种子数,对于不同的k应设置成一样的 40 | :return: train, test 41 | ''' 42 | train, test = [], [] 43 | random.seed(seed) 44 | for user, item in self.data: 45 | # 这里与书中的不一致,本人认为取M-1较为合理,因randint是左右都覆盖的 46 | if random.randint(0, M - 1) == k: 47 | test.append((user, item)) 48 | else: 49 | train.append((user, item)) 50 | 51 | # 处理成字典的形式,user->set(items) 52 | def convert_dict(data): 53 | data_dict = {} 54 | for user, item in data: 55 | if user not in data_dict: 56 | data_dict[user] = set() 57 | data_dict[user].add(item) 58 | data_dict = {k: list(data_dict[k]) for k in data_dict} 59 | return data_dict 60 | 61 | return convert_dict(train), convert_dict(test) 62 | 63 | 64 | class Metric(): 65 | 66 | def __init__(self, train, test, GetRecommendation): 67 | ''' 68 | :params: train, 训练数据 69 | :params: test, 测试数据 70 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 71 | ''' 72 | self.train = train 73 | self.test = test 74 | self.GetRecommendation = GetRecommendation 75 | self.recs = self.getRec() 76 | 77 | # 为test中的每个用户进行推荐 78 | def getRec(self): 79 | recs = {} 80 | for user in self.test: 81 | rank = self.GetRecommendation(user) 82 | recs[user] = rank 83 | return recs 84 | 85 | # 定义精确率指标计算方式 86 | def precision(self): 87 | all, hit = 0, 0 88 | for user in self.test: 89 | test_items = set(self.test[user]) 90 | rank = self.recs[user] 91 | for item, score in rank: 92 | if item in test_items: 93 | hit += 1 94 | all += len(rank) 95 | return round(hit / all * 100, 2) 96 | 97 | # 定义召回率指标计算方式 98 | def recall(self): 99 | all, hit = 0, 0 100 | for user in self.test: 101 | test_items = set(self.test[user]) 102 | rank = self.recs[user] 103 | for item, score in rank: 104 | if item in test_items: 105 | hit += 1 106 | all += len(test_items) 107 | return round(hit / all * 100, 2) 108 | 109 | # 定义覆盖率指标计算方式 110 | def coverage(self): 111 | all_item, recom_item = set(), set() 112 | for user in self.test: 113 | for item in self.train[user]: 114 | all_item.add(item) 115 | rank = self.recs[user] 116 | for item, score in rank: 117 | recom_item.add(item) 118 | return round(len(recom_item) / len(all_item) * 100, 2) 119 | 120 | # 定义新颖度指标计算方式 121 | def popularity(self): 122 | # 计算物品的流行度 123 | item_pop = {} 124 | for user in self.train: 125 | for item in self.train[user]: 126 | if item not in item_pop: 127 | item_pop[item] = 0 128 | item_pop[item] += 1 129 | 130 | num, pop = 0, 0 131 | for user in self.test: 132 | rank = self.recs[user] 133 | for item, score in rank: 134 | # 取对数,防止因长尾问题带来的被流行物品所主导 135 | pop += math.log(1 + item_pop[item]) 136 | num += 1 137 | return round(pop / num, 6) 138 | 139 | def eval(self): 140 | metric = {'Precision': self.precision(), 141 | 'Recall': self.recall(), 142 | 'Coverage': self.coverage(), 143 | 'Popularity': self.popularity()} 144 | print('Metric:', metric) 145 | return metric 146 | 147 | 148 | def LFM(train, ratio, K, lr, step, lmbda, N): 149 | ''' 150 | :params: train, 训练数据 151 | :params: ratio, 负采样的正负比例 152 | :params: K, 隐语义个数 153 | :params: lr, 初始学习率 154 | :params: step, 迭代次数 155 | :params: lmbda, 正则化系数 156 | :params: N, 推荐TopN物品的个数 157 | :return: GetRecommendation, 获取推荐结果的接口 158 | ''' 159 | 160 | all_items = {} 161 | for user in train: 162 | for item in train[user]: 163 | if item not in all_items: 164 | all_items[item] = 0 165 | all_items[item] += 1 166 | all_items = list(all_items.items()) 167 | items = [x[0] for x in all_items] 168 | pops = [x[1] for x in all_items] 169 | 170 | # 负采样函数(注意!!!要按照流行度进行采样) 171 | def nSample(data, ratio): 172 | new_data = {} 173 | # 正样本 174 | for user in data: 175 | if user not in new_data: 176 | new_data[user] = {} 177 | for item in data[user]: 178 | new_data[user][item] = 1 179 | # 负样本 180 | for user in new_data: 181 | seen = set(new_data[user]) 182 | pos_num = len(seen) 183 | item = np.random.choice(items, int(pos_num * ratio * 3), pops) 184 | item = [x for x in item if x not in seen][:int(pos_num * ratio)] 185 | new_data[user].update({x: 0 for x in item}) 186 | 187 | return new_data 188 | 189 | # 训练 190 | P, Q = {}, {} 191 | for user in train: 192 | P[user] = np.random.random(K) 193 | for item in items: 194 | Q[item] = np.random.random(K) 195 | 196 | for s in trange(step): 197 | data = nSample(train, ratio) 198 | for user in data: 199 | for item in data[user]: 200 | eui = data[user][item] - (P[user] * Q[item]).sum() 201 | P[user] += lr * (Q[item] * eui - lmbda * P[user]) 202 | Q[item] += lr * (P[user] * eui - lmbda * Q[item]) 203 | lr *= 0.9 # 调整学习率 204 | 205 | # 获取接口函数 206 | def GetRecommendation(user): 207 | seen_items = set(train[user]) 208 | recs = {} 209 | for item in items: 210 | if item not in seen_items: 211 | recs[item] = (P[user] * Q[item]).sum() 212 | recs = list(sorted(recs.items(), key=lambda x: x[1], reverse=True))[:N] 213 | return recs 214 | 215 | return GetRecommendation 216 | 217 | 218 | class Experiment(): 219 | 220 | def __init__(self, M, N, ratio=1, 221 | K=100, lr=0.02, step=100, lmbda=0.01, fp='../dataset/ml_1m/ratings.dat'): 222 | ''' 223 | :params: M, 进行多少次实验 224 | :params: N, TopN推荐物品的个数 225 | :params: ratio, 正负样本比例 226 | :params: K, 隐语义个数 227 | :params: lr, 学习率 228 | :params: step, 训练步数 229 | :params: lmbda, 正则化系数 230 | :params: fp, 数据文件路径 231 | ''' 232 | self.M = M 233 | self.K = K 234 | self.N = N 235 | self.ratio = ratio 236 | self.lr = lr 237 | self.step = step 238 | self.lmbda = lmbda 239 | self.fp = fp 240 | self.alg = LFM 241 | 242 | # 定义单次实验 243 | @timmer 244 | def worker(self, train, test): 245 | ''' 246 | :params: train, 训练数据集 247 | :params: test, 测试数据集 248 | :return: 各指标的值 249 | ''' 250 | getRecommendation = self.alg(train, self.ratio, self.K, 251 | self.lr, self.step, self.lmbda, self.N) 252 | metric = Metric(train, test, getRecommendation) 253 | return metric.eval() 254 | 255 | # 多次实验取平均 256 | @timmer 257 | def run(self): 258 | metrics = {'Precision': 0, 'Recall': 0, 259 | 'Coverage': 0, 'Popularity': 0} 260 | dataset = Dataset(self.fp) 261 | for ii in range(self.M): 262 | train, test = dataset.splitData(self.M, ii) 263 | print('Experiment {}:'.format(ii)) 264 | metric = self.worker(train, test) 265 | metrics = {k: metrics[k] + metric[k] for k in metrics} 266 | metrics = {k: metrics[k] / self.M for k in metrics} 267 | print('Average Result (M={}, N={}, ratio={}): {}'.format( \ 268 | self.M, self.N, self.ratio, metrics)) 269 | 270 | # LFM实验(运行时间较长,这里没贴实验结果) 271 | M, N = 8, 10 272 | for r in [1, 2, 3, 5, 10, 20]: 273 | exp = Experiment(M, N, ratio=r) 274 | exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/LFM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/chapter2-Algorithms-user behavior based/LFM/__init__.py -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/LFM/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | 4 | # 评价指标 precision、recall、coverage、popularity 5 | class Metric(): 6 | 7 | def __init__(self, train, test, GetRecommendation): 8 | ''' 9 | :params: train, 训练数据 10 | :params: test, 测试数据 11 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 12 | ''' 13 | self.train = train 14 | self.test = test 15 | self.GetRecommendation = GetRecommendation 16 | self.recs = self.getRec() 17 | 18 | # 为test中的每个用户进行推荐 19 | def getRec(self): 20 | recs = {} 21 | for user in self.test: 22 | rank = self.GetRecommendation(user) 23 | recs[user] = rank 24 | return recs 25 | 26 | # 定义精确率指标计算方式 27 | def precision(self): 28 | all, hit = 0, 0 29 | for user in self.test: 30 | test_items = set(self.test[user]) 31 | rank = self.recs[user] 32 | for item, score in rank: 33 | if item in test_items: 34 | hit += 1 35 | all += len(rank) 36 | return round(hit / all * 100, 2) 37 | 38 | # 定义召回率指标计算方式 39 | def recall(self): 40 | all, hit = 0, 0 41 | for user in self.test: 42 | test_items = set(self.test[user]) 43 | rank = self.recs[user] 44 | for item, score in rank: 45 | if item in test_items: 46 | hit += 1 47 | all += len(test_items) 48 | return round(hit / all * 100, 2) 49 | 50 | # 定义覆盖率指标计算方式 51 | def coverage(self): 52 | all_item, recom_item = set(), set() 53 | for user in self.test: 54 | for item in self.train[user]: 55 | all_item.add(item) 56 | rank = self.recs[user] 57 | for item, score in rank: 58 | recom_item.add(item) 59 | return round(len(recom_item) / len(all_item) * 100, 2) 60 | 61 | # 定义新颖度度指标计算方式 62 | def popularity(self): 63 | # 计算物品的流行度 64 | item_pop = {} 65 | for user in self.train: 66 | for item in self.train[user]: 67 | if item not in item_pop: 68 | item_pop[item] = 0 69 | item_pop[item] += 1 70 | 71 | num, pop = 0, 0 72 | for user in self.test: 73 | rank = self.recs[user] 74 | for item, score in rank: 75 | # 取对数,防止因长尾问题带来的被流行物品所主导 76 | pop += math.log(1 + item_pop[item]) 77 | num += 1 78 | return round(pop / num, 6) 79 | 80 | def eval(self): 81 | metric = {'Precision': self.precision(), 82 | 'Recall': self.recall(), 83 | 'Coverage': self.coverage(), 84 | 'Popularity': self.popularity()} 85 | print('Metric:', metric) 86 | return metric -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/LFM/readme.txt: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg936\cocoartf1671\cocoasubrtf400 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;} 5 | \paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural\partightenfactor0 7 | 8 | \f0\fs24 \cf0 } -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/LFM/train-LFM.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import numpy as np 5 | import time 6 | from tqdm import tqdm, trange 7 | 8 | from Recommend.ml_1m.chapter2.LFM.metrics import Metric 9 | from Recommend.ml_1m.chapter2.LFM.Dataset import Dataset 10 | 11 | # 定义装饰器,监控运行时间 12 | def timmer(func): 13 | def wrapper(*args, **kwargs): 14 | start_time = time.time() 15 | res = func(*args, **kwargs) 16 | stop_time = time.time() 17 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 18 | return res 19 | return wrapper 20 | 21 | 22 | 23 | def LFM(train, ratio, K, lr, step, lmbda, N): 24 | ''' 25 | :params: train, 训练数据 26 | :params: ratio, 负采样的正负比例 27 | :params: K, 隐语义个数 28 | :params: lr, 初始学习率 29 | :params: step, 迭代次数 30 | :params: lmbda, 正则化系数 31 | :params: N, 推荐TopN物品的个数 32 | :return: GetRecommendation, 获取推荐结果的接口 33 | ''' 34 | 35 | all_items = {} 36 | for user in train: 37 | for item in train[user]: 38 | if item not in all_items: 39 | all_items[item] = 0 40 | all_items[item] += 1 41 | all_items = list(all_items.items()) 42 | items = [x[0] for x in all_items] 43 | pops = [x[1] for x in all_items] 44 | 45 | 46 | 47 | # 负采样函数(注意!!!要按照流行度进行采样) 48 | def nSample(data, ratio): 49 | new_data = {} 50 | # 正样本 51 | for user in data: 52 | if user not in new_data: 53 | new_data[user] = {} 54 | for item in data[user]: 55 | new_data[user][item] = 1 56 | 57 | # # 分步按照流行度采集负样本 58 | # for user in new_data: 59 | # seen = set(new_data[user]) 60 | # pos_num = len(seen) 61 | # n = 0 62 | # for i in range(pos_num*3): 63 | # # temp = items[np.random.randint(0,len(items)-1)] 64 | # temp = np.random.choice(items, 1, pops)[0] 65 | # if temp in new_data[user]: 66 | # continue 67 | # new_data[user][temp] = 0 68 | # n += 1 69 | # if n > pos_num*ratio: 70 | # break 71 | 72 | # 负样本 73 | for user in new_data: 74 | seen = set(new_data[user]) 75 | pos_num = len(seen) 76 | item = np.random.choice(items, int(pos_num * ratio * 3), pops) 77 | item = [x for x in item if x not in seen][:int(pos_num * ratio)] 78 | new_data[user].update({x: 0 for x in item}) 79 | 80 | 81 | return new_data 82 | 83 | # 训练 84 | P, Q = {}, {} 85 | for user in train: 86 | P[user] = np.random.random(K) 87 | for item in items: 88 | Q[item] = np.random.random(K) 89 | 90 | for s in trange(step): 91 | data = nSample(train, ratio) 92 | for user in data: 93 | for item in data[user]: 94 | eui = data[user][item] - (P[user] * Q[item]).sum() 95 | P[user] += lr * (Q[item] * eui - lmbda * P[user]) 96 | Q[item] += lr * (P[user] * eui - lmbda * Q[item]) 97 | lr *= 0.9 # 调整学习率 98 | 99 | # 获取接口函数 100 | def GetRecommendation(user): 101 | seen_items = set(train[user]) 102 | recs = {} 103 | for item in items: 104 | if item not in seen_items: 105 | recs[item] = (P[user] * Q[item]).sum() 106 | recs = list(sorted(recs.items(), key=lambda x: x[1], reverse=True))[:N] 107 | return recs 108 | 109 | return GetRecommendation 110 | 111 | 112 | class Experiment(): 113 | 114 | def __init__(self, M, N, ratio=1, 115 | K=100, lr=0.02, step=50, lmbda=0.01, fp='../../ratings.dat'): 116 | ''' 117 | :params: M, 进行多少次实验 118 | :params: N, TopN推荐物品的个数 119 | :params: ratio, 正负样本比例 120 | :params: K, 隐语义个数 121 | :params: lr, 学习率 122 | :params: step, 训练步数 123 | :params: lmbda, 正则化系数 124 | :params: fp, 数据文件路径 125 | ''' 126 | self.M = M 127 | self.K = K 128 | self.N = N 129 | self.ratio = ratio 130 | self.lr = lr 131 | self.step = step 132 | self.lmbda = lmbda 133 | self.fp = fp 134 | self.alg = LFM 135 | 136 | # 定义单次实验 137 | @timmer 138 | def worker(self, train, test): 139 | ''' 140 | :params: train, 训练数据集 141 | :params: test, 测试数据集 142 | :return: 各指标的值 143 | ''' 144 | getRecommendation = self.alg(train, self.ratio, self.K, 145 | self.lr, self.step, self.lmbda, self.N) 146 | metric = Metric(train, test, getRecommendation) 147 | return metric.eval() 148 | 149 | # 多次实验取平均 150 | @timmer 151 | def run(self): 152 | metrics = {'Precision': 0, 'Recall': 0, 153 | 'Coverage': 0, 'Popularity': 0} 154 | dataset = Dataset(self.fp) 155 | for ii in range(self.M): 156 | train, test = dataset.splitData(self.M, ii) 157 | print('Experiment {}:'.format(ii)) 158 | metric = self.worker(train, test) 159 | metrics = {k: metrics[k] + metric[k] for k in metrics} 160 | metrics = {k: metrics[k] / self.M for k in metrics} 161 | print('Average Result (M={}, N={}, ratio={}): {}'.format( \ 162 | self.M, self.N, self.ratio, metrics)) 163 | 164 | # LFM实验(运行时间较长,这里没贴实验结果) 165 | M, N = 8, 10 166 | for r in [5, 10, 20]: 167 | exp = Experiment(M, N, ratio=r) 168 | exp.run() -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/PersonalRank/Dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | # 定义装饰器,监控运行时间 5 | def timmer(func): 6 | def wrapper(*args, **kwargs): 7 | start_time = time.time() 8 | res = func(*args, **kwargs) 9 | stop_time = time.time() 10 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 11 | return res 12 | return wrapper 13 | 14 | # shju 处理相关 load data/split data 15 | class Dataset(): 16 | 17 | def __init__(self, fp): 18 | # fp: data file path 19 | self.data = self.loadData(fp) 20 | 21 | @timmer 22 | def loadData(self, fp): 23 | data = [] 24 | for l in open(fp): 25 | data.append(tuple(map(int, l.strip().split('::')[:2]))) 26 | return data 27 | 28 | @timmer 29 | def splitData(self, M, k, seed=1): 30 | ''' 31 | :params: data, 加载的所有(user, item)数据条目 32 | :params: M, 划分的数目,最后需要取M折的平均 33 | :params: k, 本次是第几次划分,k~[0, M) 34 | :params: seed, random的种子数,对于不同的k应设置成一样的 35 | :return: train, test 36 | ''' 37 | train, test = [], [] 38 | random.seed(seed) 39 | for user, item in self.data: 40 | if random.randint(0, M - 1) == k: 41 | test.append((user, item)) 42 | else: 43 | train.append((user, item)) 44 | 45 | # 处理成字典的形式,user->set(items) 46 | def convert_dict(data): 47 | data_dict = {} 48 | for user, item in data: 49 | if user not in data_dict: 50 | data_dict[user] = set() 51 | data_dict[user].add(item) 52 | data_dict = {k: list(data_dict[k]) for k in data_dict} 53 | return data_dict 54 | 55 | return convert_dict(train), convert_dict(test) -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/PersonalRank/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/chapter2-Algorithms-user behavior based/PersonalRank/__init__.py -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/PersonalRank/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | 4 | # 评价指标 precision、recall、coverage、popularity 5 | class Metric(): 6 | 7 | def __init__(self, train, test, GetRecommendation): 8 | ''' 9 | :params: train, 训练数据 10 | :params: test, 测试数据 11 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 12 | ''' 13 | self.train = train 14 | self.test = test 15 | self.GetRecommendation = GetRecommendation 16 | self.recs = self.getRec() 17 | 18 | # 为test中的每个用户进行推荐 19 | def getRec(self): 20 | recs = {} 21 | for user in self.test: 22 | rank = self.GetRecommendation(user) 23 | recs[user] = rank 24 | return recs 25 | 26 | # 定义精确率指标计算方式 27 | def precision(self): 28 | all, hit = 0, 0 29 | for user in self.test: 30 | test_items = set(self.test[user]) 31 | rank = self.recs[user] 32 | for item, score in rank: 33 | if item in test_items: 34 | hit += 1 35 | all += len(rank) 36 | return round(hit / all * 100, 2) 37 | 38 | # 定义召回率指标计算方式 39 | def recall(self): 40 | all, hit = 0, 0 41 | for user in self.test: 42 | test_items = set(self.test[user]) 43 | rank = self.recs[user] 44 | for item, score in rank: 45 | if item in test_items: 46 | hit += 1 47 | all += len(test_items) 48 | return round(hit / all * 100, 2) 49 | 50 | # 定义覆盖率指标计算方式 51 | def coverage(self): 52 | all_item, recom_item = set(), set() 53 | for user in self.test: 54 | for item in self.train[user]: 55 | all_item.add(item) 56 | rank = self.recs[user] 57 | for item, score in rank: 58 | recom_item.add(item) 59 | return round(len(recom_item) / len(all_item) * 100, 2) 60 | 61 | # 定义新颖度度指标计算方式 62 | def popularity(self): 63 | # 计算物品的流行度 64 | item_pop = {} 65 | for user in self.train: 66 | for item in self.train[user]: 67 | if item not in item_pop: 68 | item_pop[item] = 0 69 | item_pop[item] += 1 70 | 71 | num, pop = 0, 0 72 | for user in self.test: 73 | rank = self.recs[user] 74 | for item, score in rank: 75 | # 取对数,防止因长尾问题带来的被流行物品所主导 76 | pop += math.log(1 + item_pop[item]) 77 | num += 1 78 | return round(pop / num, 6) 79 | 80 | def eval(self): 81 | metric = {'Precision': self.precision(), 82 | 'Recall': self.recall(), 83 | 'Coverage': self.coverage(), 84 | 'Popularity': self.popularity()} 85 | print('Metric:', metric) 86 | return metric -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/PersonalRank/personalrank_example.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | def PersonalRank(G, alpha, root, max_depth): 4 | rank = dict() 5 | rank = {x: 0 for x in G.keys()} 6 | rank[root] = 1 7 | for k in range(max_depth): 8 | tmp = {x: 0 for x in G.keys()} 9 | # 取出节点i和他的出边尾节点集合ri 10 | for i, ri in G.items(): 11 | # 取节点i的出边的尾节点j以及边E(i,j)的权重wij,边的权重都为1,归一化后就是1/len(ri) 12 | for j, wij in ri.items(): 13 | # 这里可以看出前一个step(k)生成的图每个节点以alpha概率向其他相关节点传递PR值, 14 | # 生成新的图,但是每个节点都有1-alpha概率保留PR,所以新图整体少了1-alpha 15 | tmp[j] += alpha * rank[i] / (1.0 * len(ri)) 16 | tmp[root] += (1 - alpha) 17 | rank = tmp 18 | lst = sorted(rank.items(), key=lambda x: x[1], reverse=True) 19 | for ele in lst: 20 | print("%s:%.3f, \t" % (ele[0], ele[1])) 21 | return rank 22 | 23 | 24 | if __name__ == '__main__': 25 | G = {'A': {'a': 1, 'c': 1}, 26 | 'B': {'a': 1, 'b': 1, 'c': 1, 'd': 1}, 27 | 'C': {'c': 1, 'd': 1}, 28 | 'a': {'A': 1, 'B': 1}, 29 | 'b': {'B': 1}, 30 | 'c': {'A': 1, 'B': 1, 'C': 1}, 31 | 'd': {'B': 1, 'C': 1}} 32 | PersonalRank(G, 0.85, 'A', 100) -------------------------------------------------------------------------------- /chapter2-Algorithms-user behavior based/PersonalRank/train_PersonalRank.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import numpy as np 5 | import time 6 | from tqdm import tqdm 7 | from scipy.sparse import csc_matrix, linalg, eye 8 | from copy import deepcopy 9 | from code_w.recommand.chapter2.graph_based.Dataset import Dataset 10 | from code_w.recommand.chapter2.graph_based.metrics import Metric 11 | 12 | # 定义装饰器,监控运行时间 13 | def timmer(func): 14 | def wrapper(*args, **kwargs): 15 | start_time = time.time() 16 | res = func(*args, **kwargs) 17 | stop_time = time.time() 18 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 19 | return res 20 | return wrapper 21 | 22 | 23 | def PersonalRank(train, alpha, N): 24 | ''' 25 | :params: train, 训练数据 26 | :params: alpha, 继续随机游走的概率 27 | :params: N, 推荐TopN物品的个数 28 | :return: GetRecommendation, 获取推荐结果的接口 29 | ''' 30 | 31 | # 构建索引 32 | items = [] 33 | for user in train: 34 | items.extend(train[user]) 35 | id2item = list(set(items)) #item集合 36 | users = {u: i for i, u in enumerate(train.keys())} #user编号 37 | items = {u: i + len(users) for i, u in enumerate(id2item)}#item编号,在user之后 38 | 39 | # 计算转移矩阵(注意!!!要按照出度进行归一化) 40 | item_user = {} 41 | for user in train: 42 | for item in train[user]: 43 | if item not in item_user: 44 | item_user[item] = [] 45 | item_user[item].append(user) #item-user倒排索引 46 | 47 | data, row, col = [], [], [] 48 | for u in train: 49 | for v in train[u]: 50 | data.append(1 / len(train[u]))# 保存所有节点出度 51 | row.append(users[u]) # 出度的节点 52 | col.append(items[v]) # 连接的节点 53 | for u in item_user: # user遍历完之后,再次遍历item 54 | for v in item_user[u]: 55 | data.append(1 / len(item_user[u])) 56 | row.append(items[u]) 57 | col.append(users[v]) 58 | # 行程稀疏矩阵,按照列排列row和col分别代表data位置的索引,shape不太赞同,我觉得应该是len(users)+len(items)而不是len(data) 59 | M = csc_matrix((data, (row, col)), shape=(len(users)+len(items),len(users)+len(items))) 60 | 61 | # 获取接口函数 62 | def GetRecommendation(user): 63 | seen_items = set(train[user]) 64 | # 解矩阵方程 r = (1-a)r0 + a(M.T)r 65 | # r0 = [0] * (len(users)+len(items)) 66 | r0 = [[0] for i in range(len(users)+len(items))] 67 | r0[users[user]][0] = 1 #测试那个user就将该user设置为1,表示从此开始随机游走 68 | r0 = np.array(r0) 69 | # r0 = csc_matrix(r0) #list转化成稀疏矩阵,按照列排列 70 | # r = (1 - alpha) * linalg.inv(eye(len(users)+len(items)) - alpha * M.T) * r0 #M是按照列排列的,转置 71 | # r = r.T.toarray()[0][len(users):]# user 之后的节点才是item 72 | 73 | r = linalg.gmres(eye(len(users) + len(items)) - alpha * M.T, (1 - alpha) * r0) # gmres(A,b),解决稀疏Ax=b的求解问题, 74 | r = r[0][len(users):] # user 之后的节点才是item 75 | 76 | idx = np.argsort(-r)[:N] # 取反是为了从大到小排列 77 | recs = [(id2item[ii], r[ii]) for ii in idx] #返回topN的item与PR值的tuple 78 | return recs 79 | 80 | return GetRecommendation 81 | 82 | 83 | class Experiment(): 84 | 85 | def __init__(self, M, N, alpha, fp='E:\PythonWorkSpace\pycharm\data\movies_data\\ratings.dat'): 86 | ''' 87 | :params: M, 进行多少次实验 88 | :params: N, TopN推荐物品的个数 89 | :params: alpha, 继续随机游走的概率 90 | :params: fp, 数据文件路径 91 | ''' 92 | self.M = M 93 | self.N = N 94 | self.alpha = alpha 95 | self.fp = fp 96 | self.alg = PersonalRank 97 | 98 | # 定义单次实验 99 | @timmer 100 | def worker(self, train, test): 101 | ''' 102 | :params: train, 训练数据集 103 | :params: test, 测试数据集 104 | :return: 各指标的值 105 | ''' 106 | getRecommendation = self.alg(train, self.alpha, self.N) 107 | metric = Metric(train, test, getRecommendation) 108 | return metric.eval() 109 | 110 | # 多次实验取平均 111 | @timmer 112 | def run(self): 113 | metrics = {'Precision': 0, 'Recall': 0, 114 | 'Coverage': 0, 'Popularity': 0} 115 | dataset = Dataset(self.fp) 116 | for ii in range(self.M): 117 | train, test = dataset.splitData(self.M, ii) 118 | print('Experiment {}:'.format(ii)) 119 | metric = self.worker(train, test) 120 | metrics = {k: metrics[k] + metric[k] for k in metrics} 121 | metrics = {k: metrics[k] / self.M for k in metrics} 122 | print('Average Result (M={}, N={}, ratio={}): {}'.format( self.M, self.N, self.ratio, metrics)) 123 | 124 | # PersonalRank实验(笔记本跑的太慢,这里没贴实验结果) 125 | M, N, alpha = 8, 10, 0.8 126 | exp = Experiment(M, N, alpha) 127 | exp.run() 128 | 129 | -------------------------------------------------------------------------------- /chapter3_cold_start/Dataset.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 导入包 4 | import random 5 | import time 6 | 7 | 8 | # 定义装饰器,监控运行时间 9 | def timmer(func): 10 | def wrapper(*args, **kwargs): 11 | start_time = time.time() 12 | res = func(*args, **kwargs) 13 | stop_time = time.time() 14 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 15 | return res 16 | return wrapper 17 | 18 | 19 | 20 | class Dataset(): 21 | 22 | def __init__(self, fp, up): 23 | # fp: data file path 24 | # up: user profile path 25 | self.data, self.profile = self.loadData(fp, up) 26 | 27 | @timmer 28 | def loadData(self, fp, up): 29 | data = [] 30 | for l in open(fp,encoding='utf-8'): 31 | data.append(tuple(l.strip().split('\t')[:2])) 32 | profile = {} 33 | for l in open(up,encoding='utf-8'): 34 | user, gender, age, country, _ = l.strip().split('\t') 35 | if age == '': 36 | age = -1 37 | profile[user] = {'gender': gender, 'age': int(age), 'country': country} 38 | # 按照用户进行采样 39 | users = list(profile.keys()) 40 | random.shuffle(users) 41 | users = set(users[:5000])# 共359347人 42 | data = [x for x in data if x[0] in users] 43 | profile = {k: profile[k] for k in users} 44 | return data, profile 45 | 46 | @timmer 47 | def splitData(self, M, k, seed=1): 48 | ''' 49 | :params: data, 加载的所有(user, item)数据条目 50 | :params: M, 划分的数目,最后需要取M折的平均 51 | :params: k, 本次是第几次划分,k~[0, M) 52 | :params: seed, random的种子数,对于不同的k应设置成一样的 53 | :return: train, test 54 | ''' 55 | train, test = [], [] 56 | random.seed(seed) 57 | for user, item in self.data: 58 | # 取M-1较为合理,因randint是左右都覆盖的 59 | if random.randint(0, M - 1) == k: 60 | test.append((user, item)) 61 | else: 62 | train.append((user, item)) 63 | 64 | # 处理成字典的形式,user->set(items) 65 | def convert_dict(data): 66 | data_dict = {} 67 | for user, item in data: 68 | if user not in data_dict: 69 | data_dict[user] = set() 70 | data_dict[user].add(item) 71 | data_dict = {k: list(data_dict[k]) for k in data_dict} 72 | return data_dict 73 | 74 | return convert_dict(train), convert_dict(test), self.profile -------------------------------------------------------------------------------- /chapter3_cold_start/Metrics.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | # 定义装饰器,监控运行时间 4 | def timmer(func): 5 | def wrapper(*args, **kwargs): 6 | start_time = time.time() 7 | res = func(*args, **kwargs) 8 | stop_time = time.time() 9 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 10 | return res 11 | return wrapper 12 | 13 | class Metric(): 14 | 15 | def __init__(self, train, test, GetRecommendation): 16 | ''' 17 | :params: train, 训练数据 18 | :params: test, 测试数据 19 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 20 | ''' 21 | self.train = train 22 | self.test = test 23 | self.GetRecommendation = GetRecommendation 24 | self.recs = self.getRec() 25 | 26 | # 为test中的每个用户进行推荐 27 | def getRec(self): 28 | recs = {} 29 | for user in self.test: 30 | rank = self.GetRecommendation(user) 31 | recs[user] = rank 32 | return recs 33 | 34 | # 定义精确率指标计算方式 35 | def precision(self): 36 | all, hit = 0, 0 37 | for user in self.test: 38 | test_items = set(self.test[user]) 39 | rank = self.recs[user] 40 | for item, score in rank: 41 | if item in test_items: 42 | hit += 1 43 | all += len(rank) 44 | return round(hit / all * 100, 2) 45 | 46 | # 定义召回率指标计算方式 47 | def recall(self): 48 | all, hit = 0, 0 49 | for user in self.test: 50 | test_items = set(self.test[user]) 51 | rank = self.recs[user] 52 | for item, score in rank: 53 | if item in test_items: 54 | hit += 1 55 | all += len(test_items) 56 | return round(hit / all * 100, 2) 57 | 58 | # 定义覆盖率指标计算方式 59 | def coverage(self): 60 | all_item, recom_item = set(), set() 61 | for user in self.test: 62 | if user in self.train: 63 | for item in self.train[user]: 64 | all_item.add(item) 65 | rank = self.recs[user] 66 | for item, score in rank: 67 | recom_item.add(item) 68 | return round(len(recom_item) / len(all_item) * 100, 2) 69 | 70 | def eval(self): 71 | metric = {'Precision': self.precision(), 72 | 'Recall': self.recall(), 73 | 'Coverage': self.coverage()} 74 | print('Metric:', metric) 75 | return metric -------------------------------------------------------------------------------- /chapter3_cold_start/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/chapter3_cold_start/__init__.py -------------------------------------------------------------------------------- /chapter3_cold_start/train_item_inf_coldstart.py: -------------------------------------------------------------------------------- 1 | 2 | # 导入包 3 | import random 4 | import math 5 | import numpy as np 6 | import time 7 | from tqdm import tqdm, trange 8 | 9 | 10 | # 定义装饰器,监控运行时间 11 | def timmer(func): 12 | def wrapper(*args, **kwargs): 13 | start_time = time.time() 14 | res = func(*args, **kwargs) 15 | stop_time = time.time() 16 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 17 | return res 18 | return wrapper 19 | 20 | 21 | class Dataset(): 22 | 23 | def __init__(self, fp, ip): 24 | # fp: data file path 25 | self.data, self.content = self.loadData(fp, ip) 26 | 27 | @timmer 28 | def loadData(self, fp, ip): 29 | data = [] 30 | for l in open(fp): 31 | data.append(tuple(map(int, l.strip().split('::')[:2]))) 32 | contents = {} 33 | for l in open(ip, 'rb'): 34 | l = l.strip() 35 | l = str(l)[2:-1] 36 | contents[int(l.strip().split('::')[0])] = l.strip().split('::')[-1].split('|') 37 | return data, contents 38 | 39 | @timmer 40 | def splitData(self, M, k, seed=1): 41 | ''' 42 | :params: data, 加载的所有(user, item)数据条目 43 | :params: M, 划分的数目,最后需要取M折的平均 44 | :params: k, 本次是第几次划分,k~[0, M) 45 | :params: seed, random的种子数,对于不同的k应设置成一样的 46 | :return: train, test 47 | ''' 48 | train, test = [], [] 49 | random.seed(seed) 50 | for user, item in self.data: 51 | # 这里与书中的不一致,本人认为取M-1较为合理,因randint是左右都覆盖的 52 | if random.randint(0, M - 1) == k: 53 | test.append((user, item)) 54 | else: 55 | train.append((user, item)) 56 | 57 | # 处理成字典的形式,user->set(items) 58 | def convert_dict(data): 59 | data_dict = {} 60 | for user, item in data: 61 | if user not in data_dict: 62 | data_dict[user] = set() 63 | data_dict[user].add(item) 64 | data_dict = {k: list(data_dict[k]) for k in data_dict} 65 | return data_dict 66 | 67 | return convert_dict(train), convert_dict(test), self.content 68 | 69 | 70 | class Metric(): 71 | 72 | def __init__(self, train, test, GetRecommendation): 73 | ''' 74 | :params: train, 训练数据 75 | :params: test, 测试数据 76 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 77 | ''' 78 | self.train = train 79 | self.test = test 80 | self.GetRecommendation = GetRecommendation 81 | self.recs = self.getRec() 82 | 83 | # 为test中的每个用户进行推荐 84 | def getRec(self): 85 | recs = {} 86 | for user in self.test: 87 | rank = self.GetRecommendation(user) 88 | recs[user] = rank 89 | return recs 90 | 91 | # 定义精确率指标计算方式 92 | def precision(self): 93 | all, hit = 0, 0 94 | for user in self.test: 95 | test_items = set(self.test[user]) 96 | rank = self.recs[user] 97 | for item, score in rank: 98 | if item in test_items: 99 | hit += 1 100 | all += len(rank) 101 | return round(hit / all * 100, 2) 102 | 103 | # 定义召回率指标计算方式 104 | def recall(self): 105 | all, hit = 0, 0 106 | for user in self.test: 107 | test_items = set(self.test[user]) 108 | rank = self.recs[user] 109 | for item, score in rank: 110 | if item in test_items: 111 | hit += 1 112 | all += len(test_items) 113 | return round(hit / all * 100, 2) 114 | 115 | # 定义覆盖率指标计算方式 116 | def coverage(self): 117 | all_item, recom_item = set(), set() 118 | for user in self.test: 119 | for item in self.train[user]: 120 | all_item.add(item) 121 | rank = self.recs[user] 122 | for item, score in rank: 123 | recom_item.add(item) 124 | return round(len(recom_item) / len(all_item) * 100, 2) 125 | 126 | # 定义新颖度指标计算方式 127 | def popularity(self): 128 | # 计算物品的流行度 129 | item_pop = {} 130 | for user in self.train: 131 | for item in self.train[user]: 132 | if item not in item_pop: 133 | item_pop[item] = 0 134 | item_pop[item] += 1 135 | 136 | num, pop = 0, 0 137 | for user in self.test: 138 | rank = self.recs[user] 139 | for item, score in rank: 140 | if item in item_pop: 141 | # 取对数,防止因长尾问题带来的被流行物品所主导 142 | pop += math.log(1 + item_pop[item]) 143 | num += 1 144 | return round(pop / num, 6) 145 | 146 | def eval(self): 147 | metric = {'Precision': self.precision(), 148 | 'Recall': self.recall(), 149 | 'Coverage': self.coverage(), 150 | 'Popularity': self.popularity()} 151 | print('Metric:', metric) 152 | return metric 153 | 154 | 155 | def ContentItemKNN(train, content, K, N): 156 | ''' 157 | :params: train, 训练数据 158 | :params: content, 物品内容信息 159 | :params: K, 取相似Top-K相似物品 160 | :params: N, 推荐TopN物品的个数 161 | :return: GetRecommendation, 获取推荐结果的接口 162 | ''' 163 | 164 | # 建立word-item倒排表 165 | word_item = {} 166 | for item in content: 167 | for word in content[item]: 168 | if word not in word_item: 169 | word_item[word] = {} 170 | word_item[word][item] = 1#物品item与关键词world的倒排索引world-item 171 | 172 | for word in word_item: 173 | for item in word_item[word]: 174 | word_item[word][item] /= math.log(1 + len(word_item[word])) 175 | 176 | # 计算相似度 177 | item_sim = {} 178 | mo = {} 179 | for word in word_item: 180 | for u in word_item[word]: 181 | if u not in item_sim: 182 | item_sim[u] = {} 183 | mo[u] = 0 184 | mo[u] += word_item[word][u] ** 2 185 | for v in word_item[word]: 186 | if u == v: continue 187 | if v not in item_sim[u]: 188 | item_sim[u][v] = 0 189 | item_sim[u][v] += word_item[word][u] * word_item[word][v] #每个物品与其他物品的相关性 190 | for u in item_sim: 191 | for v in item_sim[u]: 192 | item_sim[u][v] /= math.sqrt(mo[u] * mo[v]) 193 | 194 | # 按照相似度排序 195 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) \ 196 | for k, v in item_sim.items()}#对每个物品,与之相关的物品进行排序 197 | 198 | # 获取接口函数 199 | def GetRecommendation(user): 200 | items = {} 201 | seen_items = set(train[user]) 202 | for item in train[user]: 203 | for u, _ in sorted_item_sim[item][:K]: 204 | # 要去掉用户见过的 205 | if u not in seen_items: 206 | if u not in items: 207 | items[u] = 0 208 | items[u] += item_sim[item][u] #权值 209 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 210 | return recs 211 | 212 | return GetRecommendation 213 | 214 | 215 | class Experiment(): 216 | 217 | def __init__(self, M, N, K, fp='../../../data/movies_data/ratings.dat', ip='../../../data/movies_data/movies.dat'): 218 | ''' 219 | :params: M, 进行多少次实验 220 | :params: N, TopN推荐物品的个数 221 | :params: K, 取Top-K相似物品数目 222 | :params: fp, 数据文件路径 223 | :params: ip, 物品内容路径 224 | ''' 225 | self.M = M 226 | self.K = K 227 | self.N = N 228 | self.fp = fp 229 | self.ip = ip 230 | self.alg = ContentItemKNN 231 | 232 | # 定义单次实验 233 | @timmer 234 | def worker(self, train, test, content): 235 | ''' 236 | :params: train, 训练数据集 237 | :params: test, 测试数据集 238 | :return: 各指标的值 239 | ''' 240 | getRecommendation = self.alg(train, content, self.K, self.N) 241 | metric = Metric(train, test, getRecommendation) 242 | return metric.eval() 243 | 244 | # 多次实验取平均 245 | @timmer 246 | def run(self): 247 | metrics = {'Precision': 0, 'Recall': 0, 248 | 'Coverage': 0, 'Popularity': 0} 249 | dataset = Dataset(self.fp, self.ip) 250 | for ii in range(self.M): 251 | train, test, content = dataset.splitData(self.M, ii) 252 | print('Experiment {}:'.format(ii)) 253 | metric = self.worker(train, test, content) 254 | metrics = {k: metrics[k] + metric[k] for k in metrics} 255 | metrics = {k: metrics[k] / self.M for k in metrics} 256 | print('Average Result (M={}, N={}, K={}): {}'.format(self.M, self.N, self.K, metrics)) 257 | 258 | M, N, K = 8, 10, 10 259 | exp = Experiment(M, N, K) 260 | exp.run() -------------------------------------------------------------------------------- /chapter3_cold_start/train_reg_inf_coldstart.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import time 3 | from code_w.recommand.chapter3_code_start.Dataset import Dataset 4 | from code_w.recommand.chapter3_code_start.Metrics import Metric 5 | 6 | # 定义装饰器,监控运行时间 7 | def timmer(func): 8 | def wrapper(*args, **kwargs): 9 | start_time = time.time() 10 | res = func(*args, **kwargs) 11 | stop_time = time.time() 12 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 13 | return res 14 | return wrapper 15 | 16 | 17 | # 1. MostPopular算法 18 | def MostPopular(train, profile, N): 19 | ''' 20 | :params: train, 训练数据 21 | :params: profile, 用户的注册信息 22 | :params: N, 推荐TopN物品的个数 23 | :return: GetRecommendation, 获取推荐结果的接口 24 | ''' 25 | 26 | items = {} 27 | for user in train: 28 | for item in train[user]: 29 | if item not in items: 30 | items[item] = 0 31 | items[item] += 1 32 | items = list(sorted(items.items(), key=lambda x: x[1], reverse=True)) 33 | 34 | # 获取接口函数 35 | def GetRecommendation(user): 36 | seen_items = set(train[user]) if user in train else set() 37 | recs = [x for x in items if x[0] not in seen_items][:N] 38 | return recs 39 | 40 | return GetRecommendation 41 | 42 | 43 | # 2. GenderMostPopular算法 44 | def GenderMostPopular(train, profile, N): 45 | ''' 46 | :params: train, 训练数据 47 | :params: profile, 用户的注册信息 48 | :params: N, 推荐TopN物品的个数 49 | :return: GetRecommendation, 获取推荐结果的接口 50 | ''' 51 | 52 | mitems, fitems = {}, {} # 男、女 53 | for user in train: 54 | if profile[user]['gender'] == 'm': 55 | tmp = mitems 56 | elif profile[user]['gender'] == 'f': 57 | tmp = fitems 58 | for item in train[user]: 59 | if item not in tmp: 60 | tmp[item] = 0 61 | tmp[item] += 1 62 | mitems = list(sorted(mitems.items(), key=lambda x: x[1], reverse=True)) 63 | fitems = list(sorted(fitems.items(), key=lambda x: x[1], reverse=True)) 64 | 65 | mostPopular = MostPopular(train, profile, N) 66 | 67 | # 获取接口函数 68 | def GetRecommendation(user): 69 | seen_items = set(train[user]) if user in train else set() 70 | if profile[user]['gender'] == 'm': 71 | recs = [x for x in mitems if x[0] not in seen_items][:N] 72 | elif profile[user]['gender'] == 'f': 73 | recs = [x for x in fitems if x[0] not in seen_items][:N] 74 | else: # 没有提供性别信息的,按照MostPopular推荐 75 | recs = mostPopular(user) 76 | return recs 77 | 78 | return GetRecommendation 79 | 80 | 81 | # 3. AgeMostPopular算法 82 | def AgeMostPopular(train, profile, N): 83 | ''' 84 | :params: train, 训练数据 85 | :params: profile, 用户的注册信息 86 | :params: N, 推荐TopN物品的个数 87 | :return: GetRecommendation, 获取推荐结果的接口 88 | ''' 89 | 90 | # 对年龄进行分段 91 | ages = [] 92 | for user in profile: 93 | if profile[user]['age'] >= 0: 94 | ages.append(profile[user]['age']) 95 | maxAge, minAge = max(ages), min(ages) 96 | items = [{} for _ in range(int(maxAge // 10 + 1))] 97 | 98 | # 分年龄段进行统计 99 | for user in train: 100 | if profile[user]['age'] >= 0: 101 | age = profile[user]['age'] // 10 102 | for item in train[user]: 103 | if item not in items[age]: 104 | items[age][item] = 0 105 | items[age][item] += 1 106 | for i in range(len(items)): 107 | items[i] = list(sorted(items[i].items(), key=lambda x: x[1], reverse=True)) 108 | 109 | mostPopular = MostPopular(train, profile, N) 110 | 111 | # 获取接口函数 112 | def GetRecommendation(user): 113 | seen_items = set(train[user]) if user in train else set() 114 | if profile[user]['age'] >= 0: 115 | age = profile[user]['age'] // 10 116 | # 年龄信息异常的,按照全局推荐 117 | if age >= len(items) or len(items[age]) == 0: 118 | recs = mostPopular(user) 119 | else: 120 | recs = [x for x in items[age] if x[0] not in seen_items][:N] 121 | else: # 没有提供年龄信息的,按照全局推荐 122 | recs = mostPopular(user) 123 | return recs 124 | 125 | return GetRecommendation 126 | 127 | 128 | # 4. CountryMostPopular算法 129 | def CountryMostPopular(train, profile, N): 130 | ''' 131 | :params: train, 训练数据 132 | :params: profile, 用户的注册信息 133 | :params: N, 推荐TopN物品的个数 134 | :return: GetRecommendation, 获取推荐结果的接口 135 | ''' 136 | 137 | # 分城市进行统计 138 | items = {} 139 | for user in train: 140 | country = profile[user]['country'] 141 | if country not in items: 142 | items[country] = {} 143 | for item in train[user]: 144 | if item not in items[country]: 145 | items[country][item] = 0 146 | items[country][item] += 1 147 | for country in items: 148 | items[country] = list(sorted(items[country].items(), key=lambda x: x[1], reverse=True)) 149 | 150 | mostPopular = MostPopular(train, profile, N) 151 | 152 | # 获取接口函数 153 | def GetRecommendation(user): 154 | seen_items = set(train[user]) if user in train else set() 155 | country = profile[user]['country'] 156 | if country in items: 157 | recs = [x for x in items[country] if x[0] not in seen_items][:N] 158 | else: # 没有提供城市信息的,按照全局推荐 159 | recs = mostPopular(user) 160 | return recs 161 | 162 | return GetRecommendation 163 | 164 | 165 | # 5. DemographicMostPopular算法 166 | def DemographicMostPopular(train, profile, N): 167 | ''' 168 | :params: train, 训练数据 169 | :params: profile, 用户的注册信息 170 | :params: N, 推荐TopN物品的个数 171 | :return: GetRecommendation, 获取推荐结果的接口 172 | ''' 173 | items_pop = {} 174 | for user in train: 175 | for item in train[user]: 176 | if item not in items_pop: 177 | items_pop[item] = 0 178 | items_pop[item] += 1 179 | # items_pop = list(sorted(items_pop.items(), key=lambda x: x[1], reverse=True)) 180 | 181 | # 建立多重字典,将缺失值当成other,同归为一类进行处理 182 | items = {} 183 | for user in train: 184 | gender = profile[user]['gender'] 185 | if gender not in items: 186 | items[gender] = {} 187 | age = profile[user]['age'] // 10 188 | if age not in items[gender]: 189 | items[gender][age] = {} 190 | country = profile[user]['country'] 191 | if country not in items[gender][age]: 192 | items[gender][age][country] = {} 193 | for item in train[user]: 194 | if item not in items[gender][age][country]: 195 | items[gender][age][country][item] = 0 196 | items[gender][age][country][item] += 1 197 | for gender in items: 198 | for age in items[gender]: 199 | for country in items[gender][age]: 200 | items[gender][age][country] = list(sorted(items[gender][age][country].items(), 201 | key=lambda x: x[1], reverse=True)) 202 | # items[gender][age][country] = list(sorted(items[gender][age][country].items(), 203 | # key=lambda x: x[1] / (items_pop[x[0]] + 1000), reverse=True)) 204 | 205 | mostPopular = MostPopular(train, profile, N) 206 | 207 | # 获取接口函数 208 | def GetRecommendation(user): 209 | seen_items = set(train[user]) if user in train else set() 210 | gender = profile[user]['gender'] 211 | age = profile[user]['age'] 212 | country = profile[user]['country'] 213 | if gender not in items or age not in items[gender] or country not in items[gender][age]: 214 | recs = mostPopular(user) 215 | else: 216 | recs = [x for x in items[gender][age][country] if x[0] not in seen_items][:N] 217 | return recs 218 | 219 | return GetRecommendation 220 | class Experiment(): 221 | def __init__(self, M, N, at='MostPopular', 222 | fp='../../../data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv', 223 | up='../../../data/lastfm-dataset-360K/usersha1-profile.tsv'): 224 | ''' 225 | :params: M, 进行多少次实验 226 | :params: N, TopN推荐物品的个数 227 | :params: fp, 数据文件路径 228 | :params: up, 用户注册信息文件路径 229 | ''' 230 | self.M = M 231 | self.N = N 232 | self.fp = fp 233 | self.up = up 234 | self.at = at 235 | self.alg = {'MostPopular': MostPopular, 'GenderMostPopular': GenderMostPopular, 236 | 'AgeMostPopular': AgeMostPopular, 'CountryMostPopular': CountryMostPopular, 237 | 'DemographicMostPopular': DemographicMostPopular} 238 | 239 | # 定义单次实验 240 | @timmer 241 | def worker(self, train, test, profile): 242 | ''' 243 | :params: train, 训练数据集 244 | :params: test, 测试数据集 245 | :params: profile, 用户注册信息 246 | :return: 各指标的值 247 | ''' 248 | getRecommendation = self.alg[self.at](train, profile, self.N) 249 | metric = Metric(train, test, getRecommendation) 250 | return metric.eval() 251 | 252 | # 多次实验取平均 253 | @timmer 254 | def run(self): 255 | metrics = {'Precision': 0, 'Recall': 0, 256 | 'Coverage': 0} 257 | dataset = Dataset(self.fp, self.up) 258 | for ii in range(self.M): 259 | train, test, profile = dataset.splitData(self.M, ii) 260 | print('Experiment {}:'.format(ii)) 261 | metric = self.worker(train, test, profile) 262 | metrics = {k: metrics[k] + metric[k] for k in metrics} 263 | metrics = {k: metrics[k] / self.M for k in metrics} 264 | print('Average Result (M={}, N={}): {}'.format( 265 | self.M, self.N, metrics)) 266 | 267 | 268 | # # 1. MostPopular实验 269 | # M, N = 10, 10 270 | # exp = Experiment(M, N, at='MostPopular') 271 | # exp.run() 272 | # 273 | # # 2. GenderMostPopular实验 274 | # M, N = 10, 10 275 | # exp = Experiment(M, N, at='GenderMostPopular') 276 | # exp.run() 277 | # 278 | # 279 | # # 3. AgeMostPopular实验 280 | # M, N = 10, 10 281 | # exp = Experiment(M, N, at='AgeMostPopular') 282 | # exp.run() 283 | 284 | 285 | # # 4. CountryMostPopular实验 286 | # M, N = 10, 10 287 | # exp = Experiment(M, N, at='CountryMostPopular') 288 | # exp.run() 289 | 290 | 291 | # 5. DemographicMostPopular实验 292 | M, N = 10, 10 293 | exp = Experiment(M, N, at='DemographicMostPopular') 294 | exp.run() 295 | -------------------------------------------------------------------------------- /chapter4_tags_based/train_user_tags_based.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import math 3 | import time 4 | from code_w.recommand.chapter4.database import Dataset 5 | from code_w.recommand.chapter4.metric import Metric 6 | 7 | # 定义装饰器,监控运行时间 8 | def timmer(func): 9 | def wrapper(*args, **kwargs): 10 | start_time = time.time() 11 | res = func(*args, **kwargs) 12 | stop_time = time.time() 13 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 14 | return res 15 | return wrapper 16 | 17 | # 1. 基于热门标签的推荐 18 | def SimpleTagBased(train, N): 19 | ''' 20 | :params: train, 训练数据集 21 | :params: N, 超参数,设置取TopN推荐物品数目 22 | :return: GetRecommendation,推荐接口函数 23 | ''' 24 | # 统计user_tags和tag_items 25 | user_tags, tag_items = {}, {} 26 | for user in train: 27 | user_tags[user] = {} 28 | for item in train[user]: 29 | for tag in train[user][item]: 30 | if tag not in user_tags[user]: 31 | user_tags[user][tag] = 0 32 | user_tags[user][tag] += 1 33 | if tag not in tag_items: 34 | tag_items[tag] = {} 35 | if item not in tag_items[tag]: 36 | tag_items[tag][item] = 0 37 | tag_items[tag][item] += 1 38 | 39 | def GetRecommendation(user): 40 | # 按照打分推荐N个未见过的 41 | if user not in user_tags: 42 | return [] 43 | seen_items = set(train[user]) 44 | item_score = {} 45 | for tag in user_tags[user]: 46 | for item in tag_items[tag]: 47 | if item in seen_items: 48 | continue 49 | if item not in item_score: 50 | item_score[item] = 0 51 | item_score[item] += user_tags[user][tag] * tag_items[tag][item] 52 | item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True)) 53 | return item_score[:N] 54 | 55 | return GetRecommendation 56 | 57 | 58 | # 2. 改进一:为热门标签加入惩罚项 59 | def TagBasedTFIDF(train, N): 60 | ''' 61 | :params: train, 训练数据集 62 | :params: N, 超参数,设置取TopN推荐物品数目 63 | :return: GetRecommendation,推荐接口函数 64 | ''' 65 | # 统计user_tags和tag_items 66 | user_tags, tag_items = {}, {} 67 | # 统计标签的热门程度,即打过此标签的不同用户数 68 | tag_pop = {} 69 | for user in train: 70 | user_tags[user] = {} 71 | for item in train[user]: 72 | for tag in train[user][item]: 73 | if tag not in user_tags[user]: 74 | user_tags[user][tag] = 0 75 | user_tags[user][tag] += 1 76 | if tag not in tag_items: 77 | tag_items[tag] = {} 78 | if item not in tag_items[tag]: 79 | tag_items[tag][item] = 0 80 | tag_items[tag][item] += 1 81 | if tag not in tag_pop: 82 | tag_pop[tag] = set() 83 | tag_pop[tag].add(user) 84 | tag_pop = {k: len(v) for k, v in tag_pop.items()} 85 | # tag_pop = {k: math.log(1 + len(v))for k, v in tag_pop.items()} 86 | 87 | def GetRecommendation(user): 88 | # 按照打分推荐N个未见过的 89 | if user not in user_tags: 90 | return [] 91 | seen_items = set(train[user]) 92 | item_score = {} 93 | for tag in user_tags[user]: 94 | for item in tag_items[tag]: 95 | if item in seen_items: 96 | continue 97 | if item not in item_score: 98 | item_score[item] = 0 99 | item_score[item] += user_tags[user][tag] * tag_items[tag][item] / tag_pop[tag] 100 | item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True)) 101 | return item_score[:N] 102 | 103 | return GetRecommendation 104 | 105 | 106 | # 3. 改进二:同时也为热门商品加入惩罚项 107 | def TagBasedTFIDF_Improved(train, N): 108 | ''' 109 | :params: train, 训练数据集 110 | :params: N, 超参数,设置取TopN推荐物品数目 111 | :return: GetRecommendation,推荐接口函数 112 | ''' 113 | # 统计user_tags和tag_items 114 | user_tags, tag_items = {}, {} 115 | # 统计标签和物品的热门程度,即打过此标签的不同用户数,和物品对应的不同用户数 116 | tag_pop, item_pop = {}, {} 117 | for user in train: 118 | user_tags[user] = {} 119 | for item in train[user]: 120 | if item not in item_pop: 121 | item_pop[item] = 0 122 | item_pop[item] += 1 123 | for tag in train[user][item]: 124 | if tag not in user_tags[user]: 125 | user_tags[user][tag] = 0 126 | user_tags[user][tag] += 1 127 | if tag not in tag_items: 128 | tag_items[tag] = {} 129 | if item not in tag_items[tag]: 130 | tag_items[tag][item] = 0 131 | tag_items[tag][item] += 1 132 | if tag not in tag_pop: 133 | tag_pop[tag] = set() 134 | tag_pop[tag].add(user) 135 | tag_pop = {k: len(v) for k, v in tag_pop.items()} 136 | # tag_pop = {k: math.log(1+len(v)) for k, v in tag_pop.items()} 137 | 138 | def GetRecommendation(user): 139 | # 按照打分推荐N个未见过的 140 | if user not in user_tags: 141 | return [] 142 | seen_items = set(train[user]) 143 | item_score = {} 144 | for tag in user_tags[user]: 145 | for item in tag_items[tag]: 146 | if item in seen_items: 147 | continue 148 | if item not in item_score: 149 | item_score[item] = 0 150 | # item_score[item] += user_tags[user][tag] * tag_items[tag][item] / tag_pop[tag] / math.log(item_pop[item]+1) 151 | item_score[item] += user_tags[user][tag] * tag_items[tag][item] / tag_pop[tag] / item_pop[item] 152 | item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True)) 153 | return item_score[:N] 154 | 155 | return GetRecommendation 156 | 157 | 158 | # 4. 基于标签改进的推荐 159 | def ExpandTagBased(train, N, M=20): 160 | ''' 161 | :params: train, 训练数据集 162 | :params: N, 超参数,设置取TopN推荐物品数目 163 | :params: M,超参数,设置取TopM的标签填补不满M个标签的用户 164 | :return: GetRecommendation,推荐接口函数 165 | ''' 166 | 167 | # 1. 计算标签之间的相似度 168 | item_tag = {} 169 | for user in train: 170 | for item in train[user]: 171 | if item not in item_tag: 172 | item_tag[item] = set() 173 | for tag in train[user][item]: 174 | item_tag[item].add(tag) 175 | tag_sim, tag_cnt = {}, {} 176 | for item in item_tag: 177 | for u in item_tag[item]: 178 | if u not in tag_cnt: 179 | tag_cnt[u] = 0 180 | tag_cnt[u] += 1 181 | if u not in tag_sim: 182 | tag_sim[u] = {} 183 | for v in item_tag[item]: 184 | if u == v: 185 | continue 186 | if v not in tag_sim[u]: 187 | tag_sim[u][v] = 0 188 | tag_sim[u][v] += 1 189 | for u in tag_sim: 190 | for v in tag_sim[u]: 191 | tag_sim[u][v] /= math.sqrt(tag_cnt[u] * tag_cnt[v]) 192 | 193 | # 2. 为每个用户扩展标签 194 | user_tags = {} 195 | for user in train: 196 | if user not in user_tags: 197 | user_tags[user] = {} 198 | for item in train[user]: 199 | for tag in train[user][item]: 200 | if tag not in user_tags[user]: 201 | user_tags[user][tag] = 0 202 | user_tags[user][tag] += 1 203 | expand_tags = {} 204 | for user in user_tags: 205 | if len(user_tags[user]) >= M: 206 | expand_tags[user] = user_tags[user] 207 | continue 208 | # 不满M个的进行标签扩展 209 | expand_tags[user] = {} 210 | seen_tags = set(user_tags[user]) 211 | for tag in user_tags[user]: 212 | for t in tag_sim[tag]: 213 | if t in seen_tags: 214 | continue 215 | if t not in expand_tags[user]: 216 | expand_tags[user][t] = 0 217 | expand_tags[user][t] += user_tags[user][tag] * tag_sim[tag][t]#相关性加权,生成新的tag权值 218 | expand_tags[user].update(user_tags[user]) 219 | expand_tags[user] = dict(list(sorted(expand_tags[user].items(), key=lambda x: x[1], reverse=True))[:M]) 220 | 221 | # 3. SimpleTagBased算法 222 | tag_items = {} 223 | for user in train: 224 | for item in train[user]: 225 | for tag in train[user][item]: 226 | if tag not in tag_items: 227 | tag_items[tag] = {} 228 | if item not in tag_items[tag]: 229 | tag_items[tag][item] = 0 230 | tag_items[tag][item] += 1 231 | 232 | def GetRecommendation(user): 233 | # 按照打分推荐N个未见过的 234 | if user not in user_tags: 235 | return [] 236 | seen_items = set(train[user]) 237 | item_score = {} 238 | for tag in expand_tags[user]: 239 | for item in tag_items[tag]: 240 | if item in seen_items: 241 | continue 242 | if item not in item_score: 243 | item_score[item] = 0 244 | item_score[item] += expand_tags[user][tag] * tag_items[tag][item] 245 | item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True)) 246 | return item_score[:N] 247 | 248 | return GetRecommendation 249 | 250 | 251 | class Experiment(): 252 | 253 | def __init__(self, M, N, fp='../../../data/hetrec2011-delicious-2k/user_taggedbookmarks.dat', rt='SimpleTagBased'): 254 | ''' 255 | :params: M, 进行多少次实验 256 | :params: N, TopN推荐物品的个数 257 | :params: fp, 数据文件路径 258 | :params: rt, 推荐算法类型 259 | ''' 260 | self.M = M 261 | self.N = N 262 | self.fp = fp 263 | self.rt = rt 264 | self.alg = {'SimpleTagBased': SimpleTagBased, 'TagBasedTFIDF': TagBasedTFIDF, \ 265 | 'TagBasedTFIDF_Improved': TagBasedTFIDF_Improved, 'ExtendTagBased': ExpandTagBased} 266 | 267 | # 定义单次实验 268 | @timmer 269 | def worker(self, train, test): 270 | ''' 271 | :params: train, 训练数据集 272 | :params: test, 测试数据集 273 | :return: 各指标的值 274 | ''' 275 | getRecommendation = self.alg[self.rt](train, self.N) 276 | metric = Metric(train, test, getRecommendation) 277 | return metric.eval() 278 | 279 | # 多次实验取平均 280 | @timmer 281 | def run(self): 282 | metrics = {'Precision': 0, 'Recall': 0, 283 | 'Coverage': 0, 'Diversity': 0, 284 | 'Popularity': 0} 285 | dataset = Dataset(self.fp) 286 | for ii in range(self.M): 287 | train, test = dataset.splitData(self.M, ii) 288 | print('Experiment {}:'.format(ii)) 289 | metric = self.worker(train, test) 290 | metrics = {k: metrics[k] + metric[k] for k in metrics} 291 | metrics = {k: metrics[k] / self.M for k in metrics} 292 | print('Average Result (M={}, N={}): {}'.format(self.M, self.N, metrics)) 293 | 294 | 295 | # # 1. SimpleTagBased实验 296 | # M, N = 10, 10 297 | # exp = Experiment(M, N, rt='SimpleTagBased') 298 | # exp.run() 299 | 300 | # 2. TagBasedTFIDF实验 301 | # M, N = 10, 10 302 | # exp = Experiment(M, N, rt='TagBasedTFIDF') 303 | # exp.run() 304 | 305 | 306 | # 3. TagBasedTFIDF++实验 307 | M, N = 10, 10 308 | exp = Experiment(M, N, rt='TagBasedTFIDF_Improved') 309 | exp.run() 310 | 311 | # # 4. TagExtend实验 312 | # M, N = 10, 10 313 | # exp = Experiment(M, N, rt='ExtendTagBased') 314 | # exp.run() 315 | 316 | -------------------------------------------------------------------------------- /chapter5_context_inf/Dataset.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import codecs 3 | 4 | class Dataset(): 5 | 6 | # 对每个用户按照时间进行从前到后的排序,取最后一个时间的item作为要预测的测试集 7 | 8 | def __init__(self, site=None): 9 | # site: which site to load 10 | self.bookmark_path = '../../../data/hetrec2011-delicious-2k/bookmarks.dat' 11 | self.user_bookmark_path = '../../../data/hetrec2011-delicious-2k/user_taggedbookmarks-timestamps.dat' 12 | self.site = site 13 | self.loadData() 14 | 15 | def loadData(self): 16 | bookmarks = [f.strip() for f in codecs.open(self.bookmark_path, 'r', encoding="ISO-8859-1").readlines()][1:] 17 | # 将不同网站对应的item与时间保存下来 18 | site_ids = {} 19 | for b in bookmarks: 20 | b = b.split('\t') 21 | if b[-1] not in site_ids: 22 | site_ids[b[-1]] = set() 23 | site_ids[b[-1]].add(b[0]) 24 | 25 | user_bookmarks = [f.strip() for f in 26 | codecs.open(self.user_bookmark_path, 'r', encoding="ISO-8859-1").readlines()][1:] 27 | data = {} 28 | cnt = 0 29 | # 选择和收集对应的url的user:item+time数据,并按照时间对每个user进行排序 30 | for ub in user_bookmarks: 31 | ub = ub.split('\t') 32 | if self.site is None or (self.site in site_ids and ub[1] in site_ids[self.site]): 33 | if ub[0] not in data: 34 | data[ub[0]] = set() 35 | data[ub[0]].add((ub[1], int(ub[3][:-3]))) 36 | cnt += 1 37 | self.data = {k: list(sorted(list(data[k]), key=lambda x: x[1], reverse=True)) for k in data} 38 | 39 | def splitData(self): 40 | ''' 41 | :params: data, 加载的所有(user, item)数据条目 42 | :return: train, test 43 | ''' 44 | train, test = {}, {} 45 | for user in self.data: 46 | if user not in train: 47 | train[user] = [] 48 | test[user] = [] 49 | data = self.data[user] 50 | train[user].extend(data[1:])# 51 | test[user].append(data[0])# 时间最大作为测试 52 | 53 | return train, test -------------------------------------------------------------------------------- /chapter5_context_inf/Metric.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Metric(): 4 | 5 | def __init__(self, train, test, GetRecommendation): 6 | ''' 7 | :params: train, 训练数据 8 | :params: test, 测试数据 9 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 10 | ''' 11 | self.train = train 12 | self.test = test 13 | self.GetRecommendation = GetRecommendation 14 | self.recs = self.getRec() 15 | 16 | # 为test中的每个用户进行推荐 17 | def getRec(self): 18 | recs = {} 19 | for user in self.test: 20 | rank = self.GetRecommendation(user) 21 | recs[user] = rank 22 | return recs 23 | 24 | # 定义精确率指标计算方式 25 | def precision(self): 26 | all, hit = 0, 0 27 | for user in self.test: 28 | test_items = set([x[0] for x in self.test[user]]) 29 | rank = self.recs[user] 30 | for item, score in rank: 31 | if item in test_items: 32 | hit += 1 33 | all += len(rank) 34 | return round(hit / all * 100, 2) if all > 0 else 0.0 35 | 36 | # 定义召回率指标计算方式 37 | def recall(self): 38 | all, hit = 0, 0 39 | for user in self.test: 40 | test_items = set([x[0] for x in self.test[user]]) 41 | rank = self.recs[user] 42 | for item, score in rank: 43 | if item in test_items: 44 | hit += 1 45 | all += len(test_items) 46 | return round(hit / all * 100, 2) if all > 0 else 0.0 47 | 48 | def eval(self): 49 | metric = {'Precision': self.precision(), 50 | 'Recall': self.recall()} 51 | return metric -------------------------------------------------------------------------------- /chapter5_context_inf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/chapter5_context_inf/__init__.py -------------------------------------------------------------------------------- /chapter5_context_inf/train_context_inf.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import math 3 | import time 4 | 5 | from code_w.recommand.chapter5.Dataset import Dataset 6 | from code_w.recommand.chapter5.Metric import Metric 7 | 8 | 9 | # 1. 给用户推荐近期最热门的物品 10 | def RecentPopular(train, K, N, alpha=1.0, t0=int(time.time())): 11 | ''' 12 | :params: train, 训练数据集 13 | :params: K, 可忽略 14 | :params: N, 超参数,设置取TopN推荐物品数目 15 | :params: alpha, 时间衰减因子 16 | :params: t0, 当前的时间戳 17 | :return: GetRecommendation,推荐接口函数 18 | ''' 19 | 20 | item_score = {} 21 | for user in train: 22 | for item, t in train[user]: 23 | if item not in item_score: 24 | item_score[item] = 0 25 | item_score[item] += 1.0 / (alpha * (t0 - t)) 26 | 27 | item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True)) 28 | 29 | def GetRecommendation(user): 30 | # 随机推荐N个未见过的 31 | user_items = set(train[user]) 32 | rec_items = [x for x in item_score if x[0] not in user_items] 33 | return rec_items[:N] 34 | 35 | return GetRecommendation 36 | 37 | 38 | # 2. 时间上下文相关的ItemCF算法 39 | def TItemCF(train, K, N, alpha=1.0, beta=1.0, t0=int(time.time())): 40 | ''' 41 | :params: train, 训练数据集 42 | :params: K, 超参数,设置取TopK相似物品数目 43 | :params: N, 超参数,设置取TopN推荐物品数目 44 | :params: alpha, 计算item相似度的时间衰减因子 45 | :params: beta, 推荐打分时的时间衰减因子 46 | :params: t0, 当前的时间戳 47 | :return: GetRecommendation, 推荐接口函数 48 | ''' 49 | # 计算物品相似度矩阵 50 | sim = {} 51 | num = {} 52 | for user in train: 53 | items = train[user] 54 | for i in range(len(items)): 55 | u, t1 = items[i] 56 | if u not in num: 57 | num[u] = 0 58 | num[u] += 1 59 | if u not in sim: 60 | sim[u] = {} 61 | for j in range(len(items)): 62 | if j == i: continue 63 | v, t2 = items[j] 64 | if v not in sim[u]: 65 | sim[u][v] = 0 66 | sim[u][v] += 1.0 / (alpha * (abs(t1 - t2) + 1)) 67 | for u in sim: 68 | for v in sim[u]: 69 | sim[u][v] /= math.sqrt(num[u] * num[v]) 70 | 71 | # 按照相似度排序 72 | sorted_item_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) \ 73 | for k, v in sim.items()} 74 | 75 | # 获取接口函数 76 | def GetRecommendation(user): 77 | items = {} 78 | seen_items = set(train[user]) 79 | for item, t in train[user]: 80 | for u, _ in sorted_item_sim[item][:K]: 81 | if u not in seen_items: 82 | if u not in items: 83 | items[u] = 0 84 | items[u] += sim[item][u] / (1 + beta * (t0 - t)) 85 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 86 | return recs 87 | 88 | return GetRecommendation 89 | 90 | 91 | # 3. 时间上下文相关的UserCF算法 92 | def TUserCF(train, K, N, alpha=1.0, beta=1.0, t0=int(time.time())): 93 | ''' 94 | :params: train, 训练数据集 95 | :params: K, 超参数,设置取TopK相似用户数目 96 | :params: N, 超参数,设置取TopN推荐物品数目 97 | :params: alpha, 计算item相似度的时间衰减因子 98 | :params: beta, 推荐打分时的时间衰减因子 99 | :params: t0, 当前的时间戳 100 | :return: GetRecommendation, 推荐接口函数 101 | ''' 102 | # 计算item->user的倒排索引 103 | item_users = {} 104 | for user in train: 105 | for item, t in train[user]: 106 | if item not in item_users: 107 | item_users[item] = [] 108 | item_users[item].append((user, t)) 109 | 110 | # 计算用户相似度矩阵 111 | sim = {} 112 | num = {} 113 | for item in item_users: 114 | users = item_users[item] 115 | for i in range(len(users)): 116 | u, t1 = users[i] 117 | if u not in num: 118 | num[u] = 0 119 | num[u] += 1 120 | if u not in sim: 121 | sim[u] = {} 122 | for j in range(len(users)): 123 | if j == i: continue 124 | v, t2 = users[j] 125 | if v not in sim[u]: 126 | sim[u][v] = 0 127 | sim[u][v] += 1.0 / (alpha * (abs(t1 - t2) + 1)) 128 | for u in sim: 129 | for v in sim[u]: 130 | sim[u][v] /= math.sqrt(num[u] * num[v]) 131 | 132 | # 按照相似度排序 133 | sorted_user_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) \ 134 | for k, v in sim.items()} 135 | 136 | # 获取接口函数 137 | def GetRecommendation(user): 138 | items = {} 139 | seen_items = set(train[user]) 140 | recs = [] 141 | if user in sorted_user_sim: 142 | for u, _ in sorted_user_sim[user][:K]: 143 | for item, _ in train[u]: 144 | if item not in seen_items: 145 | if item not in items: 146 | items[item] = 0 147 | items[item] += sim[user][u] / (1 + beta * (t0 - t)) 148 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 149 | return recs 150 | 151 | return GetRecommendation 152 | 153 | 154 | # 4. ItemCF算法 155 | def ItemCF(train, K, N): 156 | ''' 157 | :params: train, 训练数据集 158 | :params: K, 超参数,设置取TopK相似物品数目 159 | :params: N, 超参数,设置取TopN推荐物品数目 160 | :return: GetRecommendation, 推荐接口函数 161 | ''' 162 | # 计算物品相似度矩阵 163 | sim = {} 164 | num = {} 165 | for user in train: 166 | items = train[user] 167 | for i in range(len(items)): 168 | u, _ = items[i] 169 | if u not in num: 170 | num[u] = 0 171 | num[u] += 1 172 | if u not in sim: 173 | sim[u] = {} 174 | for j in range(len(items)): 175 | if j == i: continue 176 | v, _ = items[j] 177 | if v not in sim[u]: 178 | sim[u][v] = 0 179 | sim[u][v] += 1 180 | for u in sim: 181 | for v in sim[u]: 182 | sim[u][v] /= math.sqrt(num[u] * num[v]) 183 | 184 | # 按照相似度排序 185 | sorted_item_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) \ 186 | for k, v in sim.items()} 187 | 188 | # 获取接口函数 189 | def GetRecommendation(user): 190 | items = {} 191 | seen_items = set(train[user]) 192 | for item, _ in train[user]: 193 | for u, _ in sorted_item_sim[item][:K]: 194 | if u not in seen_items: 195 | if u not in items: 196 | items[u] = 0 197 | items[u] += sim[item][u] 198 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 199 | return recs 200 | 201 | return GetRecommendation 202 | 203 | 204 | # 5. UserCF算法 205 | def UserCF(train, K, N): 206 | ''' 207 | :params: train, 训练数据集 208 | :params: K, 超参数,设置取TopK相似用户数目 209 | :params: N, 超参数,设置取TopN推荐物品数目 210 | :return: GetRecommendation, 推荐接口函数 211 | ''' 212 | # 计算item->user的倒排索引 213 | item_users = {} 214 | for user in train: 215 | for item, _ in train[user]: 216 | if item not in item_users: 217 | item_users[item] = [] 218 | item_users[item].append(user) 219 | 220 | # 计算用户相似度矩阵 221 | sim = {} 222 | num = {} 223 | for item in item_users: 224 | users = item_users[item] 225 | for i in range(len(users)): 226 | u = users[i] 227 | if u not in num: 228 | num[u] = 0 229 | num[u] += 1 230 | if u not in sim: 231 | sim[u] = {} 232 | for j in range(len(users)): 233 | if j == i: continue 234 | v = users[j] 235 | if v not in sim[u]: 236 | sim[u][v] = 0 237 | sim[u][v] += 1 238 | for u in sim: 239 | for v in sim[u]: 240 | sim[u][v] /= math.sqrt(num[u] * num[v]) 241 | 242 | # 按照相似度排序 243 | sorted_user_sim = {k: list(sorted(v.items(),key=lambda x: x[1], reverse=True)) \ 244 | for k, v in sim.items()} 245 | 246 | # 获取接口函数 247 | def GetRecommendation(user): 248 | items = {} 249 | seen_items = set(train[user]) 250 | recs = [] 251 | if user in sorted_user_sim: 252 | for u, _ in sorted_user_sim[user][:K]: 253 | for item, _ in train[u]: 254 | # 要去掉用户见过的 255 | if item not in seen_items: 256 | if item not in items: 257 | items[item] = 0 258 | items[item] += sim[user][u] 259 | recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 260 | return recs 261 | 262 | return GetRecommendation 263 | 264 | 265 | class Experiment(): 266 | 267 | def __init__(self, K, N, site=None, rt='RecentPopular'): 268 | ''' 269 | :params: K, TopK相似的个数 270 | :params: N, TopN推荐物品的个数 271 | :params: site, 选择一个网站的记录进行推荐 272 | :params: rt, 推荐算法类型 273 | ''' 274 | self.K = K 275 | self.N = N 276 | self.site = site 277 | self.rt = rt 278 | self.alg = {'RecentPopular': RecentPopular, 'TItemCF': TItemCF, 279 | 'TUserCF': TUserCF, 'ItemCF': ItemCF, 'UserCF': UserCF} 280 | 281 | # 定义单次实验 282 | def worker(self, train, test): 283 | ''' 284 | :params: train, 训练数据集 285 | :params: test, 测试数据集 286 | :return: 各指标的值 287 | ''' 288 | getRecommendation = self.alg[self.rt](train, self.K, self.N) 289 | metric = Metric(train, test, getRecommendation) 290 | return metric.eval() 291 | 292 | # 运行实验 293 | def run(self): 294 | dataset = Dataset(self.site) 295 | train, test = dataset.splitData() 296 | metric = self.worker(train, test) 297 | print('Result (site={}, K={}, N={}): {}'.format( 298 | self.site, self.K, self.N, metric)) 299 | 300 | # 1. RecentPopular实验 301 | K = 0 # 为保持一致而设置,随便填一个值 302 | for site in ['www.nytimes.com', 'en.wikipedia.org']: 303 | for N in range(10, 110, 10): 304 | exp = Experiment(K, N, site=site, rt='RecentPopular') 305 | exp.run() 306 | 307 | # 2. TItemCF实验 308 | K = 10 309 | for site in ['www.nytimes.com', 'en.wikipedia.org']: 310 | for N in range(10, 110, 10): 311 | exp = Experiment(K, N, site=site, rt='TItemCF') 312 | exp.run() 313 | 314 | # 3. TUserCF实验 315 | K = 10 316 | for site in ['www.nytimes.com', 'en.wikipedia.org']: 317 | for N in range(10, 110, 10): 318 | exp = Experiment(K, N, site=site, rt='TUserCF') 319 | exp.run() 320 | 321 | # 4. ItemCF实验 322 | K = 10 323 | for site in ['www.nytimes.com', 'en.wikipedia.org']: 324 | for N in range(10, 110, 10): 325 | exp = Experiment(K, N, site=site, rt='ItemCF') 326 | exp.run() 327 | 328 | # 5. UserCF实验 329 | K = 10 330 | for site in ['www.nytimes.com', 'en.wikipedia.org']: 331 | for N in range(10, 110, 10): 332 | exp = Experiment(K, N, site=site, rt='UserCF') 333 | exp.run() 334 | 335 | -------------------------------------------------------------------------------- /chapter6_socail_network/train_social_network.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import time 5 | from tqdm import tqdm 6 | 7 | # 定义装饰器,监控运行时间 8 | def timmer(func): 9 | def wrapper(*args, **kwargs): 10 | start_time = time.time() 11 | res = func(*args, **kwargs) 12 | stop_time = time.time() 13 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 14 | return res 15 | return wrapper 16 | 17 | 18 | class Dataset(): 19 | 20 | def __init__(self, fp, sample=100000): 21 | # fp: data file path 22 | # sample: 只取部分数据集,-1则为全部 23 | self.data = self.loadData(fp, sample) 24 | 25 | def loadData(self, fp, sample): 26 | # 只取一个小数据集进行处理 27 | data = [f.strip().split('\t') for f in open(fp).readlines()[4:]] 28 | if sample == -1: 29 | return data 30 | else: 31 | random.shuffle(data) 32 | return data[:sample] 33 | 34 | def splitData(self, M, k, seed=1): 35 | ''' 36 | :params: data, 加载的所有(user, item)数据条目 37 | :params: M, 划分的数目,最后需要取M折的平均 38 | :params: k, 本次是第几次划分,k~[0, M) 39 | :params: seed, random的种子数,对于不同的k应设置成一样的 40 | :return: train, test 41 | ''' 42 | train, test = [], [] 43 | random.seed(seed) 44 | for u, v in self.data: 45 | # 这里与书中的不一致,本人认为取M-1较为合理,因randint是左右都覆盖的 46 | if random.randint(0, M - 1) == k: 47 | test.append((u, v)) 48 | else: 49 | train.append((u, v)) 50 | 51 | # 处理成字典的形式,user->set(items) 52 | def convert_dict(data): 53 | data_dict = {} # 当前用户指向的用户 54 | data_dict_t = {} # 指向当前用户的用户 55 | for u, v in data: 56 | if u not in data_dict: 57 | data_dict[u] = set() 58 | data_dict[u].add(v) 59 | if v not in data_dict_t: 60 | data_dict_t[v] = set() 61 | data_dict_t[v].add(u) 62 | data_dict = {k: list(data_dict[k]) for k in data_dict} 63 | data_dict_t = {k: list(data_dict_t[k]) for k in data_dict_t} 64 | return data_dict, data_dict_t 65 | 66 | return convert_dict(train), convert_dict(test)[0] 67 | 68 | 69 | class Metric(): 70 | 71 | def __init__(self, train, test, GetRecommendation): 72 | ''' 73 | :params: train, 训练数据 74 | :params: test, 测试数据 75 | :params: GetRecommendation, 为某个用户获取推荐物品的接口函数 76 | ''' 77 | self.train = train 78 | self.test = test 79 | self.GetRecommendation = GetRecommendation 80 | self.recs = self.getRec() 81 | 82 | # 为test中的每个用户进行推荐 83 | def getRec(self): 84 | recs = {} 85 | for user in self.test: 86 | rank = self.GetRecommendation(user) 87 | recs[user] = rank 88 | return recs 89 | 90 | # 定义精确率指标计算方式 91 | def precision(self): 92 | all, hit = 0, 0 93 | for user in self.test: 94 | test_users = set(self.test[user]) 95 | rank = self.recs[user] 96 | for v, score in rank: 97 | if v in test_users: 98 | hit += 1 99 | all += len(rank) 100 | return round(hit / all * 100, 2) if all > 0 else 0 101 | 102 | # 定义召回率指标计算方式 103 | def recall(self): 104 | all, hit = 0, 0 105 | for user in self.test: 106 | test_users = set(self.test[user]) 107 | rank = self.recs[user] 108 | for v, score in rank: 109 | if v in test_users: 110 | hit += 1 111 | all += len(test_users) 112 | return round(hit / all * 100, 2) if all > 0 else 0 113 | 114 | def eval(self): 115 | metric = {'Precision': self.precision(), 116 | 'Recall': self.recall()} 117 | print('Metric:', metric) 118 | return metric 119 | 120 | 121 | # 1. 利用用户出度计算相似度 122 | def OUT(train, N): 123 | ''' 124 | :params: train, 训练数据集(包含出和入的) 125 | :params: N, 超参数,设置取TopN推荐用户数目 126 | :return: GetRecommendation,推荐接口函数 127 | ''' 128 | 129 | G, GT = train # 分别为out和in 130 | 131 | def GetRecommendation(user): 132 | if user not in G: return [] 133 | # 根据相似度推荐N个未见过的 134 | user_sim = {} 135 | user_friends = set(G[user]) 136 | for u in G[user]: 137 | if u not in GT: continue 138 | for v in GT[u]:# 如果u的朋友是x(out[u]=x),寻找相似user v,v的好友也是x(out[v]=x),那么搜索in[x]=v 139 | if v != user and v not in user_friends: 140 | if v not in user_sim: 141 | user_sim[v] = 0 142 | user_sim[v] += 1 143 | user_sim = {v: user_sim[v] / math.sqrt(len(G[user]) * len(G[v])) for v in user_sim} 144 | return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))[:N] 145 | 146 | return GetRecommendation 147 | 148 | 149 | # 2. 利用用户入度计算相似度 150 | def IN(train, N): 151 | ''' 152 | :params: train, 训练数据集(包含出和入的) 153 | :params: N, 超参数,设置取TopN推荐用户数目 154 | :return: GetRecommendation,推荐接口函数 155 | ''' 156 | 157 | G, GT = train 158 | 159 | def GetRecommendation(user): 160 | if user not in GT: return [] 161 | # 根据相似度推荐N个未见过的 162 | user_sim = {} 163 | user_friends = set(G[user]) if user in G else set() 164 | for u in GT[user]: 165 | if u not in G: continue 166 | for v in G[u]: 167 | if v != user and v not in user_friends: 168 | if v not in user_sim: 169 | user_sim[v] = 0 170 | user_sim[v] += 1 171 | user_sim = {v: user_sim[v] / math.sqrt(len(GT[user] * len(GT[v]))) for v in user_sim} 172 | return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))[:N] 173 | 174 | return GetRecommendation 175 | 176 | 177 | # 3. 利用用户出度和入度进行计算,但没有考虑到热门入度用户的惩罚 178 | def OUT_IN(train, N): 179 | ''' 180 | :params: train, 训练数据集(包含出和入的) 181 | :params: N, 超参数,设置取TopN推荐用户数目 182 | :return: GetRecommendation,推荐接口函数 183 | ''' 184 | 185 | G, GT = train 186 | 187 | def GetRecommendation(user): 188 | if user not in G: return [] 189 | # 根据相似度推荐N个未见过的 190 | user_sim = {} 191 | user_friends = set(G[user]) 192 | for u in G[user]: 193 | if u not in G: continue 194 | for v in G[u]: 195 | if v != user and v not in user_friends: 196 | if v not in user_sim: 197 | user_sim[v] = 0 198 | user_sim[v] += 1 199 | user_sim = {v: user_sim[v] / len(G[user]) for v in user_sim} 200 | return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))[:N] 201 | 202 | return GetRecommendation 203 | 204 | 205 | # 4. 利用用户出度和入度的余弦相似度进行计算 206 | def OUT_IN_Cosine(train, N): 207 | ''' 208 | :params: train, 训练数据集(包含出和入的) 209 | :params: N, 超参数,设置取TopN推荐用户数目 210 | :return: GetRecommendation,推荐接口函数 211 | ''' 212 | 213 | G, GT = train 214 | 215 | def GetRecommendation(user): 216 | if user not in G: return [] 217 | # 根据相似度推荐N个未见过的 218 | user_sim = {} 219 | user_friends = set(G[user]) 220 | for u in G[user]: 221 | if u not in G: continue 222 | for v in G[u]: 223 | if v != user and v not in user_friends: 224 | if v not in user_sim: 225 | user_sim[v] = 0 226 | user_sim[v] += 1 227 | user_sim = {v: user_sim[v] / math.sqrt(len(G[user]) * len(GT[v])) for v in user_sim} 228 | return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))[:N] 229 | 230 | return GetRecommendation 231 | 232 | 233 | class Experiment(): 234 | 235 | def __init__(self, M, N, fp='../../../data/soc-Slashdot0902', rt='OUT'): 236 | ''' 237 | :params: M, 进行多少次实验 238 | :params: N, TopN推荐用户的个数 239 | :params: fp, 数据文件路径 240 | :params: rt, 推荐算法类型 241 | ''' 242 | self.M = M 243 | self.N = N 244 | self.fp = fp 245 | self.rt = rt 246 | self.alg = {'OUT': OUT, 'IN': IN, 247 | 'OUT_IN': OUT_IN, 'OUT_IN_Cosine': OUT_IN_Cosine} 248 | 249 | # 定义单次实验 250 | def worker(self, train, test): 251 | ''' 252 | :params: train, 训练数据集 253 | :params: test, 测试数据集 254 | :return: 各指标的值 255 | ''' 256 | getRecommendation = self.alg[self.rt](train, self.N) 257 | metric = Metric(train[0], test, getRecommendation) 258 | return metric.eval() 259 | 260 | # 多次实验取平均 261 | def run(self): 262 | metrics = {'Precision': 0, 'Recall': 0} 263 | dataset = Dataset(self.fp) 264 | for ii in range(self.M): 265 | train, test = dataset.splitData(self.M, ii) 266 | print('Experiment {}:'.format(ii)) 267 | metric = self.worker(train, test) 268 | metrics = {k: metrics[k] + metric[k] for k in metrics} 269 | metrics = {k: metrics[k] / self.M for k in metrics} 270 | print('Average Result (M={}, N={}, alg={}): {}'.format( 271 | self.M, self.N, self.rt, metrics)) 272 | 273 | # 1. Slashdot数据集实验 274 | M, N = 10, 10 275 | for alg in ['OUT', 'IN', 'OUT_IN', 'OUT_IN_Cosine']: 276 | exp = Experiment(M, N, fp='../../../data/soc-Slashdot0902/Slashdot0902.txt', rt=alg) 277 | exp.run() 278 | 279 | 280 | # 2. Epinions数据集实验 281 | M, N = 10, 10 282 | for alg in ['OUT', 'IN', 'OUT_IN', 'OUT_IN_Cosine']: 283 | exp = Experiment(M, N, fp='../../../data/soc-Epinions1/soc-Epinions1.txt', rt=alg) 284 | exp.run() 285 | 286 | -------------------------------------------------------------------------------- /chapter8_score_pre/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/chapter8_score_pre/__init__.py -------------------------------------------------------------------------------- /chapter8_score_pre/train_cascade_model.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import time 5 | from tqdm import tqdm 6 | 7 | 8 | # 定义装饰器,监控运行时间 9 | def timmer(func): 10 | def wrapper(*args, **kwargs): 11 | start_time = time.time() 12 | res = func(*args, **kwargs) 13 | stop_time = time.time() 14 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 15 | return res 16 | return wrapper 17 | 18 | 19 | class Data(): 20 | 21 | def __init__(self, user, item, rate, test=False, predict=0.0): 22 | self.user = user 23 | self.item = item 24 | self.rate = rate 25 | self.test = test 26 | self.predict = predict 27 | 28 | 29 | class Dataset(): 30 | 31 | def __init__(self, fp): 32 | # fp: data file path 33 | self.data = self.loadData(fp) 34 | 35 | def loadData(self, fp): 36 | data = [] 37 | for l in open(fp): 38 | data.append(tuple(map(int, l.strip().split('::')[:3]))) 39 | data = [Data(*d) for d in data] 40 | return data 41 | 42 | def splitData(self, M, k, seed=1): 43 | ''' 44 | :params: data, 加载的所有数据条目 45 | :params: M, 划分的数目,最后需要取M折的平均 46 | :params: k, 本次是第几次划分,k~[0, M) 47 | :params: seed, random的种子数,对于不同的k应设置成一样的 48 | :return: train, test 49 | ''' 50 | random.seed(seed) 51 | for i in range(len(self.data)): 52 | # 这里与书中的不一致,本人认为取M-1较为合理,因randint是左右都覆盖的 53 | if random.randint(0, M - 1) == k: 54 | self.data[i].test = True 55 | 56 | def RMSE(records): 57 | rmse = {'train_rmse': [], 'test_rmse': []} 58 | for r in records: 59 | if r.test: 60 | rmse['test_rmse'].append((r.rate - r.predict) ** 2) 61 | else: 62 | rmse['train_rmse'].append((r.rate - r.predict) ** 2) 63 | rmse = {'train_rmse': math.sqrt(sum(rmse['train_rmse']) / len(rmse['train_rmse'])), 64 | 'test_rmse': math.sqrt(sum(rmse['test_rmse']) / len(rmse['test_rmse']))} 65 | return rmse 66 | 67 | 68 | # 1. Cluster 69 | class Cluster: 70 | 71 | def __init__(self, records): 72 | self.group = {} 73 | 74 | def GetGroup(self, i): 75 | return 0 76 | 77 | 78 | # 2. IdCluster 79 | class IdCluster(Cluster): 80 | 81 | def __init__(self, records): 82 | Cluster.__init__(self, records) 83 | 84 | def GetGroup(self, i): 85 | return i 86 | 87 | 88 | # 3. UserActivityCluster 89 | class UserActivityCluster(Cluster): 90 | 91 | def __init__(self, records): 92 | Cluster.__init__(self, records) 93 | activity = {} 94 | for r in records: 95 | if r.test: continue 96 | if r.user not in activity: 97 | activity[r.user] = 0 98 | activity[r.user] += 1 99 | # 按照用户活跃度进行分组 100 | k = 0 101 | for user, n in sorted(activity.items(), key=lambda x: x[-1], reverse=False): 102 | c = int((k * 5) / len(activity)) 103 | self.group[user] = c 104 | k += 1 105 | 106 | def GetGroup(self, uid): 107 | if uid not in self.group: 108 | return -1 109 | else: 110 | return self.group[uid] 111 | 112 | 113 | # 3. ItemPopularityCluster 114 | class ItemPopularityCluster(Cluster): 115 | 116 | def __init__(self, records): 117 | Cluster.__init__(self, records) 118 | popularity = {} 119 | for r in records: 120 | if r.test: continue 121 | if r.item not in popularity: 122 | popularity[r.item] = 0 123 | popularity[r.item] += 1 124 | # 按照物品流行度进行分组 125 | k = 0 126 | for item, n in sorted(popularity.items(), key=lambda x: x[-1], reverse=False): 127 | c = int((k * 5) / len(popularity)) 128 | self.group[item] = c 129 | k += 1 130 | 131 | def GetGroup(self, iid): 132 | if iid not in self.group: 133 | return -1 134 | else: 135 | return self.group[iid] 136 | 137 | 138 | # 4. UserVoteCluster 139 | class UserVoteCluster(Cluster): 140 | 141 | def __init__(self, records): 142 | Cluster.__init__(self, records) 143 | vote, cnt = {}, {} 144 | for r in records: 145 | if r.test: continue 146 | if r.user not in vote: 147 | vote[r.user] = 0 148 | cnt[r.user] = 0 149 | vote[r.user] += r.rate 150 | cnt[r.user] += 1 151 | # 按照物品平均评分进行分组 152 | for user, v in vote.items(): 153 | c = v / (cnt[user] * 1.0) 154 | self.group[user] = int(c * 2) 155 | 156 | def GetGroup(self, uid): 157 | if uid not in self.group: 158 | return -1 159 | else: 160 | return self.group[uid] 161 | 162 | 163 | # 5. ItemVoteCluster 164 | class ItemVoteCluster(Cluster): 165 | 166 | def __init__(self, records): 167 | Cluster.__init__(self, records) 168 | vote, cnt = {}, {} 169 | for r in records: 170 | if r.test: continue 171 | if r.item not in vote: 172 | vote[r.item] = 0 173 | cnt[r.item] = 0 174 | vote[r.item] += r.rate 175 | cnt[r.item] += 1 176 | # 按照物品平均评分进行分组 177 | for item, v in vote.items(): 178 | c = v / (cnt[item] * 1.0) 179 | self.group[item] = int(c * 2) 180 | 181 | def GetGroup(self, iid): 182 | if iid not in self.group: 183 | return -1 184 | else: 185 | return self.group[iid] 186 | 187 | # 返回预测接口函数 188 | def PredictAll(records, UserGroup, ItemGroup): 189 | ''' 190 | :params: records, 数据集 191 | :params: UserGroup, 用户分组类 192 | :params: ItemGroup, 物品分组类 193 | ''' 194 | userGroup = UserGroup(records) 195 | itemGroup = ItemGroup(records) 196 | group = {} 197 | for r in records: 198 | ug = userGroup.GetGroup(r.user) 199 | ig = itemGroup.GetGroup(r.item) 200 | if ug not in group: 201 | group[ug] = {} 202 | if ig not in group[ug]: 203 | group[ug][ig] = [] 204 | # 这里计算的残差 205 | group[ug][ig].append(r.rate - r.predict) 206 | for ug in group: 207 | for ig in group[ug]: 208 | group[ug][ig] = sum(group[ug][ig]) / (1.0 * len(group[ug][ig]) + 1.0) 209 | # predict 210 | for i in range(len(records)): 211 | ug = userGroup.GetGroup(records[i].user) 212 | ig = itemGroup.GetGroup(records[i].item) 213 | # 这里需要与之前的结果进行结合,这里应该用回归训练的方法计算出每个推荐方法的权值。 214 | records[i].predict += group[ug][ig] 215 | 216 | 217 | class Experiment(): 218 | 219 | def __init__(self, M, UserGroup, ItemGroup, fp='../../../data/movies_data/ratings.dat'): 220 | ''' 221 | :params: M, 进行多少次实验 222 | :params: UserGroup, ItemGroup, 聚类算法类型 223 | :params: fp, 数据文件路径 224 | ''' 225 | self.userGroup = UserGroup 226 | self.itemGroup = ItemGroup 227 | self.dataset = Dataset(fp) 228 | self.dataset.splitData(M, 0) 229 | 230 | # 定义单次实验 231 | def worker(self, records): 232 | ''' 233 | :params: train, 训练数据集 234 | :params: test, 测试数据集 235 | :return: train和test的rmse值 236 | ''' 237 | PredictAll(records, self.userGroup, self.itemGroup) 238 | metric = RMSE(records) 239 | return metric 240 | 241 | # 多次实验取平均 242 | def run(self): 243 | metrics = {'train_rmse': 0, 'test_rmse': 0} 244 | metric = self.worker(self.dataset.data) 245 | print('Result (UserGroup={}, ItemGroup={}): {}'.format( 246 | self.userGroup.__name__, 247 | self.itemGroup.__name__, metric)) 248 | 249 | UserGroups = [Cluster, IdCluster, Cluster, UserActivityCluster, UserActivityCluster, Cluster, IdCluster, 250 | UserActivityCluster, UserVoteCluster, UserVoteCluster, Cluster, IdCluster, UserVoteCluster] 251 | ItemGroups = [Cluster, Cluster, IdCluster, Cluster, IdCluster, ItemPopularityCluster, ItemPopularityCluster, 252 | ItemPopularityCluster, Cluster, IdCluster, ItemVoteCluster, ItemVoteCluster, ItemVoteCluster] 253 | M = 10 254 | exp = Experiment(M, None, None) 255 | for i in range(len(UserGroups)): 256 | exp.userGroup = UserGroups[i] 257 | exp.itemGroup = ItemGroups[i] 258 | exp.run() 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /chapter8_score_pre/train_score_pre.py: -------------------------------------------------------------------------------- 1 | # 导入包 2 | import random 3 | import math 4 | import time 5 | from tqdm import tqdm 6 | 7 | # 定义装饰器,监控运行时间 8 | def timmer(func): 9 | def wrapper(*args, **kwargs): 10 | start_time = time.time() 11 | res = func(*args, **kwargs) 12 | stop_time = time.time() 13 | print('Func %s, run time: %s' % (func.__name__, stop_time - start_time)) 14 | return res 15 | return wrapper 16 | 17 | 18 | class Data(): 19 | 20 | def __init__(self, user, item, rate, test=False, predict=0.0): 21 | self.user = user 22 | self.item = item 23 | self.rate = rate 24 | self.test = test 25 | self.predict = predict 26 | 27 | 28 | class Dataset(): 29 | 30 | def __init__(self, fp): 31 | # fp: data file path 32 | self.data = self.loadData(fp) 33 | 34 | def loadData(self, fp): 35 | data = [] 36 | for l in open(fp): 37 | data.append(tuple(map(int, l.strip().split('::')[:3]))) 38 | data = [Data(*d) for d in data] 39 | return data 40 | 41 | def splitData(self, M, k, seed=1): 42 | ''' 43 | :params: data, 加载的所有数据条目 44 | :params: M, 划分的数目,最后需要取M折的平均 45 | :params: k, 本次是第几次划分,k~[0, M) 46 | :params: seed, random的种子数,对于不同的k应设置成一样的 47 | :return: train, test 48 | ''' 49 | random.seed(seed) 50 | for i in range(len(self.data)): 51 | # 这里与书中的不一致,本人认为取M-1较为合理,因randint是左右都覆盖的 52 | if random.randint(0, M - 1) == k: 53 | self.data[i].test = True 54 | 55 | def RMSE(records): 56 | rmse = {'train_rmse': [], 'test_rmse': []} 57 | for r in records: 58 | if r.test: 59 | rmse['test_rmse'].append((r.rate - r.predict) ** 2) 60 | else: 61 | rmse['train_rmse'].append((r.rate - r.predict) ** 2) 62 | rmse = {'train_rmse': math.sqrt(sum(rmse['train_rmse']) / len(rmse['train_rmse'])), 63 | 'test_rmse': math.sqrt(sum(rmse['test_rmse']) / len(rmse['test_rmse']))} 64 | return rmse 65 | 66 | 67 | # 1. Cluster 68 | class Cluster: 69 | # 分组,所有用户分组为0 70 | def __init__(self, records): 71 | self.group = {} 72 | 73 | def GetGroup(self, i): 74 | return 0 75 | 76 | 77 | # 2. IdCluster 78 | class IdCluster(Cluster): 79 | # 按照用户id分组 80 | def __init__(self, records): 81 | Cluster.__init__(self, records) 82 | 83 | def GetGroup(self, i): 84 | return i 85 | 86 | 87 | # 3. UserActivityCluster 88 | class UserActivityCluster(Cluster): 89 | # 按照用户活跃度进行分组,用户id对应不同的group。 90 | def __init__(self, records): 91 | Cluster.__init__(self, records) 92 | activity = {} 93 | for r in records: 94 | if r.test: continue 95 | if r.user not in activity: 96 | activity[r.user] = 0 97 | activity[r.user] += 1 98 | # 按照用户活跃度进行分组 99 | k = 0 100 | for user, n in sorted(activity.items(), key=lambda x: x[-1], reverse=False): 101 | c = int((k * 5) / len(activity)) 102 | self.group[user] = c 103 | k += 1 104 | 105 | def GetGroup(self, uid): 106 | if uid not in self.group: 107 | return -1 108 | else: 109 | return self.group[uid] 110 | 111 | 112 | # 3. ItemPopularityCluster 113 | class ItemPopularityCluster(Cluster): 114 | # 按照物品流行度进行分组,物品id对应不同的group。 115 | def __init__(self, records): 116 | Cluster.__init__(self, records) 117 | popularity = {} 118 | for r in records: 119 | if r.test: continue 120 | if r.item not in popularity: 121 | popularity[r.item] = 0 122 | popularity[r.item] += 1 123 | # 按照物品流行度进行分组 124 | k = 0 125 | for item, n in sorted(popularity.items(), key=lambda x: x[-1], reverse=False): 126 | c = int((k * 5) / len(popularity)) 127 | self.group[item] = c 128 | k += 1 129 | 130 | def GetGroup(self, iid): 131 | if iid not in self.group: 132 | return -1 133 | else: 134 | return self.group[iid] 135 | 136 | 137 | # 4. UserVoteCluster 138 | class UserVoteCluster(Cluster): 139 | #对于每个用户,用户对物品评价的平均分进行排名。这里区别于用户活跃度。 140 | def __init__(self, records): 141 | Cluster.__init__(self, records) 142 | vote, cnt = {}, {} 143 | for r in records: 144 | if r.test: continue 145 | if r.user not in vote: 146 | vote[r.user] = 0 147 | cnt[r.user] = 0 148 | vote[r.user] += r.rate 149 | cnt[r.user] += 1 150 | # 按照物品平均评分进行分组 151 | for user, v in vote.items(): 152 | c = v / (cnt[user] * 1.0) 153 | self.group[user] = int(c * 2) 154 | 155 | def GetGroup(self, uid): 156 | if uid not in self.group: 157 | return -1 158 | else: 159 | return self.group[uid] 160 | 161 | 162 | # 5. ItemVoteCluster 163 | class ItemVoteCluster(Cluster): 164 | # 对于每个item,用户对物品评价的平均分进行排名。这里区别于物品流行度。 165 | def __init__(self, records): 166 | Cluster.__init__(self, records) 167 | vote, cnt = {}, {} 168 | for r in records: 169 | if r.test: continue 170 | if r.item not in vote: 171 | vote[r.item] = 0 172 | cnt[r.item] = 0 173 | vote[r.item] += r.rate 174 | cnt[r.item] += 1 175 | # 按照物品平均评分进行分组 176 | for item, v in vote.items(): 177 | c = v / (cnt[item] * 1.0) 178 | self.group[item] = int(c * 2) 179 | 180 | def GetGroup(self, iid): 181 | if iid not in self.group: 182 | return -1 183 | else: 184 | return self.group[iid] 185 | 186 | # 返回预测接口函数 187 | def PredictAll(records, UserGroup, ItemGroup): 188 | ''' 189 | :params: records, 数据集 190 | :params: UserGroup, 用户分组类 191 | :params: ItemGroup, 物品分组类 192 | ''' 193 | # 对每个用户分组,不同的算法分组方式不同,计算每个组用户u对每个item打分的平均值,然后进行预测 194 | userGroup = UserGroup(records) 195 | itemGroup = ItemGroup(records) 196 | group = {} 197 | for r in records: 198 | ug = userGroup.GetGroup(r.user) 199 | ig = itemGroup.GetGroup(r.item) 200 | if ug not in group: 201 | group[ug] = {} 202 | if ig not in group[ug]: 203 | group[ug][ig] = [] 204 | group[ug][ig].append(r.rate) 205 | for ug in group: 206 | for ig in group[ug]: 207 | group[ug][ig] = sum(group[ug][ig]) / (1.0 * len(group[ug][ig]) + 1.0) 208 | # predict 209 | for r in records: 210 | ug = userGroup.GetGroup(r.user) 211 | ig = itemGroup.GetGroup(r.item) 212 | r.predict = group[ug][ig] 213 | 214 | 215 | class Experiment(): 216 | 217 | def __init__(self, M, UserGroup, ItemGroup, fp='../../../data/movies_data/ratings.dat'): 218 | ''' 219 | :params: M, 划分数据集的比例 220 | :params: UserGroup, ItemGroup, 聚类算法类型 221 | :params: fp, 数据文件路径 222 | ''' 223 | self.M = M 224 | self.userGroup = UserGroup 225 | self.itemGroup = ItemGroup 226 | self.fp = fp 227 | 228 | # 定义单次实验 229 | def worker(self, records): 230 | ''' 231 | :params: train, 训练数据集 232 | :params: test, 测试数据集 233 | :return: train和test的rmse值 234 | ''' 235 | PredictAll(records, self.userGroup, self.itemGroup) 236 | metric = RMSE(records) 237 | return metric 238 | 239 | # 多次实验取平均 240 | def run(self): 241 | dataset = Dataset(self.fp) 242 | dataset.splitData(self.M, 0) 243 | metric = self.worker(dataset.data) 244 | print('Result (UserGroup={}, ItemGroup={}): {}'.format( 245 | self.userGroup.__name__, 246 | self.itemGroup.__name__, metric)) 247 | 248 | UserGroups = [Cluster, IdCluster, Cluster, UserActivityCluster, UserActivityCluster, Cluster, IdCluster, 249 | UserActivityCluster, UserVoteCluster, UserVoteCluster, Cluster, IdCluster, UserVoteCluster] 250 | ItemGroups = [Cluster, Cluster, IdCluster, Cluster, IdCluster, ItemPopularityCluster, ItemPopularityCluster, 251 | ItemPopularityCluster, Cluster, IdCluster, ItemVoteCluster, ItemVoteCluster, ItemVoteCluster] 252 | M = 10 253 | # for i in range(len(UserGroups)): 254 | # exp = Experiment(M, UserGroups[i], ItemGroups[i]) 255 | # exp.run() 256 | 257 | exp = Experiment(M, UserGroups[1], ItemGroups[1]) 258 | exp.run() 259 | 260 | -------------------------------------------------------------------------------- /images/TopN推荐.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/images/TopN推荐.png -------------------------------------------------------------------------------- /images/推荐系统.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/images/推荐系统.png -------------------------------------------------------------------------------- /images/推荐系统架构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/images/推荐系统架构.png -------------------------------------------------------------------------------- /images/评分预测推荐.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangyuyunmu/Recommended-system-practice/c436b881f6fc7ae8ddc1f6927bd9dd07bf8d66e3/images/评分预测推荐.png --------------------------------------------------------------------------------