├── requirements.txt ├── Datas ├── ml-1m │ ├── 0.1 │ │ ├── IBCF ml-1M.png │ │ ├── UBCF ml-1M.png │ │ ├── Item-basedCF.txt │ │ └── User-basedCF.txt │ └── 0.2 │ │ ├── IBCF ml-1M.png │ │ ├── UBCF ml-1M.png │ │ ├── Item-basedCF.txt │ │ └── User-basedCF.txt ├── ml-10M │ └── 0.1 │ │ ├── IBCF ml-10M.png │ │ ├── UBCF ml-10M.png │ │ ├── Item-basedCF.txt │ │ └── User-basedCF.txt └── ml-100k │ ├── 0.1 │ ├── IBCF ml-100k.png │ ├── UBCF ml-100k.png │ ├── Item-basedCF.txt │ └── User-basedCF.txt │ ├── 0.2 │ ├── IBCF ml-100k.png │ ├── UBCF ml-100k.png │ ├── Item-basedCF.txt │ └── User-basedCF.txt │ ├── IBCF ml-100k 0.1.png │ ├── IBCF ml-100k 0.2.png │ ├── UBCF ml-100k 0.1.png │ ├── UBCF ml-100k 0.2.png │ ├── Item-basedCF ml-100k 0.1.txt │ ├── Item-basedCF ml-100k 0.2.txt │ ├── User-basedCF ml-100k 0.1.txt │ └── User-basedCF ml-100k 0.2.txt ├── Docs └── ml-100k │ └── UBCF ml-100k 0.2.png ├── test_mbcf.py ├── .travis.yml ├── EvaluationHelper.py ├── ThreadWithReturn.py ├── README.md ├── DistanceHelper.py ├── .gitignore ├── User_basedCF.py ├── Item_basedCF.py ├── bak ├── kNNUnitTest.py ├── getRating.py ├── Test1.py ├── Test.py └── Intro to Recommender Systems_Collaborative Filtering.py ├── RunExample.py └── DataHelper.py /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | matplotlib 3 | sklearn 4 | numpy -------------------------------------------------------------------------------- /Datas/ml-1m/0.1/IBCF ml-1M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.1/IBCF ml-1M.png -------------------------------------------------------------------------------- /Datas/ml-1m/0.1/UBCF ml-1M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.1/UBCF ml-1M.png -------------------------------------------------------------------------------- /Datas/ml-1m/0.2/IBCF ml-1M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.2/IBCF ml-1M.png -------------------------------------------------------------------------------- /Datas/ml-1m/0.2/UBCF ml-1M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.2/UBCF ml-1M.png -------------------------------------------------------------------------------- /Datas/ml-10M/0.1/IBCF ml-10M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-10M/0.1/IBCF ml-10M.png -------------------------------------------------------------------------------- /Datas/ml-10M/0.1/UBCF ml-10M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-10M/0.1/UBCF ml-10M.png -------------------------------------------------------------------------------- /Docs/ml-100k/UBCF ml-100k 0.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Docs/ml-100k/UBCF ml-100k 0.2.png -------------------------------------------------------------------------------- /Datas/ml-100k/0.1/IBCF ml-100k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.1/IBCF ml-100k.png -------------------------------------------------------------------------------- /Datas/ml-100k/0.1/UBCF ml-100k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.1/UBCF ml-100k.png -------------------------------------------------------------------------------- /Datas/ml-100k/0.2/IBCF ml-100k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.2/IBCF ml-100k.png -------------------------------------------------------------------------------- /Datas/ml-100k/0.2/UBCF ml-100k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.2/UBCF ml-100k.png -------------------------------------------------------------------------------- /Datas/ml-100k/IBCF ml-100k 0.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/IBCF ml-100k 0.1.png -------------------------------------------------------------------------------- /Datas/ml-100k/IBCF ml-100k 0.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/IBCF ml-100k 0.2.png -------------------------------------------------------------------------------- /Datas/ml-100k/UBCF ml-100k 0.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/UBCF ml-100k 0.1.png -------------------------------------------------------------------------------- /Datas/ml-100k/UBCF ml-100k 0.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/UBCF ml-100k 0.2.png -------------------------------------------------------------------------------- /test_mbcf.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | class TestClass: 4 | 5 | def test_one(self): 6 | x = "this" 7 | 8 | def test_two(self): 9 | x = "hello" 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | # command to install dependencies 5 | install: 6 | - pip install -r requirements.txt 7 | # command to run tests 8 | script: pytest 9 | notifications: 10 | email: false 11 | -------------------------------------------------------------------------------- /Datas/ml-10M/0.1/Item-basedCF.txt: -------------------------------------------------------------------------------- 1 | IBCF K=25 RMSE:0.821169 MAE:0.674318 2 | IBCF K=50 RMSE:0.826170 MAE:0.682557 3 | IBCF K=75 RMSE:0.830468 MAE:0.689677 4 | IBCF K=100 RMSE:0.833642 MAE:0.694958 5 | IBCF K=125 RMSE:0.836080 MAE:0.699030 6 | IBCF K=150 RMSE:0.837998 MAE:0.702240 7 | -------------------------------------------------------------------------------- /Datas/ml-1m/0.1/Item-basedCF.txt: -------------------------------------------------------------------------------- 1 | IBCF K=25 RMSE:0.864500 MAE:0.747361 2 | IBCF K=50 RMSE:0.869455 MAE:0.755952 3 | IBCF K=75 RMSE:0.874602 MAE:0.764928 4 | IBCF K=100 RMSE:0.878605 MAE:0.771946 5 | IBCF K=125 RMSE:0.881555 MAE:0.777140 6 | IBCF K=150 RMSE:0.884046 MAE:0.781537 7 | -------------------------------------------------------------------------------- /Datas/ml-1m/0.2/Item-basedCF.txt: -------------------------------------------------------------------------------- 1 | IBCF K=25 RMSE:0.868866 MAE:0.754929 2 | IBCF K=50 RMSE:0.873996 MAE:0.763868 3 | IBCF K=75 RMSE:0.878743 MAE:0.772190 4 | IBCF K=100 RMSE:0.882366 MAE:0.778570 5 | IBCF K=125 RMSE:0.885053 MAE:0.783319 6 | IBCF K=150 RMSE:0.887112 MAE:0.786967 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/0.1/Item-basedCF.txt: -------------------------------------------------------------------------------- 1 | IBCF K=25 RMSE:0.911620 MAE:0.831052 2 | IBCF K=50 RMSE:0.916596 MAE:0.840149 3 | IBCF K=75 RMSE:0.921024 MAE:0.848286 4 | IBCF K=100 RMSE:0.924499 MAE:0.854699 5 | IBCF K=125 RMSE:0.926818 MAE:0.858992 6 | IBCF K=150 RMSE:0.928498 MAE:0.862109 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/0.2/Item-basedCF.txt: -------------------------------------------------------------------------------- 1 | IBCF K=25 RMSE:0.919330 MAE:0.845168 2 | IBCF K=50 RMSE:0.921909 MAE:0.849915 3 | IBCF K=75 RMSE:0.924977 MAE:0.855583 4 | IBCF K=100 RMSE:0.927738 MAE:0.860699 5 | IBCF K=125 RMSE:0.929850 MAE:0.864622 6 | IBCF K=150 RMSE:0.931432 MAE:0.867565 7 | -------------------------------------------------------------------------------- /Datas/ml-1m/0.1/User-basedCF.txt: -------------------------------------------------------------------------------- 1 | UBCF K=25 RMSE:0.907885 MAE:0.824255 2 | UBCF K=50 RMSE:0.905784 MAE:0.820444 3 | UBCF K=75 RMSE:0.906797 MAE:0.822281 4 | UBCF K=100 RMSE:0.908323 MAE:0.825051 5 | UBCF K=125 RMSE:0.909714 MAE:0.827580 6 | UBCF K=150 RMSE:0.910878 MAE:0.829700 7 | -------------------------------------------------------------------------------- /Datas/ml-1m/0.2/User-basedCF.txt: -------------------------------------------------------------------------------- 1 | UBCF K=25 RMSE:0.909597 MAE:0.827367 2 | UBCF K=50 RMSE:0.907115 MAE:0.822857 3 | UBCF K=75 RMSE:0.908010 MAE:0.824482 4 | UBCF K=100 RMSE:0.909183 MAE:0.826614 5 | UBCF K=125 RMSE:0.910374 MAE:0.828782 6 | UBCF K=150 RMSE:0.911454 MAE:0.830749 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/0.1/User-basedCF.txt: -------------------------------------------------------------------------------- 1 | UBCF K=25 RMSE:0.935700 MAE:0.875534 2 | UBCF K=50 RMSE:0.936977 MAE:0.877926 3 | UBCF K=75 RMSE:0.939825 MAE:0.883271 4 | UBCF K=100 RMSE:0.941714 MAE:0.886825 5 | UBCF K=125 RMSE:0.943357 MAE:0.889922 6 | UBCF K=150 RMSE:0.944576 MAE:0.892225 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/0.2/User-basedCF.txt: -------------------------------------------------------------------------------- 1 | UBCF K=25 RMSE:0.940682 MAE:0.884883 2 | UBCF K=50 RMSE:0.941559 MAE:0.886533 3 | UBCF K=75 RMSE:0.943585 MAE:0.890353 4 | UBCF K=100 RMSE:0.945687 MAE:0.894324 5 | UBCF K=125 RMSE:0.947128 MAE:0.897052 6 | UBCF K=150 RMSE:0.948167 MAE:0.899021 7 | -------------------------------------------------------------------------------- /Datas/ml-10M/0.1/User-basedCF.txt: -------------------------------------------------------------------------------- 1 | UBCF K=25 RMSE:0.845004 MAE:0.714032 2 | UBCF K=50 RMSE:0.841574 MAE:0.708246 3 | UBCF K=75 RMSE:0.841605 MAE:0.708298 4 | UBCF K=100 RMSE:0.842181 MAE:0.709269 5 | UBCF K=125 RMSE:0.843042 MAE:0.710719 6 | UBCF K=150 RMSE:0.843740 MAE:0.711898 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/Item-basedCF ml-100k 0.1.txt: -------------------------------------------------------------------------------- 1 | IBCF K=25 RMSE:0.919458 MAE:0.845403 2 | IBCF K=50 RMSE:0.924853 MAE:0.855352 3 | IBCF K=75 RMSE:0.928752 MAE:0.862580 4 | IBCF K=100 RMSE:0.931144 MAE:0.867029 5 | IBCF K=125 RMSE:0.933297 MAE:0.871044 6 | IBCF K=150 RMSE:0.934274 MAE:0.872867 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/Item-basedCF ml-100k 0.2.txt: -------------------------------------------------------------------------------- 1 | IBCF K=25 RMSE:0.917474 MAE:0.841758 2 | IBCF K=50 RMSE:0.921177 MAE:0.848567 3 | IBCF K=75 RMSE:0.925177 MAE:0.855952 4 | IBCF K=100 RMSE:0.928474 MAE:0.862065 5 | IBCF K=125 RMSE:0.930407 MAE:0.865657 6 | IBCF K=150 RMSE:0.931977 MAE:0.868581 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/User-basedCF ml-100k 0.1.txt: -------------------------------------------------------------------------------- 1 | UBCF K=25 RMSE:0.936204 MAE:0.876478 2 | UBCF K=50 RMSE:0.936623 MAE:0.877263 3 | UBCF K=75 RMSE:0.939302 MAE:0.882289 4 | UBCF K=100 RMSE:0.941543 MAE:0.886502 5 | UBCF K=125 RMSE:0.943234 MAE:0.889690 6 | UBCF K=150 RMSE:0.944467 MAE:0.892018 7 | -------------------------------------------------------------------------------- /Datas/ml-100k/User-basedCF ml-100k 0.2.txt: -------------------------------------------------------------------------------- 1 | UBCF K=25 RMSE:0.940611 MAE:0.884748 2 | UBCF K=50 RMSE:0.941572 MAE:0.886558 3 | UBCF K=75 RMSE:0.943732 MAE:0.890631 4 | UBCF K=100 RMSE:0.945455 MAE:0.893886 5 | UBCF K=125 RMSE:0.946944 MAE:0.896703 6 | UBCF K=150 RMSE:0.947725 MAE:0.898183 7 | -------------------------------------------------------------------------------- /EvaluationHelper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import mean_squared_error, mean_absolute_error 3 | 4 | 5 | def RMSE(true, prediction): 6 | rmse = np.sqrt(mean_squared_error(true, prediction)) 7 | return rmse 8 | 9 | 10 | def MAE(true, prediction): 11 | mae = mean_absolute_error(true, prediction) 12 | return mae 13 | -------------------------------------------------------------------------------- /ThreadWithReturn.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | 3 | 4 | class ThreadWithReturnValue(Thread): 5 | def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None): 6 | Thread.__init__(self, group, target, name, args, kwargs, daemon=daemon) 7 | 8 | self._return = None 9 | 10 | def run(self): 11 | if self._target is not None: 12 | self._return = self._target(*self._args, **self._kwargs) 13 | 14 | def join(self): 15 | Thread.join(self) 16 | return self._return -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Memory-based-collaborative-filtering 2 | 3 | [![Build Status](https://travis-ci.org/fuhailin/Memory-based-collaborative-filtering.svg?branch=master)](https://travis-ci.org/fuhailin/Memory-based-collaborative-filtering) 4 | 5 | Contain User-based CF([UBCF](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/User_basedCF.py)),Item-based CF([IBCF](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/Item_basedCF.py)) 6 | A robust k-nearest neighbors Recommender System use MovieLens dataset in Python 7 | 8 | ## User-based collaborative filter 9 | > *K=25   RunTime:1s 10 | RMSE:0.940611 11 | MAE:0.884748.* 12 | 13 | 14 | ![image](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/Docs/ml-100k/UBCF%20ml-100k%200.2.png) 15 | ![image](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/Docs/ml-100k/IBCF%20ml-100k%200.2.png) 16 | 17 | Memory-based algorithms are easy to implement and produce reasonable prediction quality. 18 | The drawback of memory-based CF is that it doesn’t scale to real-world scenarios and doesn’t address the well-known cold-start problem, that is when new user or new item enters the system. 19 | -------------------------------------------------------------------------------- /DistanceHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import math 3 | 4 | ''' 5 | # 1) 用scikit cosine_similarity计算余弦相似度 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | user_similarity=cosine_similarity(user_item_matric) 8 | 9 | # 2) 用scikit pairwise_distances计算相似度,用pairwise_distances计算的Cosine distance是1-(cosine similarity)结果 10 | from sklearn.metrics.pairwise import pairwise_distances 11 | user_similarity = pairwise_distances(user_item_matric, metric='cosine') 12 | ''' 13 | 14 | 15 | class DistanceHelper(object): 16 | # 1) given two data points, calculate the euclidean distance between them 17 | def Euclidean_distance(self, vector1, vector2): 18 | points = zip(vector1, vector2) 19 | diffs_squared_distance = [pow(a - b, 2) for (a, b) in points] 20 | return math.sqrt(sum(diffs_squared_distance)) 21 | 22 | def Cosin_distance(self, vector1, vector2): 23 | dot_product = 0.0 24 | normA = 0.0 25 | normB = 0.0 26 | for a, b in zip(vector1, vector2): 27 | dot_product += a * b 28 | normA += a ** 2 29 | normB += b ** 2 30 | if normA == 0.0 or normB == 0.0: 31 | return None 32 | else: 33 | return dot_product / ((normA * normB) ** 0.5) 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea/ 91 | Datas/ml-1m/ 92 | Datas/ml-10M100K/ 93 | Datas/ml-20m/ 94 | *.data 95 | -------------------------------------------------------------------------------- /User_basedCF.py: -------------------------------------------------------------------------------- 1 | from DataHelper import * 2 | from EvaluationHelper import * 3 | 4 | 5 | class UBCollaborativeFilter(object): 6 | def __init__(self): 7 | self.SimilityMatrix = None 8 | self.truerating = [] 9 | self.predictions = [] 10 | self.train_data_matrix = None 11 | self.RMSE = dict() 12 | self.MAE = dict() 13 | self.UserMeanMatrix = None 14 | 15 | # 平均加权策略,预测userId对itemId的评分 16 | def getRating(self, Train_data_matrix, userId, simility_matrix, neighborset): 17 | # simSums为0,即该项目尚未被其他用户评分,这里的处理方法:返回用户平均分 18 | simSums = np.sum(simility_matrix[neighborset]) 19 | # 获取userId 的平均值 20 | averageOfUser = self.UserMeanMatrix[userId] 21 | # 计算每个用户的加权,预测得分 22 | jiaquanAverage = (Train_data_matrix[neighborset]).dot(simility_matrix[neighborset]) 23 | if simSums == 0: 24 | return 0 25 | else: 26 | return jiaquanAverage / simSums 27 | 28 | def doEvaluate(self, testDataMatrix, K): 29 | a, b = testDataMatrix.nonzero() 30 | for userIndex, itemIndex in zip(a, b): 31 | # 用户最相似的K个用户 32 | neighborset = get_K_Neighbors(self.train_data_matrix[:, itemIndex], self.SimilityMatrix[userIndex], K) 33 | # 基于训练集预测用户评分(用户数目<=K) 34 | prerating = self.getRating(self.train_data_matrix[:, itemIndex], userIndex, self.SimilityMatrix[userIndex], neighborset) 35 | self.truerating.append(testDataMatrix[userIndex][itemIndex]) 36 | self.predictions.append(prerating) 37 | # print(len(self.predictions)) 38 | self.RMSE[K] = RMSE(self.truerating, self.predictions) 39 | self.MAE[K] = MAE(self.truerating, self.predictions) 40 | print("UBCF K=%d,RMSE:%f,MAE:%f" % (K, self.RMSE[K], self.MAE[K])) 41 | -------------------------------------------------------------------------------- /Item_basedCF.py: -------------------------------------------------------------------------------- 1 | #! python3 2 | # -*- coding: utf-8 -*- 3 | from DataHelper import * 4 | from EvaluationHelper import * 5 | 6 | 7 | class IBCollaborativeFilter(object): 8 | def __init__(self): 9 | self.SimilityMatrix = None 10 | self.ItemMeanMatrix = None 11 | self.truerating = [] 12 | self.predictions = [] 13 | self.train_data_matrix = None 14 | self.RMSE = dict() 15 | self.MAE = dict() 16 | 17 | ### 平均加权策略,预测userId对itemId的评分 18 | def getRating(self, Train_data_matrix, itemId, simility_matrix, knumber=20): 19 | neighborset = get_K_Neighbors(Train_data_matrix, simility_matrix, knumber) # 最相似的K个Item 20 | simSums = numpy.sum(simility_matrix[neighborset]) # simSums为0,即该项目尚未被其他用户评分,这里的处理方法:返回用户平均分 21 | averageOfUser = self.ItemMeanMatrix[itemId] # 获取userId 的平均值 22 | jiaquanAverage = (Train_data_matrix[neighborset] - self.ItemMeanMatrix[neighborset]).dot(simility_matrix[neighborset]) # 计算每个用户的加权,预测 23 | if simSums == 0: 24 | return averageOfUser 25 | else: 26 | return averageOfUser + jiaquanAverage / simSums 27 | 28 | def doEvaluate(self, testDataMatrix, K): 29 | a, b = testDataMatrix.nonzero() 30 | for userIndex, itemIndex in zip(a, b): 31 | prerating = self.getRating(self.train_data_matrix[userIndex], itemIndex, self.SimilityMatrix[itemIndex],K) # 基于训练集预测用户评分(用户数目<=K) 32 | self.truerating.append(testDataMatrix[userIndex][itemIndex]) 33 | self.predictions.append(prerating) 34 | # print(len(self.predictions)) 35 | self.RMSE[K] = RMSE(self.truerating, self.predictions) 36 | self.MAE[K] = MAE(self.truerating, self.predictions) 37 | print("IBCF K=%d,RMSE:%f,MAE:%f" % (K, self.RMSE[K], self.MAE[K])) 38 | 39 | 40 | -------------------------------------------------------------------------------- /bak/kNNUnitTest.py: -------------------------------------------------------------------------------- 1 | #! python3 2 | # -*- coding: utf-8 -*- 3 | import datetime 4 | from numpy import * 5 | from threading import Thread 6 | from ThreadWithReturn import * 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | import matplotlib.pyplot as plt 9 | from Item_basedCF import * 10 | from User_basedCF import * 11 | if __name__ == '__main__': 12 | startTime = datetime.datetime.now() 13 | # MyData = LoadMovieLens1M() 14 | MyData = LoadMovieLens100k('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data') 15 | # MyData = LoadMovieLens10M() 16 | MyUBCF = UBCollaborativeFilter() 17 | train_data, test_data = train_test_split(MyData, test_size=0.1) 18 | print(type(train_data)) 19 | print(MyData.head()) 20 | n_users = MyData.user_id.max() 21 | n_items = MyData.item_id.max() 22 | print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) 23 | 24 | test1 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, train_data)) 25 | test2 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, test_data)) 26 | test1.start() 27 | test2.start() 28 | train_data_matrix = test1.join() 29 | test_data_matrix = test2.join() 30 | MyUBCF.train_data_matrix = train_data_matrix 31 | MyUBCF.test_data_matrix = test_data_matrix 32 | 33 | MyUBCF.SimilityMatrix = cosine_similarity(train_data_matrix) 34 | MyUBCF.UserMeanMatrix = numpy.true_divide(MyUBCF.train_data_matrix.sum(1), 35 | (MyUBCF.train_data_matrix != 0).sum(1)) # 按X轴方向获取非0元素均值,如果某行所有元素为0返回nan 36 | KList = [25, 50, 75, 100, 125, 150] 37 | for i in range(len(KList)): 38 | MyUBCF.Clear() 39 | 40 | medTime = datetime.datetime.now() 41 | print((medTime - startTime).seconds) 42 | t1 = Thread(target=MyUBCF.doEvaluate, args=(test_data_matrix, KList[i])) 43 | t1.start() 44 | t1.join() 45 | 46 | endTime = datetime.datetime.now() 47 | print((endTime - startTime).seconds) -------------------------------------------------------------------------------- /bak/getRating.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Name: getRating.py 3 | # Purpose: 基于已知的训练集,"测试集"中的user的item进行评分预测. 4 | # 5 | # Language: Python 3.2 6 | # Author: Python大菜鸟 7 | # E-mail: zhenboye@gmail.com 8 | # Created: 14-06-2014 9 | # -------------------------------------------------------- 10 | from math import sqrt 11 | from DataHelper import LoadMovieLensData 12 | # from loadMovieLens import loadMovieLensTrain 13 | # from loadMovieLens import loadMovieLensTest 14 | 15 | 16 | ### 计算pearson相关度 17 | def sim_pearson(prefer, person1, person2): 18 | sim = {} 19 | # 查找双方都评价过的项 20 | for item in prefer[person1]: 21 | if item in prefer[person2]: 22 | sim[item] = 1 # 将相同项添加到字典sim中 23 | # 元素个数 24 | n = len(sim) 25 | if len(sim) == 0: 26 | return -1 27 | 28 | # 所有偏好之和 29 | sum1 = sum([prefer[person1][item] for item in sim]) 30 | sum2 = sum([prefer[person2][item] for item in sim]) 31 | 32 | # 求平方和 33 | sum1Sq = sum([pow(prefer[person1][item], 2) for item in sim]) 34 | sum2Sq = sum([pow(prefer[person2][item], 2) for item in sim]) 35 | 36 | # 求乘积之和 ∑XiYi 37 | sumMulti = sum([prefer[person1][item] * prefer[person2][item] for item in sim]) 38 | 39 | num1 = sumMulti - (sum1 * sum2 / n) 40 | num2 = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n)) 41 | if num2 == 0: ### 如果分母为0,本处将返回0. 42 | return 0 43 | 44 | result = num1 / num2 45 | return result 46 | 47 | 48 | ### 获取对item评分的K个最相似用户(K默认20) 49 | def topKMatches(prefer, person, itemId, k=20, sim=sim_pearson): 50 | userSet = [] 51 | scores = [] 52 | users = [] 53 | # 找出所有prefer中评价过Item的用户,存入userSet 54 | for user in prefer: 55 | if itemId in prefer[user]: 56 | userSet.append(user) 57 | # 计算相似性 58 | scores = [(sim(prefer, person, other), other) for other in userSet if other != person] 59 | 60 | # 按相似度排序 61 | scores.sort() 62 | scores.reverse() 63 | 64 | if len(scores) <= k: # 如果小于k,只选择这些做推荐。 65 | for item in scores: 66 | users.append(item[1]) # 提取每项的userId 67 | return users 68 | else: # 如果>k,截取k个用户 69 | kscore = scores[0:k] 70 | for item in kscore: 71 | users.append(item[1]) # 提取每项的userId 72 | return users # 返回K个最相似用户的ID 73 | 74 | 75 | ### 计算用户的平均评分 76 | def getAverage(prefer, userId): 77 | count = 0 78 | sum = 0 79 | for item in prefer[userId]: 80 | sum = sum + prefer[userId][item] 81 | count = count + 1 82 | return sum / count 83 | 84 | 85 | ### 平均加权策略,预测userId对itemId的评分 86 | def getRating(prefer1, userId, itemId, knumber=20, similarity=sim_pearson): 87 | sim = 0.0 88 | averageOther = 0.0 89 | jiaquanAverage = 0.0 90 | simSums = 0.0 91 | # 获取K近邻用户(评过分的用户集) 92 | users = topKMatches(prefer1, userId, itemId, k=knumber, sim=sim_pearson) 93 | 94 | # 获取userId 的平均值 95 | averageOfUser = getAverage(prefer1, userId) 96 | 97 | # 计算每个用户的加权,预测 98 | for other in users: 99 | sim = similarity(prefer1, userId, other) # 计算比较其他用户的相似度 100 | averageOther = getAverage(prefer1, other) # 该用户的平均分 101 | # 累加 102 | simSums += abs(sim) # 取绝对值 103 | jiaquanAverage += (prefer1[other][itemId] - averageOther) * sim # 累加,一些值为负 104 | 105 | # simSums为0,即该项目尚未被其他用户评分,这里的处理方法:返回用户平均分 106 | if simSums == 0: 107 | return averageOfUser 108 | else: 109 | return (averageOfUser + jiaquanAverage / simSums) 110 | 111 | 112 | ##================================================================== 113 | 114 | 115 | ## getAllUserRating(): 获取所有用户的预测评分,存放到fileResult中 116 | ## 117 | ## 参数:fileTrain,fileTest 是训练文件和对应的测试文件,fileResult为结果文件 118 | ## similarity是相似度度量方法,默认是皮尔森。 119 | ##================================================================== 120 | def getAllUserRating(fileTrain='u1.base', fileTest='u1.test', fileResult='result.txt', similarity=sim_pearson): 121 | prefer1 = loadMovieLensTrain(fileTrain) # 加载训练集 122 | prefer2 = loadMovieLensTest(fileTest) # 加载测试集 123 | inAllnum = 0 124 | 125 | file = open(fileResult, 'a') 126 | file.write("%s\n" % ("------------------------------------------------------")) 127 | 128 | for userid in prefer2: # test集中每个用户 129 | for item in prefer2[userid]: # 对于test集合中每一个项目用base数据集,CF预测评分 130 | rating = getRating(prefer1, userid, item, 20) # 基于训练集预测用户评分(用户数目<=K) 131 | file.write('%s\t%s\t%s\n' % (userid, item, rating)) 132 | inAllnum = inAllnum + 1 133 | file.close() 134 | print("-------------Completed!!-----------", inAllnum) 135 | 136 | 137 | ############ 主程序 ############## 138 | if __name__ == "__main__": 139 | print("\n--------------基于MovieLens的推荐系统 运行中... -----------\n") 140 | getAllUserRating('u1.base', 'u1.test', 'result.txt') 141 | -------------------------------------------------------------------------------- /RunExample.py: -------------------------------------------------------------------------------- 1 | #! python3 2 | # -*- coding: utf-8 -*- 3 | import argparse 4 | import datetime 5 | 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | 9 | from Item_basedCF import * 10 | from ThreadWithReturn import * 11 | from User_basedCF import * 12 | 13 | MovieLensData = { 14 | 1: 'Datas/ml-100k/u.data', 15 | 2: 'Datas/ml-1M/ratings.dat', 16 | 3: 'Datas/ml-10M100K/ratings.dat', 17 | 4: 'Datas/ml-20m/ratings.csv' 18 | } 19 | 20 | 21 | def parseargs(): 22 | parser = argparse.ArgumentParser() 23 | 24 | parser.add_argument("--ratings", 25 | type=str, 26 | default='ml-100k', 27 | help="Ratings file") 28 | 29 | parser.add_argument("--testsize", 30 | type=float, 31 | default=0.2, 32 | help="Percentage of test data") 33 | 34 | return parser.parse_args() 35 | 36 | 37 | if __name__ == '__main__': 38 | myparser = parseargs() 39 | startTime = datetime.datetime.now() 40 | MyData = LoadMovieLensData(myparser.ratings) 41 | MyUBCF = UBCollaborativeFilter() 42 | MyIBCF = IBCollaborativeFilter() 43 | train_data, test_data = train_test_split(MyData, test_size=myparser.testsize) 44 | print(type(train_data)) 45 | print(MyData.head()) 46 | n_users = MyData.user_id.max() 47 | n_items = MyData.item_id.max() 48 | print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) 49 | 50 | test1 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, train_data)) 51 | test2 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, test_data)) 52 | test1.start() 53 | test2.start() 54 | train_data_matrix = test1.join() 55 | test_data_matrix = test2.join() 56 | MyUBCF.train_data_matrix = train_data_matrix 57 | MyIBCF.train_data_matrix = train_data_matrix 58 | MyUBCF.test_data_matrix = test_data_matrix 59 | MyIBCF.test_data_matrix = test_data_matrix 60 | 61 | # 皮尔逊相关系数 62 | # MyUBCF.SimilityMatrix = np.corrcoef(train_data_matrix) 63 | # MyIBCF.SimilityMatrix = np.corrcoef(train_data_matrix.T) 64 | 65 | # # 余弦相似度 66 | MyUBCF.SimilityMatrix = cosine_similarity(train_data_matrix) 67 | # MyIBCF.SimilityMatrix = cosine_similarity(train_data_matrix.T) 68 | 69 | # 按X轴方向获取非0元素均值,如果某行所有元素为0返回nan,横着,求对应用户所有电影得平均分 70 | MyUBCF.UserMeanMatrix = np.true_divide(MyUBCF.train_data_matrix.sum(1), (MyUBCF.train_data_matrix != 0).sum(1)) 71 | # 按Y轴方向获取非0元素均值,如果某行所有元素为0返回nan,竖着,求该物品得所有用户平均分 72 | # MyIBCF.ItemMeanMatrix = numpy.true_divide(MyUBCF.train_data_matrix.sum(0), (MyUBCF.train_data_matrix != 0).sum(0)) 73 | # MyIBCF.ItemMeanMatrix[np.isnan(MyIBCF.ItemMeanMatrix)] = 0 74 | KList = [10, 20, 30, 40, 50, 60] 75 | for i in range(len(KList)): 76 | MyUBCF.truerating = [] 77 | MyUBCF.predictions = [] 78 | # MyIBCF.truerating = [] 79 | # MyIBCF.predictions = [] 80 | 81 | medTime = datetime.datetime.now() 82 | print((medTime - startTime).seconds) 83 | t1 = Thread(target=MyUBCF.doEvaluate, args=(test_data_matrix, KList[i])) 84 | # t2 = Thread(target=MyIBCF.doEvaluate, args=(test_data_matrix, KList[i])) 85 | t1.start() 86 | # t2.start() 87 | t1.join() 88 | # t2.join() 89 | 90 | endTime = datetime.datetime.now() 91 | print("Cost time:%d seconds" % (endTime - startTime).seconds) 92 | Savetxt("Docs/%s/UBCF %s %1.1f.txt" % (myparser.ratings, myparser.ratings, myparser.testsize), 93 | "UBCF K=%d\tRMSE:%f\tMAE:%f\t" % (KList[i], MyUBCF.RMSE[KList[i]], MyUBCF.MAE[KList[i]])) 94 | # Savetxt('Docs/%s/IBCF %s %1.1f.txt' % (myparser.ratings, myparser.ratings, myparser.testsize),"IBCF K=%d\tRMSE:%f\tMAE:%f\t" % (KList[i], MyIBCF.RMSE[KList[i]], MyIBCF.MAE[KList[i]])) 95 | # Check performance by plotting train and test errors 96 | plt.plot(KList, list(MyUBCF.RMSE.values()), marker='o', label='RMSE') 97 | plt.plot(KList, list(MyUBCF.MAE.values()), marker='v', label='MAE') 98 | plt.title('The Error of UBCF in MovieLens ' + myparser.ratings) 99 | plt.xlabel('K') 100 | plt.ylabel('value') 101 | plt.legend() 102 | plt.grid() 103 | plt.savefig('Docs/%s/UBCF %s %1.1f.png' % (myparser.ratings, myparser.ratings, myparser.testsize)) 104 | plt.show() 105 | plt.gcf().clear() 106 | # Check performance by plotting train and test errors 107 | # plt.plot(KList, list(MyIBCF.RMSE.values()), marker='o', label='RMSE') 108 | # plt.plot(KList, list(MyIBCF.MAE.values()), marker='v', label='MAE') 109 | # plt.title('The Error of IBCF in MovieLens ' + myparser.ratings) 110 | # plt.xlabel('K') 111 | # plt.ylabel('value') 112 | # plt.legend() 113 | # plt.grid() 114 | # plt.savefig('Docs/%s/IBCF %s %1.1f.png' % (myparser.ratings, myparser.ratings, myparser.testsize)) 115 | # plt.show() 116 | -------------------------------------------------------------------------------- /DataHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import pickle 4 | import heapq 5 | import numpy as np 6 | from sklearn.model_selection import train_test_split 7 | 8 | 9 | def SaveData2pkl(DictData, FilePath='Datas/Mydata.pkl', mode='wb'): 10 | pkl_file = open(FilePath, mode) 11 | try: 12 | pickle.dump(DictData, pkl_file, protocol=2) 13 | return True 14 | except: 15 | return False 16 | finally: 17 | pkl_file.close() 18 | 19 | 20 | def SaveData2cvs(MatrixData, FilePath='Datas/Mydata.pkl', Thisdelimiter=','): 21 | try: 22 | np.savetxt(FilePath, MatrixData, delimiter=Thisdelimiter) 23 | return True 24 | except Exception as e: 25 | print(repr(e)) 26 | return False 27 | 28 | 29 | def LoadData4pkl(FilePath='Datas/Mydata.pkl', mode='rb'): 30 | pkl_file = open(FilePath, mode) 31 | try: 32 | DataDict = pickle.load(pkl_file) 33 | return DataDict 34 | except: 35 | return None 36 | finally: 37 | pkl_file.close() 38 | 39 | 40 | def LoadData4cvs(FilePath='Datas/Mydata.pkl', Thisdelimiter=',', mode='rb'): 41 | try: 42 | my_matrix = np.loadtxt(open(FilePath, mode), delimiter=Thisdelimiter, skiprows=0) 43 | return my_matrix 44 | except: 45 | return None 46 | 47 | 48 | def LoadDoubanData(FilePath='Datas/Mydata.pkl'): 49 | LineNum = 1 50 | UserRating = dict() 51 | UserIndex = LoadData4pkl('Datas/UserIndex.pkl') 52 | ItemIndex = LoadData4pkl('Datas/ItemIndex.pkl') 53 | for line in open(FilePath, 'r', encoding='UTF-8'): 54 | LineNum += 1 55 | if len(line.rstrip('\n')) == 0: 56 | continue 57 | linelist = line.split(',') 58 | UserID = int(linelist[0]) 59 | MovieID = int(linelist[1]) 60 | Rating = float(linelist[2]) 61 | tags = str(linelist[4].rstrip('\n')).lower() 62 | UserRating.setdefault(UserIndex[UserID], {}) 63 | UserRating[UserIndex[UserID]][ItemIndex[MovieID]] = Rating 64 | print("第%d行数据:" % LineNum) 65 | # if z>30000: break 66 | return UserRating 67 | 68 | 69 | def LoadMovieLensData(FileType='ml-100k'): 70 | """ 71 | :param FileType: 72 | :return: DataFrame 73 | """ 74 | if FileType == 'ml-100k': 75 | header = ['user_id', 'item_id', 'rating', 'timestamp'] 76 | data = pd.read_table('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data', header=None, names=header) 77 | elif FileType == 'ml-1M': 78 | header = ['user_id', 'item_id', 'rating', 'timestamp'] 79 | data = pd.read_table('Datas/ml-1M/ratings.dat', header=None, names=header) 80 | elif FileType == 'ml-10M': 81 | header = ['user_id', 'item_id', 'rating', 'timestamp'] 82 | data = pd.read_table('Datas/ml-10M100K/ratings.dat', sep="::", header=None, names=header, engine='python') 83 | elif FileType == 'ml-20M': 84 | data = pd.read_csv('Datas/ml-20m/ratings.csv') 85 | else: 86 | data = None 87 | return data 88 | 89 | 90 | def SpiltData(DataSet, SpiltRate=0.25): 91 | TrainData, TestData = train_test_split(DataSet, test_size=SpiltRate) 92 | return TrainData, TestData 93 | 94 | 95 | # 给定用户实例编号,和相似度矩阵,得到最相似的K个用户,对用户共同评价过的物品中找到最相似的K个对象 96 | def get_K_Neighbors(Train_data_matrix, simility_matrix, knumber=10): 97 | SIM = simility_matrix.copy() 98 | zeroset = np.where(Train_data_matrix == 0) 99 | SIM[zeroset] = 0 100 | myresult = sparse_argsort(-SIM)[0:knumber] 101 | return myresult 102 | 103 | 104 | def sparse_argsort(arr): 105 | indices = np.nonzero(arr)[0] 106 | return indices[np.argsort(arr[indices])] 107 | 108 | 109 | # write in txt Appending mode 110 | def Savetxt(FilePath, message='', mode='a'): 111 | file_object = open(FilePath, mode) 112 | file_object.write(message + '\n') 113 | file_object.close() 114 | 115 | 116 | def DataFrame2Matrix(n_users, n_items, dataframe): 117 | train_data_matrix = np.zeros((n_users, n_items)) 118 | for line in dataframe.itertuples(): 119 | train_data_matrix[line[1] - 1, line[2] - 1] = line[3] 120 | return train_data_matrix 121 | 122 | 123 | def get_N_Recommends(neighborset, userIndex, Train_data_matrix, simility_matrix, Nnumber=10): 124 | myTrain_data_matrix = Train_data_matrix.copy() 125 | if len(neighborset) != 0: 126 | # for i in neighborset: 127 | # myTrain_data_matrix[i] = myTrain_data_matrix[i] * simility_matrix[userIndex][i] 128 | myTrain_data_matrix[neighborset] = np.multiply(myTrain_data_matrix[neighborset].T, simility_matrix[userIndex][neighborset]).T 129 | watched = myTrain_data_matrix[userIndex].nonzero() 130 | myTrain_data_matrix[:, watched] = 0 131 | recommendset = myTrain_data_matrix[neighborset] 132 | teat1 = np.where(recommendset >= heapq.nlargest(Nnumber, recommendset.flatten())[-1]) 133 | return teat1[1] 134 | else: # 冷启动处理 135 | watched = myTrain_data_matrix[userIndex].nonzero() 136 | myTrain_data_matrix[:, watched] = 0 137 | teat1 = np.vstack(np.unravel_index(np.argpartition(myTrain_data_matrix.flatten(), -2)[-Nnumber:], myTrain_data_matrix.shape)).T 138 | return teat1[:, 1] 139 | -------------------------------------------------------------------------------- /bak/Test1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2016 Ronald J. Nowling 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | http://www.apache.org/licenses/LICENSE-2.0 7 | Unless required by applicable law or agreed to in writing, software 8 | distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | See the License for the specific language governing permissions and 11 | limitations under the License. 12 | """ 13 | 14 | import argparse 15 | from collections import defaultdict 16 | import random 17 | from DataHelper import * 18 | from sklearn.neighbors import NearestNeighbors 19 | from sklearn.metrics import roc_auc_score 20 | 21 | import numpy as np 22 | import scipy.sparse as sp 23 | 24 | 25 | 26 | def create_training_sets(ratings, n_training, n_testing): 27 | print("Creating user movie-interaction lists") 28 | 29 | user_interactions = defaultdict(set) 30 | max_movie_id = 0 31 | for r in ratings: 32 | user_interactions[r.user_id].add(r.movie_id) 33 | max_movie_id = max(max_movie_id, r.movie_id) 34 | 35 | user_interactions = list(user_interactions.values()) 36 | sampled_indices = random.sample(range(len(user_interactions)), n_training + n_testing) 37 | 38 | users = [] 39 | movies = [] 40 | interactions = [] 41 | for new_user_id, idx in enumerate(sampled_indices[:n_training]): 42 | users.extend([new_user_id] * len(user_interactions[idx])) 43 | movies.extend(user_interactions[idx]) 44 | interactions.extend([1.] * len(user_interactions[idx])) 45 | 46 | n_movies = max_movie_id + 1 47 | training_matrix = sp.coo_matrix((interactions, (users, movies)), 48 | shape=(n_training, n_movies)).tocsr() 49 | 50 | users = [] 51 | movies = [] 52 | interactions = [] 53 | for new_user_id, idx in enumerate(sampled_indices[n_training:]): 54 | users.extend([new_user_id] * len(user_interactions[idx])) 55 | movies.extend(user_interactions[idx]) 56 | interactions.extend([1.] * len(user_interactions[idx])) 57 | 58 | n_movies = max_movie_id + 1 59 | testing_matrix = sp.coo_matrix((interactions, (users, movies)), 60 | shape=(n_testing, n_movies)).tocsr() 61 | 62 | print(training_matrix.shape, testing_matrix.shape) 63 | 64 | return training_matrix, testing_matrix 65 | 66 | 67 | def train_and_score(metric, training, testing, ks): 68 | print("Training and scoring") 69 | scores = [] 70 | knn = NearestNeighbors(metric=metric, algorithm="brute") 71 | knn.fit(training) 72 | for k in ks: 73 | print("Evaluating for", k, "neighbors") 74 | neighbor_indices = knn.kneighbors(testing, 75 | n_neighbors=k, 76 | return_distance=False) 77 | 78 | all_predicted_scores = [] 79 | all_labels = [] 80 | for user_id in range(testing.shape[0]): 81 | user_row = testing[user_id, :] 82 | 83 | interaction_indices = user_row.nonzero() 84 | interacted = set(interaction_indices) 85 | non_interacted = set(range(testing.shape[1])) - interacted 86 | 87 | n_samples = min(len(non_interacted), len(interacted)) 88 | sampled_interacted = random.sample(interacted, n_samples) 89 | sampled_non_interacted = random.sample(non_interacted, n_samples) 90 | 91 | indices = list(sampled_interacted) 92 | indices.extend(sampled_non_interacted) 93 | labels = [1] * n_samples 94 | labels.extend([0] * n_samples) 95 | 96 | neighbors = training[neighbor_indices[user_id, :], :] 97 | predicted_scores = neighbors.mean(axis=0) 98 | for idx in indices: 99 | all_predicted_scores.append(predicted_scores[0, idx]) 100 | all_labels.extend(labels) 101 | 102 | print(len(all_labels), len(all_predicted_scores)) 103 | 104 | auc = roc_auc_score(all_labels, all_predicted_scores) 105 | 106 | print("k", k, "AUC", auc) 107 | 108 | 109 | def parseargs(): 110 | parser = argparse.ArgumentParser() 111 | 112 | parser.add_argument("--ratings-fl", 113 | type=str, 114 | required=True, 115 | help="Ratings file") 116 | 117 | parser.add_argument("--training", 118 | type=int, 119 | default=10000, 120 | help="Number of training samples") 121 | 122 | parser.add_argument("--testing", 123 | type=int, 124 | default=1000, 125 | help="Number of testing samples") 126 | 127 | parser.add_argument("--metric", 128 | type=str, 129 | choices=["euclidean", "cosine"], 130 | default="euclidean", 131 | help="Distance metric") 132 | 133 | parser.add_argument("--ks", 134 | type=int, 135 | nargs="+", 136 | required=True, 137 | help="Number of neigbhors") 138 | 139 | return parser.parse_args() 140 | 141 | 142 | if __name__ == "__main__": 143 | parseargs() 144 | MyData = LoadMovieLens100k('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data') 145 | print(MyData.head()) 146 | n_users = MyData.user_id.unique().shape[0] 147 | n_items = MyData.item_id.unique().shape[0] 148 | print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) 149 | train_data, test_data = SpiltData(MyData, SpiltRate=0.25) 150 | train_data = pd.DataFrame(train_data) 151 | test_data = pd.DataFrame(test_data) 152 | 153 | # Create two user-item matrices, one for training and another for testing 154 | train_data_matrix = np.zeros((n_users, n_items)) 155 | for line in train_data.itertuples(): 156 | train_data_matrix[line[1] - 1, line[2] - 1] = line[3] 157 | 158 | test_data_matrix = np.zeros((n_users, n_items)) 159 | for line in test_data.itertuples(): 160 | test_data_matrix[line[1] - 1, line[2] - 1] = line[3] 161 | 162 | train_and_score('cosine', 163 | train_data_matrix, 164 | test_data_matrix, 165 | [25]) -------------------------------------------------------------------------------- /bak/Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on 2015-06-22 4 | @author: Lockvictor 5 | ''' 6 | import sys, random, math 7 | from operator import itemgetter 8 | 9 | random.seed(0) 10 | 11 | 12 | class UserBasedCF(): 13 | ''' TopN recommendation - UserBasedCF ''' 14 | 15 | def __init__(self): 16 | self.trainset = {} 17 | self.testset = {} 18 | 19 | self.n_sim_user = 20 20 | self.n_rec_movie = 10 21 | 22 | self.user_sim_mat = {} 23 | self.movie_popular = {} 24 | self.movie_count = 0 25 | 26 | print(sys.stderr, 'Similar user number = %d' % self.n_sim_user) 27 | print(sys.stderr, 'recommended movie number = %d' % self.n_rec_movie) 28 | 29 | @staticmethod 30 | def loadfile(filename): 31 | ''' load a file, return a generator. ''' 32 | fp = open(filename, 'r') 33 | for i, line in enumerate(fp): 34 | yield line.strip('\t') 35 | if i % 100000 == 0: 36 | print(sys.stderr, 'loading %s(%s)' % (filename, i)) 37 | fp.close() 38 | print(sys.stderr, 'load %s succ' % filename) 39 | 40 | def generate_dataset(self, filename, pivot=0.7): 41 | ''' load rating data and split it to training set and test set ''' 42 | trainset_len = 0 43 | testset_len = 0 44 | 45 | for line in self.loadfile(filename): 46 | user, movie, rating, timestamp = line.split('\t') 47 | # split the data by pivot 48 | if (random.random() < pivot): 49 | self.trainset.setdefault(user, {}) 50 | self.trainset[user][movie] = int(rating) 51 | trainset_len += 1 52 | else: 53 | self.testset.setdefault(user, {}) 54 | self.testset[user][movie] = int(rating) 55 | testset_len += 1 56 | 57 | print(sys.stderr, 'split training set and test set succ') 58 | print(sys.stderr, 'train set = %s' % trainset_len) 59 | print(sys.stderr, 'test set = %s' % testset_len) 60 | 61 | def calc_user_sim(self): 62 | ''' calculate user similarity matrix ''' 63 | # build inverse table for item-users 64 | # key=movieID, value=list of userIDs who have seen this movie 65 | print(sys.stderr, 'building movie-users inverse table...') 66 | movie2users = dict() 67 | 68 | for user, movies in self.trainset.items(): 69 | for movie in movies: 70 | # inverse table for item-users 71 | if movie not in movie2users: 72 | movie2users[movie] = set() 73 | movie2users[movie].add(user) 74 | # count item popularity at the same time 75 | if movie not in self.movie_popular: 76 | self.movie_popular[movie] = 0 77 | self.movie_popular[movie] += 1 78 | print(sys.stderr, 'build movie-users inverse table succ') 79 | 80 | # save the total movie number, which will be used in evaluation 81 | self.movie_count = len(movie2users) 82 | print(sys.stderr, 'total movie number = %d' % self.movie_count) 83 | 84 | # count co-rated items between users 85 | usersim_mat = self.user_sim_mat 86 | print(sys.stderr, 'building user co-rated movies matrix...') 87 | 88 | for movie, users in movie2users.items(): 89 | for u in users: 90 | for v in users: 91 | if u == v: continue 92 | usersim_mat.setdefault(u, {}) 93 | usersim_mat[u].setdefault(v, 0) 94 | usersim_mat[u][v] += 1 95 | print(sys.stderr, 'build user co-rated movies matrix succ') 96 | 97 | # calculate similarity matrix 98 | print(sys.stderr, 'calculating user similarity matrix...') 99 | simfactor_count = 0 100 | PRINT_STEP = 2000000 101 | for u, related_users in usersim_mat.items(): 102 | for v, count in related_users.items(): 103 | usersim_mat[u][v] = count / math.sqrt( 104 | len(self.trainset[u]) * len(self.trainset[v])) 105 | simfactor_count += 1 106 | if simfactor_count % PRINT_STEP == 0: 107 | print(sys.stderr, 'calculating user similarity factor(%d)' % simfactor_count) 108 | 109 | print(sys.stderr, 'calculate user similarity matrix(similarity factor) succ') 110 | print(sys.stderr, 'Total similarity factor number = %d' % simfactor_count) 111 | 112 | def recommend(self, user): 113 | """ Find K similar users and recommend N movies. """ 114 | K = self.n_sim_user 115 | N = self.n_rec_movie 116 | rank = dict() 117 | watched_movies = self.trainset[user] 118 | 119 | # v=similar user, wuv=similarity factor 120 | for v, wuv in sorted(self.user_sim_mat[user].items(), 121 | key=itemgetter(1), reverse=True)[0:K]: 122 | for movie in self.trainset[v]: 123 | if movie in watched_movies: 124 | continue 125 | # predict the user's "interest" for each movie 126 | rank.setdefault(movie, 0) 127 | rank[movie] += wuv 128 | # return the N best movies 129 | return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] 130 | 131 | def evaluate(self): 132 | ''' return precision, recall, coverage and popularity ''' 133 | print(sys.stderr, 'Evaluation start...') 134 | 135 | N = self.n_rec_movie 136 | # varables for precision and recall 137 | hit = 0 138 | rec_count = 0 139 | test_count = 0 140 | # varables for coverage 141 | all_rec_movies = set() 142 | # varables for popularity 143 | popular_sum = 0 144 | 145 | for i, user in enumerate(self.trainset): # enumerate 函数用于遍历序列中的元素以及它们的下标: 146 | if i % 500 == 0: 147 | print(sys.stderr, 'recommended for %d users' % i) 148 | test_movies = self.testset.get(user, {}) 149 | rec_movies = self.recommend(user) 150 | for movie, w in rec_movies: 151 | if movie in test_movies: 152 | hit += 1 153 | all_rec_movies.add(movie) 154 | popular_sum += math.log(1 + self.movie_popular[movie]) 155 | rec_count += N 156 | test_count += len(test_movies) 157 | 158 | precision = hit / (1.0 * rec_count) 159 | recall = hit / (1.0 * test_count) 160 | coverage = len(all_rec_movies) / (1.0 * self.movie_count) 161 | popularity = popular_sum / (1.0 * rec_count) 162 | 163 | print(sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' % \ 164 | (precision, recall, coverage, popularity)) 165 | 166 | 167 | if __name__ == '__main__': 168 | ratingfile = 'G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data' 169 | usercf = UserBasedCF() 170 | usercf.generate_dataset(ratingfile) 171 | usercf.calc_user_sim() 172 | usercf.evaluate() 173 | -------------------------------------------------------------------------------- /bak/Intro to Recommender Systems_Collaborative Filtering.py: -------------------------------------------------------------------------------- 1 | from DataHelper import * 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.metrics.pairwise import cosine_similarity 5 | from sklearn.metrics import mean_squared_error 6 | from EvaluationHelper import * 7 | 8 | def get_mse(pred, actual): 9 | # Ignore nonzero terms. 10 | pred = pred[actual.nonzero()].flatten() 11 | actual = actual[actual.nonzero()].flatten() 12 | return mean_squared_error(pred, actual) 13 | def slow_similarity(ratings, kind='user'): 14 | if kind == 'user': 15 | axmax = 0 16 | axmin = 1 17 | elif kind == 'item': 18 | axmax = 1 19 | axmin = 0 20 | sim = np.zeros((ratings.shape[axmax], ratings.shape[axmax])) 21 | for u in range(ratings.shape[axmax]): 22 | for uprime in range(ratings.shape[axmax]): 23 | rui_sqrd = 0. 24 | ruprimei_sqrd = 0. 25 | for i in range(ratings.shape[axmin]): 26 | sim[u, uprime] = ratings[u, i] * ratings[uprime, i] 27 | rui_sqrd += ratings[u, i] ** 2 28 | ruprimei_sqrd += ratings[uprime, i] ** 2 29 | sim[u, uprime] /= rui_sqrd * ruprimei_sqrd 30 | return sim 31 | 32 | def fast_similarity(ratings, kind='user', epsilon=1e-9): 33 | # epsilon -> small number for handling dived-by-zero errors 34 | if kind == 'user': 35 | sim = ratings.dot(ratings.T) + epsilon 36 | elif kind == 'item': 37 | sim = ratings.T.dot(ratings) + epsilon 38 | norms = np.array([np.sqrt(np.diagonal(sim))]) 39 | return (sim / norms / norms.T) 40 | 41 | def predict_slow_simple(ratings, similarity, kind='user'): 42 | pred = np.zeros(ratings.shape) 43 | if kind == 'user': 44 | for i in range(ratings.shape[0]): 45 | for j in range(ratings.shape[1]): 46 | pred[i, j] = similarity[i, :].dot(ratings[:, j])\ 47 | /np.sum(np.abs(similarity[i, :])) 48 | return pred 49 | elif kind == 'item': 50 | for i in range(ratings.shape[0]): 51 | for j in range(ratings.shape[1]): 52 | pred[i, j] = similarity[j, :].dot(ratings[i, :].T)\ 53 | /np.sum(np.abs(similarity[j, :])) 54 | 55 | return pred 56 | 57 | def predict_fast_simple(ratings, similarity, kind='user'): 58 | if kind == 'user': 59 | return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T 60 | elif kind == 'item': 61 | return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) 62 | 63 | 64 | def predict_topk(ratings, similarity, kind='user', k=40): 65 | pred = np.zeros(ratings.shape) 66 | if kind == 'user': 67 | for i in range(ratings.shape[0]): 68 | top_k_users = [np.argsort(similarity[:, i])[:-k - 1:-1]] 69 | for j in range(ratings.shape[1]): 70 | pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 71 | pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users])) 72 | if kind == 'item': 73 | for j in range(ratings.shape[1]): 74 | top_k_items = [np.argsort(similarity[:, j])[:-k - 1:-1]] 75 | for i in range(ratings.shape[0]): 76 | pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 77 | pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items])) 78 | 79 | return pred 80 | 81 | 82 | def predict_nobias(ratings, similarity, kind='user'): 83 | if kind == 'user': 84 | user_bias = ratings.mean(axis=1) 85 | ratings = (ratings - user_bias[:, np.newaxis]).copy() 86 | pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T 87 | pred += user_bias[:, np.newaxis] 88 | elif kind == 'item': 89 | item_bias = ratings.mean(axis=0) 90 | ratings = (ratings - item_bias[np.newaxis, :]).copy() 91 | pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) 92 | pred += item_bias[np.newaxis, :] 93 | 94 | return pred 95 | 96 | 97 | def predict_topk_nobias(ratings, similarity, kind='user', k=20): 98 | pred = np.zeros(ratings.shape) 99 | if kind == 'user': 100 | user_bias = ratings.mean(axis=1) 101 | ratings = (ratings - user_bias[:, np.newaxis]).copy() 102 | for i in range(ratings.shape[0]): 103 | top_k_users = [np.argsort(similarity[:, i])[:-k - 1:-1]] 104 | for j in range(ratings.shape[1]): 105 | pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 106 | pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users])) 107 | pred += user_bias[:, np.newaxis] 108 | if kind == 'item': 109 | item_bias = ratings.mean(axis=0) 110 | ratings = (ratings - item_bias[np.newaxis, :]).copy() 111 | for j in range(ratings.shape[1]): 112 | top_k_items = [np.argsort(similarity[:, j])[:-k - 1:-1]] 113 | for i in range(ratings.shape[0]): 114 | pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 115 | pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items])) 116 | pred += item_bias[np.newaxis, :] 117 | 118 | return pred 119 | 120 | if __name__ == '__main__': 121 | MyData = LoadMovieLens100k('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data') 122 | print(MyData.head()) 123 | n_users = MyData.user_id.unique().shape[0] 124 | n_items = MyData.item_id.unique().shape[0] 125 | print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) 126 | train_data, test_data = SpiltData(MyData, SpiltRate=0.25) 127 | train_data = pd.DataFrame(train_data) 128 | test_data = pd.DataFrame(test_data) 129 | 130 | # Create two user-item matrices, one for training and another for testing 131 | train_data_matrix = np.zeros((n_users, n_items)) 132 | for line in train_data.itertuples(): 133 | train_data_matrix[line[1] - 1, line[2] - 1] = line[3] 134 | 135 | test_data_matrix = np.zeros((n_users, n_items)) 136 | for line in test_data.itertuples(): 137 | test_data_matrix[line[1] - 1, line[2] - 1] = line[3] 138 | 139 | ratings = np.zeros((n_users, n_items)) 140 | for row in MyData.itertuples(): 141 | ratings[row[1] - 1, row[2] - 1] = row[3] 142 | 143 | sparsity = float(len(ratings.nonzero()[0])) 144 | sparsity /= (ratings.shape[0] * ratings.shape[1]) 145 | sparsity *= 100 146 | print( 'Sparsity: {:4.2f}%'.format(sparsity)) 147 | train, test = train_test_split(ratings) 148 | 149 | user_similarity = fast_similarity(train, kind='user') 150 | item_similarity = fast_similarity(train, kind='item') 151 | print(item_similarity[:4, :4]) 152 | 153 | item_prediction = predict_fast_simple(train, item_similarity, kind='item') 154 | user_prediction = predict_fast_simple(train, user_similarity, kind='user') 155 | ''' 156 | print('User-based CF MSE: ' + str(get_mse(user_prediction, test))) 157 | print('Item-based CF RMSE: ' + str(RMSE(item_prediction, test))) 158 | pred = predict_topk(train, user_similarity, kind='user', k=40) 159 | print('Top-k User-based CF RMSE: ' + str(RMSE(pred, test))) 160 | 161 | pred = predict_topk(train, item_similarity, kind='item', k=40) 162 | print('Top-k Item-based CF RMSE: ' + str(RMSE(pred, test))) 163 | ''' 164 | user_pred = predict_topk_nobias(train, user_similarity, kind='user') 165 | print('Bias-subtracted User-based CF RMSE: ' + str(RMSE(user_pred, test))) 166 | 167 | user_pred = predict_nobias(train, user_similarity, kind='user') 168 | print('Bias-subtracted User-based CF RMSE: ' + str(RMSE(user_pred, test))) 169 | item_pred = predict_nobias(train, item_similarity, kind='item') 170 | print('Bias-subtracted Item-based CF RMSE: ' + str(RMSE(item_pred, test))) --------------------------------------------------------------------------------