├── requirements.txt
├── Datas
    ├── ml-1m
    │   ├── 0.1
    │   │   ├── IBCF ml-1M.png
    │   │   ├── UBCF ml-1M.png
    │   │   ├── Item-basedCF.txt
    │   │   └── User-basedCF.txt
    │   └── 0.2
    │   │   ├── IBCF ml-1M.png
    │   │   ├── UBCF ml-1M.png
    │   │   ├── Item-basedCF.txt
    │   │   └── User-basedCF.txt
    ├── ml-10M
    │   └── 0.1
    │   │   ├── IBCF ml-10M.png
    │   │   ├── UBCF ml-10M.png
    │   │   ├── Item-basedCF.txt
    │   │   └── User-basedCF.txt
    └── ml-100k
    │   ├── 0.1
    │       ├── IBCF ml-100k.png
    │       ├── UBCF ml-100k.png
    │       ├── Item-basedCF.txt
    │       └── User-basedCF.txt
    │   ├── 0.2
    │       ├── IBCF ml-100k.png
    │       ├── UBCF ml-100k.png
    │       ├── Item-basedCF.txt
    │       └── User-basedCF.txt
    │   ├── IBCF ml-100k 0.1.png
    │   ├── IBCF ml-100k 0.2.png
    │   ├── UBCF ml-100k 0.1.png
    │   ├── UBCF ml-100k 0.2.png
    │   ├── Item-basedCF ml-100k 0.1.txt
    │   ├── Item-basedCF ml-100k 0.2.txt
    │   ├── User-basedCF ml-100k 0.1.txt
    │   └── User-basedCF ml-100k 0.2.txt
├── Docs
    └── ml-100k
    │   └── UBCF ml-100k 0.2.png
├── test_mbcf.py
├── .travis.yml
├── EvaluationHelper.py
├── ThreadWithReturn.py
├── README.md
├── DistanceHelper.py
├── .gitignore
├── User_basedCF.py
├── Item_basedCF.py
├── bak
    ├── kNNUnitTest.py
    ├── getRating.py
    ├── Test1.py
    ├── Test.py
    └── Intro to Recommender Systems_Collaborative Filtering.py
├── RunExample.py
└── DataHelper.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | matplotlib
3 | sklearn
4 | numpy


--------------------------------------------------------------------------------
/Datas/ml-1m/0.1/IBCF ml-1M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.1/IBCF ml-1M.png


--------------------------------------------------------------------------------
/Datas/ml-1m/0.1/UBCF ml-1M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.1/UBCF ml-1M.png


--------------------------------------------------------------------------------
/Datas/ml-1m/0.2/IBCF ml-1M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.2/IBCF ml-1M.png


--------------------------------------------------------------------------------
/Datas/ml-1m/0.2/UBCF ml-1M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-1m/0.2/UBCF ml-1M.png


--------------------------------------------------------------------------------
/Datas/ml-10M/0.1/IBCF ml-10M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-10M/0.1/IBCF ml-10M.png


--------------------------------------------------------------------------------
/Datas/ml-10M/0.1/UBCF ml-10M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-10M/0.1/UBCF ml-10M.png


--------------------------------------------------------------------------------
/Docs/ml-100k/UBCF ml-100k 0.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Docs/ml-100k/UBCF ml-100k 0.2.png


--------------------------------------------------------------------------------
/Datas/ml-100k/0.1/IBCF ml-100k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.1/IBCF ml-100k.png


--------------------------------------------------------------------------------
/Datas/ml-100k/0.1/UBCF ml-100k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.1/UBCF ml-100k.png


--------------------------------------------------------------------------------
/Datas/ml-100k/0.2/IBCF ml-100k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.2/IBCF ml-100k.png


--------------------------------------------------------------------------------
/Datas/ml-100k/0.2/UBCF ml-100k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/0.2/UBCF ml-100k.png


--------------------------------------------------------------------------------
/Datas/ml-100k/IBCF ml-100k 0.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/IBCF ml-100k 0.1.png


--------------------------------------------------------------------------------
/Datas/ml-100k/IBCF ml-100k 0.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/IBCF ml-100k 0.2.png


--------------------------------------------------------------------------------
/Datas/ml-100k/UBCF ml-100k 0.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/UBCF ml-100k 0.1.png


--------------------------------------------------------------------------------
/Datas/ml-100k/UBCF ml-100k 0.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuhailin/Memory-based-collaborative-filtering/HEAD/Datas/ml-100k/UBCF ml-100k 0.2.png


--------------------------------------------------------------------------------
/test_mbcf.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | class TestClass:
 4 | 
 5 |     def test_one(self):
 6 |         x = "this"
 7 | 
 8 |     def test_two(self):
 9 |         x = "hello"
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 | # command to install dependencies
 5 | install:
 6 |   - pip install -r requirements.txt
 7 | # command to run tests
 8 | script: pytest
 9 | notifications:
10 |   email: false
11 | 


--------------------------------------------------------------------------------
/Datas/ml-10M/0.1/Item-basedCF.txt:
--------------------------------------------------------------------------------
1 | IBCF  K=25	RMSE:0.821169	MAE:0.674318
2 | IBCF  K=50	RMSE:0.826170	MAE:0.682557
3 | IBCF  K=75	RMSE:0.830468	MAE:0.689677
4 | IBCF  K=100	RMSE:0.833642	MAE:0.694958
5 | IBCF  K=125	RMSE:0.836080	MAE:0.699030
6 | IBCF  K=150	RMSE:0.837998	MAE:0.702240
7 | 


--------------------------------------------------------------------------------
/Datas/ml-1m/0.1/Item-basedCF.txt:
--------------------------------------------------------------------------------
1 | IBCF  K=25	RMSE:0.864500	MAE:0.747361
2 | IBCF  K=50	RMSE:0.869455	MAE:0.755952
3 | IBCF  K=75	RMSE:0.874602	MAE:0.764928
4 | IBCF  K=100	RMSE:0.878605	MAE:0.771946
5 | IBCF  K=125	RMSE:0.881555	MAE:0.777140
6 | IBCF  K=150	RMSE:0.884046	MAE:0.781537
7 | 


--------------------------------------------------------------------------------
/Datas/ml-1m/0.2/Item-basedCF.txt:
--------------------------------------------------------------------------------
1 | IBCF  K=25	RMSE:0.868866	MAE:0.754929
2 | IBCF  K=50	RMSE:0.873996	MAE:0.763868
3 | IBCF  K=75	RMSE:0.878743	MAE:0.772190
4 | IBCF  K=100	RMSE:0.882366	MAE:0.778570
5 | IBCF  K=125	RMSE:0.885053	MAE:0.783319
6 | IBCF  K=150	RMSE:0.887112	MAE:0.786967
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/0.1/Item-basedCF.txt:
--------------------------------------------------------------------------------
1 | IBCF  K=25	RMSE:0.911620	MAE:0.831052
2 | IBCF  K=50	RMSE:0.916596	MAE:0.840149
3 | IBCF  K=75	RMSE:0.921024	MAE:0.848286
4 | IBCF  K=100	RMSE:0.924499	MAE:0.854699
5 | IBCF  K=125	RMSE:0.926818	MAE:0.858992
6 | IBCF  K=150	RMSE:0.928498	MAE:0.862109
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/0.2/Item-basedCF.txt:
--------------------------------------------------------------------------------
1 | IBCF  K=25	RMSE:0.919330	MAE:0.845168
2 | IBCF  K=50	RMSE:0.921909	MAE:0.849915
3 | IBCF  K=75	RMSE:0.924977	MAE:0.855583
4 | IBCF  K=100	RMSE:0.927738	MAE:0.860699
5 | IBCF  K=125	RMSE:0.929850	MAE:0.864622
6 | IBCF  K=150	RMSE:0.931432	MAE:0.867565
7 | 


--------------------------------------------------------------------------------
/Datas/ml-1m/0.1/User-basedCF.txt:
--------------------------------------------------------------------------------
1 | UBCF  K=25	RMSE:0.907885	MAE:0.824255	
2 | UBCF  K=50	RMSE:0.905784	MAE:0.820444	
3 | UBCF  K=75	RMSE:0.906797	MAE:0.822281	
4 | UBCF  K=100	RMSE:0.908323	MAE:0.825051	
5 | UBCF  K=125	RMSE:0.909714	MAE:0.827580	
6 | UBCF  K=150	RMSE:0.910878	MAE:0.829700	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-1m/0.2/User-basedCF.txt:
--------------------------------------------------------------------------------
1 | UBCF  K=25	RMSE:0.909597	MAE:0.827367	
2 | UBCF  K=50	RMSE:0.907115	MAE:0.822857	
3 | UBCF  K=75	RMSE:0.908010	MAE:0.824482	
4 | UBCF  K=100	RMSE:0.909183	MAE:0.826614	
5 | UBCF  K=125	RMSE:0.910374	MAE:0.828782	
6 | UBCF  K=150	RMSE:0.911454	MAE:0.830749	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/0.1/User-basedCF.txt:
--------------------------------------------------------------------------------
1 | UBCF  K=25	RMSE:0.935700	MAE:0.875534	
2 | UBCF  K=50	RMSE:0.936977	MAE:0.877926	
3 | UBCF  K=75	RMSE:0.939825	MAE:0.883271	
4 | UBCF  K=100	RMSE:0.941714	MAE:0.886825	
5 | UBCF  K=125	RMSE:0.943357	MAE:0.889922	
6 | UBCF  K=150	RMSE:0.944576	MAE:0.892225	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/0.2/User-basedCF.txt:
--------------------------------------------------------------------------------
1 | UBCF  K=25	RMSE:0.940682	MAE:0.884883	
2 | UBCF  K=50	RMSE:0.941559	MAE:0.886533	
3 | UBCF  K=75	RMSE:0.943585	MAE:0.890353	
4 | UBCF  K=100	RMSE:0.945687	MAE:0.894324	
5 | UBCF  K=125	RMSE:0.947128	MAE:0.897052	
6 | UBCF  K=150	RMSE:0.948167	MAE:0.899021	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-10M/0.1/User-basedCF.txt:
--------------------------------------------------------------------------------
1 | UBCF  K=25	RMSE:0.845004	MAE:0.714032	
2 | UBCF  K=50	RMSE:0.841574	MAE:0.708246	
3 | UBCF  K=75	RMSE:0.841605	MAE:0.708298	
4 | UBCF  K=100	RMSE:0.842181	MAE:0.709269	
5 | UBCF  K=125	RMSE:0.843042	MAE:0.710719	
6 | UBCF  K=150	RMSE:0.843740	MAE:0.711898	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/Item-basedCF ml-100k 0.1.txt:
--------------------------------------------------------------------------------
1 | IBCF  K=25	RMSE:0.919458	MAE:0.845403	
2 | IBCF  K=50	RMSE:0.924853	MAE:0.855352	
3 | IBCF  K=75	RMSE:0.928752	MAE:0.862580	
4 | IBCF  K=100	RMSE:0.931144	MAE:0.867029	
5 | IBCF  K=125	RMSE:0.933297	MAE:0.871044	
6 | IBCF  K=150	RMSE:0.934274	MAE:0.872867	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/Item-basedCF ml-100k 0.2.txt:
--------------------------------------------------------------------------------
1 | IBCF  K=25	RMSE:0.917474	MAE:0.841758	
2 | IBCF  K=50	RMSE:0.921177	MAE:0.848567	
3 | IBCF  K=75	RMSE:0.925177	MAE:0.855952	
4 | IBCF  K=100	RMSE:0.928474	MAE:0.862065	
5 | IBCF  K=125	RMSE:0.930407	MAE:0.865657	
6 | IBCF  K=150	RMSE:0.931977	MAE:0.868581	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/User-basedCF ml-100k 0.1.txt:
--------------------------------------------------------------------------------
1 | UBCF  K=25	RMSE:0.936204	MAE:0.876478	
2 | UBCF  K=50	RMSE:0.936623	MAE:0.877263	
3 | UBCF  K=75	RMSE:0.939302	MAE:0.882289	
4 | UBCF  K=100	RMSE:0.941543	MAE:0.886502	
5 | UBCF  K=125	RMSE:0.943234	MAE:0.889690	
6 | UBCF  K=150	RMSE:0.944467	MAE:0.892018	
7 | 


--------------------------------------------------------------------------------
/Datas/ml-100k/User-basedCF ml-100k 0.2.txt:
--------------------------------------------------------------------------------
1 | UBCF  K=25	RMSE:0.940611	MAE:0.884748	
2 | UBCF  K=50	RMSE:0.941572	MAE:0.886558	
3 | UBCF  K=75	RMSE:0.943732	MAE:0.890631	
4 | UBCF  K=100	RMSE:0.945455	MAE:0.893886	
5 | UBCF  K=125	RMSE:0.946944	MAE:0.896703	
6 | UBCF  K=150	RMSE:0.947725	MAE:0.898183	
7 | 


--------------------------------------------------------------------------------
/EvaluationHelper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import mean_squared_error, mean_absolute_error
 3 | 
 4 | 
 5 | def RMSE(true, prediction):
 6 |     rmse = np.sqrt(mean_squared_error(true, prediction))
 7 |     return rmse
 8 | 
 9 | 
10 | def MAE(true, prediction):
11 |     mae = mean_absolute_error(true, prediction)
12 |     return mae
13 | 


--------------------------------------------------------------------------------
/ThreadWithReturn.py:
--------------------------------------------------------------------------------
 1 | from threading import Thread
 2 | 
 3 | 
 4 | class ThreadWithReturnValue(Thread):
 5 |     def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None):
 6 |         Thread.__init__(self, group, target, name, args, kwargs, daemon=daemon)
 7 | 
 8 |         self._return = None
 9 | 
10 |     def run(self):
11 |         if self._target is not None:
12 |             self._return = self._target(*self._args, **self._kwargs)
13 | 
14 |     def join(self):
15 |         Thread.join(self)
16 |         return self._return


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Memory-based-collaborative-filtering
 2 | 
 3 | [![Build Status](https://travis-ci.org/fuhailin/Memory-based-collaborative-filtering.svg?branch=master)](https://travis-ci.org/fuhailin/Memory-based-collaborative-filtering)
 4 | 
 5 | Contain User-based CF([UBCF](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/User_basedCF.py)),Item-based CF([IBCF](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/Item_basedCF.py))
 6 | A robust k-nearest neighbors Recommender System use MovieLens dataset in Python
 7 | 
 8 | ## User-based collaborative filter
 9 | > *K=25    RunTime：1s
10 |     RMSE:0.940611
11 |     MAE:0.884748.*
12 | 
13 | 
14 | ![image](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/Docs/ml-100k/UBCF%20ml-100k%200.2.png)
15 | ![image](https://github.com/fuhailin/Memory-based-collaborative-filtering/blob/master/Docs/ml-100k/IBCF%20ml-100k%200.2.png)
16 | 
17 | Memory-based algorithms are easy to implement and produce reasonable prediction quality.
18 | The drawback of memory-based CF is that it doesn’t scale to real-world scenarios and doesn’t address the well-known cold-start problem, that is when new user or new item enters the system.
19 | 


--------------------------------------------------------------------------------
/DistanceHelper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import math
 3 | 
 4 | '''
 5 | # 1) 用scikit cosine_similarity计算余弦相似度
 6 |     from sklearn.metrics.pairwise import cosine_similarity
 7 |     user_similarity=cosine_similarity(user_item_matric)
 8 | 
 9 | # 2) 用scikit pairwise_distances计算相似度,用pairwise_distances计算的Cosine distance是1-（cosine similarity）结果
10 |     from sklearn.metrics.pairwise import pairwise_distances
11 |     user_similarity = pairwise_distances(user_item_matric, metric='cosine')
12 |  '''
13 | 
14 | 
15 | class DistanceHelper(object):
16 |     # 1) given two data points, calculate the euclidean distance between them
17 |     def Euclidean_distance(self, vector1, vector2):
18 |         points = zip(vector1, vector2)
19 |         diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
20 |         return math.sqrt(sum(diffs_squared_distance))
21 | 
22 |     def Cosin_distance(self, vector1, vector2):
23 |         dot_product = 0.0
24 |         normA = 0.0
25 |         normB = 0.0
26 |         for a, b in zip(vector1, vector2):
27 |             dot_product += a * b
28 |             normA += a ** 2
29 |             normB += b ** 2
30 |         if normA == 0.0 or normB == 0.0:
31 |             return None
32 |         else:
33 |             return dot_product / ((normA * normB) ** 0.5)
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | .idea/
91 | Datas/ml-1m/
92 | Datas/ml-10M100K/
93 | Datas/ml-20m/
94 | *.data
95 | 


--------------------------------------------------------------------------------
/User_basedCF.py:
--------------------------------------------------------------------------------
 1 | from DataHelper import *
 2 | from EvaluationHelper import *
 3 | 
 4 | 
 5 | class UBCollaborativeFilter(object):
 6 |     def __init__(self):
 7 |         self.SimilityMatrix = None
 8 |         self.truerating = []
 9 |         self.predictions = []
10 |         self.train_data_matrix = None
11 |         self.RMSE = dict()
12 |         self.MAE = dict()
13 |         self.UserMeanMatrix = None
14 | 
15 |     # 平均加权策略，预测userId对itemId的评分
16 |     def getRating(self, Train_data_matrix, userId, simility_matrix, neighborset):
17 |         # simSums为0，即该项目尚未被其他用户评分，这里的处理方法：返回用户平均分
18 |         simSums = np.sum(simility_matrix[neighborset])
19 |         # 获取userId 的平均值
20 |         averageOfUser = self.UserMeanMatrix[userId]
21 |         # 计算每个用户的加权，预测得分
22 |         jiaquanAverage = (Train_data_matrix[neighborset]).dot(simility_matrix[neighborset])
23 |         if simSums == 0:
24 |             return 0
25 |         else:
26 |             return jiaquanAverage / simSums
27 | 
28 |     def doEvaluate(self, testDataMatrix, K):
29 |         a, b = testDataMatrix.nonzero()
30 |         for userIndex, itemIndex in zip(a, b):
31 |             # 用户最相似的K个用户
32 |             neighborset = get_K_Neighbors(self.train_data_matrix[:, itemIndex], self.SimilityMatrix[userIndex], K)
33 |             # 基于训练集预测用户评分(用户数目<=K)
34 |             prerating = self.getRating(self.train_data_matrix[:, itemIndex], userIndex, self.SimilityMatrix[userIndex], neighborset)
35 |             self.truerating.append(testDataMatrix[userIndex][itemIndex])
36 |             self.predictions.append(prerating)
37 |             # print(len(self.predictions))
38 |         self.RMSE[K] = RMSE(self.truerating, self.predictions)
39 |         self.MAE[K] = MAE(self.truerating, self.predictions)
40 |         print("UBCF  K=%d,RMSE:%f,MAE:%f" % (K, self.RMSE[K], self.MAE[K]))
41 | 


--------------------------------------------------------------------------------
/Item_basedCF.py:
--------------------------------------------------------------------------------
 1 | #! python3
 2 | # -*- coding: utf-8 -*-
 3 | from DataHelper import *
 4 | from EvaluationHelper import *
 5 | 
 6 | 
 7 | class IBCollaborativeFilter(object):
 8 |     def __init__(self):
 9 |         self.SimilityMatrix = None
10 |         self.ItemMeanMatrix = None
11 |         self.truerating = []
12 |         self.predictions = []
13 |         self.train_data_matrix = None
14 |         self.RMSE = dict()
15 |         self.MAE = dict()
16 | 
17 |     ### 平均加权策略，预测userId对itemId的评分
18 |     def getRating(self, Train_data_matrix, itemId, simility_matrix, knumber=20):
19 |         neighborset = get_K_Neighbors(Train_data_matrix, simility_matrix, knumber)  # 最相似的K个Item
20 |         simSums = numpy.sum(simility_matrix[neighborset])  # simSums为0，即该项目尚未被其他用户评分，这里的处理方法：返回用户平均分
21 |         averageOfUser = self.ItemMeanMatrix[itemId]  # 获取userId 的平均值
22 |         jiaquanAverage = (Train_data_matrix[neighborset] - self.ItemMeanMatrix[neighborset]).dot(simility_matrix[neighborset])  # 计算每个用户的加权，预测
23 |         if simSums == 0:
24 |             return averageOfUser
25 |         else:
26 |             return averageOfUser + jiaquanAverage / simSums
27 | 
28 |     def doEvaluate(self, testDataMatrix, K):
29 |         a, b = testDataMatrix.nonzero()
30 |         for userIndex, itemIndex in zip(a, b):
31 |             prerating = self.getRating(self.train_data_matrix[userIndex], itemIndex, self.SimilityMatrix[itemIndex],K)  # 基于训练集预测用户评分(用户数目<=K)
32 |             self.truerating.append(testDataMatrix[userIndex][itemIndex])
33 |             self.predictions.append(prerating)
34 |             # print(len(self.predictions))
35 |         self.RMSE[K] = RMSE(self.truerating, self.predictions)
36 |         self.MAE[K] = MAE(self.truerating, self.predictions)
37 |         print("IBCF  K=%d,RMSE:%f,MAE:%f" % (K, self.RMSE[K], self.MAE[K]))
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/bak/kNNUnitTest.py:
--------------------------------------------------------------------------------
 1 | #! python3
 2 | # -*- coding: utf-8 -*-
 3 | import datetime
 4 | from numpy import *
 5 | from threading import Thread
 6 | from ThreadWithReturn import *
 7 | from sklearn.metrics.pairwise import cosine_similarity
 8 | import matplotlib.pyplot as plt
 9 | from Item_basedCF import *
10 | from User_basedCF import *
11 | if __name__ == '__main__':
12 |     startTime = datetime.datetime.now()
13 |     # MyData = LoadMovieLens1M()
14 |     MyData = LoadMovieLens100k('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data')
15 |     # MyData = LoadMovieLens10M()
16 |     MyUBCF = UBCollaborativeFilter()
17 |     train_data, test_data = train_test_split(MyData, test_size=0.1)
18 |     print(type(train_data))
19 |     print(MyData.head())
20 |     n_users = MyData.user_id.max()
21 |     n_items = MyData.item_id.max()
22 |     print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))
23 | 
24 |     test1 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, train_data))
25 |     test2 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, test_data))
26 |     test1.start()
27 |     test2.start()
28 |     train_data_matrix = test1.join()
29 |     test_data_matrix = test2.join()
30 |     MyUBCF.train_data_matrix = train_data_matrix
31 |     MyUBCF.test_data_matrix = test_data_matrix
32 | 
33 |     MyUBCF.SimilityMatrix = cosine_similarity(train_data_matrix)
34 |     MyUBCF.UserMeanMatrix = numpy.true_divide(MyUBCF.train_data_matrix.sum(1),
35 |                                               (MyUBCF.train_data_matrix != 0).sum(1))  # 按X轴方向获取非0元素均值，如果某行所有元素为0返回nan
36 |     KList = [25, 50, 75, 100, 125, 150]
37 |     for i in range(len(KList)):
38 |         MyUBCF.Clear()
39 | 
40 |         medTime = datetime.datetime.now()
41 |         print((medTime - startTime).seconds)
42 |         t1 = Thread(target=MyUBCF.doEvaluate, args=(test_data_matrix, KList[i]))
43 |         t1.start()
44 |         t1.join()
45 | 
46 |         endTime = datetime.datetime.now()
47 |         print((endTime - startTime).seconds)


--------------------------------------------------------------------------------
/bak/getRating.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Name:            getRating.py  
  3 | # Purpose:  基于已知的训练集，"测试集"中的user的item进行评分预测.
  4 | #
  5 | # Language:    Python 3.2
  6 | # Author:      Python大菜鸟
  7 | # E-mail:      zhenboye@gmail.com  
  8 | # Created:     14-06-2014  
  9 | # --------------------------------------------------------
 10 | from math import sqrt
 11 | from DataHelper import LoadMovieLensData
 12 | # from loadMovieLens import loadMovieLensTrain
 13 | # from loadMovieLens import loadMovieLensTest
 14 | 
 15 | 
 16 | ### 计算pearson相关度
 17 | def sim_pearson(prefer, person1, person2):
 18 |     sim = {}
 19 |     # 查找双方都评价过的项
 20 |     for item in prefer[person1]:
 21 |         if item in prefer[person2]:
 22 |             sim[item] = 1  # 将相同项添加到字典sim中
 23 |     # 元素个数
 24 |     n = len(sim)
 25 |     if len(sim) == 0:
 26 |         return -1
 27 | 
 28 |     # 所有偏好之和
 29 |     sum1 = sum([prefer[person1][item] for item in sim])
 30 |     sum2 = sum([prefer[person2][item] for item in sim])
 31 | 
 32 |     # 求平方和
 33 |     sum1Sq = sum([pow(prefer[person1][item], 2) for item in sim])
 34 |     sum2Sq = sum([pow(prefer[person2][item], 2) for item in sim])
 35 | 
 36 |     # 求乘积之和 ∑XiYi
 37 |     sumMulti = sum([prefer[person1][item] * prefer[person2][item] for item in sim])
 38 | 
 39 |     num1 = sumMulti - (sum1 * sum2 / n)
 40 |     num2 = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
 41 |     if num2 == 0:  ### 如果分母为0，本处将返回0.
 42 |         return 0
 43 | 
 44 |     result = num1 / num2
 45 |     return result
 46 | 
 47 | 
 48 | ### 获取对item评分的K个最相似用户（K默认20）
 49 | def topKMatches(prefer, person, itemId, k=20, sim=sim_pearson):
 50 |     userSet = []
 51 |     scores = []
 52 |     users = []
 53 |     # 找出所有prefer中评价过Item的用户,存入userSet
 54 |     for user in prefer:
 55 |         if itemId in prefer[user]:
 56 |             userSet.append(user)
 57 |     # 计算相似性
 58 |     scores = [(sim(prefer, person, other), other) for other in userSet if other != person]
 59 | 
 60 |     # 按相似度排序
 61 |     scores.sort()
 62 |     scores.reverse()
 63 | 
 64 |     if len(scores) <= k:  # 如果小于k，只选择这些做推荐。
 65 |         for item in scores:
 66 |             users.append(item[1])  # 提取每项的userId
 67 |         return users
 68 |     else:  # 如果>k,截取k个用户
 69 |         kscore = scores[0:k]
 70 |         for item in kscore:
 71 |             users.append(item[1])  # 提取每项的userId
 72 |         return users  # 返回K个最相似用户的ID
 73 | 
 74 | 
 75 | ### 计算用户的平均评分
 76 | def getAverage(prefer, userId):
 77 |     count = 0
 78 |     sum = 0
 79 |     for item in prefer[userId]:
 80 |         sum = sum + prefer[userId][item]
 81 |         count = count + 1
 82 |     return sum / count
 83 | 
 84 | 
 85 | ### 平均加权策略，预测userId对itemId的评分
 86 | def getRating(prefer1, userId, itemId, knumber=20, similarity=sim_pearson):
 87 |     sim = 0.0
 88 |     averageOther = 0.0
 89 |     jiaquanAverage = 0.0
 90 |     simSums = 0.0
 91 |     # 获取K近邻用户(评过分的用户集)
 92 |     users = topKMatches(prefer1, userId, itemId, k=knumber, sim=sim_pearson)
 93 | 
 94 |     # 获取userId 的平均值
 95 |     averageOfUser = getAverage(prefer1, userId)
 96 | 
 97 |     # 计算每个用户的加权，预测
 98 |     for other in users:
 99 |         sim = similarity(prefer1, userId, other)  # 计算比较其他用户的相似度
100 |         averageOther = getAverage(prefer1, other)  # 该用户的平均分
101 |         # 累加
102 |         simSums += abs(sim)  # 取绝对值
103 |         jiaquanAverage += (prefer1[other][itemId] - averageOther) * sim  # 累加，一些值为负
104 | 
105 |     # simSums为0，即该项目尚未被其他用户评分，这里的处理方法：返回用户平均分
106 |     if simSums == 0:
107 |         return averageOfUser
108 |     else:
109 |         return (averageOfUser + jiaquanAverage / simSums)
110 | 
111 | 
112 |         ##==================================================================
113 | 
114 | 
115 | ##     getAllUserRating(): 获取所有用户的预测评分，存放到fileResult中
116 | ##
117 | ## 参数:fileTrain,fileTest 是训练文件和对应的测试文件，fileResult为结果文件
118 | ##     similarity是相似度度量方法，默认是皮尔森。
119 | ##==================================================================
120 | def getAllUserRating(fileTrain='u1.base', fileTest='u1.test', fileResult='result.txt', similarity=sim_pearson):
121 |     prefer1 = loadMovieLensTrain(fileTrain)  # 加载训练集
122 |     prefer2 = loadMovieLensTest(fileTest)  # 加载测试集
123 |     inAllnum = 0
124 | 
125 |     file = open(fileResult, 'a')
126 |     file.write("%s\n" % ("------------------------------------------------------"))
127 | 
128 |     for userid in prefer2:  # test集中每个用户
129 |         for item in prefer2[userid]:  # 对于test集合中每一个项目用base数据集,CF预测评分
130 |             rating = getRating(prefer1, userid, item, 20)  # 基于训练集预测用户评分(用户数目<=K)
131 |             file.write('%s\t%s\t%s\n' % (userid, item, rating))
132 |             inAllnum = inAllnum + 1
133 |     file.close()
134 |     print("-------------Completed!!-----------", inAllnum)
135 | 
136 | 
137 | ############    主程序   ##############
138 | if __name__ == "__main__":
139 |     print("\n--------------基于MovieLens的推荐系统 运行中... -----------\n")
140 |     getAllUserRating('u1.base', 'u1.test', 'result.txt')
141 | 


--------------------------------------------------------------------------------
/RunExample.py:
--------------------------------------------------------------------------------
  1 | #! python3
  2 | # -*- coding: utf-8 -*-
  3 | import argparse
  4 | import datetime
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | from sklearn.metrics.pairwise import cosine_similarity
  8 | 
  9 | from Item_basedCF import *
 10 | from ThreadWithReturn import *
 11 | from User_basedCF import *
 12 | 
 13 | MovieLensData = {
 14 |     1: 'Datas/ml-100k/u.data',
 15 |     2: 'Datas/ml-1M/ratings.dat',
 16 |     3: 'Datas/ml-10M100K/ratings.dat',
 17 |     4: 'Datas/ml-20m/ratings.csv'
 18 | }
 19 | 
 20 | 
 21 | def parseargs():
 22 |     parser = argparse.ArgumentParser()
 23 | 
 24 |     parser.add_argument("--ratings",
 25 |                         type=str,
 26 |                         default='ml-100k',
 27 |                         help="Ratings file")
 28 | 
 29 |     parser.add_argument("--testsize",
 30 |                         type=float,
 31 |                         default=0.2,
 32 |                         help="Percentage of test data")
 33 | 
 34 |     return parser.parse_args()
 35 | 
 36 | 
 37 | if __name__ == '__main__':
 38 |     myparser = parseargs()
 39 |     startTime = datetime.datetime.now()
 40 |     MyData = LoadMovieLensData(myparser.ratings)
 41 |     MyUBCF = UBCollaborativeFilter()
 42 |     MyIBCF = IBCollaborativeFilter()
 43 |     train_data, test_data = train_test_split(MyData, test_size=myparser.testsize)
 44 |     print(type(train_data))
 45 |     print(MyData.head())
 46 |     n_users = MyData.user_id.max()
 47 |     n_items = MyData.item_id.max()
 48 |     print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))
 49 | 
 50 |     test1 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, train_data))
 51 |     test2 = ThreadWithReturnValue(target=DataFrame2Matrix, args=(n_users, n_items, test_data))
 52 |     test1.start()
 53 |     test2.start()
 54 |     train_data_matrix = test1.join()
 55 |     test_data_matrix = test2.join()
 56 |     MyUBCF.train_data_matrix = train_data_matrix
 57 |     MyIBCF.train_data_matrix = train_data_matrix
 58 |     MyUBCF.test_data_matrix = test_data_matrix
 59 |     MyIBCF.test_data_matrix = test_data_matrix
 60 | 
 61 |     # 皮尔逊相关系数
 62 |     # MyUBCF.SimilityMatrix = np.corrcoef(train_data_matrix)
 63 |     # MyIBCF.SimilityMatrix = np.corrcoef(train_data_matrix.T)
 64 | 
 65 |     # # 余弦相似度
 66 |     MyUBCF.SimilityMatrix = cosine_similarity(train_data_matrix)
 67 |     # MyIBCF.SimilityMatrix = cosine_similarity(train_data_matrix.T)
 68 | 
 69 |     # 按X轴方向获取非0元素均值，如果某行所有元素为0返回nan，横着，求对应用户所有电影得平均分
 70 |     MyUBCF.UserMeanMatrix = np.true_divide(MyUBCF.train_data_matrix.sum(1), (MyUBCF.train_data_matrix != 0).sum(1))
 71 |     # 按Y轴方向获取非0元素均值，如果某行所有元素为0返回nan，竖着，求该物品得所有用户平均分
 72 |     # MyIBCF.ItemMeanMatrix = numpy.true_divide(MyUBCF.train_data_matrix.sum(0), (MyUBCF.train_data_matrix != 0).sum(0))
 73 |     # MyIBCF.ItemMeanMatrix[np.isnan(MyIBCF.ItemMeanMatrix)] = 0
 74 |     KList = [10, 20, 30, 40, 50, 60]
 75 |     for i in range(len(KList)):
 76 |         MyUBCF.truerating = []
 77 |         MyUBCF.predictions = []
 78 |         # MyIBCF.truerating = []
 79 |         # MyIBCF.predictions = []
 80 | 
 81 |         medTime = datetime.datetime.now()
 82 |         print((medTime - startTime).seconds)
 83 |         t1 = Thread(target=MyUBCF.doEvaluate, args=(test_data_matrix, KList[i]))
 84 |         # t2 = Thread(target=MyIBCF.doEvaluate, args=(test_data_matrix, KList[i]))
 85 |         t1.start()
 86 |         # t2.start()
 87 |         t1.join()
 88 |         # t2.join()
 89 | 
 90 |         endTime = datetime.datetime.now()
 91 |         print("Cost time:%d seconds" % (endTime - startTime).seconds)
 92 |         Savetxt("Docs/%s/UBCF %s %1.1f.txt" % (myparser.ratings, myparser.ratings, myparser.testsize),
 93 |                 "UBCF  K=%d\tRMSE:%f\tMAE:%f\t" % (KList[i], MyUBCF.RMSE[KList[i]], MyUBCF.MAE[KList[i]]))
 94 |         # Savetxt('Docs/%s/IBCF %s %1.1f.txt' % (myparser.ratings, myparser.ratings, myparser.testsize),"IBCF  K=%d\tRMSE:%f\tMAE:%f\t" % (KList[i], MyIBCF.RMSE[KList[i]], MyIBCF.MAE[KList[i]]))
 95 |     # Check performance by plotting train and test errors
 96 |     plt.plot(KList, list(MyUBCF.RMSE.values()), marker='o', label='RMSE')
 97 |     plt.plot(KList, list(MyUBCF.MAE.values()), marker='v', label='MAE')
 98 |     plt.title('The Error of UBCF in MovieLens ' + myparser.ratings)
 99 |     plt.xlabel('K')
100 |     plt.ylabel('value')
101 |     plt.legend()
102 |     plt.grid()
103 |     plt.savefig('Docs/%s/UBCF %s %1.1f.png' % (myparser.ratings, myparser.ratings, myparser.testsize))
104 |     plt.show()
105 |     plt.gcf().clear()
106 |     # Check performance by plotting train and test errors
107 |     # plt.plot(KList, list(MyIBCF.RMSE.values()), marker='o', label='RMSE')
108 |     # plt.plot(KList, list(MyIBCF.MAE.values()), marker='v', label='MAE')
109 |     # plt.title('The Error of IBCF in MovieLens ' + myparser.ratings)
110 |     # plt.xlabel('K')
111 |     # plt.ylabel('value')
112 |     # plt.legend()
113 |     # plt.grid()
114 |     # plt.savefig('Docs/%s/IBCF %s %1.1f.png' % (myparser.ratings, myparser.ratings, myparser.testsize))
115 |     # plt.show()
116 | 


--------------------------------------------------------------------------------
/DataHelper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pandas as pd
  3 | import pickle
  4 | import heapq
  5 | import numpy as np
  6 | from sklearn.model_selection import train_test_split
  7 | 
  8 | 
  9 | def SaveData2pkl(DictData, FilePath='Datas/Mydata.pkl', mode='wb'):
 10 |     pkl_file = open(FilePath, mode)
 11 |     try:
 12 |         pickle.dump(DictData, pkl_file, protocol=2)
 13 |         return True
 14 |     except:
 15 |         return False
 16 |     finally:
 17 |         pkl_file.close()
 18 | 
 19 | 
 20 | def SaveData2cvs(MatrixData, FilePath='Datas/Mydata.pkl', Thisdelimiter=','):
 21 |     try:
 22 |         np.savetxt(FilePath, MatrixData, delimiter=Thisdelimiter)
 23 |         return True
 24 |     except Exception as e:
 25 |         print(repr(e))
 26 |         return False
 27 | 
 28 | 
 29 | def LoadData4pkl(FilePath='Datas/Mydata.pkl', mode='rb'):
 30 |     pkl_file = open(FilePath, mode)
 31 |     try:
 32 |         DataDict = pickle.load(pkl_file)
 33 |         return DataDict
 34 |     except:
 35 |         return None
 36 |     finally:
 37 |         pkl_file.close()
 38 | 
 39 | 
 40 | def LoadData4cvs(FilePath='Datas/Mydata.pkl', Thisdelimiter=',', mode='rb'):
 41 |     try:
 42 |         my_matrix = np.loadtxt(open(FilePath, mode), delimiter=Thisdelimiter, skiprows=0)
 43 |         return my_matrix
 44 |     except:
 45 |         return None
 46 | 
 47 | 
 48 | def LoadDoubanData(FilePath='Datas/Mydata.pkl'):
 49 |     LineNum = 1
 50 |     UserRating = dict()
 51 |     UserIndex = LoadData4pkl('Datas/UserIndex.pkl')
 52 |     ItemIndex = LoadData4pkl('Datas/ItemIndex.pkl')
 53 |     for line in open(FilePath, 'r', encoding='UTF-8'):
 54 |         LineNum += 1
 55 |         if len(line.rstrip('\n')) == 0:
 56 |             continue
 57 |         linelist = line.split(',')
 58 |         UserID = int(linelist[0])
 59 |         MovieID = int(linelist[1])
 60 |         Rating = float(linelist[2])
 61 |         tags = str(linelist[4].rstrip('\n')).lower()
 62 |         UserRating.setdefault(UserIndex[UserID], {})
 63 |         UserRating[UserIndex[UserID]][ItemIndex[MovieID]] = Rating
 64 |         print("第%d行数据：" % LineNum)
 65 |         # if z>30000: break
 66 |     return UserRating
 67 | 
 68 | 
 69 | def LoadMovieLensData(FileType='ml-100k'):
 70 |     """
 71 |     :param FileType:
 72 |     :return: DataFrame
 73 |     """
 74 |     if FileType == 'ml-100k':
 75 |         header = ['user_id', 'item_id', 'rating', 'timestamp']
 76 |         data = pd.read_table('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data', header=None, names=header)
 77 |     elif FileType == 'ml-1M':
 78 |         header = ['user_id', 'item_id', 'rating', 'timestamp']
 79 |         data = pd.read_table('Datas/ml-1M/ratings.dat', header=None, names=header)
 80 |     elif FileType == 'ml-10M':
 81 |         header = ['user_id', 'item_id', 'rating', 'timestamp']
 82 |         data = pd.read_table('Datas/ml-10M100K/ratings.dat', sep="::", header=None, names=header, engine='python')
 83 |     elif FileType == 'ml-20M':
 84 |         data = pd.read_csv('Datas/ml-20m/ratings.csv')
 85 |     else:
 86 |         data = None
 87 |     return data
 88 | 
 89 | 
 90 | def SpiltData(DataSet, SpiltRate=0.25):
 91 |     TrainData, TestData = train_test_split(DataSet, test_size=SpiltRate)
 92 |     return TrainData, TestData
 93 | 
 94 | 
 95 | # 给定用户实例编号，和相似度矩阵，得到最相似的K个用户,对用户共同评价过的物品中找到最相似的K个对象
 96 | def get_K_Neighbors(Train_data_matrix, simility_matrix, knumber=10):
 97 |     SIM = simility_matrix.copy()
 98 |     zeroset = np.where(Train_data_matrix == 0)
 99 |     SIM[zeroset] = 0
100 |     myresult = sparse_argsort(-SIM)[0:knumber]
101 |     return myresult
102 | 
103 | 
104 | def sparse_argsort(arr):
105 |     indices = np.nonzero(arr)[0]
106 |     return indices[np.argsort(arr[indices])]
107 | 
108 | 
109 | # write in txt Appending mode
110 | def Savetxt(FilePath, message='', mode='a'):
111 |     file_object = open(FilePath, mode)
112 |     file_object.write(message + '\n')
113 |     file_object.close()
114 | 
115 | 
116 | def DataFrame2Matrix(n_users, n_items, dataframe):
117 |     train_data_matrix = np.zeros((n_users, n_items))
118 |     for line in dataframe.itertuples():
119 |         train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
120 |     return train_data_matrix
121 | 
122 | 
123 | def get_N_Recommends(neighborset, userIndex, Train_data_matrix, simility_matrix, Nnumber=10):
124 |     myTrain_data_matrix = Train_data_matrix.copy()
125 |     if len(neighborset) != 0:
126 |         # for i in neighborset:
127 |         #    myTrain_data_matrix[i] = myTrain_data_matrix[i] * simility_matrix[userIndex][i]
128 |         myTrain_data_matrix[neighborset] = np.multiply(myTrain_data_matrix[neighborset].T, simility_matrix[userIndex][neighborset]).T
129 |         watched = myTrain_data_matrix[userIndex].nonzero()
130 |         myTrain_data_matrix[:, watched] = 0
131 |         recommendset = myTrain_data_matrix[neighborset]
132 |         teat1 = np.where(recommendset >= heapq.nlargest(Nnumber, recommendset.flatten())[-1])
133 |         return teat1[1]
134 |     else:  # 冷启动处理
135 |         watched = myTrain_data_matrix[userIndex].nonzero()
136 |         myTrain_data_matrix[:, watched] = 0
137 |         teat1 = np.vstack(np.unravel_index(np.argpartition(myTrain_data_matrix.flatten(), -2)[-Nnumber:], myTrain_data_matrix.shape)).T
138 |         return teat1[:, 1]
139 | 


--------------------------------------------------------------------------------
/bak/Test1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2016 Ronald J. Nowling
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 |     http://www.apache.org/licenses/LICENSE-2.0
  7 | Unless required by applicable law or agreed to in writing, software
  8 | distributed under the License is distributed on an "AS IS" BASIS,
  9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | See the License for the specific language governing permissions and
 11 | limitations under the License.
 12 | """
 13 | 
 14 | import argparse
 15 | from collections import defaultdict
 16 | import random
 17 | from DataHelper import *
 18 | from sklearn.neighbors import NearestNeighbors
 19 | from sklearn.metrics import roc_auc_score
 20 | 
 21 | import numpy as np
 22 | import scipy.sparse as sp
 23 | 
 24 | 
 25 | 
 26 | def create_training_sets(ratings, n_training, n_testing):
 27 |     print("Creating user movie-interaction lists")
 28 | 
 29 |     user_interactions = defaultdict(set)
 30 |     max_movie_id = 0
 31 |     for r in ratings:
 32 |         user_interactions[r.user_id].add(r.movie_id)
 33 |         max_movie_id = max(max_movie_id, r.movie_id)
 34 | 
 35 |     user_interactions = list(user_interactions.values())
 36 |     sampled_indices = random.sample(range(len(user_interactions)), n_training + n_testing)
 37 | 
 38 |     users = []
 39 |     movies = []
 40 |     interactions = []
 41 |     for new_user_id, idx in enumerate(sampled_indices[:n_training]):
 42 |         users.extend([new_user_id] * len(user_interactions[idx]))
 43 |         movies.extend(user_interactions[idx])
 44 |         interactions.extend([1.] * len(user_interactions[idx]))
 45 | 
 46 |     n_movies = max_movie_id + 1
 47 |     training_matrix = sp.coo_matrix((interactions, (users, movies)),
 48 |                                     shape=(n_training, n_movies)).tocsr()
 49 | 
 50 |     users = []
 51 |     movies = []
 52 |     interactions = []
 53 |     for new_user_id, idx in enumerate(sampled_indices[n_training:]):
 54 |         users.extend([new_user_id] * len(user_interactions[idx]))
 55 |         movies.extend(user_interactions[idx])
 56 |         interactions.extend([1.] * len(user_interactions[idx]))
 57 | 
 58 |     n_movies = max_movie_id + 1
 59 |     testing_matrix = sp.coo_matrix((interactions, (users, movies)),
 60 |                                    shape=(n_testing, n_movies)).tocsr()
 61 | 
 62 |     print(training_matrix.shape, testing_matrix.shape)
 63 | 
 64 |     return training_matrix, testing_matrix
 65 | 
 66 | 
 67 | def train_and_score(metric, training, testing, ks):
 68 |     print("Training and scoring")
 69 |     scores = []
 70 |     knn = NearestNeighbors(metric=metric, algorithm="brute")
 71 |     knn.fit(training)
 72 |     for k in ks:
 73 |         print("Evaluating for", k, "neighbors")
 74 |         neighbor_indices = knn.kneighbors(testing,
 75 |                                           n_neighbors=k,
 76 |                                           return_distance=False)
 77 | 
 78 |         all_predicted_scores = []
 79 |         all_labels = []
 80 |         for user_id in range(testing.shape[0]):
 81 |             user_row = testing[user_id, :]
 82 | 
 83 |             interaction_indices = user_row.nonzero()
 84 |             interacted = set(interaction_indices)
 85 |             non_interacted = set(range(testing.shape[1])) - interacted
 86 | 
 87 |             n_samples = min(len(non_interacted), len(interacted))
 88 |             sampled_interacted = random.sample(interacted, n_samples)
 89 |             sampled_non_interacted = random.sample(non_interacted, n_samples)
 90 | 
 91 |             indices = list(sampled_interacted)
 92 |             indices.extend(sampled_non_interacted)
 93 |             labels = [1] * n_samples
 94 |             labels.extend([0] * n_samples)
 95 | 
 96 |             neighbors = training[neighbor_indices[user_id, :], :]
 97 |             predicted_scores = neighbors.mean(axis=0)
 98 |             for idx in indices:
 99 |                 all_predicted_scores.append(predicted_scores[0, idx])
100 |             all_labels.extend(labels)
101 | 
102 |         print(len(all_labels), len(all_predicted_scores))
103 | 
104 |         auc = roc_auc_score(all_labels, all_predicted_scores)
105 | 
106 |         print("k", k, "AUC", auc)
107 | 
108 | 
109 | def parseargs():
110 |     parser = argparse.ArgumentParser()
111 | 
112 |     parser.add_argument("--ratings-fl",
113 |                         type=str,
114 |                         required=True,
115 |                         help="Ratings file")
116 | 
117 |     parser.add_argument("--training",
118 |                         type=int,
119 |                         default=10000,
120 |                         help="Number of training samples")
121 | 
122 |     parser.add_argument("--testing",
123 |                         type=int,
124 |                         default=1000,
125 |                         help="Number of testing samples")
126 | 
127 |     parser.add_argument("--metric",
128 |                         type=str,
129 |                         choices=["euclidean", "cosine"],
130 |                         default="euclidean",
131 |                         help="Distance metric")
132 | 
133 |     parser.add_argument("--ks",
134 |                         type=int,
135 |                         nargs="+",
136 |                         required=True,
137 |                         help="Number of neigbhors")
138 | 
139 |     return parser.parse_args()
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     parseargs()
144 |     MyData = LoadMovieLens100k('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data')
145 |     print(MyData.head())
146 |     n_users = MyData.user_id.unique().shape[0]
147 |     n_items = MyData.item_id.unique().shape[0]
148 |     print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))
149 |     train_data, test_data = SpiltData(MyData, SpiltRate=0.25)
150 |     train_data = pd.DataFrame(train_data)
151 |     test_data = pd.DataFrame(test_data)
152 | 
153 |     # Create two user-item matrices, one for training and another for testing
154 |     train_data_matrix = np.zeros((n_users, n_items))
155 |     for line in train_data.itertuples():
156 |         train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
157 | 
158 |     test_data_matrix = np.zeros((n_users, n_items))
159 |     for line in test_data.itertuples():
160 |         test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
161 | 
162 |     train_and_score('cosine',
163 |                     train_data_matrix,
164 |                     test_data_matrix,
165 |                     [25])


--------------------------------------------------------------------------------
/bak/Test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | Created on 2015-06-22
  4 | @author: Lockvictor
  5 | '''
  6 | import sys, random, math
  7 | from operator import itemgetter
  8 | 
  9 | random.seed(0)
 10 | 
 11 | 
 12 | class UserBasedCF():
 13 |     ''' TopN recommendation - UserBasedCF '''
 14 | 
 15 |     def __init__(self):
 16 |         self.trainset = {}
 17 |         self.testset = {}
 18 | 
 19 |         self.n_sim_user = 20
 20 |         self.n_rec_movie = 10
 21 | 
 22 |         self.user_sim_mat = {}
 23 |         self.movie_popular = {}
 24 |         self.movie_count = 0
 25 | 
 26 |         print(sys.stderr, 'Similar user number = %d' % self.n_sim_user)
 27 |         print(sys.stderr, 'recommended movie number = %d' % self.n_rec_movie)
 28 | 
 29 |     @staticmethod
 30 |     def loadfile(filename):
 31 |         ''' load a file, return a generator. '''
 32 |         fp = open(filename, 'r')
 33 |         for i, line in enumerate(fp):
 34 |             yield line.strip('\t')
 35 |             if i % 100000 == 0:
 36 |                 print(sys.stderr, 'loading %s(%s)' % (filename, i))
 37 |         fp.close()
 38 |         print(sys.stderr, 'load %s succ' % filename)
 39 | 
 40 |     def generate_dataset(self, filename, pivot=0.7):
 41 |         ''' load rating data and split it to training set and test set '''
 42 |         trainset_len = 0
 43 |         testset_len = 0
 44 | 
 45 |         for line in self.loadfile(filename):
 46 |             user, movie, rating, timestamp = line.split('\t')
 47 |             # split the data by pivot
 48 |             if (random.random() < pivot):
 49 |                 self.trainset.setdefault(user, {})
 50 |                 self.trainset[user][movie] = int(rating)
 51 |                 trainset_len += 1
 52 |             else:
 53 |                 self.testset.setdefault(user, {})
 54 |                 self.testset[user][movie] = int(rating)
 55 |                 testset_len += 1
 56 | 
 57 |         print(sys.stderr, 'split training set and test set succ')
 58 |         print(sys.stderr, 'train set = %s' % trainset_len)
 59 |         print(sys.stderr, 'test set = %s' % testset_len)
 60 | 
 61 |     def calc_user_sim(self):
 62 |         ''' calculate user similarity matrix '''
 63 |         # build inverse table for item-users
 64 |         # key=movieID, value=list of userIDs who have seen this movie
 65 |         print(sys.stderr, 'building movie-users inverse table...')
 66 |         movie2users = dict()
 67 | 
 68 |         for user, movies in self.trainset.items():
 69 |             for movie in movies:
 70 |                 # inverse table for item-users
 71 |                 if movie not in movie2users:
 72 |                     movie2users[movie] = set()
 73 |                 movie2users[movie].add(user)
 74 |                 # count item popularity at the same time
 75 |                 if movie not in self.movie_popular:
 76 |                     self.movie_popular[movie] = 0
 77 |                 self.movie_popular[movie] += 1
 78 |         print(sys.stderr, 'build movie-users inverse table succ')
 79 | 
 80 |         # save the total movie number, which will be used in evaluation
 81 |         self.movie_count = len(movie2users)
 82 |         print(sys.stderr, 'total movie number = %d' % self.movie_count)
 83 | 
 84 |         # count co-rated items between users
 85 |         usersim_mat = self.user_sim_mat
 86 |         print(sys.stderr, 'building user co-rated movies matrix...')
 87 | 
 88 |         for movie, users in movie2users.items():
 89 |             for u in users:
 90 |                 for v in users:
 91 |                     if u == v: continue
 92 |                     usersim_mat.setdefault(u, {})
 93 |                     usersim_mat[u].setdefault(v, 0)
 94 |                     usersim_mat[u][v] += 1
 95 |         print(sys.stderr, 'build user co-rated movies matrix succ')
 96 | 
 97 |         # calculate similarity matrix
 98 |         print(sys.stderr, 'calculating user similarity matrix...')
 99 |         simfactor_count = 0
100 |         PRINT_STEP = 2000000
101 |         for u, related_users in usersim_mat.items():
102 |             for v, count in related_users.items():
103 |                 usersim_mat[u][v] = count / math.sqrt(
104 |                     len(self.trainset[u]) * len(self.trainset[v]))
105 |                 simfactor_count += 1
106 |                 if simfactor_count % PRINT_STEP == 0:
107 |                     print(sys.stderr, 'calculating user similarity factor(%d)' % simfactor_count)
108 | 
109 |         print(sys.stderr, 'calculate user similarity matrix(similarity factor) succ')
110 |         print(sys.stderr, 'Total similarity factor number = %d' % simfactor_count)
111 | 
112 |     def recommend(self, user):
113 |         """ Find K similar users and recommend N movies. """
114 |         K = self.n_sim_user
115 |         N = self.n_rec_movie
116 |         rank = dict()
117 |         watched_movies = self.trainset[user]
118 | 
119 |         # v=similar user, wuv=similarity factor
120 |         for v, wuv in sorted(self.user_sim_mat[user].items(),
121 |                              key=itemgetter(1), reverse=True)[0:K]:
122 |             for movie in self.trainset[v]:
123 |                 if movie in watched_movies:
124 |                     continue
125 |                 # predict the user's "interest" for each movie
126 |                 rank.setdefault(movie, 0)
127 |                 rank[movie] += wuv
128 |         # return the N best movies
129 |         return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
130 | 
131 |     def evaluate(self):
132 |         ''' return precision, recall, coverage and popularity '''
133 |         print(sys.stderr, 'Evaluation start...')
134 | 
135 |         N = self.n_rec_movie
136 |         #  varables for precision and recall
137 |         hit = 0
138 |         rec_count = 0
139 |         test_count = 0
140 |         # varables for coverage
141 |         all_rec_movies = set()
142 |         # varables for popularity
143 |         popular_sum = 0
144 | 
145 |         for i, user in enumerate(self.trainset):  # enumerate 函数用于遍历序列中的元素以及它们的下标：
146 |             if i % 500 == 0:
147 |                 print(sys.stderr, 'recommended for %d users' % i)
148 |             test_movies = self.testset.get(user, {})
149 |             rec_movies = self.recommend(user)
150 |             for movie, w in rec_movies:
151 |                 if movie in test_movies:
152 |                     hit += 1
153 |                 all_rec_movies.add(movie)
154 |                 popular_sum += math.log(1 + self.movie_popular[movie])
155 |             rec_count += N
156 |             test_count += len(test_movies)
157 | 
158 |         precision = hit / (1.0 * rec_count)
159 |         recall = hit / (1.0 * test_count)
160 |         coverage = len(all_rec_movies) / (1.0 * self.movie_count)
161 |         popularity = popular_sum / (1.0 * rec_count)
162 | 
163 |         print(sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' % \
164 |               (precision, recall, coverage, popularity))
165 | 
166 | 
167 | if __name__ == '__main__':
168 |     ratingfile = 'G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data'
169 |     usercf = UserBasedCF()
170 |     usercf.generate_dataset(ratingfile)
171 |     usercf.calc_user_sim()
172 |     usercf.evaluate()
173 | 


--------------------------------------------------------------------------------
/bak/Intro to Recommender Systems_Collaborative Filtering.py:
--------------------------------------------------------------------------------
  1 | from DataHelper import *
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.metrics.pairwise import cosine_similarity
  5 | from sklearn.metrics import mean_squared_error
  6 | from EvaluationHelper import *
  7 | 
  8 | def get_mse(pred, actual):
  9 |     # Ignore nonzero terms.
 10 |     pred = pred[actual.nonzero()].flatten()
 11 |     actual = actual[actual.nonzero()].flatten()
 12 |     return mean_squared_error(pred, actual)
 13 | def slow_similarity(ratings, kind='user'):
 14 |     if kind == 'user':
 15 |         axmax = 0
 16 |         axmin = 1
 17 |     elif kind == 'item':
 18 |         axmax = 1
 19 |         axmin = 0
 20 |     sim = np.zeros((ratings.shape[axmax], ratings.shape[axmax]))
 21 |     for u in range(ratings.shape[axmax]):
 22 |         for uprime in range(ratings.shape[axmax]):
 23 |             rui_sqrd = 0.
 24 |             ruprimei_sqrd = 0.
 25 |             for i in range(ratings.shape[axmin]):
 26 |                 sim[u, uprime] = ratings[u, i] * ratings[uprime, i]
 27 |                 rui_sqrd += ratings[u, i] ** 2
 28 |                 ruprimei_sqrd += ratings[uprime, i] ** 2
 29 |             sim[u, uprime] /= rui_sqrd * ruprimei_sqrd
 30 |     return sim
 31 | 
 32 | def fast_similarity(ratings, kind='user', epsilon=1e-9):
 33 |     # epsilon -> small number for handling dived-by-zero errors
 34 |     if kind == 'user':
 35 |         sim = ratings.dot(ratings.T) + epsilon
 36 |     elif kind == 'item':
 37 |         sim = ratings.T.dot(ratings) + epsilon
 38 |     norms = np.array([np.sqrt(np.diagonal(sim))])
 39 |     return (sim / norms / norms.T)
 40 | 
 41 | def predict_slow_simple(ratings, similarity, kind='user'):
 42 |     pred = np.zeros(ratings.shape)
 43 |     if kind == 'user':
 44 |         for i in range(ratings.shape[0]):
 45 |             for j in range(ratings.shape[1]):
 46 |                 pred[i, j] = similarity[i, :].dot(ratings[:, j])\
 47 |                              /np.sum(np.abs(similarity[i, :]))
 48 |         return pred
 49 |     elif kind == 'item':
 50 |         for i in range(ratings.shape[0]):
 51 |             for j in range(ratings.shape[1]):
 52 |                 pred[i, j] = similarity[j, :].dot(ratings[i, :].T)\
 53 |                              /np.sum(np.abs(similarity[j, :]))
 54 | 
 55 |         return pred
 56 | 
 57 | def predict_fast_simple(ratings, similarity, kind='user'):
 58 |     if kind == 'user':
 59 |         return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
 60 |     elif kind == 'item':
 61 |         return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
 62 | 
 63 | 
 64 | def predict_topk(ratings, similarity, kind='user', k=40):
 65 |     pred = np.zeros(ratings.shape)
 66 |     if kind == 'user':
 67 |         for i in range(ratings.shape[0]):
 68 |             top_k_users = [np.argsort(similarity[:, i])[:-k - 1:-1]]
 69 |             for j in range(ratings.shape[1]):
 70 |                 pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
 71 |                 pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
 72 |     if kind == 'item':
 73 |         for j in range(ratings.shape[1]):
 74 |             top_k_items = [np.argsort(similarity[:, j])[:-k - 1:-1]]
 75 |             for i in range(ratings.shape[0]):
 76 |                 pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
 77 |                 pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))
 78 | 
 79 |     return pred
 80 | 
 81 | 
 82 | def predict_nobias(ratings, similarity, kind='user'):
 83 |     if kind == 'user':
 84 |         user_bias = ratings.mean(axis=1)
 85 |         ratings = (ratings - user_bias[:, np.newaxis]).copy()
 86 |         pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
 87 |         pred += user_bias[:, np.newaxis]
 88 |     elif kind == 'item':
 89 |         item_bias = ratings.mean(axis=0)
 90 |         ratings = (ratings - item_bias[np.newaxis, :]).copy()
 91 |         pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
 92 |         pred += item_bias[np.newaxis, :]
 93 | 
 94 |     return pred
 95 | 
 96 | 
 97 | def predict_topk_nobias(ratings, similarity, kind='user', k=20):
 98 |     pred = np.zeros(ratings.shape)
 99 |     if kind == 'user':
100 |         user_bias = ratings.mean(axis=1)
101 |         ratings = (ratings - user_bias[:, np.newaxis]).copy()
102 |         for i in range(ratings.shape[0]):
103 |             top_k_users = [np.argsort(similarity[:, i])[:-k - 1:-1]]
104 |             for j in range(ratings.shape[1]):
105 |                 pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
106 |                 pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
107 |         pred += user_bias[:, np.newaxis]
108 |     if kind == 'item':
109 |         item_bias = ratings.mean(axis=0)
110 |         ratings = (ratings - item_bias[np.newaxis, :]).copy()
111 |         for j in range(ratings.shape[1]):
112 |             top_k_items = [np.argsort(similarity[:, j])[:-k - 1:-1]]
113 |             for i in range(ratings.shape[0]):
114 |                 pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
115 |                 pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))
116 |         pred += item_bias[np.newaxis, :]
117 | 
118 |     return pred
119 | 
120 | if __name__ == '__main__':
121 |     MyData = LoadMovieLens100k('G:\\PycharmProjects\\Memory-based-collaborative-filtering\\Datas\\ml-100k\\data\\u.data')
122 |     print(MyData.head())
123 |     n_users = MyData.user_id.unique().shape[0]
124 |     n_items = MyData.item_id.unique().shape[0]
125 |     print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))
126 |     train_data, test_data = SpiltData(MyData, SpiltRate=0.25)
127 |     train_data = pd.DataFrame(train_data)
128 |     test_data = pd.DataFrame(test_data)
129 | 
130 |     # Create two user-item matrices, one for training and another for testing
131 |     train_data_matrix = np.zeros((n_users, n_items))
132 |     for line in train_data.itertuples():
133 |         train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
134 | 
135 |     test_data_matrix = np.zeros((n_users, n_items))
136 |     for line in test_data.itertuples():
137 |         test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
138 | 
139 |     ratings = np.zeros((n_users, n_items))
140 |     for row in MyData.itertuples():
141 |         ratings[row[1] - 1, row[2] - 1] = row[3]
142 | 
143 |     sparsity = float(len(ratings.nonzero()[0]))
144 |     sparsity /= (ratings.shape[0] * ratings.shape[1])
145 |     sparsity *= 100
146 |     print( 'Sparsity: {:4.2f}%'.format(sparsity))
147 |     train, test = train_test_split(ratings)
148 | 
149 |     user_similarity = fast_similarity(train, kind='user')
150 |     item_similarity = fast_similarity(train, kind='item')
151 |     print(item_similarity[:4, :4])
152 | 
153 |     item_prediction = predict_fast_simple(train, item_similarity, kind='item')
154 |     user_prediction = predict_fast_simple(train, user_similarity, kind='user')
155 |     '''
156 |     print('User-based CF MSE: ' + str(get_mse(user_prediction, test)))
157 |     print('Item-based CF RMSE: ' + str(RMSE(item_prediction, test)))
158 |     pred = predict_topk(train, user_similarity, kind='user', k=40)
159 |     print('Top-k User-based CF RMSE: ' + str(RMSE(pred, test)))
160 | 
161 |     pred = predict_topk(train, item_similarity, kind='item', k=40)
162 |     print('Top-k Item-based CF RMSE: ' + str(RMSE(pred, test)))
163 |     '''
164 |     user_pred = predict_topk_nobias(train, user_similarity, kind='user')
165 |     print('Bias-subtracted User-based CF RMSE: ' + str(RMSE(user_pred, test)))
166 | 
167 |     user_pred = predict_nobias(train, user_similarity, kind='user')
168 |     print('Bias-subtracted User-based CF RMSE: ' + str(RMSE(user_pred, test)))
169 |     item_pred = predict_nobias(train, item_similarity, kind='item')
170 |     print('Bias-subtracted Item-based CF RMSE: ' + str(RMSE(item_pred, test)))


--------------------------------------------------------------------------------