├── .gitignore
├── .project
├── .pydevproject
├── README.md
└── src
├── recommender.py
├── similarity.py
├── tool.py
└── validation.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | pyCollaborativeFiltering
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 | python3
4 | python 3.0
5 |
6 | /${PROJECT_DIR_NAME}/src
7 |
8 |
9 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pyCollaborativeFiltering
2 | User-based and Item-based Collaborative Filtering algorithms written in Python
3 |
4 | ## Develop enviroment
5 | * Language: Python3
6 | * IDE: Eclipse PyDev
7 | * Prerequisite libraries: [Numpy](http://numpy.org)
8 |
9 | ## Specification of user-based method
10 | * If you use a built-up model, the recommender system considers only the nearest neighbors existing in the model. Otherwise, the recommender looks for K-similar neighbors for each target user by using the given similarity measure and the number(K) of nearest neighbors.
11 | * In unary data, the predicted score of the item is the average similarity of the nearest neighbors who rated on the item.
12 | * User similarity does not include those of neighbors whose similarity is zero or lower value.
13 | * The cosine similarity basically considers only co-rated items. (Another measures such as the basic cosine similarity and Pearson correlation coefficient are also applicable.)
14 |
15 | ## Input data format
16 | `UserID \t ItemID \t Rating \n`
17 |
18 | ## Usage example
19 | ### User-based Recommendation
20 | ```python
21 | >>> import tool
22 | >>> data = tool.loadData("/home/changuk/data/MovieLens/movielens.dat")
23 | >>> from recommender import UserBased
24 | >>> ubcf = UserBased()
25 | >>> ubcf.loadData(data)
26 | >>> import similarity
27 | >>> simMeasure = similarity.cosine_intersection
28 | >>> for user in data.keys():
29 | ... recommendation = ubcf.Recommendation(user, simMeasure=simMeasure, nNeighbors=30)
30 | ```
31 | ### Item-based Recommendation
32 | ```python
33 | >>> import tool
34 | >>> data = tool.loadData("/home/changuk/data/MovieLens/movielens.dat")
35 | >>> from recommender import ItemBased
36 | >>> ibcf = ItemBased()
37 | >>> ibcf.loadData(data)
38 | >>> model = ibcf.buildModel(nNeighbors=20)
39 | >>> for user in data.keys():
40 | ... recommendation = ibcf.Recommendation(user, model=model)
41 | ```
42 | ### Validation
43 | ```python
44 | >>> import tool
45 | >>> trainSet = tool.loadData("/home/changuk/data/MovieLens/u1.base")
46 | >>> testSet = tool.loadData("/home/changuk/data/MovieLens/u1.test")
47 | >>> from recommender import UserBased
48 | >>> ubcf = UserBased()
49 | >>> ubcf.loadData(trainSet)
50 | >>> model = ubcf.buildModel(nNeighbors=30)
51 | >>> import validation
52 | >>> result = validation.evaluateRecommender(testSet, ubcf, model=model, topN=10)
53 | >>> print(result)
54 | {'Precision': 0.050980392156862, 'Recall': 0.009698538130460, 'Hit-rate': 0.5098039215686}
55 | ```
56 |
57 | ## TODO list
58 | * Support binary data
59 | * Implement similarity normalization in Item-based CF
60 |
61 | ## References
62 | * [An Algorithmic Framework for Performing Collaborative Filtering - Herlocker, Konstan, Borchers, Riedl (SIGIR 1999)](http://files.grouplens.org/papers/algs.pdf)
63 | * [Item-based Top-N Recommendation Algorithms - Deshpande, Karypis (TOIS 2004)](http://glaros.dtc.umn.edu/gkhome/fetch/papers/itemrsTOIS04.pdf)
64 | * [https://en.wikipedia.org/wiki/Collaborative_filtering](https://en.wikipedia.org/wiki/Collaborative_filtering)
65 |
--------------------------------------------------------------------------------
/src/recommender.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from builtins import isinstance
3 | import pickle
4 |
5 | import numpy as np
6 | import similarity
7 | import tool
8 |
9 |
10 | class DataType:
11 | Unary = 1 # Like, purchase, etc
12 | Binary = 2 # Like/dislike, thumb-up/thumb-down, true/false, etc
13 | Explicit = 3 # User-Item-Score, etc
14 |
15 | class CollaborativeFiltering(object):
16 | __metaclass__ = abc.ABCMeta
17 |
18 | def __init__(self, dataType = DataType.Explicit):
19 | self.dataType = dataType
20 | self.prefs = None
21 | self.itemList = None
22 |
23 | @classmethod
24 | @abc.abstractmethod
25 | def buildModel(cls):
26 | raise NotImplementedError
27 |
28 | @classmethod
29 | @abc.abstractmethod
30 | def Recommendation(cls):
31 | raise NotImplementedError
32 |
33 | def getNearestNeighbors(self, target, simMeasure, nNeighbors = None):
34 | similarities = [(simMeasure(self.prefs[target], self.prefs[other]), other) for other in self.prefs if target != other]
35 | similarities.sort(reverse = True)
36 | if nNeighbors != None:
37 | similarities = similarities[0:nNeighbors]
38 | return similarities # similarities = [(similarity, neighbor), ...]
39 |
40 | def loadExtModel(self, pathDump):
41 | print("Loading external model...")
42 | try:
43 | file = open(pathDump, "rb")
44 | model = pickle.load(file)
45 | file.close()
46 | print("\tDone!")
47 | return model
48 | except:
49 | print("\tFailed!")
50 | return None
51 |
52 | def dumpModel(self, model, pathDump):
53 | try:
54 | file = open(pathDump, "wb")
55 | pickle.dump(model, file)
56 | file.close()
57 | except IOError as e:
58 | print(e)
59 |
60 | class UserBased(CollaborativeFiltering):
61 | '''
62 | For more details, reference the following paper:
63 | An Algorithmic Framework for Performing Collaborative Filtering - Herlocker, Konstan, Borchers, Riedl (SIGIR 1999)
64 | '''
65 | def __init__(self, dataType = DataType.Explicit):
66 | super().__init__(dataType)
67 | print("User-based Collaborative Filtering")
68 |
69 | def loadData(self, data):
70 | if isinstance(data, dict): # If 'data' is preferences on users for training
71 | self.prefs = data
72 | elif isinstance(data, str): # If 'data' is a file path of training data
73 | self.prefs = tool.loadData(data)
74 | self.itemList = {}
75 | for user in self.prefs:
76 | for item in self.prefs[user]:
77 | self.itemList[item] = None
78 |
79 | def buildModel(self, simMeasure = similarity.cosine_intersection, nNeighbors = None, pathDump = None):
80 | # Model contains top-K similar users for each user and their similarities.
81 | # Model format: {user: [(similarity, neighbor), ...], ...}
82 | model = self.loadExtModel(pathDump)
83 | if model != None:
84 | return model
85 |
86 | print("Model builder is running...")
87 | model = {}
88 | for user in self.prefs:
89 | model[user] = self.getNearestNeighbors(user, simMeasure, nNeighbors)
90 |
91 | if pathDump != None:
92 | self.dumpModel(model, pathDump)
93 | print("\tComplete!")
94 | return model
95 |
96 | def getPredictedRating(self, user, item, nearestNeighbors):
97 | if self.dataType == DataType.Unary:
98 | if item in self.prefs[user]:
99 | return 1.0
100 | similarities = [similarity for neighbor, similarity in nearestNeighbors.items() if item in self.prefs[neighbor]]
101 | if len(similarities) == 0:
102 | return 0.0
103 | return np.mean(similarities)
104 | elif self.dataType == DataType.Binary:
105 | # Not supported yet
106 | return 0.0
107 | elif self.dataType == DataType.Explicit:
108 | if item in self.prefs[user]:
109 | return self.prefs[user][item]
110 | meanRating = np.mean([score for score in self.prefs[user].values()])
111 | weightedSum = 0
112 | normalizingFactor = 0
113 | for neighbor, similarity in nearestNeighbors.items():
114 | if item not in self.prefs[neighbor]:
115 | continue
116 | meanRatingOfNeighbor = np.mean([r for r in self.prefs[neighbor].values()])
117 | weightedSum += similarity * (self.prefs[neighbor][item] - meanRatingOfNeighbor)
118 | normalizingFactor += np.abs(similarity)
119 | if normalizingFactor == 0:
120 | return 0
121 | return meanRating + (weightedSum / normalizingFactor)
122 |
123 | def Recommendation(self, user, simMeasure = similarity.cosine_intersection, nNeighbors = 50, model = None, topN = None):
124 | if model != None:
125 | '''
126 | If a user-user similarity model is given,
127 | other parameters such as similarity measure and the number of nearest neighbors are ignored.
128 | It is because that the similarity measure and # of neighbors are determined during the model building.
129 | '''
130 | candidateItems = {} # List of candidate items to be recommended
131 | nearestNeighbors = {} # List of nearest neighbors
132 | for similarity, neighbor in model[user]:
133 | if similarity <= 0:
134 | break
135 | nearestNeighbors[neighbor] = similarity
136 | for item in self.prefs[neighbor]:
137 | candidateItems[item] = None
138 | predictedScores = [(self.getPredictedRating(user, item, nearestNeighbors), item)
139 | for item in candidateItems if item not in self.prefs[user]]
140 | else:
141 | '''
142 | If a model is not given, the recommendation task follows the original CF method.
143 | After finding K-nearest neighbors who have rating history on the item,
144 | the recommendation is made by using their similarities.
145 | '''
146 | predictedScores = [] # predictedScores = [(predicted_score, item), ...]
147 | similarities = self.getNearestNeighbors(user, simMeasure) # similarities = [(similarity, neighbor), ...]
148 | for item in self.itemList:
149 | if item in self.prefs[user]:
150 | continue
151 | itemRaters = {} # Nearest neighbors who rated on the item
152 | for similarity, neighbor in similarities:
153 | if similarity <= 0 or len(itemRaters) == nNeighbors:
154 | break
155 | if item in self.prefs[neighbor]:
156 | itemRaters[neighbor] = similarity
157 | predictedScores.append((self.getPredictedRating(user, item, itemRaters), item))
158 |
159 | predictedScores.sort(reverse = True)
160 | recommendation = [item for similarity, item in predictedScores]
161 | if topN != None:
162 | recommendation = recommendation[0:topN]
163 | return recommendation
164 |
165 | class ItemBased(CollaborativeFiltering):
166 | '''
167 | For more details, reference the following paper:
168 | Item-based Top-N Recommendation Algorithms - Deshpande, Karypis (TOIS 2004)
169 | '''
170 | def __init__(self, dataType = DataType.Explicit):
171 | super().__init__(dataType)
172 | print("Item-based Collaborative Filtering")
173 |
174 | def loadData(self, data):
175 | if isinstance(data, dict): # If 'data' is preferences on users for training
176 | self.prefsOnUser = data
177 | self.prefs = tool.transposePrefs(self.prefsOnUser)
178 | elif isinstance(data, str): # If 'data' is a file path of training data
179 | self.prefsOnUser = tool.loadData(data)
180 | self.prefs = tool.transposePrefs(self.prefsOnUser)
181 | self.itemList = self.prefs.keys()
182 |
183 | def buildModel(self, simMeasure = similarity.cosine, nNeighbors = 20, pathDump = None):
184 | '''
185 | The j-th column of the model(matrix) stores the k most similar items to item j.
186 | But, in this project, the model is not matrix but dictionary type.
187 | '''
188 | # Model contains top-K similar items for each item and their similarities.
189 | # Model format: {item: {neighbor: similarity, ...}, ...}
190 | model = self.loadExtModel(pathDump)
191 | if model != None:
192 | return model
193 |
194 | print("Model builder is running...")
195 | model = {}
196 | for item in self.prefs:
197 | model.setdefault(item, {})
198 | correlations = self.getNearestNeighbors(item, simMeasure, nNeighbors)
199 | for correlation, neighbor in correlations:
200 | model[item][neighbor] = correlation
201 |
202 | # Row normalization
203 | for c in model:
204 | COLSUM = sum([model[c][r] for r in model[c]])
205 | if COLSUM > 0:
206 | for r in model[c]:
207 | model[c][r] /= COLSUM
208 |
209 | if pathDump != None:
210 | self.dumpModel(model, pathDump)
211 | print("\tComplete!")
212 | return model
213 |
214 | def Recommendation(self, user, simMeasure = similarity.cosine, nNeighbors = 20, model = None, topN = None):
215 | '''
216 | Pseudo code:
217 | ApplyModel(M, U, N):
218 | x <- MU # i-th row, j-th column
219 | for j <- 1 to m:
220 | if U_i != 0:
221 | x_i <- 0
222 | for j <- 1 to m:
223 | if x_i != among the N largest values in x:
224 | x_i <- 0
225 | '''
226 | predictedScores = []
227 | for candidate in self.itemList:
228 | if candidate in self.prefsOnUser[user]:
229 | continue
230 |
231 | if model != None:
232 | correlations = model[candidate]
233 | else:
234 | correlations = self.getNearestNeighbors(candidate, simMeasure, nNeighbors)
235 |
236 | score = sum([correlations[candidate] * self.prefsOnUser[user][item]
237 | for item in self.prefsOnUser[user] if candidate in correlations])
238 | predictedScores.append((score, candidate))
239 | predictedScores.sort(reverse = True)
240 | recommendation = [item for similarity, item in predictedScores]
241 | if topN != None:
242 | recommendation = recommendation[0:topN]
243 | return recommendation
244 |
--------------------------------------------------------------------------------
/src/similarity.py:
--------------------------------------------------------------------------------
1 | from math import sqrt
2 |
3 | import numpy as np
4 |
5 |
6 | def cosine(dataA, dataB):
7 | if type(dataA) is list and type(dataB) is list:
8 | if len(dataA) != len(dataB):
9 | print("Error: the length of two input lists are not same.")
10 | return -1
11 | AB = sum([dataA[i] * dataB[i] for i in range(len(dataA))])
12 | normA = sqrt(sum([dataA[i] ** 2 for i in range(len(dataA))]))
13 | normB = sqrt(sum([dataB[i] ** 2 for i in range(len(dataB))]))
14 | denominator = normA * normB
15 | if denominator == 0:
16 | return 0
17 | return AB / denominator
18 | elif type(dataA) is dict and type(dataB) is dict:
19 | interSet = [obj for obj in dataA if obj in dataB]
20 | if len(interSet) == 0:
21 | return 0
22 | AB = sum([dataA[obj] * dataB[obj] for obj in interSet])
23 | normA = sqrt(sum([dataA[obj] ** 2 for obj in dataA]))
24 | normB = sqrt(sum([dataB[obj] ** 2 for obj in dataB]))
25 | denominator = normA * normB
26 | if denominator == 0:
27 | return -1
28 | return AB / denominator
29 | else:
30 | print("Error: input data type is invalid.")
31 | return -1
32 |
33 | def cosine_intersection(dataA, dataB):
34 | if type(dataA) is list and type(dataB) is list:
35 | if len(dataA) != len(dataB):
36 | print("Error: the length of two input lists are not same.")
37 | return -1
38 | interSet = [i for i in range(len(dataA)) if dataA[i] * dataB[i] != 0]
39 | if len(interSet) == 0:
40 | return 0
41 | AB = sum([dataA[i] * dataB[i] for i in range(interSet)])
42 | normA = sqrt(sum([dataA[i] ** 2 for i in range(interSet)]))
43 | normB = sqrt(sum([dataB[i] ** 2 for i in range(interSet)]))
44 | denominator = normA * normB
45 | if denominator == 0:
46 | return 0
47 | return AB / denominator
48 | elif type(dataA) is dict and type(dataB) is dict:
49 | interSet = [obj for obj in dataA if obj in dataB]
50 | if len(interSet) == 0:
51 | return 0
52 | AB = sum([dataA[obj] * dataB[obj] for obj in interSet])
53 | normA = sqrt(sum([dataA[obj] ** 2 for obj in interSet]))
54 | normB = sqrt(sum([dataB[obj] ** 2 for obj in interSet]))
55 | denominator = normA * normB
56 | if denominator == 0:
57 | return -1
58 | return AB / denominator
59 | else:
60 | print("Error: input data type is invalid.")
61 | return -1
62 |
63 | def pearson(dataA, dataB, significanceWeighting = False):
64 | if type(dataA) is list and type(dataB) is list:
65 | if len(dataA) != len(dataB):
66 | print("Error: the length of two input lists are not same.")
67 | return -1
68 | length = len(dataA)
69 | intersection = [i for i in range(length) if dataA[i] != 0 and dataB[i] != 0] # Contains indices of co-rated items
70 | if len(intersection) == 0:
71 | return 0
72 | meanA = np.mean([dataA[i] for i in range(length) if dataA[i] != 0])
73 | meanB = np.mean([dataB[i] for i in range(length) if dataB[i] != 0])
74 | numerator = sum([(dataA[i] - meanA) * (dataB[i] - meanB) for i in intersection])
75 | deviationA = sqrt(sum([(dataA[i] - meanA) ** 2 for i in intersection]))
76 | deviationB = sqrt(sum([(dataB[i] - meanB) ** 2 for i in intersection]))
77 | if (deviationA * deviationB) == 0:
78 | return 0
79 | correlation = numerator / (deviationA * deviationB)
80 | elif type(dataA) is dict and type(dataB) is dict:
81 | intersection = [obj for obj in dataA if obj in dataB]
82 | if len(intersection) == 0:
83 | return 0
84 | meanA = np.mean([dataA[obj] for obj in dataA.keys()])
85 | meanB = np.mean([dataB[obj] for obj in dataB.keys()])
86 | numerator = sum([(dataA[obj] - meanA) * (dataB[obj] - meanB) for obj in intersection])
87 | deviationA = sqrt(sum([(dataA[obj] - meanA) ** 2 for obj in intersection]))
88 | deviationB = sqrt(sum([(dataB[obj] - meanB) ** 2 for obj in intersection]))
89 | if (deviationA * deviationB) == 0:
90 | return 0
91 | correlation = numerator / (deviationA * deviationB)
92 | else:
93 | print("Error: input data type is invalid.")
94 | return -1
95 |
96 | # Correlation significance weighting
97 | # Reference: An Algorithmic Framework for Performing Collaborative Filtering (SIGIR 1999)
98 | if significanceWeighting == True:
99 | if len(intersection) < 50:
100 | correlation *= (len(intersection) / 50)
101 |
102 | return correlation
103 |
104 | def jaccard(dataA, dataB):
105 | # Jaccard similarity is applicable to both list type and dictionary type.
106 | nIntersection = sum([1 for obj in dataA if obj in dataB])
107 | nUnion = len(dataA) + len(dataB) - nIntersection
108 | if nUnion == 0:
109 | return -1
110 | return nIntersection / nUnion
111 |
--------------------------------------------------------------------------------
/src/tool.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def loadData(filePath, inv = False):
5 | '''
6 | Load data from a input file into memory with dictionary format.
7 | * Input file format: userID \t itemID \t rating \n
8 | * Output data format: {userID: {itemID: rating, ...}, ...}
9 | '''
10 | data = {}
11 | try:
12 | with open(filePath) as file:
13 | for line in file:
14 | line = line.replace("\n", "")
15 | tokens = line.split("\t")
16 |
17 | if len(tokens) < 2:
18 | continue
19 | elif len(tokens) == 2:
20 | user = tokens[0]
21 | item = tokens[1]
22 | rating = 1
23 | else:
24 | user = tokens[0]
25 | item = tokens[1]
26 | rating = tokens[2]
27 |
28 | # Store data
29 | if inv == False:
30 | data.setdefault(user, {})
31 | data[user][item] = float(rating)
32 | else:
33 | data.setdefault(item, {})
34 | data[item][user] = float(rating)
35 | file.close()
36 | except IOError as e:
37 | print(e)
38 | return data
39 |
40 | def transposePrefs(prefs):
41 | '''
42 | Transpose the preference data by switching object and subject.
43 | For example, the preference data on users can be transformed into the preferences data on items.
44 | '''
45 | transposed = {}
46 | for obj in prefs:
47 | for subj in prefs[obj]:
48 | transposed.setdefault(subj, {})
49 | transposed[subj][obj] = prefs[obj][subj]
50 | return transposed
51 |
52 | def getCurrentDir(filePath):
53 | return os.path.dirname(os.path.abspath(filePath)) + "/"
54 |
55 | def getFilename(filePath):
56 | return os.path.basename(filePath)
57 |
58 | def getFilenameWithoutExtension(filePath):
59 | fullname = getFilename(filePath)
60 | return os.path.splitext(fullname)[0]
61 |
--------------------------------------------------------------------------------
/src/validation.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from datetime import datetime
3 |
4 |
5 | def evaluateRecommender(testSet, recommender, simMeasure = None, nNeighbors = None, model = None, topN = None):
6 | # Evaluation metrics
7 | totalPrecision = 0
8 | totalRecall = 0
9 | totalF1score = 0
10 | totalHit = 0
11 |
12 | for user in testSet:
13 | recommendation = recommender.Recommendation(user, simMeasure = simMeasure, nNeighbors = nNeighbors, model = model, topN = topN)
14 | hit = sum([1 for item in testSet[user] if item in recommendation])
15 | precision = hit / topN
16 | recall = hit / len(testSet[user])
17 | f1score = 0 if hit == 0 else 2 * precision * recall / (precision + recall)
18 |
19 | totalPrecision += precision
20 | totalRecall += recall
21 | totalF1score += f1score
22 | totalHit += hit
23 |
24 | # Find final results
25 | result = {}
26 | result["Precision"] = totalPrecision / len(testSet)
27 | result["Recall"] = totalRecall / len(testSet)
28 | result["F1-score"] = totalF1score / len(testSet)
29 | result["Hit-rate"] = totalHit / len(testSet)
30 | return result
31 |
32 | class CrossValidation(object):
33 | def KFoldSplit(self, data, fold, nFolds): # fold: 0~4 when 5-Fold validation
34 | trainSet = deepcopy(data) # data = {user: {item: rating, ...}, ...}
35 | testSet = {}
36 | for user in data:
37 | testSet.setdefault(user, {})
38 | unitLength = int(len(data[user]) / nFolds) # data[user] = {item: rating, ...}
39 | lowerbound = unitLength * fold
40 | upperbound = unitLength * (fold + 1) if fold < nFolds - 1 else len(data[user])
41 | testItems = {}
42 | for i, item in enumerate(data[user]):
43 | if lowerbound <= i and i < upperbound:
44 | testItems[item] = float(trainSet[user].pop(item))
45 | testSet[user] = testItems
46 | return trainSet, testSet
47 |
48 | def KFold(self, data, recommender, simMeasure = None, nNeighbors = None, model = None, topN = 10, nFolds = 5):
49 | start_time = datetime.now()
50 |
51 | # Evaluation metrics
52 | totalPrecision = 0
53 | totalRecall = 0
54 | totalF1score = 0
55 | totalHitrate = 0
56 |
57 | for fold in range(nFolds):
58 | trainSet, testSet = self.KFoldSplit(data, fold, nFolds)
59 | recommender.loadData(trainSet)
60 | evaluation = evaluateRecommender(testSet, recommender, simMeasure = simMeasure, nNeighbors = nNeighbors, model = model, topN = topN)
61 |
62 | totalPrecision += evaluation["Precision"]
63 | totalRecall += evaluation["Recall"]
64 | totalF1score += evaluation["F1-score"]
65 | totalHitrate += evaluation["Hit-rate"]
66 |
67 | del(trainSet)
68 | del(testSet)
69 |
70 | # Find final results
71 | result = {}
72 | result["Precision"] = totalPrecision / nFolds
73 | result["Recall"] = totalRecall / nFolds
74 | result["F1-score"] = totalF1score / nFolds
75 | result["Hit-rate"] = totalHitrate / nFolds
76 |
77 | print("Execution time: {}".format(datetime.now() - start_time))
78 | return result
79 |
80 | def LeaveKOutSplit(self, data, user, items): # `user` should have rating scores on `items` in `data`
81 | trainSet = deepcopy(data) # To prevent original input data from being modified
82 | testSet = {}
83 | testSet.setdefault(user, {})
84 | for item in items:
85 | testSet[user][item] = float(trainSet[user].pop(item))
86 | return trainSet, testSet
87 |
88 | def LeaveOneOut(self, data, recommender, simMeasure = None, nNeighbors = None, model = None, topN = 10):
89 | start_time = datetime.now()
90 |
91 | # Evaluation metrics
92 | totalPrecision = 0
93 | totalRecall = 0
94 | totalF1score = 0
95 | totalHitrate = 0
96 |
97 | nTrials = 0
98 | for user in data:
99 | for item in data[user]:
100 | trainSet, testSet = self.LeaveKOutSplit(data, user, [item])
101 | recommender.loadData(trainSet)
102 | evaluation = evaluateRecommender(testSet, recommender, simMeasure = simMeasure, nNeighbors = nNeighbors, model = model, topN = topN)
103 |
104 | totalPrecision += evaluation["Precision"]
105 | totalRecall += evaluation["Recall"]
106 | totalF1score += evaluation["F1-score"]
107 | totalHitrate += evaluation["Hit-rate"]
108 | nTrials += 1
109 |
110 | del(trainSet)
111 | del(testSet)
112 |
113 | # Find final results
114 | result = {}
115 | result["Precision"] = totalPrecision / nTrials
116 | result["Recall"] = totalRecall / nTrials
117 | result["F1-score"] = totalF1score / nTrials
118 | result["Hit-rate"] = totalHitrate / nTrials
119 |
120 | print("Execution time: {}".format(datetime.now() - start_time))
121 | return result
122 |
--------------------------------------------------------------------------------