├── .gitignore ├── BPR └── BPR_Sampling.py ├── Base ├── Cython │ ├── compileCython.py │ ├── cosine_similarity.c │ ├── cosine_similarity.cpython-36m-x86_64-linux-gnu.so │ ├── cosine_similarity.html │ └── cosine_similarity.pyx ├── Recommender.py ├── Recommender_utils.py ├── Recommender_utils_Test.py ├── Similarity_Matrix_Recommender.py ├── cosine_similarity.py ├── cosine_similarity_test.py └── metrics.py ├── KNN ├── item_knn_CBF.py ├── item_knn_CF.py ├── item_knn_custom_Similarity.py └── user_knn_CF.py ├── MatrixFactorization ├── Cython │ ├── MF_BPR_Cython.py │ ├── MF_BPR_Cython_Epoch.c │ ├── MF_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so │ ├── MF_BPR_Cython_Epoch.html │ ├── MF_BPR_Cython_Epoch.pyx │ ├── MF_RMSE.c │ ├── MF_RMSE.cpython-36m-x86_64-linux-gnu.so │ ├── MF_RMSE.html │ ├── MF_RMSE.pyx │ ├── build │ │ └── temp.linux-x86_64-3.6 │ │ │ └── MF_BPR_Cython_Epoch.o │ └── compileCython.py └── MatrixFactorization_RMSE.py ├── README.md ├── SLIM_BPR ├── Cython │ ├── SLIM_BPR_Cython.py │ ├── SLIM_BPR_Cython_Epoch.c │ ├── SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so │ ├── SLIM_BPR_Cython_Epoch.html │ ├── SLIM_BPR_Cython_Epoch.pyx │ ├── build │ │ ├── lib.linux-x86_64-3.6 │ │ │ └── SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so │ │ ├── temp.linux-x86_64-3.6 │ │ │ ├── SLIM_BPR_Cython_Epoch.o │ │ │ ├── Sparse_Matrix_CSR.o │ │ │ └── Sparse_Matrix_Tree_CSR.o │ │ └── temp.win-amd64-3.6 │ │ │ └── Release │ │ │ ├── SLIM_BPR_Cython_Epoch.cp36-win_amd64.def │ │ │ ├── Sparse_Matrix_CSR.cp36-win_amd64.def │ │ │ ├── Sparse_Matrix_Tree_CSR.cp36-win_amd64.def │ │ │ ├── slim_bpr_cython_epoch.o │ │ │ ├── sparse_matrix_csr.o │ │ │ └── sparse_matrix_tree_csr.o │ └── compileCython.py └── SLIM_BPR_Python.py ├── SLIM_RMSE └── SLIM_RMSE.py ├── all_algorithms.py ├── data ├── Movielens10MReader.py ├── URM_test.npz ├── URM_train.npz ├── URM_validation.npz └── movielens_10m.zip ├── run_SLIM_BPR.py └── slides ├── 20161219_BPR.pptx ├── 2017_MF.pdf ├── 2017_MF.pptx ├── Amazon AWS.pdf ├── FM.pdf ├── FunkSVD - 2006.pdf ├── hu_koren_volinsky.pdf ├── koren_sdv++.pdf └── rendle_bpr.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | env/ 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *.cover 44 | .hypothesis/ 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | local_settings.py 53 | 54 | # Flask stuff: 55 | instance/ 56 | .webassets-cache 57 | 58 | # Scrapy stuff: 59 | .scrapy 60 | 61 | # Sphinx documentation 62 | docs/_build/ 63 | 64 | # PyBuilder 65 | target/ 66 | 67 | # Jupyter Notebook 68 | .ipynb_checkpoints 69 | 70 | # pyenv 71 | .python-version 72 | 73 | # celery beat schedule file 74 | celerybeat-schedule 75 | 76 | # SageMath parsed files 77 | *.sage.py 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | .venv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | .spyproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | # mkdocs documentation 95 | /site 96 | 97 | # mypy 98 | .mypy_cache/ 99 | -------------------------------------------------------------------------------- /BPR/BPR_Sampling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 5/09/2017 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | import numpy as np 10 | 11 | 12 | class BPR_Sampling(object): 13 | 14 | def __init__(self): 15 | super(BPR_Sampling, self).__init__() 16 | 17 | 18 | def sampleUser(self): 19 | """ 20 | Sample a user that has viewed at least one and not all items 21 | :return: user_id 22 | """ 23 | while (True): 24 | 25 | user_id = np.random.randint(0, self.n_users) 26 | numSeenItems = self.URM_train[user_id].nnz 27 | 28 | if (numSeenItems > 0 and numSeenItems < self.n_items): 29 | return user_id 30 | 31 | 32 | def sampleItemPair(self, user_id): 33 | """ 34 | Returns for the given user a random seen item and a random not seen item 35 | :param user_id: 36 | :return: pos_item_id, neg_item_id 37 | """ 38 | 39 | userSeenItems = self.URM_train[user_id].indices 40 | 41 | pos_item_id = userSeenItems[np.random.randint(0, len(userSeenItems))] 42 | 43 | while (True): 44 | 45 | neg_item_id = np.random.randint(0, self.n_items) 46 | 47 | if (neg_item_id not in userSeenItems): 48 | return pos_item_id, neg_item_id 49 | 50 | 51 | def sampleTriple(self): 52 | """ 53 | Randomly samples a user and then samples randomly a seen and not seen item 54 | :return: user_id, pos_item_id, neg_item_id 55 | """ 56 | 57 | user_id = self.sampleUser() 58 | pos_item_id, neg_item_id = self.sampleItemPair(user_id) 59 | 60 | return user_id, pos_item_id, neg_item_id 61 | 62 | 63 | def initializeFastSampling(self, positive_threshold=3): 64 | print("Initializing fast sampling") 65 | 66 | self.eligibleUsers = [] 67 | self.userSeenItems = dict() 68 | 69 | # Select only positive interactions 70 | URM_train_positive = self.URM_train.multiply(self.URM_train>positive_threshold) 71 | 72 | for user_id in range(self.n_users): 73 | 74 | if (URM_train_positive[user_id].nnz > 0): 75 | self.eligibleUsers.append(user_id) 76 | self.userSeenItems[user_id] = URM_train_positive[user_id].indices 77 | 78 | self.eligibleUsers = np.array(self.eligibleUsers) 79 | 80 | 81 | def sampleBatch(self): 82 | user_id_list = np.random.choice(self.eligibleUsers, size=(self.batch_size)) 83 | pos_item_id_list = [None]*self.batch_size 84 | neg_item_id_list = [None]*self.batch_size 85 | 86 | for sample_index in range(self.batch_size): 87 | user_id = user_id_list[sample_index] 88 | 89 | pos_item_id_list[sample_index] = np.random.choice(self.userSeenItems[user_id]) 90 | 91 | negItemSelected = False 92 | 93 | # It's faster to just try again then to build a mapping of the non-seen items 94 | # for every user 95 | while (not negItemSelected): 96 | neg_item_id = np.random.randint(0, self.n_items) 97 | 98 | if (neg_item_id not in self.userSeenItems[user_id]): 99 | negItemSelected = True 100 | neg_item_id_list[sample_index] = neg_item_id 101 | 102 | return user_id_list, pos_item_id_list, neg_item_id_list 103 | 104 | -------------------------------------------------------------------------------- /Base/Cython/compileCython.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 16/07/2017 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | 10 | try: 11 | from setuptools import setup 12 | from setuptools import Extension 13 | except ImportError: 14 | from distutils.core import setup 15 | from distutils.extension import Extension 16 | 17 | 18 | from Cython.Distutils import build_ext 19 | import numpy 20 | import sys 21 | import re 22 | 23 | 24 | if len(sys.argv) != 4: 25 | raise ValueError("Wrong number of paramethers received. Expected 4, got {}".format(sys.argv)) 26 | 27 | 28 | #fileToCompile = 'FW_SIMILARITY_RMSE_Cython_Epoch.pyx' 29 | 30 | # Get the name of the file to compile 31 | fileToCompile = sys.argv[1] 32 | # Remove the argument from sys argv in order for it to contain only what setup needs 33 | del sys.argv[1] 34 | 35 | extensionName = re.sub("\.pyx", "", fileToCompile) 36 | 37 | 38 | ext_modules = Extension(extensionName, 39 | [fileToCompile], 40 | extra_compile_args=['-O3'], 41 | include_dirs=[numpy.get_include(),], 42 | ) 43 | 44 | setup( 45 | cmdclass={'build_ext': build_ext}, 46 | ext_modules=[ext_modules] 47 | ) 48 | -------------------------------------------------------------------------------- /Base/Cython/cosine_similarity.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/Base/Cython/cosine_similarity.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /Base/Cython/cosine_similarity.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 23/10/17 3 | 4 | @author: Maurizio Ferrari Dacrema 5 | """ 6 | 7 | #cython: boundscheck=False 8 | #cython: wraparound=True 9 | #cython: initializedcheck=False 10 | #cython: language_level=3 11 | #cython: nonecheck=False 12 | #cython: cdivision=True 13 | #cython: unpack_method_calls=True 14 | #cython: overflowcheck=False 15 | 16 | 17 | import time, sys 18 | 19 | import numpy as np 20 | cimport numpy as np 21 | from cpython.array cimport array, clone 22 | 23 | 24 | 25 | import scipy.sparse as sps 26 | from Base.Recommender_utils import check_matrix 27 | 28 | 29 | cdef class Cosine_Similarity: 30 | 31 | cdef int TopK 32 | cdef long n_items, n_users 33 | 34 | cdef int[:] user_to_item_row_ptr, user_to_item_cols 35 | cdef int[:] item_to_user_rows, item_to_user_col_ptr 36 | cdef double[:] user_to_item_data, item_to_user_data 37 | cdef double[:] sumOfSquared 38 | cdef int shrink, normalize, adjusted_cosine, pearson_correlation, tanimoto_coefficient 39 | 40 | cdef double[:,:] W_dense 41 | 42 | def __init__(self, URM, topK = 100, shrink=0, normalize = True, 43 | mode = "cosine"): 44 | """ 45 | Computes the cosine similarity on the columns of dataMatrix 46 | If it is computed on URM=|users|x|items|, pass the URM as is. 47 | If it is computed on ICM=|items|x|features|, pass the ICM transposed. 48 | :param dataMatrix: 49 | :param topK: 50 | :param shrink: 51 | :param normalize: 52 | :param mode: "cosine" computes Cosine similarity 53 | "adjusted" computes Adjusted Cosine, removing the average of the users 54 | "pearson" computes Pearson Correlation, removing the average of the items 55 | "jaccard" computes Jaccard similarity for binary interactions using Tanimoto 56 | "tanimoto" computes Tanimoto coefficient for binary interactions 57 | 58 | """ 59 | 60 | super(Cosine_Similarity, self).__init__() 61 | 62 | self.n_items = URM.shape[1] 63 | self.n_users = URM.shape[0] 64 | self.shrink = shrink 65 | self.normalize = normalize 66 | 67 | self.adjusted_cosine = False 68 | self.pearson_correlation = False 69 | self.tanimoto_coefficient = False 70 | 71 | if mode == "adjusted": 72 | self.adjusted_cosine = True 73 | elif mode == "pearson": 74 | self.pearson_correlation = True 75 | elif mode == "jaccard" or mode == "tanimoto": 76 | self.tanimoto_coefficient = True 77 | # Tanimoto has a specific kind of normalization 78 | self.normalize = False 79 | 80 | elif mode == "cosine": 81 | pass 82 | else: 83 | raise ValueError("Cosine_Similarity: value for paramether 'mode' not recognized." 84 | " Allowed values are: 'cosine', 'pearson', 'adjusted', 'jaccard', 'tanimoto'." 85 | " Passed value was '{}'".format(mode)) 86 | 87 | 88 | self.TopK = min(topK, self.n_items) 89 | 90 | # Copy data to avoid altering the original object 91 | URM = URM.copy() 92 | 93 | if self.adjusted_cosine: 94 | URM = self.applyAdjustedCosine(URM) 95 | elif self.pearson_correlation: 96 | URM = self.applyPearsonCorrelation(URM) 97 | elif self.tanimoto_coefficient: 98 | URM = self.useOnlyBooleanInteractions(URM) 99 | 100 | 101 | URM = check_matrix(URM, 'csr') 102 | 103 | self.user_to_item_row_ptr = URM.indptr 104 | self.user_to_item_cols = URM.indices 105 | self.user_to_item_data = np.array(URM.data, dtype=np.float64) 106 | 107 | URM = check_matrix(URM, 'csc') 108 | self.item_to_user_rows = URM.indices 109 | self.item_to_user_col_ptr = URM.indptr 110 | self.item_to_user_data = np.array(URM.data, dtype=np.float64) 111 | 112 | # Compute sum of squared values to be used in normalization 113 | self.sumOfSquared = np.array(URM.power(2).sum(axis=0), dtype=np.float64).ravel() 114 | 115 | # Tanimoto does not require the square root to be applied 116 | if not self.tanimoto_coefficient: 117 | self.sumOfSquared = np.sqrt(self.sumOfSquared) 118 | 119 | 120 | if self.TopK == 0: 121 | self.W_dense = np.zeros((self.n_items,self.n_items)) 122 | 123 | 124 | cdef useOnlyBooleanInteractions(self, URM): 125 | """ 126 | Set to 1 all data points 127 | :return: 128 | """ 129 | 130 | cdef long index 131 | 132 | for index in range(len(URM.data)): 133 | URM.data[index] = 1 134 | 135 | return URM 136 | 137 | 138 | 139 | cdef applyPearsonCorrelation(self, URM): 140 | """ 141 | Remove from every data point the average for the corresponding column 142 | :return: 143 | """ 144 | 145 | cdef double[:] sumPerCol 146 | cdef int[:] interactionsPerCol 147 | cdef long colIndex, innerIndex, start_pos, end_pos 148 | cdef double colAverage 149 | 150 | 151 | URM = check_matrix(URM, 'csc') 152 | 153 | 154 | sumPerCol = np.array(URM.sum(axis=0), dtype=np.float64).ravel() 155 | interactionsPerCol = np.diff(URM.indptr) 156 | 157 | 158 | #Remove for every row the corresponding average 159 | for colIndex in range(self.n_items): 160 | 161 | if interactionsPerCol[colIndex]>0: 162 | 163 | colAverage = sumPerCol[colIndex] / interactionsPerCol[colIndex] 164 | 165 | start_pos = URM.indptr[colIndex] 166 | end_pos = URM.indptr[colIndex+1] 167 | 168 | innerIndex = start_pos 169 | 170 | while innerIndex < end_pos: 171 | 172 | URM.data[innerIndex] -= colAverage 173 | innerIndex+=1 174 | 175 | 176 | return URM 177 | 178 | 179 | 180 | cdef applyAdjustedCosine(self, URM): 181 | """ 182 | Remove from every data point the average for the corresponding row 183 | :return: 184 | """ 185 | 186 | cdef double[:] sumPerRow 187 | cdef int[:] interactionsPerRow 188 | cdef long rowIndex, innerIndex, start_pos, end_pos 189 | cdef double rowAverage 190 | 191 | URM = check_matrix(URM, 'csr') 192 | 193 | sumPerRow = np.array(URM.sum(axis=1), dtype=np.float64).ravel() 194 | interactionsPerRow = np.diff(URM.indptr) 195 | 196 | 197 | #Remove for every row the corresponding average 198 | for rowIndex in range(self.n_users): 199 | 200 | if interactionsPerRow[rowIndex]>0: 201 | 202 | rowAverage = sumPerRow[rowIndex] / interactionsPerRow[rowIndex] 203 | 204 | start_pos = URM.indptr[rowIndex] 205 | end_pos = URM.indptr[rowIndex+1] 206 | 207 | innerIndex = start_pos 208 | 209 | while innerIndex < end_pos: 210 | 211 | URM.data[innerIndex] -= rowAverage 212 | innerIndex+=1 213 | 214 | 215 | return URM 216 | 217 | 218 | 219 | 220 | 221 | cdef int[:] getUsersThatRatedItem(self, long item_id): 222 | return self.item_to_user_rows[self.item_to_user_col_ptr[item_id]:self.item_to_user_col_ptr[item_id+1]] 223 | 224 | cdef int[:] getItemsRatedByUser(self, long user_id): 225 | return self.user_to_item_cols[self.user_to_item_row_ptr[user_id]:self.user_to_item_row_ptr[user_id+1]] 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | cdef double[:] computeItemSimilarities(self, long item_id_input): 234 | """ 235 | For every item the cosine similarity against other items depends on whether they have users in common. The more 236 | common users the higher the similarity. 237 | 238 | The basic implementation is: 239 | - Select the first item 240 | - Loop through all other items 241 | -- Given the two items, get the users they have in common 242 | -- Update the similarity for all common users 243 | 244 | That is VERY slow due to the common user part, in which a long data structure is looped multiple times. 245 | 246 | A better way is to use the data structure in a different way skipping the search part, getting directly the 247 | information we need. 248 | 249 | The implementation here used is: 250 | - Select the first item 251 | - Initialize a zero valued array for the similarities 252 | - Get the users who rated the first item 253 | - Loop through the users 254 | -- Given a user, get the items he rated (second item) 255 | -- Update the similarity of the items he rated 256 | 257 | 258 | """ 259 | 260 | # Create template used to initialize an array with zeros 261 | # Much faster than np.zeros(self.n_items) 262 | cdef array[double] template_zero = array('d') 263 | cdef array[double] result = clone(template_zero, self.n_items, zero=True) 264 | 265 | 266 | cdef long user_index, user_id, item_index, item_id_second 267 | 268 | cdef int[:] users_that_rated_item = self.getUsersThatRatedItem(item_id_input) 269 | cdef int[:] items_rated_by_user 270 | 271 | cdef double rating_item_input, rating_item_second 272 | 273 | # Get users that rated the items 274 | for user_index in range(len(users_that_rated_item)): 275 | 276 | user_id = users_that_rated_item[user_index] 277 | rating_item_input = self.item_to_user_data[self.item_to_user_col_ptr[item_id_input]+user_index] 278 | 279 | # Get all items rated by that user 280 | items_rated_by_user = self.getItemsRatedByUser(user_id) 281 | 282 | for item_index in range(len(items_rated_by_user)): 283 | 284 | item_id_second = items_rated_by_user[item_index] 285 | 286 | # Do not compute the similarity on the diagonal 287 | if item_id_second != item_id_input: 288 | # Increment similairty 289 | rating_item_second = self.user_to_item_data[self.user_to_item_row_ptr[user_id]+item_index] 290 | 291 | result[item_id_second] += rating_item_input*rating_item_second 292 | 293 | return result 294 | 295 | 296 | 297 | 298 | def compute_similarity(self): 299 | 300 | cdef int itemIndex, innerItemIndex 301 | cdef long long topKItemIndex 302 | 303 | cdef long long[:] top_k_idx 304 | 305 | # Declare numpy data type to use vetor indexing and simplify the topK selection code 306 | cdef np.ndarray[long, ndim=1] top_k_partition, top_k_partition_sorting 307 | cdef np.ndarray[np.float64_t, ndim=1] this_item_weights_np 308 | 309 | cdef double[:] this_item_weights 310 | 311 | cdef long processedItems = 0 312 | 313 | # Data structure to incrementally build sparse matrix 314 | # Preinitialize max possible length 315 | cdef double[:] values = np.zeros((self.n_items*self.TopK)) 316 | cdef int[:] rows = np.zeros((self.n_items*self.TopK,), dtype=np.int32) 317 | cdef int[:] cols = np.zeros((self.n_items*self.TopK,), dtype=np.int32) 318 | cdef long sparse_data_pointer = 0 319 | 320 | 321 | 322 | start_time = time.time() 323 | 324 | # Compute all similarities for each item 325 | for itemIndex in range(self.n_items): 326 | 327 | processedItems += 1 328 | 329 | if processedItems % 10000==0 or processedItems==self.n_items: 330 | 331 | itemPerSec = processedItems/(time.time()-start_time) 332 | 333 | print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format( 334 | processedItems, processedItems*1.0/self.n_items*100, itemPerSec, (time.time()-start_time) / 60)) 335 | 336 | sys.stdout.flush() 337 | sys.stderr.flush() 338 | 339 | 340 | this_item_weights = self.computeItemSimilarities(itemIndex) 341 | 342 | 343 | # Apply normalization and shrinkage, ensure denominator != 0 344 | if self.normalize: 345 | for innerItemIndex in range(self.n_items): 346 | this_item_weights[innerItemIndex] /= self.sumOfSquared[itemIndex] * self.sumOfSquared[innerItemIndex]\ 347 | + self.shrink + 1e-6 348 | 349 | # Apply the specific denominator for Tanimoto 350 | elif self.tanimoto_coefficient: 351 | for innerItemIndex in range(self.n_items): 352 | this_item_weights[innerItemIndex] /= self.sumOfSquared[itemIndex] + self.sumOfSquared[innerItemIndex] -\ 353 | this_item_weights[innerItemIndex] + self.shrink + 1e-6 354 | 355 | elif self.shrink != 0: 356 | for innerItemIndex in range(self.n_items): 357 | this_item_weights[innerItemIndex] /= self.shrink 358 | 359 | 360 | if self.TopK == 0: 361 | 362 | for innerItemIndex in range(self.n_items): 363 | self.W_dense[innerItemIndex,itemIndex] = this_item_weights[innerItemIndex] 364 | 365 | else: 366 | 367 | # Sort indices and select TopK 368 | # Using numpy implies some overhead, unfortunately the plain C qsort function is even slower 369 | #top_k_idx = np.argsort(this_item_weights) [-self.TopK:] 370 | 371 | # Sorting is done in three steps. Faster then plain np.argsort for higher number of items 372 | # because we avoid sorting elements we already know we don't care about 373 | # - Partition the data to extract the set of TopK items, this set is unsorted 374 | # - Sort only the TopK items, discarding the rest 375 | # - Get the original item index 376 | 377 | this_item_weights_np = - np.array(this_item_weights) 378 | # 379 | # Get the unordered set of topK items 380 | top_k_partition = np.argpartition(this_item_weights_np, self.TopK-1)[0:self.TopK] 381 | # Sort only the elements in the partition 382 | top_k_partition_sorting = np.argsort(this_item_weights_np[top_k_partition]) 383 | # Get original index 384 | top_k_idx = top_k_partition[top_k_partition_sorting] 385 | 386 | 387 | 388 | # Incrementally build sparse matrix 389 | for innerItemIndex in range(len(top_k_idx)): 390 | 391 | topKItemIndex = top_k_idx[innerItemIndex] 392 | 393 | values[sparse_data_pointer] = this_item_weights[topKItemIndex] 394 | rows[sparse_data_pointer] = topKItemIndex 395 | cols[sparse_data_pointer] = itemIndex 396 | 397 | sparse_data_pointer += 1 398 | 399 | 400 | if self.TopK == 0: 401 | 402 | return np.array(self.W_dense) 403 | 404 | else: 405 | 406 | values = np.array(values[0:sparse_data_pointer]) 407 | rows = np.array(rows[0:sparse_data_pointer]) 408 | cols = np.array(cols[0:sparse_data_pointer]) 409 | 410 | W_sparse = sps.csr_matrix((values, (rows, cols)), 411 | shape=(self.n_items, self.n_items), 412 | dtype=np.float32) 413 | 414 | return W_sparse 415 | 416 | -------------------------------------------------------------------------------- /Base/Recommender.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | @author: Maurizio Ferrari Dacrema 6 | """ 7 | 8 | import multiprocessing 9 | import time 10 | 11 | import numpy as np 12 | 13 | from Base.metrics import roc_auc, precision, recall, map, ndcg, rr 14 | #from Base.Cython.metrics import roc_auc, precision, recall, map, ndcg, rr 15 | from Base.Recommender_utils import check_matrix, areURMequals, removeTopPop 16 | 17 | 18 | class Recommender(object): 19 | """Abstract Recommender""" 20 | 21 | def __init__(self): 22 | super(Recommender, self).__init__() 23 | self.URM_train = None 24 | self.sparse_weights = True 25 | self.normalize = False 26 | 27 | self.filterTopPop = False 28 | self.filterTopPop_ItemsID = np.array([], dtype=np.int) 29 | 30 | self.filterCustomItems = False 31 | self.filterCustomItems_ItemsID = np.array([], dtype=np.int) 32 | 33 | 34 | def fit(self): 35 | pass 36 | 37 | def _filter_TopPop_on_scores(self, scores): 38 | scores[self.filterTopPop_ItemsID] = -np.inf 39 | return scores 40 | 41 | 42 | def _filterCustomItems_on_scores(self, scores): 43 | scores[self.filterCustomItems_ItemsID] = -np.inf 44 | return scores 45 | 46 | 47 | def _filter_seen_on_scores(self, user_id, scores): 48 | 49 | seen = self.URM_train.indices[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]] 50 | 51 | scores[seen] = -np.inf 52 | return scores 53 | 54 | 55 | 56 | 57 | def evaluateRecommendations(self, URM_test_new, at=5, minRatingsPerUser=1, exclude_seen=True, 58 | mode='sequential', filterTopPop = False, 59 | filterCustomItems = np.array([], dtype=np.int), 60 | filterCustomUsers = np.array([], dtype=np.int)): 61 | """ 62 | Speed info: 63 | - Sparse weighgs: batch mode is 2x faster than sequential 64 | - Dense weighgts: batch and sequential speed are equivalent 65 | 66 | 67 | :param URM_test_new: URM to be used for testing 68 | :param at: 5 Length of the recommended items 69 | :param minRatingsPerUser: 1 Users with less than this number of interactions will not be evaluated 70 | :param exclude_seen: True Whether to remove already seen items from the recommended items 71 | 72 | :param mode: 'sequential', 'parallel', 'batch' 73 | :param filterTopPop: False or decimal number Percentage of items to be removed from recommended list and testing interactions 74 | :param filterCustomItems: Array, default empty Items ID to NOT take into account when recommending 75 | :param filterCustomUsers: Array, default empty Users ID to NOT take into account when recommending 76 | :return: 77 | """ 78 | 79 | if len(filterCustomItems) == 0: 80 | self.filterCustomItems = False 81 | else: 82 | self.filterCustomItems = True 83 | self.filterCustomItems_ItemsID = np.array(filterCustomItems) 84 | 85 | 86 | if filterTopPop != False: 87 | 88 | self.filterTopPop = True 89 | 90 | _,_, self.filterTopPop_ItemsID = removeTopPop(self.URM_train, URM_2 = URM_test_new, percentageToRemove=filterTopPop) 91 | 92 | print("Filtering {}% TopPop items, count is: {}".format(filterTopPop*100, len(self.filterTopPop_ItemsID))) 93 | 94 | # Zero-out the items in order to be considered irrelevant 95 | URM_test_new = check_matrix(URM_test_new, format='lil') 96 | URM_test_new[:,self.filterTopPop_ItemsID] = 0 97 | URM_test_new = check_matrix(URM_test_new, format='csr') 98 | 99 | 100 | # During testing CSR is faster 101 | self.URM_test = check_matrix(URM_test_new, format='csr') 102 | self.URM_train = check_matrix(self.URM_train, format='csr') 103 | self.at = at 104 | self.minRatingsPerUser = minRatingsPerUser 105 | self.exclude_seen = exclude_seen 106 | 107 | 108 | nusers = self.URM_test.shape[0] 109 | 110 | # Prune users with an insufficient number of ratings 111 | rows = self.URM_test.indptr 112 | numRatings = np.ediff1d(rows) 113 | mask = numRatings >= minRatingsPerUser 114 | usersToEvaluate = np.arange(nusers)[mask] 115 | 116 | if len(filterCustomUsers) != 0: 117 | print("Filtering {} Users".format(len(filterCustomUsers))) 118 | usersToEvaluate = set(usersToEvaluate) - set(filterCustomUsers) 119 | 120 | usersToEvaluate = list(usersToEvaluate) 121 | 122 | 123 | 124 | if mode=='sequential': 125 | return self.evaluateRecommendationsSequential(usersToEvaluate) 126 | elif mode=='parallel': 127 | return self.evaluateRecommendationsParallel(usersToEvaluate) 128 | elif mode=='batch': 129 | return self.evaluateRecommendationsBatch(usersToEvaluate) 130 | # elif mode=='cython': 131 | # return self.evaluateRecommendationsCython(usersToEvaluate) 132 | # elif mode=='random-equivalent': 133 | # return self.evaluateRecommendationsRandomEquivalent(usersToEvaluate) 134 | else: 135 | raise ValueError("Mode '{}' not available".format(mode)) 136 | 137 | 138 | def get_user_relevant_items(self, user_id): 139 | 140 | return self.URM_test.indices[self.URM_test.indptr[user_id]:self.URM_test.indptr[user_id+1]] 141 | 142 | def get_user_test_ratings(self, user_id): 143 | 144 | return self.URM_test.data[self.URM_test.indptr[user_id]:self.URM_test.indptr[user_id+1]] 145 | 146 | 147 | 148 | 149 | def evaluateRecommendationsSequential(self, usersToEvaluate): 150 | 151 | start_time = time.time() 152 | 153 | roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 154 | n_eval = 0 155 | 156 | for test_user in usersToEvaluate: 157 | 158 | # Calling the 'evaluateOneUser' function instead of copying its code would be cleaner, but is 20% slower 159 | 160 | # Being the URM CSR, the indices are the non-zero column indexes 161 | relevant_items = self.get_user_relevant_items(test_user) 162 | 163 | n_eval += 1 164 | 165 | recommended_items = self.recommend(user_id=test_user, exclude_seen=self.exclude_seen, 166 | n=self.at, filterTopPop=self.filterTopPop, filterCustomItems=self.filterCustomItems) 167 | 168 | is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) 169 | 170 | # evaluate the recommendation list with ranking metrics ONLY 171 | roc_auc_ += roc_auc(is_relevant) 172 | precision_ += precision(is_relevant) 173 | recall_ += recall(is_relevant, relevant_items) 174 | map_ += map(is_relevant, relevant_items) 175 | mrr_ += rr(is_relevant) 176 | ndcg_ += ndcg(recommended_items, relevant_items, relevance=self.get_user_test_ratings(test_user), at=self.at) 177 | 178 | 179 | 180 | if n_eval % 10000 == 0 or n_eval==len(usersToEvaluate)-1: 181 | print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}".format( 182 | n_eval, 183 | 100.0* float(n_eval+1)/len(usersToEvaluate), 184 | time.time()-start_time, 185 | float(n_eval)/(time.time()-start_time))) 186 | 187 | 188 | 189 | 190 | if (n_eval > 0): 191 | roc_auc_ /= n_eval 192 | precision_ /= n_eval 193 | recall_ /= n_eval 194 | map_ /= n_eval 195 | mrr_ /= n_eval 196 | ndcg_ /= n_eval 197 | 198 | else: 199 | print("WARNING: No users had a sufficient number of relevant items") 200 | 201 | results_run = {} 202 | 203 | results_run["AUC"] = roc_auc_ 204 | results_run["precision"] = precision_ 205 | results_run["recall"] = recall_ 206 | results_run["map"] = map_ 207 | results_run["NDCG"] = ndcg_ 208 | results_run["MRR"] = mrr_ 209 | 210 | return (results_run) 211 | 212 | 213 | 214 | 215 | def evaluateRecommendationsBatch(self, usersToEvaluate, batch_size = 1000): 216 | 217 | roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 218 | n_eval = 0 219 | 220 | start_time = time.time() 221 | start_time_batch = time.time() 222 | 223 | #Number of blocks is rounded to the next integer 224 | totalNumberOfBatch = int(len(usersToEvaluate) / batch_size) + 1 225 | 226 | for current_batch in range(totalNumberOfBatch): 227 | 228 | user_first_id = current_batch*batch_size 229 | user_last_id = min((current_batch+1)*batch_size-1, len(usersToEvaluate)-1) 230 | 231 | users_in_batch = usersToEvaluate[user_first_id:user_last_id] 232 | 233 | relevant_items_batch = self.URM_test[users_in_batch] 234 | 235 | recommended_items_batch = self.recommendBatch(users_in_batch, 236 | exclude_seen=self.exclude_seen, 237 | n=self.at, filterTopPop=self.filterTopPop, 238 | filterCustomItems=self.filterCustomItems) 239 | 240 | 241 | for test_user in range(recommended_items_batch.shape[0]): 242 | 243 | n_eval += 1 244 | 245 | current_user = relevant_items_batch[test_user,:] 246 | 247 | relevant_items = current_user.indices 248 | recommended_items = recommended_items_batch[test_user,:] 249 | 250 | is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) 251 | 252 | # evaluate the recommendation list with ranking metrics ONLY 253 | roc_auc_ += roc_auc(is_relevant) 254 | precision_ += precision(is_relevant) 255 | recall_ += recall(is_relevant, relevant_items) 256 | map_ += map(is_relevant, relevant_items) 257 | mrr_ += rr(is_relevant) 258 | ndcg_ += ndcg(recommended_items, relevant_items, relevance=current_user.data, at=self.at) 259 | 260 | 261 | 262 | if(time.time() - start_time_batch >= 20 or current_batch == totalNumberOfBatch-1): 263 | print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}".format( 264 | n_eval, 265 | 100.0* float(n_eval)/len(usersToEvaluate), 266 | time.time()-start_time, 267 | float(n_eval)/(time.time()-start_time))) 268 | 269 | start_time_batch = time.time() 270 | 271 | 272 | if (n_eval > 0): 273 | roc_auc_ /= n_eval 274 | precision_ /= n_eval 275 | recall_ /= n_eval 276 | map_ /= n_eval 277 | mrr_ /= n_eval 278 | ndcg_ /= n_eval 279 | 280 | else: 281 | print("WARNING: No users had a sufficient number of relevant items") 282 | 283 | results_run = {} 284 | 285 | results_run["AUC"] = roc_auc_ 286 | results_run["precision"] = precision_ 287 | results_run["recall"] = recall_ 288 | results_run["map"] = map_ 289 | results_run["NDCG"] = ndcg_ 290 | results_run["MRR"] = mrr_ 291 | 292 | return (results_run) 293 | 294 | 295 | 296 | def evaluateOneUser(self, test_user): 297 | 298 | # Being the URM CSR, the indices are the non-zero column indexes 299 | #relevant_items = self.URM_test_relevantItems[test_user] 300 | relevant_items = self.URM_test[test_user].indices 301 | 302 | # this will rank top n items 303 | recommended_items = self.recommend(user_id=test_user, exclude_seen=self.exclude_seen, 304 | n=self.at, filterTopPop=self.filterTopPop, 305 | filterCustomItems=self.filterCustomItems) 306 | 307 | is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) 308 | 309 | # evaluate the recommendation list with ranking metrics ONLY 310 | roc_auc_ = roc_auc(is_relevant) 311 | precision_ = precision(is_relevant) 312 | recall_ = recall(is_relevant, relevant_items) 313 | map_ = map(is_relevant, relevant_items) 314 | mrr_ = rr(is_relevant) 315 | ndcg_ = ndcg(recommended_items, relevant_items, relevance=self.get_user_test_ratings(test_user), at=self.at) 316 | 317 | return roc_auc_, precision_, recall_, map_, mrr_, ndcg_ 318 | 319 | 320 | 321 | def evaluateRecommendationsParallel(self, usersToEvaluate): 322 | 323 | print("Evaluation of {} users begins".format(len(usersToEvaluate))) 324 | 325 | pool = multiprocessing.Pool(processes=multiprocessing.cpu_count(), maxtasksperchild=1) 326 | resultList = pool.map(self.evaluateOneUser, usersToEvaluate) 327 | 328 | # Close the pool to avoid memory leaks 329 | pool.close() 330 | 331 | n_eval = len(usersToEvaluate) 332 | roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 333 | 334 | # Looping is slightly faster then using the numpy vectorized approach, less data transformation 335 | for result in resultList: 336 | roc_auc_ += result[0] 337 | precision_ += result[1] 338 | recall_ += result[2] 339 | map_ += result[3] 340 | mrr_ += result[4] 341 | ndcg_ += result[5] 342 | 343 | 344 | if (n_eval > 0): 345 | roc_auc_ = roc_auc_/n_eval 346 | precision_ = precision_/n_eval 347 | recall_ = recall_/n_eval 348 | map_ = map_/n_eval 349 | mrr_ = mrr_/n_eval 350 | ndcg_ = ndcg_/n_eval 351 | 352 | else: 353 | print("WARNING: No users had a sufficient number of relevant items") 354 | 355 | 356 | print("Evaluated {} users".format(n_eval)) 357 | 358 | results = {} 359 | 360 | results["AUC"] = roc_auc_ 361 | results["precision"] = precision_ 362 | results["recall"] = recall_ 363 | results["map"] = map_ 364 | results["NDCG"] = ndcg_ 365 | results["MRR"] = mrr_ 366 | 367 | return (results) 368 | 369 | 370 | 371 | -------------------------------------------------------------------------------- /Base/Recommender_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | @author: Maurizio Ferrari Dacrema 6 | """ 7 | 8 | import numpy as np 9 | import scipy.sparse as sps 10 | import time 11 | import os 12 | 13 | def check_matrix(X, format='csc', dtype=np.float32): 14 | if format == 'csc' and not isinstance(X, sps.csc_matrix): 15 | return X.tocsc().astype(dtype) 16 | elif format == 'csr' and not isinstance(X, sps.csr_matrix): 17 | return X.tocsr().astype(dtype) 18 | elif format == 'coo' and not isinstance(X, sps.coo_matrix): 19 | return X.tocoo().astype(dtype) 20 | elif format == 'dok' and not isinstance(X, sps.dok_matrix): 21 | return X.todok().astype(dtype) 22 | elif format == 'bsr' and not isinstance(X, sps.bsr_matrix): 23 | return X.tobsr().astype(dtype) 24 | elif format == 'dia' and not isinstance(X, sps.dia_matrix): 25 | return X.todia().astype(dtype) 26 | elif format == 'lil' and not isinstance(X, sps.lil_matrix): 27 | return X.tolil().astype(dtype) 28 | else: 29 | return X.astype(dtype) 30 | 31 | 32 | def similarityMatrixTopK(item_weights, forceSparseOutput = True, k=100, verbose = False, inplace=True): 33 | """ 34 | The function selects the TopK most similar elements, column-wise 35 | 36 | :param item_weights: 37 | :param forceSparseOutput: 38 | :param k: 39 | :param verbose: 40 | :param inplace: Default True, WARNING matrix will be modified 41 | :return: 42 | """ 43 | 44 | assert (item_weights.shape[0] == item_weights.shape[1]), "selectTopK: ItemWeights is not a square matrix" 45 | 46 | start_time = time.time() 47 | 48 | if verbose: 49 | print("Generating topK matrix") 50 | 51 | nitems = item_weights.shape[1] 52 | k = min(k, nitems) 53 | 54 | # for each column, keep only the top-k scored items 55 | sparse_weights = not isinstance(item_weights, np.ndarray) 56 | 57 | if not sparse_weights: 58 | 59 | idx_sorted = np.argsort(item_weights, axis=0) # sort data inside each column 60 | 61 | if inplace: 62 | W = item_weights 63 | else: 64 | W = item_weights.copy() 65 | 66 | # index of the items that don't belong to the top-k similar items of each column 67 | not_top_k = idx_sorted[:-k, :] 68 | # use numpy fancy indexing to zero-out the values in sim without using a for loop 69 | W[not_top_k, np.arange(nitems)] = 0.0 70 | 71 | if forceSparseOutput: 72 | W_sparse = sps.csr_matrix(W, shape=(nitems, nitems)) 73 | 74 | if verbose: 75 | print("Sparse TopK matrix generated in {:.2f} seconds".format(time.time() - start_time)) 76 | 77 | return W_sparse 78 | 79 | if verbose: 80 | print("Dense TopK matrix generated in {:.2f} seconds".format(time.time()-start_time)) 81 | 82 | return W 83 | 84 | else: 85 | # iterate over each column and keep only the top-k similar items 86 | data, rows_indices, cols_indptr = [], [], [] 87 | 88 | item_weights = check_matrix(item_weights, format='csc', dtype=np.float32) 89 | 90 | for item_idx in range(nitems): 91 | 92 | cols_indptr.append(len(data)) 93 | 94 | start_position = item_weights.indptr[item_idx] 95 | end_position = item_weights.indptr[item_idx+1] 96 | 97 | column_data = item_weights.data[start_position:end_position] 98 | column_row_index = item_weights.indices[start_position:end_position] 99 | 100 | idx_sorted = np.argsort(column_data) # sort by column 101 | top_k_idx = idx_sorted[-k:] 102 | 103 | data.extend(column_data[top_k_idx]) 104 | rows_indices.extend(column_row_index[top_k_idx]) 105 | 106 | 107 | cols_indptr.append(len(data)) 108 | 109 | # During testing CSR is faster 110 | W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr), shape=(nitems, nitems), dtype=np.float32) 111 | W_sparse = W_sparse.tocsr() 112 | 113 | if verbose: 114 | print("Sparse TopK matrix generated in {:.2f} seconds".format(time.time() - start_time)) 115 | 116 | return W_sparse 117 | 118 | 119 | 120 | 121 | def removeZeroRatingRowAndCol(URM): 122 | 123 | rows = URM.indptr 124 | numRatings = np.ediff1d(rows) 125 | mask = numRatings >= 1 126 | 127 | URM = URM[mask,:] 128 | 129 | cols = URM.tocsc().indptr 130 | numRatings = np.ediff1d(cols) 131 | mask = numRatings >= 1 132 | 133 | URM = URM[:,mask] 134 | 135 | return URM.tocsr() 136 | 137 | 138 | def areURMequals(URM1, URM2): 139 | 140 | if(URM1.shape != URM2.shape): 141 | return False 142 | 143 | return (URM1-URM2).nnz ==0 144 | 145 | 146 | def removeTopPop(URM_1, URM_2=None, percentageToRemove=0.2): 147 | """ 148 | Remove the top popular items from the matrix 149 | :param URM_1: user X items 150 | :param URM_2: user X items 151 | :param percentageToRemove: value 1 corresponds to 100% 152 | :return: URM: user X selectedItems, obtained from URM_1 153 | Array: itemMappings[selectedItemIndex] = originalItemIndex 154 | Array: removedItems 155 | """ 156 | 157 | 158 | item_pop = URM_1.sum(axis=0) # this command returns a numpy.matrix of size (1, nitems) 159 | 160 | if URM_2 != None: 161 | 162 | assert URM_2.shape[1] == URM_1.shape[1], \ 163 | "The two URM do not contain the same number of columns, URM_1 has {}, URM_2 has {}".format(URM_1.shape[1], URM_2.shape[1]) 164 | 165 | item_pop += URM_2.sum(axis=0) 166 | 167 | 168 | item_pop = np.asarray(item_pop).squeeze() # necessary to convert it into a numpy.array of size (nitems,) 169 | popularItemsSorted = np.argsort(item_pop)[::-1] 170 | 171 | numItemsToRemove = int(len(popularItemsSorted)*percentageToRemove) 172 | 173 | # Choose which columns to keep 174 | itemMask = np.in1d(np.arange(len(popularItemsSorted)), popularItemsSorted[:numItemsToRemove], invert=True) 175 | 176 | # Map the column index of the new URM to the original ItemID 177 | itemMappings = np.arange(len(popularItemsSorted))[itemMask] 178 | 179 | removedItems = np.arange(len(popularItemsSorted))[np.logical_not(itemMask)] 180 | 181 | return URM_1[:,itemMask], itemMappings, removedItems 182 | 183 | 184 | 185 | def loadCSVintoSparse (filePath, header = False): 186 | 187 | values, rows, cols = [], [], [] 188 | 189 | fileHandle = open(filePath, "r") 190 | numCells = 0 191 | 192 | if header: 193 | fileHandle.readline() 194 | 195 | for line in fileHandle: 196 | numCells += 1 197 | if (numCells % 1000000 == 0): 198 | print("Processed {} cells".format(numCells)) 199 | 200 | if (len(line)) > 1: 201 | line = line.split(",") 202 | 203 | value = line[2].replace("\n", "") 204 | 205 | if not value == "0" and not value == "NaN": 206 | rows.append(int(line[0])) 207 | cols.append(int(line[1])) 208 | values.append(float(value)) 209 | 210 | return sps.csr_matrix((values, (rows, cols)), dtype=np.float32) 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /Base/Recommender_utils_Test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 30/09/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | from Base.Recommender_utils import similarityMatrixTopK 10 | 11 | import numpy as np 12 | import scipy.sparse as sps 13 | import unittest 14 | 15 | 16 | class MyTestCase(unittest.TestCase): 17 | 18 | def test_similarityMatrixTopK_denseToDense(self): 19 | 20 | numRows = 100 21 | 22 | TopK = 20 23 | 24 | dense_input = np.random.random((numRows, numRows)) 25 | dense_output = similarityMatrixTopK(dense_input, k=TopK, forceSparseOutput=False) 26 | 27 | numExpectedNonZeroCells = TopK*numRows 28 | 29 | numNonZeroCells = np.sum(dense_output!=0) 30 | 31 | self.assertEqual(numExpectedNonZeroCells, numNonZeroCells, "DenseToDense incorrect") 32 | 33 | 34 | def test_similarityMatrixTopK_denseToSparse(self): 35 | 36 | numRows = 100 37 | 38 | TopK = 20 39 | 40 | dense = np.random.random((numRows, numRows)) 41 | 42 | sparse = similarityMatrixTopK(dense, k=TopK, forceSparseOutput=True) 43 | dense = similarityMatrixTopK(dense, k=TopK, forceSparseOutput=False) 44 | 45 | 46 | self.assertTrue(np.equal(dense, sparse.todense()).all(), "denseToSparse incorrect") 47 | 48 | 49 | def test_similarityMatrixTopK_sparseToSparse(self): 50 | 51 | numRows = 20 52 | 53 | TopK = 5 54 | 55 | dense_input = np.random.random((numRows, numRows)) 56 | sparse_input = sps.csr_matrix(dense_input) 57 | 58 | dense_output = similarityMatrixTopK(dense_input, k=TopK, forceSparseOutput=False, inplace=False) 59 | sparse_output = similarityMatrixTopK(sparse_input, k=TopK, forceSparseOutput=True) 60 | 61 | self.assertTrue(np.all((dense_output - sparse_output.todense())<1e-6), "sparseToSparse CSR incorrect") 62 | 63 | sparse_input = sps.csc_matrix(dense_input) 64 | sparse_output = similarityMatrixTopK(sparse_input, k=TopK, forceSparseOutput=True) 65 | self.assertTrue(np.all((dense_output - sparse_output.todense())<1e-6), "sparseToSparse CSC incorrect") 66 | 67 | if __name__ == '__main__': 68 | 69 | unittest.main() 70 | 71 | -------------------------------------------------------------------------------- /Base/Similarity_Matrix_Recommender.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 16/09/2017 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | import numpy as np 10 | 11 | 12 | class Similarity_Matrix_Recommender(object): 13 | 14 | def __init__(self): 15 | super(Similarity_Matrix_Recommender, self).__init__() 16 | 17 | 18 | 19 | def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 20 | 21 | if n==None: 22 | n=self.URM_train.shape[1]-1 23 | 24 | # compute the scores using the dot product 25 | if self.sparse_weights: 26 | user_profile = self.URM_train[user_id] 27 | 28 | scores = user_profile.dot(self.W_sparse).toarray().ravel() 29 | 30 | else: 31 | 32 | user_profile = self.URM_train.indices[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]] 33 | user_ratings = self.URM_train.data[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]] 34 | 35 | relevant_weights = self.W[user_profile] 36 | scores = relevant_weights.T.dot(user_ratings) 37 | 38 | if self.normalize: 39 | # normalization will keep the scores in the same range 40 | # of value of the ratings in dataset 41 | rated = user_profile.copy() 42 | rated.data = np.ones_like(rated.data) 43 | if self.sparse_weights: 44 | den = rated.dot(self.W_sparse).toarray().ravel() 45 | else: 46 | den = rated.dot(self.W).ravel() 47 | den[np.abs(den) < 1e-6] = 1.0 # to avoid NaNs 48 | scores /= den 49 | 50 | if exclude_seen: 51 | scores = self._filter_seen_on_scores(user_id, scores) 52 | 53 | if filterTopPop: 54 | scores = self._filter_TopPop_on_scores(scores) 55 | 56 | if filterCustomItems: 57 | scores = self._filterCustomItems_on_scores(scores) 58 | 59 | 60 | # rank items and mirror column to obtain a ranking in descending score 61 | #ranking = scores.argsort() 62 | #ranking = np.flip(ranking, axis=0) 63 | 64 | # Sorting is done in three steps. Faster then plain np.argsort for higher number of items 65 | # - Partition the data to extract the set of relevant items 66 | # - Sort only the relevant items 67 | # - Get the original item index 68 | relevant_items_partition = (-scores).argpartition(n)[0:n] 69 | relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition]) 70 | ranking = relevant_items_partition[relevant_items_partition_sorting] 71 | 72 | 73 | return ranking 74 | 75 | 76 | 77 | 78 | def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 79 | 80 | # compute the scores using the dot product 81 | user_profile_batch = self.URM_train[users_in_batch] 82 | 83 | if self.sparse_weights: 84 | scores_array = np.array(user_profile_batch.dot(self.W_sparse)) 85 | 86 | else: 87 | scores_array = user_profile_batch.dot(self.W) 88 | 89 | if self.normalize: 90 | raise ValueError("Not implemented") 91 | 92 | # To exclude seen items perform a boolean indexing and replace their score with -inf 93 | # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be 94 | # recommended 95 | if exclude_seen: 96 | scores_array[user_profile_batch.nonzero()] = -np.inf 97 | 98 | if filterTopPop: 99 | scores_array[:,self.filterTopPop_ItemsID] = -np.inf 100 | 101 | if filterCustomItems: 102 | scores_array[:, self.filterCustomItems_ItemsID] = -np.inf 103 | 104 | 105 | # rank items and mirror column to obtain a ranking in descending score 106 | #ranking = (-scores_array).argsort(axis=1) 107 | #ranking = np.fliplr(ranking) 108 | #ranking = ranking[:,0:n] 109 | 110 | ranking = np.zeros((scores_array.shape[0],n), dtype=np.int) 111 | 112 | for row_index in range(scores_array.shape[0]): 113 | scores = scores_array[row_index] 114 | 115 | relevant_items_partition = (-scores).argpartition(n)[0:n] 116 | relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition]) 117 | ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting] 118 | 119 | 120 | return ranking 121 | 122 | 123 | 124 | def recommend_new_user(self, user_profile, n=None, exclude_seen=True): 125 | # compute the scores using the dot product 126 | if self.sparse_weights: 127 | assert user_profile.shape[1] == self.W_sparse.shape[0], 'The number of items does not match!' 128 | scores = user_profile.dot(self.W_sparse).toarray().ravel() 129 | else: 130 | assert user_profile.shape[1] == self.W.shape[0], 'The number of items does not match!' 131 | scores = user_profile.dot(self.W).ravel() 132 | if self.normalize: 133 | # normalization will keep the scores in the same range 134 | # of value of the ratings in dataset 135 | rated = user_profile.copy() 136 | rated.data = np.ones_like(rated.data) 137 | if self.sparse_weights: 138 | den = rated.dot(self.W_sparse).toarray().ravel() 139 | else: 140 | den = rated.dot(self.W).ravel() 141 | den[np.abs(den) < 1e-6] = 1.0 # to avoid NaNs 142 | scores /= den 143 | # rank items 144 | ranking = scores.argsort()[::-1] 145 | 146 | if exclude_seen: 147 | seen = user_profile.indices 148 | unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True) 149 | ranking = ranking[unseen_mask] 150 | return ranking[:n] 151 | 152 | 153 | -------------------------------------------------------------------------------- /Base/cosine_similarity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 23/10/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | import numpy as np 10 | import time, sys 11 | import scipy.sparse as sps 12 | from Base.Recommender_utils import check_matrix 13 | 14 | 15 | 16 | class Cosine_Similarity: 17 | 18 | 19 | def __init__(self, dataMatrix, topK=100, shrink = 0, normalize = True, 20 | mode = "cosine"): 21 | """ 22 | Computes the cosine similarity on the columns of dataMatrix 23 | If it is computed on URM=|users|x|items|, pass the URM as is. 24 | If it is computed on ICM=|items|x|features|, pass the ICM transposed. 25 | :param dataMatrix: 26 | :param topK: 27 | :param shrink: 28 | :param normalize: 29 | :param mode: "cosine" computes Cosine similarity 30 | "adjusted" computes Adjusted Cosine, removing the average of the users 31 | "pearson" computes Pearson Correlation, removing the average of the items 32 | "jaccard" computes Jaccard similarity for binary interactions using Tanimoto 33 | "tanimoto" computes Tanimoto coefficient for binary interactions 34 | 35 | """ 36 | 37 | super(Cosine_Similarity, self).__init__() 38 | 39 | self.TopK = topK 40 | self.shrink = shrink 41 | self.normalize = normalize 42 | self.n_columns = dataMatrix.shape[1] 43 | self.n_rows = dataMatrix.shape[0] 44 | 45 | self.dataMatrix = dataMatrix.copy() 46 | 47 | self.adjusted_cosine = False 48 | self.pearson_correlation = False 49 | self.tanimoto_coefficient = False 50 | 51 | if mode == "adjusted": 52 | self.adjusted_cosine = True 53 | elif mode == "pearson": 54 | self.pearson_correlation = True 55 | elif mode == "jaccard" or mode == "tanimoto": 56 | self.tanimoto_coefficient = True 57 | # Tanimoto has a specific kind of normalization 58 | self.normalize = False 59 | 60 | elif mode == "cosine": 61 | pass 62 | else: 63 | raise ValueError("Cosine_Similarity: value for paramether 'mode' not recognized." 64 | " Allowed values are: 'cosine', 'pearson', 'adjusted', 'jaccard', 'tanimoto'." 65 | " Passed value was '{}'".format(mode)) 66 | 67 | 68 | 69 | if self.TopK == 0: 70 | self.W_dense = np.zeros((self.n_columns, self.n_columns)) 71 | 72 | 73 | 74 | 75 | def applyAdjustedCosine(self): 76 | """ 77 | Remove from every data point the average for the corresponding row 78 | :return: 79 | """ 80 | 81 | self.dataMatrix = check_matrix(self.dataMatrix, 'csr') 82 | 83 | 84 | interactionsPerRow = np.diff(self.dataMatrix.indptr) 85 | 86 | nonzeroRows = interactionsPerRow > 0 87 | sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() 88 | 89 | rowAverage = np.zeros_like(sumPerRow) 90 | rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows] 91 | 92 | 93 | # Split in blocks to avoid duplicating the whole data structure 94 | start_row = 0 95 | end_row= 0 96 | 97 | blockSize = 1000 98 | 99 | 100 | while end_row < self.n_rows: 101 | 102 | end_row = min(self.n_rows, end_row + blockSize) 103 | 104 | self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ 105 | np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) 106 | 107 | start_row += blockSize 108 | 109 | 110 | 111 | 112 | def applyPearsonCorrelation(self): 113 | """ 114 | Remove from every data point the average for the corresponding column 115 | :return: 116 | """ 117 | 118 | self.dataMatrix = check_matrix(self.dataMatrix, 'csc') 119 | 120 | 121 | interactionsPerCol = np.diff(self.dataMatrix.indptr) 122 | 123 | nonzeroCols = interactionsPerCol > 0 124 | sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() 125 | 126 | colAverage = np.zeros_like(sumPerCol) 127 | colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols] 128 | 129 | 130 | # Split in blocks to avoid duplicating the whole data structure 131 | start_col = 0 132 | end_col= 0 133 | 134 | blockSize = 1000 135 | 136 | 137 | while end_col < self.n_columns: 138 | 139 | end_col = min(self.n_columns, end_col + blockSize) 140 | 141 | self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ 142 | np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) 143 | 144 | start_col += blockSize 145 | 146 | 147 | def useOnlyBooleanInteractions(self): 148 | 149 | # Split in blocks to avoid duplicating the whole data structure 150 | start_pos = 0 151 | end_pos= 0 152 | 153 | blockSize = 1000 154 | 155 | 156 | while end_pos < len(self.dataMatrix.data): 157 | 158 | end_pos = min(len(self.dataMatrix.data), end_pos + blockSize) 159 | 160 | self.dataMatrix.data[start_pos:end_pos] = np.ones(end_pos-start_pos) 161 | 162 | start_pos += blockSize 163 | 164 | 165 | 166 | 167 | def compute_similarity(self): 168 | 169 | values = [] 170 | rows = [] 171 | cols = [] 172 | 173 | start_time = time.time() 174 | start_time_print_batch = start_time 175 | processedItems = 0 176 | 177 | if self.adjusted_cosine: 178 | self.applyAdjustedCosine() 179 | 180 | elif self.pearson_correlation: 181 | self.applyPearsonCorrelation() 182 | 183 | elif self.tanimoto_coefficient: 184 | self.useOnlyBooleanInteractions() 185 | 186 | 187 | # We explore the matrix column-wise 188 | self.dataMatrix = check_matrix(self.dataMatrix, 'csc') 189 | 190 | 191 | # Compute sum of squared values to be used in normalization 192 | sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() 193 | 194 | # Tanimoto does not require the square root to be applied 195 | if not self.tanimoto_coefficient: 196 | sumOfSquared = np.sqrt(sumOfSquared) 197 | 198 | 199 | # Compute all similarities for each item using vectorization 200 | for columnIndex in range(self.n_columns): 201 | 202 | processedItems += 1 203 | 204 | if time.time() - start_time_print_batch >= 30 or processedItems==self.n_columns: 205 | columnPerSec = processedItems / (time.time() - start_time) 206 | 207 | print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format( 208 | processedItems, processedItems / self.n_columns * 100, columnPerSec, (time.time() - start_time)/ 60)) 209 | 210 | sys.stdout.flush() 211 | sys.stderr.flush() 212 | 213 | start_time_print_batch = time.time() 214 | 215 | 216 | # All data points for a given item 217 | item_data = self.dataMatrix[:, columnIndex] 218 | item_data = item_data.toarray().squeeze() 219 | 220 | # Compute item similarities 221 | this_column_weights = self.dataMatrix.T.dot(item_data) 222 | this_column_weights[columnIndex] = 0.0 223 | 224 | # Apply normalization and shrinkage, ensure denominator != 0 225 | if self.normalize: 226 | denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6 227 | this_column_weights = np.multiply(this_column_weights, 1 / denominator) 228 | 229 | # Apply the specific denominator for Tanimoto 230 | elif self.tanimoto_coefficient: 231 | denominator = sumOfSquared[columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 232 | this_column_weights = np.multiply(this_column_weights, 1 / denominator) 233 | 234 | # If no normalization or tanimoto is selected, apply only shrink 235 | elif self.shrink != 0: 236 | this_column_weights = this_column_weights/self.shrink 237 | 238 | 239 | if self.TopK == 0: 240 | self.W_dense[:, columnIndex] = this_column_weights 241 | 242 | else: 243 | # Sort indices and select TopK 244 | # Sorting is done in three steps. Faster then plain np.argsort for higher number of items 245 | # - Partition the data to extract the set of relevant items 246 | # - Sort only the relevant items 247 | # - Get the original item index 248 | relevant_items_partition = (-this_column_weights).argpartition(self.TopK-1)[0:self.TopK] 249 | relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition]) 250 | top_k_idx = relevant_items_partition[relevant_items_partition_sorting] 251 | 252 | # Incrementally build sparse matrix 253 | values.extend(this_column_weights[top_k_idx]) 254 | rows.extend(top_k_idx) 255 | cols.extend(np.ones(self.TopK) * columnIndex) 256 | 257 | if self.TopK == 0: 258 | return self.W_dense 259 | 260 | else: 261 | 262 | W_sparse = sps.csr_matrix((values, (rows, cols)), 263 | shape=(self.n_columns, self.n_columns), 264 | dtype=np.float32) 265 | 266 | 267 | return W_sparse -------------------------------------------------------------------------------- /Base/cosine_similarity_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 23/10/17 3 | 4 | @author: Maurizio Ferrari Dacrema 5 | """ 6 | 7 | import unittest 8 | 9 | from Base.Recommender_utils import similarityMatrixTopK 10 | import subprocess, os 11 | import numpy as np 12 | import time 13 | import scipy.sparse as sps 14 | 15 | 16 | 17 | def areSparseEquals(Sparse1, Sparse2): 18 | 19 | if(Sparse1.shape != Sparse2.shape): 20 | return False 21 | 22 | return (Sparse1 - Sparse2).nnz ==0 23 | 24 | 25 | 26 | 27 | class MyTestCase(unittest.TestCase): 28 | 29 | def test_cosine_similarity_dense(self): 30 | 31 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 32 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 33 | 34 | TopK = 0 35 | 36 | data_matrix = np.array([[1,1,0,1],[0,1,1,1],[1,0,1,0]]) 37 | data_matrix = sps.csr_matrix(data_matrix) 38 | 39 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False) 40 | W_dense_Cython = cosine_similarity.compute_similarity() 41 | 42 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False) 43 | W_dense_Python = cosine_similarity.compute_similarity() 44 | 45 | 46 | W_dense_mul = data_matrix.T.dot(data_matrix) 47 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 48 | 49 | assert np.all(W_dense_Cython == W_dense_mul), "W_dense_Cython not matching control" 50 | assert np.all(W_dense_Python == W_dense_mul), "W_dense_Python not matching control" 51 | 52 | 53 | 54 | 55 | def test_cosine_similarity_dense_external_cfr(self): 56 | 57 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 58 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 59 | from sklearn.metrics.pairwise import cosine_similarity as Cosine_Similarity_Sklearn 60 | 61 | from scipy.spatial.distance import jaccard as Jaccard_Distance_Scipy 62 | 63 | 64 | TopK = 0 65 | shrink = 0 66 | 67 | data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]]) 68 | data_matrix = sps.csr_matrix(data_matrix) 69 | 70 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, shrink=shrink) 71 | W_dense_Cython = cosine_similarity.compute_similarity() 72 | 73 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, shrink=shrink) 74 | W_dense_Python = cosine_similarity.compute_similarity() 75 | 76 | W_dense_sklearn = Cosine_Similarity_Sklearn(data_matrix.copy().T) 77 | W_dense_sklearn[np.arange(W_dense_sklearn.shape[0]),np.arange(W_dense_sklearn.shape[0])] = 0.0 78 | 79 | 80 | assert np.allclose(W_dense_Cython, W_dense_sklearn, atol=1e-4), "W_dense_Cython Cosine not matching Sklearn control" 81 | assert np.allclose(W_dense_Python, W_dense_sklearn, atol=1e-4), "W_dense_Python Cosine not matching Sklearn control" 82 | 83 | 84 | data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]]) 85 | data_matrix = sps.csr_matrix(data_matrix) 86 | 87 | 88 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, shrink=shrink, 89 | mode='jaccard') 90 | W_dense_Cython = cosine_similarity.compute_similarity() 91 | 92 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, shrink=shrink, 93 | mode='jaccard') 94 | W_dense_Python = cosine_similarity.compute_similarity() 95 | 96 | 97 | W_dense_Scipy = np.zeros_like(W_dense_Python) 98 | data_matrix.data = np.ones_like(data_matrix.data) 99 | data_matrix = data_matrix.toarray() 100 | 101 | for row in range(W_dense_Scipy.shape[0]): 102 | for col in range(W_dense_Scipy.shape[1]): 103 | 104 | if row != col: 105 | W_dense_Scipy[row, col] = 1-Jaccard_Distance_Scipy(data_matrix[:,row], data_matrix[:,col]) 106 | 107 | 108 | assert np.allclose(W_dense_Cython, W_dense_Scipy, atol=1e-4), "W_dense_Cython Jaccard not matching Scipy control" 109 | assert np.allclose(W_dense_Python, W_dense_Scipy, atol=1e-4), "W_dense_Python Jaccard not matching Scipy control" 110 | 111 | 112 | 113 | 114 | 115 | def test_cosine_similarity_dense_normalize(self): 116 | 117 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 118 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 119 | 120 | import numpy.matlib 121 | 122 | TopK = 0 123 | shrink = 5 124 | 125 | data_matrix = np.array([[1,1,0,1],[0,1,1,1],[1,0,1,0]]) 126 | data_matrix = sps.csr_matrix(data_matrix) 127 | 128 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, shrink=shrink) 129 | W_dense_Cython = cosine_similarity.compute_similarity() 130 | 131 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, shrink=shrink) 132 | W_dense_Python = cosine_similarity.compute_similarity() 133 | 134 | 135 | W_dense_denominator = np.matlib.repmat(data_matrix.power(2).sum(axis=0), data_matrix.shape[1], 1) 136 | W_dense_denominator = np.sqrt(W_dense_denominator) 137 | W_dense_denominator = np.multiply(W_dense_denominator, W_dense_denominator.T) + shrink 138 | 139 | W_dense_mul = data_matrix.T.dot(data_matrix) 140 | W_dense_mul /= W_dense_denominator 141 | 142 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 143 | 144 | 145 | assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control" 146 | assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control" 147 | 148 | 149 | 150 | 151 | def test_cosine_similarity_dense_adjusted(self): 152 | 153 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 154 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 155 | 156 | import numpy.matlib 157 | 158 | TopK = 0 159 | shrink = 0 160 | 161 | data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]]) 162 | data_matrix = sps.csr_matrix(data_matrix) 163 | 164 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, 165 | shrink=shrink, mode='adjusted') 166 | W_dense_Cython = cosine_similarity.compute_similarity() 167 | 168 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, 169 | shrink=shrink, mode='adjusted') 170 | W_dense_Python = cosine_similarity.compute_similarity() 171 | 172 | 173 | data_matrix = data_matrix.toarray().astype(np.float64) 174 | for row in range(data_matrix.shape[0]): 175 | 176 | nonzeroMask = data_matrix[row,:]>0 177 | data_matrix[row,:][nonzeroMask] -= np.mean(data_matrix[row,:][nonzeroMask]) 178 | 179 | 180 | W_dense_denominator = np.matlib.repmat((data_matrix**2).sum(axis=0), data_matrix.shape[1], 1) 181 | W_dense_denominator = np.sqrt(W_dense_denominator) 182 | W_dense_denominator = np.multiply(W_dense_denominator, W_dense_denominator.T) + shrink 183 | 184 | W_dense_mul = data_matrix.T.dot(data_matrix) 185 | W_dense_mul[W_dense_denominator>0] /= W_dense_denominator[W_dense_denominator>0] 186 | 187 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 188 | 189 | assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control" 190 | assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control" 191 | 192 | 193 | 194 | def test_cosine_similarity_dense_pearson(self): 195 | 196 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 197 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 198 | 199 | import numpy.matlib 200 | 201 | TopK = 0 202 | shrink = 0 203 | 204 | data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]]) 205 | data_matrix = sps.csr_matrix(data_matrix) 206 | 207 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, 208 | shrink=shrink, mode='pearson') 209 | W_dense_Cython = cosine_similarity.compute_similarity() 210 | 211 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, 212 | shrink=shrink, mode='pearson') 213 | W_dense_Python = cosine_similarity.compute_similarity() 214 | 215 | 216 | data_matrix = data_matrix.toarray().astype(np.float64) 217 | for col in range(data_matrix.shape[1]): 218 | 219 | nonzeroMask = data_matrix[:,col]>0 220 | data_matrix[:,col][nonzeroMask] -= np.mean(data_matrix[:,col][nonzeroMask]) 221 | 222 | 223 | W_dense_denominator = np.matlib.repmat((data_matrix**2).sum(axis=0), data_matrix.shape[1], 1) 224 | W_dense_denominator = np.sqrt(W_dense_denominator) 225 | W_dense_denominator = np.multiply(W_dense_denominator, W_dense_denominator.T) + shrink 226 | 227 | W_dense_mul = data_matrix.T.dot(data_matrix) 228 | W_dense_mul[W_dense_denominator>0] /= W_dense_denominator[W_dense_denominator>0] 229 | 230 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 231 | 232 | assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control" 233 | assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control" 234 | 235 | 236 | 237 | def test_cosine_similarity_dense_jaccard(self): 238 | 239 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 240 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 241 | 242 | import numpy.matlib 243 | 244 | TopK = 0 245 | shrink = 0 246 | 247 | data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]]) 248 | data_matrix = sps.csr_matrix(data_matrix) 249 | 250 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, 251 | shrink=shrink, mode='jaccard') 252 | W_dense_Cython = cosine_similarity.compute_similarity() 253 | 254 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, 255 | shrink=shrink, mode='jaccard') 256 | W_dense_Python = cosine_similarity.compute_similarity() 257 | 258 | 259 | data_matrix.data = np.ones_like(data_matrix.data) 260 | data_matrix = data_matrix.toarray().astype(np.float64) 261 | 262 | W_dense_mul = data_matrix.T.dot(data_matrix) 263 | 264 | 265 | W_dense_denominator = np.matlib.repmat((data_matrix**2).sum(axis=0), data_matrix.shape[1], 1) 266 | W_dense_denominator = W_dense_denominator + W_dense_denominator.T - W_dense_mul + shrink 267 | 268 | W_dense_mul[W_dense_denominator>0] /= W_dense_denominator[W_dense_denominator>0] 269 | 270 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 271 | 272 | assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control" 273 | assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control" 274 | 275 | 276 | 277 | def test_cosine_similarity_dense_big(self): 278 | 279 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 280 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 281 | 282 | TopK = 0 283 | n_items = 500 284 | n_users = 1000 285 | 286 | data_matrix = sps.random(n_users, n_items, density=0.1) 287 | 288 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False) 289 | W_dense_Cython = cosine_similarity.compute_similarity() 290 | 291 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False) 292 | W_dense_Python = cosine_similarity.compute_similarity() 293 | 294 | 295 | W_dense_mul = data_matrix.T.dot(data_matrix).toarray() 296 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 297 | 298 | assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control" 299 | assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control" 300 | 301 | 302 | def test_cosine_similarity_TopK(self): 303 | 304 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 305 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 306 | 307 | TopK=4 308 | 309 | data_matrix = np.array([[1,1,0,1],[0,1,1,1],[1,0,1,0]]) 310 | data_matrix = sps.csr_matrix(data_matrix) 311 | 312 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False) 313 | W_dense_Cython = cosine_similarity.compute_similarity().toarray() 314 | 315 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False) 316 | W_dense_Python = cosine_similarity.compute_similarity().toarray() 317 | 318 | 319 | W_dense_mul = data_matrix.T.dot(data_matrix) 320 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 321 | 322 | W_dense_mul = similarityMatrixTopK(W_dense_mul, k=TopK).toarray() 323 | 324 | assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_sparse_Cython not matching control" 325 | assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control" 326 | 327 | 328 | 329 | def test_cosine_similarity_TopK_big(self): 330 | 331 | from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 332 | from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 333 | 334 | 335 | n_items = 500 336 | n_users = 1000 337 | TopK = n_items 338 | 339 | 340 | data_matrix = sps.random(n_users, n_items, density=0.1) 341 | 342 | cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False) 343 | W_dense_Cython = cosine_similarity.compute_similarity().toarray() 344 | 345 | cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False) 346 | W_dense_Python = cosine_similarity.compute_similarity().toarray() 347 | 348 | W_dense_mul = data_matrix.T.dot(data_matrix) 349 | W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0 350 | 351 | W_dense_mul = similarityMatrixTopK(W_dense_mul, k=TopK).toarray() 352 | 353 | assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_sparse_Cython not matching control" 354 | assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control" 355 | 356 | 357 | 358 | 359 | def runCompilationScript(): 360 | 361 | # Run compile script setting the working directory to ensure the compiled file are contained in the 362 | # appropriate subfolder and not the project root 363 | 364 | compiledModuleSubfolder = "/Cython" 365 | fileToCompile = 'cosine_similarity.pyx' 366 | 367 | command = ['python', 368 | 'compileCython.py', 369 | fileToCompile, 370 | 'build_ext', 371 | '--inplace' 372 | ] 373 | 374 | output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder) 375 | 376 | 377 | try: 378 | 379 | command = ['cython', 380 | fileToCompile, 381 | '-a' 382 | ] 383 | 384 | output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder) 385 | 386 | except: 387 | pass 388 | 389 | print("Compiled module saved in subfolder: {}".format(compiledModuleSubfolder)) 390 | 391 | # Command to run compilation script 392 | #python compileCython.py cosine_similarity.pyx build_ext --inplace 393 | 394 | # Command to generate html report 395 | #subprocess.call(["cython", "-a", "cosine_similarity.pyx"]) 396 | 397 | if __name__ == '__main__': 398 | 399 | from data.Movielens10MReader import Movielens10MReader 400 | 401 | runCompilationScript() 402 | 403 | unittest.main() 404 | # 405 | # from data.NetflixEnhanced.NetflixEnhancedReader import NetflixEnhancedReader 406 | # 407 | # from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython 408 | # from Base.Cython.cosine_similarity import cosine_common 409 | # 410 | # from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python 411 | # 412 | # from Base.Recommender_utils import similarityMatrixTopK 413 | # 414 | # TopK = 100 415 | # 416 | # dataReader = Movielens10MReader() 417 | # #dataReader = NetflixEnhancedReader() 418 | # URM_train = dataReader.get_URM_train() 419 | # 420 | # start_time = time.time() 421 | # cosine_similarity = Cosine_Similarity_Cython(URM_train, TopK=TopK) 422 | # W_sparse_Cython = cosine_similarity.compute_similarity() 423 | # print("Cosine_Similarity_Cython {:.2f} sec, {:.2f} item/sec".format(time.time() - start_time, 424 | # URM_train.shape[1] / (time.time() - start_time))) 425 | # 426 | # start_time = time.time() 427 | # W_cosine_common = cosine_common(URM_train) 428 | # print("Cosine common {:.2f} sec, {:.2f} item/sec".format(time.time()-start_time, URM_train.shape[1] / (time.time() - start_time))) 429 | # 430 | # start_time = time.time() 431 | # cosine_similarity = Cosine_Similarity_Python(URM_train, TopK=TopK) 432 | # W_sparse_Python = cosine_similarity.compute_similarity() 433 | # print("Cosine_Similarity_Python {:.2f} sec, {:.2f} item/sec".format(time.time() - start_time, 434 | # URM_train.shape[1] / (time.time() - start_time))) 435 | # 436 | 437 | # start_time = time.time() 438 | # product = URM_train.T.dot(URM_train) 439 | # product[np.arange(product.shape[0]),np.arange(product.shape[0])] = 0.0 440 | # 441 | # W_sparse_Control = similarityMatrixTopK(product, k=TopK).toarray() 442 | # print("similarityMatrixTopK {:.2f} sec, {:.2f} item/sec".format(time.time() - start_time, 443 | # URM_train.shape[1] / (time.time() - start_time))) 444 | 445 | 446 | 447 | -------------------------------------------------------------------------------- /Base/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | 4 | @author: Massimo Quadrana 5 | """ 6 | 7 | import numpy as np 8 | import unittest 9 | 10 | 11 | 12 | def roc_auc(is_relevant): 13 | #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True) 14 | ranks = np.arange(len(is_relevant)) 15 | pos_ranks = ranks[is_relevant] 16 | neg_ranks = ranks[~is_relevant] 17 | auc_score = 0.0 18 | if len(neg_ranks) == 0: 19 | return 1.0 20 | if len(pos_ranks) > 0: 21 | for pos_pred in pos_ranks: 22 | auc_score += np.sum(pos_pred < neg_ranks, dtype=np.float32) 23 | auc_score /= (pos_ranks.shape[0] * neg_ranks.shape[0]) 24 | assert 0 <= auc_score <= 1, auc_score 25 | return auc_score 26 | 27 | 28 | def precision(is_relevant): 29 | #ranked_list = ranked_list[:at] 30 | #is_relevant = np.in1d(is_relevant, pos_items, assume_unique=True) 31 | precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant) 32 | assert 0 <= precision_score <= 1, precision_score 33 | return precision_score 34 | 35 | 36 | def recall(is_relevant, pos_items): 37 | #ranked_list = ranked_list[:at] 38 | #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True) 39 | recall_score = np.sum(is_relevant, dtype=np.float32) / pos_items.shape[0] 40 | assert 0 <= recall_score <= 1, recall_score 41 | return recall_score 42 | 43 | 44 | def rr(is_relevant): 45 | # reciprocal rank of the FIRST relevant item in the ranked list (0 if none) 46 | #ranked_list = ranked_list[:at] 47 | #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True) 48 | ranks = np.arange(1, len(is_relevant) + 1)[is_relevant] 49 | if len(ranks) > 0: 50 | return 1. / ranks[0] 51 | else: 52 | return 0.0 53 | 54 | 55 | def map(is_relevant, pos_items): 56 | #ranked_list = ranked_list[:at] 57 | #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True) 58 | p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0])) 59 | map_score = np.sum(p_at_k) / np.min([pos_items.shape[0], is_relevant.shape[0]]) 60 | assert 0 <= map_score <= 1, map_score 61 | return map_score 62 | 63 | 64 | def ndcg(ranked_list, pos_items, relevance=None, at=None): 65 | if relevance is None: 66 | relevance = np.ones_like(pos_items) 67 | assert len(relevance) == pos_items.shape[0] 68 | it2rel = {it: r for it, r in zip(pos_items, relevance)} 69 | rank_scores = np.asarray([it2rel.get(it, 0.0) for it in ranked_list[:at]], dtype=np.float32) 70 | ideal_dcg = dcg(np.sort(relevance)[::-1]) 71 | rank_dcg = dcg(rank_scores) 72 | ndcg_ = rank_dcg / ideal_dcg 73 | # assert 0 <= ndcg_ <= 1, (rank_dcg, ideal_dcg, ndcg_) 74 | return ndcg_ 75 | 76 | 77 | def dcg(scores): 78 | return np.sum(np.divide(np.power(2, scores) - 1, np.log(np.arange(scores.shape[0], dtype=np.float32) + 2)), 79 | dtype=np.float32) 80 | 81 | 82 | metrics = ['AUC', 'Precision' 'Recall', 'MAP', 'NDCG'] 83 | 84 | 85 | def pp_metrics(metric_names, metric_values, metric_at): 86 | """ 87 | Pretty-prints metric values 88 | :param metrics_arr: 89 | :return: 90 | """ 91 | assert len(metric_names) == len(metric_values) 92 | if isinstance(metric_at, int): 93 | metric_at = [metric_at] * len(metric_values) 94 | return ' '.join(['{}: {:.4f}'.format(mname, mvalue) if mcutoff is None or mcutoff == 0 else 95 | '{}@{}: {:.4f}'.format(mname, mcutoff, mvalue) 96 | for mname, mcutoff, mvalue in zip(metric_names, metric_at, metric_values)]) 97 | 98 | 99 | class TestAUC(unittest.TestCase): 100 | def runTest(self): 101 | pos_items = np.asarray([2, 4]) 102 | ranked_list = np.asarray([1, 2, 3, 4, 5]) 103 | self.assertTrue(np.allclose(roc_auc(ranked_list, pos_items), 104 | (2. / 3 + 1. / 3) / 2)) 105 | 106 | 107 | class TestRecall(unittest.TestCase): 108 | def runTest(self): 109 | pos_items = np.asarray([2, 4, 5, 10]) 110 | ranked_list_1 = np.asarray([1, 2, 3, 4, 5]) 111 | ranked_list_2 = np.asarray([10, 5, 2, 4, 3]) 112 | ranked_list_3 = np.asarray([1, 3, 6, 7, 8]) 113 | self.assertTrue(np.allclose(recall(ranked_list_1, pos_items), 3. / 4)) 114 | self.assertTrue(np.allclose(recall(ranked_list_2, pos_items), 1.0)) 115 | self.assertTrue(np.allclose(recall(ranked_list_3, pos_items), 0.0)) 116 | 117 | thresholds = [1, 2, 3, 4, 5] 118 | values = [0.0, 1. / 4, 1. / 4, 2. / 4, 3. / 4] 119 | for at, val in zip(thresholds, values): 120 | self.assertTrue(np.allclose(np.asarray(recall(ranked_list_1, pos_items, at=at)), val)) 121 | 122 | 123 | class TestPrecision(unittest.TestCase): 124 | def runTest(self): 125 | pos_items = np.asarray([2, 4, 5, 10]) 126 | ranked_list_1 = np.asarray([1, 2, 3, 4, 5]) 127 | ranked_list_2 = np.asarray([10, 5, 2, 4, 3]) 128 | ranked_list_3 = np.asarray([1, 3, 6, 7, 8]) 129 | self.assertTrue(np.allclose(precision(ranked_list_1, pos_items), 3. / 5)) 130 | self.assertTrue(np.allclose(precision(ranked_list_2, pos_items), 4. / 5)) 131 | self.assertTrue(np.allclose(precision(ranked_list_3, pos_items), 0.0)) 132 | 133 | thresholds = [1, 2, 3, 4, 5] 134 | values = [0.0, 1. / 2, 1. / 3, 2. / 4, 3. / 5] 135 | for at, val in zip(thresholds, values): 136 | self.assertTrue(np.allclose(np.asarray(precision(ranked_list_1, pos_items, at=at)), val)) 137 | 138 | 139 | class TestRR(unittest.TestCase): 140 | def runTest(self): 141 | pos_items = np.asarray([2, 4, 5, 10]) 142 | ranked_list_1 = np.asarray([1, 2, 3, 4, 5]) 143 | ranked_list_2 = np.asarray([10, 5, 2, 4, 3]) 144 | ranked_list_3 = np.asarray([1, 3, 6, 7, 8]) 145 | self.assertTrue(np.allclose(rr(ranked_list_1, pos_items), 1. / 2)) 146 | self.assertTrue(np.allclose(rr(ranked_list_2, pos_items), 1.)) 147 | self.assertTrue(np.allclose(rr(ranked_list_3, pos_items), 0.0)) 148 | 149 | thresholds = [1, 2, 3, 4, 5] 150 | values = [0.0, 1. / 2, 1. / 2, 1. / 2, 1. / 2] 151 | for at, val in zip(thresholds, values): 152 | self.assertTrue(np.allclose(np.asarray(rr(ranked_list_1, pos_items, at=at)), val)) 153 | 154 | 155 | class TestMAP(unittest.TestCase): 156 | def runTest(self): 157 | pos_items = np.asarray([2, 4, 5, 10]) 158 | ranked_list_1 = np.asarray([1, 2, 3, 4, 5]) 159 | ranked_list_2 = np.asarray([10, 5, 2, 4, 3]) 160 | ranked_list_3 = np.asarray([1, 3, 6, 7, 8]) 161 | ranked_list_4 = np.asarray([11, 12, 13, 14, 15, 16, 2, 4, 5, 10]) 162 | ranked_list_5 = np.asarray([2, 11, 12, 13, 14, 15, 4, 5, 10, 16]) 163 | self.assertTrue(np.allclose(map(ranked_list_1, pos_items), (1. / 2 + 2. / 4 + 3. / 5) / 4)) 164 | self.assertTrue(np.allclose(map(ranked_list_2, pos_items), 1.0)) 165 | self.assertTrue(np.allclose(map(ranked_list_3, pos_items), 0.0)) 166 | self.assertTrue(np.allclose(map(ranked_list_4, pos_items), (1. / 7 + 2. / 8 + 3. / 9 + 4. / 10) / 4)) 167 | self.assertTrue(np.allclose(map(ranked_list_5, pos_items), (1. + 2. / 7 + 3. / 8 + 4. / 9) / 4)) 168 | 169 | thresholds = [1, 2, 3, 4, 5] 170 | values = [ 171 | 0.0, 172 | 1. / 2 / 2, 173 | 1. / 2 / 3, 174 | (1. / 2 + 2. / 4) / 4, 175 | (1. / 2 + 2. / 4 + 3. / 5) / 4 176 | ] 177 | for at, val in zip(thresholds, values): 178 | self.assertTrue(np.allclose(np.asarray(map(ranked_list_1, pos_items, at)), val)) 179 | 180 | 181 | class TestNDCG(unittest.TestCase): 182 | def runTest(self): 183 | pos_items = np.asarray([2, 4, 5, 10]) 184 | pos_relevances = np.asarray([5, 4, 3, 2]) 185 | ranked_list_1 = np.asarray([1, 2, 3, 4, 5]) # rel = 0, 5, 0, 4, 3 186 | ranked_list_2 = np.asarray([10, 5, 2, 4, 3]) # rel = 2, 3, 5, 4, 0 187 | ranked_list_3 = np.asarray([1, 3, 6, 7, 8]) # rel = 0, 0, 0, 0, 0 188 | idcg = ((2 ** 5 - 1) / np.log(2) + 189 | (2 ** 4 - 1) / np.log(3) + 190 | (2 ** 3 - 1) / np.log(4) + 191 | (2 ** 2 - 1) / np.log(5)) 192 | self.assertTrue(np.allclose(dcg(np.sort(pos_relevances)[::-1]), idcg)) 193 | self.assertTrue(np.allclose(ndcg(ranked_list_1, pos_items, pos_relevances), 194 | ((2 ** 5 - 1) / np.log(3) + 195 | (2 ** 4 - 1) / np.log(5) + 196 | (2 ** 3 - 1) / np.log(6)) / idcg)) 197 | self.assertTrue(np.allclose(ndcg(ranked_list_2, pos_items, pos_relevances), 198 | ((2 ** 2 - 1) / np.log(2) + 199 | (2 ** 3 - 1) / np.log(3) + 200 | (2 ** 5 - 1) / np.log(4) + 201 | (2 ** 4 - 1) / np.log(5)) / idcg)) 202 | self.assertTrue(np.allclose(ndcg(ranked_list_3, pos_items, pos_relevances), 0.0)) 203 | 204 | 205 | if __name__ == '__main__': 206 | unittest.main() 207 | -------------------------------------------------------------------------------- /KNN/item_knn_CBF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 23/10/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | from Base.Recommender import Recommender 10 | from Base.Recommender_utils import check_matrix 11 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender 12 | 13 | try: 14 | from Base.Cython.cosine_similarity import Cosine_Similarity 15 | except ImportError: 16 | print("Unable to load Cython Cosine_Similarity, reverting to Python") 17 | from Base.cosine_similarity import Cosine_Similarity 18 | 19 | 20 | class ItemKNNCBFRecommender(Recommender, Similarity_Matrix_Recommender): 21 | """ ItemKNN recommender""" 22 | 23 | def __init__(self, ICM, URM_train, sparse_weights=True): 24 | super(ItemKNNCBFRecommender, self).__init__() 25 | 26 | self.ICM = ICM 27 | 28 | # CSR is faster during evaluation 29 | self.URM_train = check_matrix(URM_train, 'csr') 30 | 31 | self.sparse_weights = sparse_weights 32 | 33 | 34 | def fit(self, k=50, shrink=100, similarity='cosine', normalize=True): 35 | 36 | self.k = k 37 | self.shrink = shrink 38 | 39 | self.similarity = Cosine_Similarity(self.ICM.T, shrink=shrink, topK=k, normalize=normalize, mode = similarity) 40 | 41 | 42 | if self.sparse_weights: 43 | self.W_sparse = self.similarity.compute_similarity() 44 | else: 45 | self.W = self.similarity.compute_similarity() 46 | self.W = self.W.toarray() 47 | 48 | -------------------------------------------------------------------------------- /KNN/item_knn_CF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 23/10/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | from Base.Recommender import Recommender 10 | from Base.Recommender_utils import check_matrix 11 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender 12 | 13 | try: 14 | from Base.Cython.cosine_similarity import Cosine_Similarity 15 | except ImportError: 16 | print("Unable to load Cython Cosine_Similarity, reverting to Python") 17 | from Base.cosine_similarity import Cosine_Similarity 18 | 19 | 20 | class ItemKNNCFRecommender(Recommender, Similarity_Matrix_Recommender): 21 | """ ItemKNN recommender""" 22 | 23 | def __init__(self, URM_train, sparse_weights=True): 24 | super(ItemKNNCFRecommender, self).__init__() 25 | 26 | # CSR is faster during evaluation 27 | self.URM_train = check_matrix(URM_train, 'csr') 28 | 29 | self.dataset = None 30 | 31 | self.sparse_weights = sparse_weights 32 | 33 | def fit(self, k=50, shrink=100, similarity='cosine', normalize=True): 34 | 35 | self.k = k 36 | self.shrink = shrink 37 | 38 | self.similarity = Cosine_Similarity(self.URM_train, shrink=shrink, topK=k, normalize=normalize, mode = similarity) 39 | 40 | if self.sparse_weights: 41 | self.W_sparse = self.similarity.compute_similarity() 42 | else: 43 | self.W = self.similarity.compute_similarity() 44 | self.W = self.W.toarray() 45 | -------------------------------------------------------------------------------- /KNN/item_knn_custom_Similarity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 23/10/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | 10 | import numpy as np 11 | import scipy.sparse as sps 12 | from Base.Recommender_utils import check_matrix 13 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender 14 | from Base.Recommender import Recommender 15 | 16 | 17 | class ItemKNNCustomSimilarityRecommender(Recommender, Similarity_Matrix_Recommender): 18 | """ ItemKNN recommender""" 19 | 20 | def __init__(self, k=50, shrinkage=100, normalize=False, sparse_weights=True): 21 | super(ItemKNNCustomSimilarityRecommender, self).__init__() 22 | self.k = k 23 | self.shrinkage = shrinkage 24 | self.normalize = normalize 25 | self.dataset = None 26 | self.similarity_name = None 27 | self.sparse_weights = sparse_weights 28 | 29 | 30 | def __str__(self): 31 | return "ItemKNNCBF(similarity={},k={},shrinkage={},normalize={},sparse_weights={})".format( 32 | self.similarity_name, self.k, self.shrinkage, self.normalize, self.sparse_weights) 33 | 34 | def fit(self, item_weights, URM_train, selectTopK = False): 35 | 36 | self.URM_train = check_matrix(URM_train, format='csc') 37 | 38 | # If no topK selection is required, just save the similarity 39 | if (not selectTopK): 40 | if isinstance(item_weights, np.ndarray): 41 | #self.W = item_weights 42 | #self.sparse_weights = False 43 | self.W_sparse = sps.csr_matrix(item_weights) 44 | self.sparse_weights = True 45 | else: 46 | self.W_sparse = check_matrix(item_weights, format='csr') 47 | self.sparse_weights = True 48 | 49 | return 50 | 51 | 52 | # If matrix is not dense, make it dense to select top K 53 | if not isinstance(item_weights, np.ndarray): 54 | item_weights = item_weights.toarray() 55 | 56 | 57 | idx_sorted = np.argsort(item_weights, axis=0) # sort by column 58 | 59 | # for each column, keep only the top-k scored items 60 | 61 | if not self.sparse_weights: 62 | self.W = item_weights.copy() 63 | # index of the items that don't belong to the top-k similar items of each column 64 | not_top_k = idx_sorted[:-self.k, :] 65 | # use numpy fancy indexing to zero-out the values in sim without using a for loop 66 | self.W[not_top_k, np.arange(item_weights.shape[1])] = 0.0 67 | else: 68 | # iterate over each column and keep only the top-k similar items 69 | values, rows, cols = [], [], [] 70 | nitems = self.URM_train.shape[1] 71 | for i in range(nitems): 72 | 73 | top_k_idx = idx_sorted[-self.k:, i] 74 | 75 | values.extend(item_weights[top_k_idx, i]) 76 | rows.extend(np.arange(nitems)[top_k_idx]) 77 | cols.extend(np.ones(self.k) * i) 78 | 79 | # During testing CSR is faster 80 | self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32) 81 | 82 | #self.scoresAll = URM_train.dot(self.W_sparse) 83 | -------------------------------------------------------------------------------- /KNN/user_knn_CF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 23/10/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | import numpy as np 10 | 11 | from Base.Recommender import Recommender 12 | from Base.Recommender_utils import check_matrix 13 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender 14 | 15 | try: 16 | from Base.Cython.cosine_similarity import Cosine_Similarity 17 | except ImportError: 18 | print("Unable to load Cython Cosine_Similarity, reverting to Python") 19 | from Base.cosine_similarity import Cosine_Similarity 20 | 21 | 22 | class UserKNNCFRecommender(Recommender, Similarity_Matrix_Recommender): 23 | """ UserKNN recommender""" 24 | 25 | def __init__(self, URM_train, sparse_weights=True): 26 | super(UserKNNCFRecommender, self).__init__() 27 | 28 | # Not sure if CSR here is faster 29 | self.URM_train = check_matrix(URM_train, 'csr') 30 | 31 | self.dataset = None 32 | 33 | self.sparse_weights = sparse_weights 34 | 35 | def fit(self, k=50, shrink=100, similarity='cosine', normalize=True): 36 | 37 | self.k = k 38 | self.shrink = shrink 39 | 40 | self.similarity = Cosine_Similarity(self.URM_train.T, shrink=shrink, topK=k, normalize=normalize, mode = similarity) 41 | 42 | if self.sparse_weights: 43 | self.W_sparse = self.similarity.compute_similarity() 44 | else: 45 | self.W = self.similarity.compute_similarity() 46 | self.W = self.W.toarray() 47 | 48 | 49 | 50 | 51 | 52 | def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 53 | 54 | if n==None: 55 | n=self.URM_train.shape[1]-1 56 | 57 | # compute the scores using the dot product 58 | if self.sparse_weights: 59 | 60 | scores = self.W_sparse[user_id].dot(self.URM_train).toarray().ravel() 61 | 62 | else: 63 | # Numpy dot does not recognize sparse matrices, so we must 64 | # invoke the dot function on the sparse one 65 | scores = self.URM_train.T.dot(self.W[user_id]) 66 | 67 | if self.normalize: 68 | # normalization will keep the scores in the same range 69 | # of value of the ratings in dataset 70 | user_profile = self.URM_train[user_id] 71 | 72 | rated = user_profile.copy() 73 | rated.data = np.ones_like(rated.data) 74 | if self.sparse_weights: 75 | den = rated.dot(self.W_sparse).toarray().ravel() 76 | else: 77 | den = rated.dot(self.W).ravel() 78 | den[np.abs(den) < 1e-6] = 1.0 # to avoid NaNs 79 | scores /= den 80 | 81 | if exclude_seen: 82 | scores = self._filter_seen_on_scores(user_id, scores) 83 | 84 | if filterTopPop: 85 | scores = self._filter_TopPop_on_scores(scores) 86 | 87 | if filterCustomItems: 88 | scores = self._filterCustomItems_on_scores(scores) 89 | 90 | 91 | # rank items and mirror column to obtain a ranking in descending score 92 | #ranking = scores.argsort() 93 | #ranking = np.flip(ranking, axis=0) 94 | 95 | # Sorting is done in three steps. Faster then plain np.argsort for higher number of items 96 | # - Partition the data to extract the set of relevant items 97 | # - Sort only the relevant items 98 | # - Get the original item index 99 | relevant_items_partition = (-scores).argpartition(n)[0:n] 100 | relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition]) 101 | ranking = relevant_items_partition[relevant_items_partition_sorting] 102 | 103 | 104 | return ranking 105 | 106 | 107 | 108 | 109 | def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 110 | 111 | # compute the scores using the dot product 112 | 113 | if self.sparse_weights: 114 | 115 | scores_array = self.W_sparse[users_in_batch].dot(self.URM_train) 116 | scores_array = scores_array.toarray() 117 | 118 | else: 119 | # Numpy dot does not recognize sparse matrices, so we must 120 | # invoke the dot function on the sparse one 121 | scores_array = self.URM_train.T.dot(self.W[users_in_batch].T) 122 | 123 | if self.normalize: 124 | raise ValueError("Not implemented") 125 | 126 | # To exclude seen items perform a boolean indexing and replace their score with -inf 127 | # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be 128 | # recommended 129 | if exclude_seen: 130 | user_profile_batch = self.URM_train[users_in_batch] 131 | scores_array[user_profile_batch.nonzero()] = -np.inf 132 | 133 | if filterTopPop: 134 | scores_array[:,self.filterTopPop_ItemsID] = -np.inf 135 | 136 | if filterCustomItems: 137 | scores_array[:, self.filterCustomItems_ItemsID] = -np.inf 138 | 139 | 140 | # rank items and mirror column to obtain a ranking in descending score 141 | #ranking = (-scores_array).argsort(axis=1) 142 | #ranking = np.fliplr(ranking) 143 | #ranking = ranking[:,0:n] 144 | 145 | ranking = np.zeros((scores_array.shape[0],n), dtype=np.int) 146 | 147 | for row_index in range(scores_array.shape[0]): 148 | scores = scores_array[row_index] 149 | 150 | relevant_items_partition = (-scores).argpartition(n)[0:n] 151 | relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition]) 152 | ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting] 153 | 154 | 155 | return ranking 156 | 157 | -------------------------------------------------------------------------------- /MatrixFactorization/Cython/MF_BPR_Cython.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 07/09/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | from Base.Recommender_utils import similarityMatrixTopK 10 | from Base.Recommender import Recommender 11 | import subprocess 12 | import os, sys 13 | import time 14 | import numpy as np 15 | 16 | 17 | class MF_BPR_Cython(Recommender): 18 | 19 | 20 | def __init__(self, URM_train, recompile_cython = False): 21 | 22 | 23 | super(MF_BPR_Cython, self).__init__() 24 | 25 | 26 | self.URM_train = URM_train 27 | self.n_users = URM_train.shape[0] 28 | self.n_items = URM_train.shape[1] 29 | self.normalize = False 30 | 31 | if recompile_cython: 32 | print("Compiling in Cython") 33 | self.runCompilationScript() 34 | print("Compilation Complete") 35 | 36 | 37 | 38 | def fit(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, filterCustomItems = np.array([], dtype=np.int), minRatingsPerUser=1, 39 | batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0, num_factors=10, positive_threshold=4, 40 | learning_rate = 0.05, sgd_mode='sgd', user_reg = 0.0, positive_reg = 0.0, negative_reg = 0.0): 41 | 42 | 43 | self.num_factors = num_factors 44 | self.positive_threshold = positive_threshold 45 | 46 | # Select only positive interactions 47 | URM_train_positive = self.URM_train.copy() 48 | 49 | URM_train_positive.data = URM_train_positive.data >= self.positive_threshold 50 | URM_train_positive.eliminate_zeros() 51 | 52 | self.sgd_mode = sgd_mode 53 | 54 | 55 | # Import compiled module 56 | from MatrixFactorization.Cython.MF_BPR_Cython_Epoch import MF_BPR_Cython_Epoch 57 | 58 | 59 | self.cythonEpoch = MF_BPR_Cython_Epoch(URM_train_positive, 60 | n_factors = self.num_factors, 61 | learning_rate=learning_rate, 62 | batch_size=1, 63 | sgd_mode = sgd_mode, 64 | user_reg=user_reg, 65 | positive_reg=positive_reg, 66 | negative_reg=negative_reg) 67 | 68 | 69 | self.batch_size = batch_size 70 | self.learning_rate = learning_rate 71 | 72 | 73 | start_time_train = time.time() 74 | 75 | for currentEpoch in range(epochs): 76 | 77 | start_time_epoch = time.time() 78 | 79 | if self.batch_size>0: 80 | self.epochIteration() 81 | else: 82 | print("No batch not available") 83 | 84 | 85 | if (URM_test is not None) and (currentEpoch % validate_every_N_epochs == 0) and \ 86 | currentEpoch >= start_validation_after_N_epochs: 87 | 88 | print("Evaluation begins") 89 | 90 | self.W = self.cythonEpoch.get_W() 91 | self.H = self.cythonEpoch.get_H() 92 | 93 | results_run = self.evaluateRecommendations(URM_test, filterTopPop=filterTopPop, 94 | minRatingsPerUser=minRatingsPerUser, filterCustomItems=filterCustomItems) 95 | 96 | self.writeCurrentConfig(currentEpoch, results_run, logFile) 97 | 98 | print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs, 99 | float(time.time() - start_time_epoch) / 60)) 100 | 101 | 102 | # Fit with no validation 103 | else: 104 | print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs, 105 | float(time.time() - start_time_epoch) / 60)) 106 | 107 | # Ensure W and H are up to date 108 | self.W = self.cythonEpoch.get_W() 109 | self.H = self.cythonEpoch.get_H() 110 | 111 | print("Fit completed in {:.2f} minutes".format(float(time.time() - start_time_train) / 60)) 112 | 113 | sys.stdout.flush() 114 | 115 | 116 | 117 | 118 | def runCompilationScript(self): 119 | 120 | # Run compile script setting the working directory to ensure the compiled file are contained in the 121 | # appropriate subfolder and not the project root 122 | 123 | compiledModuleSubfolder = "/MatrixFactorization/Cython" 124 | fileToCompile_list = ['MF_BPR_Cython_Epoch.pyx'] 125 | 126 | for fileToCompile in fileToCompile_list: 127 | 128 | command = ['python', 129 | 'compileCython.py', 130 | fileToCompile, 131 | 'build_ext', 132 | '--inplace' 133 | ] 134 | 135 | 136 | output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder) 137 | 138 | try: 139 | 140 | command = ['cython', 141 | fileToCompile, 142 | '-a' 143 | ] 144 | 145 | output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder) 146 | 147 | except: 148 | pass 149 | 150 | 151 | print("Compiled module saved in subfolder: {}".format(compiledModuleSubfolder)) 152 | 153 | # Command to run compilation script 154 | #python compileCython.py MF_BPR_Cython_Epoch.pyx build_ext --inplace 155 | 156 | # Command to generate html report 157 | #subprocess.call(["cython", "-a", "MF_BPR_Cython_Epoch.pyx"]) 158 | 159 | 160 | def epochIteration(self): 161 | 162 | self.cythonEpoch.epochIteration_Cython() 163 | 164 | 165 | 166 | 167 | def writeCurrentConfig(self, currentEpoch, results_run, logFile): 168 | 169 | current_config = {'learn_rate': self.learning_rate, 170 | 'num_factors': self.num_factors, 171 | 'batch_size': 1, 172 | 'epoch': currentEpoch} 173 | 174 | print("Test case: {}\nResults {}\n".format(current_config, results_run)) 175 | 176 | sys.stdout.flush() 177 | 178 | if (logFile != None): 179 | logFile.write("Test case: {}, Results {}\n".format(current_config, results_run)) 180 | logFile.flush() 181 | 182 | 183 | 184 | 185 | def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 186 | 187 | # compute the scores using the dot product 188 | user_profile_batch = self.URM_train[users_in_batch] 189 | 190 | scores_array = np.dot(self.W[users_in_batch], self.H.T) 191 | 192 | if self.normalize: 193 | raise ValueError("Not implemented") 194 | 195 | # To exclude seen items perform a boolean indexing and replace their score with -inf 196 | # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be 197 | # recommended 198 | if exclude_seen: 199 | scores_array[user_profile_batch.nonzero()] = -np.inf 200 | 201 | if filterTopPop: 202 | scores_array[:,self.filterTopPop_ItemsID] = -np.inf 203 | 204 | if filterCustomItems: 205 | scores_array[:, self.filterCustomItems_ItemsID] = -np.inf 206 | 207 | 208 | # rank items and mirror column to obtain a ranking in descending score 209 | #ranking = (-scores_array).argsort(axis=1) 210 | #ranking = np.fliplr(ranking) 211 | #ranking = ranking[:,0:n] 212 | 213 | ranking = np.zeros((scores_array.shape[0],n), dtype=np.int) 214 | 215 | for row_index in range(scores_array.shape[0]): 216 | scores = scores_array[row_index] 217 | 218 | relevant_items_partition = (-scores).argpartition(n)[0:n] 219 | relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition]) 220 | ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting] 221 | 222 | 223 | return ranking 224 | 225 | 226 | 227 | def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 228 | 229 | 230 | if n==None: 231 | n=self.URM_train.shape[1]-1 232 | 233 | scores_array = np.dot(self.W[user_id], self.H.T) 234 | 235 | if self.normalize: 236 | raise ValueError("Not implemented") 237 | 238 | 239 | if exclude_seen: 240 | scores = self._filter_seen_on_scores(user_id, scores_array) 241 | 242 | if filterTopPop: 243 | scores = self._filter_TopPop_on_scores(scores_array) 244 | 245 | if filterCustomItems: 246 | scores = self._filterCustomItems_on_scores(scores_array) 247 | 248 | 249 | # rank items and mirror column to obtain a ranking in descending score 250 | #ranking = scores.argsort() 251 | #ranking = np.flip(ranking, axis=0) 252 | 253 | # Sorting is done in three steps. Faster then plain np.argsort for higher number of items 254 | # - Partition the data to extract the set of relevant items 255 | # - Sort only the relevant items 256 | # - Get the original item index 257 | relevant_items_partition = (-scores_array).argpartition(n)[0:n] 258 | relevant_items_partition_sorting = np.argsort(-scores_array[relevant_items_partition]) 259 | ranking = relevant_items_partition[relevant_items_partition_sorting] 260 | 261 | 262 | return ranking 263 | -------------------------------------------------------------------------------- /MatrixFactorization/Cython/MF_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/MatrixFactorization/Cython/MF_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /MatrixFactorization/Cython/MF_BPR_Cython_Epoch.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 07/09/17 3 | 4 | @author: Maurizio Ferrari Dacrema 5 | """ 6 | 7 | #cython: boundscheck=False 8 | #cython: wraparound=False 9 | #cython: initializedcheck=False 10 | #cython: language_level=3 11 | #cython: nonecheck=False 12 | #cython: cdivision=True 13 | #cython: unpack_method_calls=True 14 | #cython: overflowcheck=False 15 | 16 | #defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 17 | 18 | from Base.Recommender_utils import check_matrix 19 | import numpy as np 20 | cimport numpy as np 21 | import time 22 | import sys 23 | 24 | from libc.math cimport exp, sqrt 25 | from libc.stdlib cimport rand, RAND_MAX 26 | 27 | 28 | cdef struct BPR_sample: 29 | long user 30 | long pos_item 31 | long neg_item 32 | 33 | 34 | cdef class MF_BPR_Cython_Epoch: 35 | 36 | cdef int n_users, n_items, n_factors 37 | cdef int numPositiveIteractions 38 | 39 | cdef int useAdaGrad, rmsprop 40 | 41 | cdef float learning_rate, user_reg, positive_reg, negative_reg 42 | 43 | cdef int batch_size 44 | 45 | cdef int[:] URM_mask_indices, URM_mask_indptr 46 | 47 | cdef double[:,:] W, H 48 | 49 | 50 | def __init__(self, URM_mask, n_factors = 10, 51 | learning_rate = 0.01, user_reg = 0.0, positive_reg = 0.0, negative_reg = 0.0, 52 | batch_size = 1, sgd_mode='sgd'): 53 | 54 | super(MF_BPR_Cython_Epoch, self).__init__() 55 | 56 | 57 | URM_mask = check_matrix(URM_mask, 'csr') 58 | 59 | self.numPositiveIteractions = int(URM_mask.nnz * 1) 60 | self.n_users = URM_mask.shape[0] 61 | self.n_items = URM_mask.shape[1] 62 | self.n_factors = n_factors 63 | 64 | self.URM_mask_indices = URM_mask.indices 65 | self.URM_mask_indptr = URM_mask.indptr 66 | 67 | # W and H cannot be initialized as zero, otherwise the gradient will always be zero 68 | self.W = np.random.random((self.n_users, self.n_factors)) 69 | self.H = np.random.random((self.n_items, self.n_factors)) 70 | 71 | 72 | 73 | if sgd_mode=='adagrad': 74 | self.useAdaGrad = True 75 | elif sgd_mode=='rmsprop': 76 | self.rmsprop = True 77 | elif sgd_mode=='sgd': 78 | pass 79 | else: 80 | raise ValueError( 81 | "SGD_mode not valid. Acceptable values are: 'sgd', 'adagrad', 'rmsprop'. Provided value was '{}'".format( 82 | sgd_mode)) 83 | 84 | 85 | 86 | self.learning_rate = learning_rate 87 | self.user_reg = user_reg 88 | self.positive_reg = positive_reg 89 | self.negative_reg = negative_reg 90 | 91 | 92 | if batch_size!=1: 93 | print("MiniBatch not implemented, reverting to default value 1") 94 | self.batch_size = 1 95 | 96 | 97 | # Using memoryview instead of the sparse matrix itself allows for much faster access 98 | cdef int[:] getSeenItems(self, long index): 99 | return self.URM_mask_indices[self.URM_mask_indptr[index]:self.URM_mask_indptr[index + 1]] 100 | 101 | 102 | 103 | def epochIteration_Cython(self): 104 | 105 | # Get number of available interactions 106 | cdef long totalNumberOfBatch = int(self.numPositiveIteractions / self.batch_size) + 1 107 | 108 | 109 | cdef BPR_sample sample 110 | cdef long u, i, j 111 | cdef long index, numCurrentBatch 112 | cdef double x_uij, sigmoid_user, sigmoid_item 113 | 114 | cdef int numSeenItems 115 | 116 | # Variables for AdaGrad and RMSprop 117 | cdef double [:] sgd_cache_item_factors, sgd_cache_user_factors 118 | cdef double cacheUpdate 119 | cdef float gamma 120 | 121 | cdef double H_i, H_j, W_u 122 | 123 | 124 | if self.useAdaGrad: 125 | sgd_cache_item_factors = np.zeros((self.n_items), dtype=float) 126 | sgd_cache_user_factors = np.zeros((self.n_users), dtype=float) 127 | 128 | # elif self.rmsprop: 129 | # sgd_cache = np.zeros((self.n_items), dtype=float) 130 | # gamma = 0.001 131 | 132 | 133 | cdef long start_time_epoch = time.time() 134 | cdef long start_time_batch = time.time() 135 | 136 | for numCurrentBatch in range(totalNumberOfBatch): 137 | 138 | # Uniform user sampling with replacement 139 | sample = self.sampleBPR_Cython() 140 | 141 | u = sample.user 142 | i = sample.pos_item 143 | j = sample.neg_item 144 | 145 | x_uij = 0.0 146 | 147 | for index in range(self.n_factors): 148 | 149 | x_uij = self.W[u,index] * (self.H[i,index] - self.H[j,index]) 150 | 151 | # Use gradient of log(sigm(-x_uij)) 152 | sigmoid_item = 1 / (1 + exp(x_uij)) 153 | sigmoid_user = sigmoid_item 154 | 155 | 156 | 157 | 158 | if self.useAdaGrad: 159 | cacheUpdate = sigmoid_item ** 2 160 | 161 | sgd_cache_item_factors[i] += cacheUpdate 162 | sgd_cache_item_factors[j] += cacheUpdate 163 | sgd_cache_user_factors[u] += cacheUpdate 164 | 165 | sigmoid_item = sigmoid_item / (sqrt(sgd_cache_item_factors[i]) + 1e-8) 166 | sigmoid_user = sigmoid_user / (sqrt(sgd_cache_user_factors[u]) + 1e-8) 167 | 168 | # INCOMPATIBLE CODE 169 | # elif self.rmsprop: 170 | # cacheUpdate = sgd_cache[i] * gamma + (1 - gamma) * gradient ** 2 171 | # 172 | # sgd_cache[i] = cacheUpdate 173 | # sgd_cache[j] = cacheUpdate 174 | # 175 | # gradient = gradient / (sqrt(sgd_cache[i]) + 1e-8) 176 | 177 | 178 | for index in range(self.n_factors): 179 | 180 | # Copy original value to avoid messing up the updates 181 | H_i = self.H[i, index] 182 | H_j = self.H[j, index] 183 | W_u = self.W[u, index] 184 | 185 | self.W[u, index] += self.learning_rate * (sigmoid_user * ( H_i - H_j ) - self.user_reg * W_u) 186 | self.H[i, index] += self.learning_rate * (sigmoid_item * ( W_u ) - self.positive_reg * H_i) 187 | self.H[j, index] += self.learning_rate * (sigmoid_item * (-W_u ) - self.negative_reg * H_j) 188 | 189 | 190 | 191 | if((numCurrentBatch%5000000==0 and not numCurrentBatch==0) or numCurrentBatch==totalNumberOfBatch-1): 192 | print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Sample per second: {:.0f}".format( 193 | numCurrentBatch*self.batch_size, 194 | 100.0* float(numCurrentBatch*self.batch_size)/self.numPositiveIteractions, 195 | time.time() - start_time_batch, 196 | float(numCurrentBatch*self.batch_size + 1) / (time.time() - start_time_epoch))) 197 | 198 | sys.stdout.flush() 199 | sys.stderr.flush() 200 | 201 | start_time_batch = time.time() 202 | 203 | 204 | def get_W(self): 205 | return np.array(self.W) 206 | 207 | 208 | def get_H(self): 209 | return np.array(self.H) 210 | 211 | 212 | 213 | cdef BPR_sample sampleBPR_Cython(self): 214 | 215 | cdef BPR_sample sample = BPR_sample(-1,-1,-1) 216 | cdef long index, start_pos_seen_items, end_pos_seen_items 217 | 218 | cdef int negItemSelected, numSeenItems = 0 219 | 220 | 221 | # Skip users with no interactions or with no negative items 222 | while numSeenItems == 0 or numSeenItems == self.n_items: 223 | 224 | sample.user = rand() % self.n_users 225 | 226 | start_pos_seen_items = self.URM_mask_indptr[sample.user] 227 | end_pos_seen_items = self.URM_mask_indptr[sample.user+1] 228 | 229 | numSeenItems = end_pos_seen_items - start_pos_seen_items 230 | 231 | 232 | index = rand() % numSeenItems 233 | 234 | sample.pos_item = self.URM_mask_indices[start_pos_seen_items + index] 235 | 236 | 237 | 238 | negItemSelected = False 239 | 240 | # It's faster to just try again then to build a mapping of the non-seen items 241 | # for every user 242 | while (not negItemSelected): 243 | 244 | sample.neg_item = rand() % self.n_items 245 | 246 | index = 0 247 | while index < numSeenItems and self.URM_mask_indices[start_pos_seen_items + index]!=sample.neg_item: 248 | index+=1 249 | 250 | if index == numSeenItems: 251 | negItemSelected = True 252 | 253 | 254 | return sample 255 | -------------------------------------------------------------------------------- /MatrixFactorization/Cython/MF_RMSE.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/MatrixFactorization/Cython/MF_RMSE.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /MatrixFactorization/Cython/MF_RMSE.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 23/10/17 3 | 4 | @author: Massimo Quadrana 5 | """ 6 | 7 | #cython: boundscheck=False 8 | #cython: wraparound=False 9 | #cython: initializedcheck=False 10 | #cython: language_level=3 11 | #cython: nonecheck=False 12 | #cython: cdivision=True 13 | #cython: unpack_method_calls=True 14 | #cython: overflowcheck=False 15 | 16 | 17 | cimport cython 18 | cimport numpy as np 19 | import numpy as np 20 | import scipy.sparse as sps 21 | 22 | import time 23 | import sys 24 | 25 | 26 | @cython.boundscheck(False) 27 | def FunkSVD_sgd(R, int num_factors=50, double lrate=0.01, double reg=0.015, int n_iterations=10, init_mean=0.0, init_std=0.1, double lrate_decay=1.0, rnd_seed=42): 28 | if not isinstance(R, sps.csr_matrix): 29 | raise ValueError('R must be an instance of scipy.sparse.csr_matrix') 30 | 31 | # use Cython MemoryViews for fast access to the sparse structure of R 32 | cdef int [:] col_indices = R.indices, indptr = R.indptr 33 | cdef double [:] data = np.array(R.data, dtype=np.float) 34 | cdef int n_users = R.shape[0], n_items = R.shape[1] 35 | cdef int nnz = len(R.data) 36 | 37 | 38 | # in csr format, indices correspond to column indices 39 | # let's build the vector of row_indices 40 | cdef np.ndarray[np.int64_t, ndim=1] row_nnz = np.diff(indptr).astype(np.int64) 41 | cdef np.ndarray[np.int64_t, ndim=1] row_indices = np.repeat(np.arange(n_users), row_nnz).astype(np.int64) 42 | 43 | # set the seed of the random number generator 44 | np.random.seed(rnd_seed) 45 | 46 | # randomly initialize the user and item latent factors 47 | cdef double[:,:] U = np.random.normal(init_mean, init_std, (n_users, num_factors)).astype(np.float) 48 | cdef double[:,:] V = np.random.normal(init_mean, init_std, (n_items, num_factors)).astype(np.float) 49 | 50 | # build random index to iterate over the non-zero elements in R 51 | cdef np.ndarray[np.int64_t, ndim=1] shuffled_idx = np.random.permutation(nnz).astype(np.int64) 52 | 53 | # here we define some auxiliary variables 54 | cdef int i, j, f, idx, currentIteration, numSample 55 | cdef double rij, rij_pred, err, loss 56 | cdef double[:] U_i = np.zeros(num_factors, dtype=np.float) 57 | cdef double[:] V_j = np.zeros(num_factors, dtype=np.float) 58 | 59 | start_time_epoch = time.time() 60 | start_time_batch = time.time() 61 | 62 | # 63 | # Stochastic Gradient Descent starts here 64 | # 65 | for currentIteration in range(n_iterations): # for each iteration 66 | loss = 0.0 67 | 68 | for numSample in range(nnz): # iterate over non-zero values in R only 69 | idx = shuffled_idx[numSample] 70 | rij = data[idx] 71 | 72 | # get the row and col indices of x_ij 73 | i = row_indices[idx] 74 | j = col_indices[idx] 75 | 76 | rij_pred = 0 77 | 78 | # compute the predicted value of R 79 | for f in range(num_factors): 80 | U_i[f] = U[i,f] 81 | V_j[f] = V[j,f] 82 | rij_pred += U[i,f]*V[j,f] 83 | 84 | # compute the prediction error 85 | err = rij - rij_pred 86 | 87 | # update the loss 88 | loss += err**2 89 | 90 | # adjust the latent factors 91 | for f in range(num_factors): 92 | U[i, f] += lrate * (err * V_j[f] - reg * U_i[f]) 93 | V[j, f] += lrate * (err * U_i[f] - reg * V_j[f]) 94 | 95 | loss /= nnz 96 | 97 | # update the learning rate 98 | lrate *= lrate_decay 99 | 100 | print("Iteration {} of {} completed in {:.2f} minutes. Loss is {:.4f}. Sample per second: {:.0f}".format( 101 | currentIteration, n_iterations, 102 | (time.time() - start_time_batch)/60, 103 | loss, 104 | float(nnz) / (time.time() - start_time_batch))) 105 | 106 | sys.stdout.flush() 107 | sys.stderr.flush() 108 | 109 | start_time_batch = time.time() 110 | 111 | 112 | 113 | return U, V 114 | 115 | 116 | @cython.boundscheck(False) 117 | def AsySVD_sgd(R, num_factors=50, lrate=0.01, reg=0.015, iters=10, init_mean=0.0, init_std=0.1, lrate_decay=1.0, rnd_seed=42): 118 | if not isinstance(R, sps.csr_matrix): 119 | raise ValueError('R must be an instance of scipy.sparse.csr_matrix') 120 | 121 | # use Cython MemoryViews for fast access to the sparse structure of R 122 | cdef int [:] col_indices = R.indices, indptr = R.indptr 123 | cdef float [:] data = R.data 124 | cdef int M = R.shape[0], N = R.shape[1] 125 | cdef int nnz = len(R.data) 126 | 127 | # in csr format, indices correspond to column indices 128 | # let's build the vector of row_indices 129 | cdef np.ndarray[np.int64_t, ndim=1] row_nnz = np.diff(indptr).astype(np.int64) 130 | cdef np.ndarray[np.int64_t, ndim=1] row_indices = np.repeat(np.arange(M), row_nnz).astype(np.int64) 131 | 132 | # set the seed of the random number generator 133 | np.random.seed(rnd_seed) 134 | 135 | # randomly initialize the item latent factors 136 | cdef np.ndarray[np.float32_t, ndim=2] X = np.random.normal(init_mean, init_std, (N, num_factors)).astype(np.float32) 137 | cdef np.ndarray[np.float32_t, ndim=2] Y = np.random.normal(init_mean, init_std, (N, num_factors)).astype(np.float32) 138 | 139 | # build random index to iterate over the non-zero elements in R 140 | cdef np.ndarray[np.int64_t, ndim=1] shuffled_idx = np.random.permutation(nnz).astype(np.int64) 141 | 142 | # here we define some auxiliary variables 143 | cdef int i, j, it, n, idx, n_rated, start, end 144 | cdef float rij, rij_pred, err, loss 145 | cdef np.ndarray[np.float32_t, ndim=1] X_j = np.zeros(num_factors, dtype=np.float32) 146 | cdef np.ndarray[np.float32_t, ndim=1] Y_acc = np.zeros(num_factors, dtype=np.float32) 147 | cdef np.ndarray[np.float32_t, ndim=2] Y_copy = np.zeros_like(Y, dtype=np.float32) 148 | 149 | # 150 | # Stochastic Gradient Descent starts here 151 | # 152 | for it in range(iters): # for each iteration 153 | loss = 0.0 154 | for n in range(nnz): # iterate over non-zero values in R only 155 | idx = shuffled_idx[n] 156 | rij = data[idx] 157 | # get the row and col indices of x_ij 158 | i = row_indices[idx] 159 | j = col_indices[idx] 160 | # get the latent factor of item j 161 | X_j = X[j].copy() 162 | # accumulate the item latent factors over the other items rated by i 163 | Y_acc = np.zeros(num_factors, dtype=np.float32) 164 | n_rated = 0 165 | start, end = indptr[i], indptr[i+1] 166 | for l in col_indices[start:end]: 167 | x_il = data[start + n_rated] 168 | Y_acc += x_il * Y[l] 169 | n_rated += 1 170 | if n_rated > 0: 171 | Y_acc /= np.sqrt(n_rated) 172 | # compute the predicted rating 173 | rij_pred = np.dot(X_j, Y_acc) 174 | # compute the prediction error 175 | err = rij - rij_pred 176 | # update the loss 177 | loss += err**2 178 | # adjust the latent factors 179 | X[j] += lrate * (err * Y_acc - reg * X_j) 180 | # copy the current item preference factors 181 | Y_copy = Y.copy() 182 | for l in col_indices[indptr[i]:indptr[i+1]]: 183 | Y_l = Y_copy[l] 184 | Y[l] += lrate * (err * X_j - reg * Y_l) 185 | 186 | loss /= nnz 187 | print('Iter {} - loss: {:.4f}'.format(it+1, loss)) 188 | # update the learning rate 189 | lrate *= lrate_decay 190 | 191 | return X, Y 192 | 193 | @cython.boundscheck(False) 194 | def AsySVD_compute_user_factors(user_profile, Y): 195 | if not isinstance(user_profile, sps.csr_matrix): 196 | raise ValueError('user_profile must be an instance of scipy.sparse.csr_matrix') 197 | assert user_profile.shape[0] == 1, 'user_profile must be a 1-dimensional vector' 198 | 199 | # use Cython MemoryViews for fast access to the sparse structure of user_profile 200 | cdef int [:] col_indices = user_profile.indices 201 | cdef float [:] data = user_profile.data 202 | 203 | # intialize the accumulated user profile 204 | cdef int num_factors = Y.shape[1] 205 | cdef np.ndarray[np.float32_t, ndim=1] Y_acc = np.zeros(num_factors, dtype=np.float32) 206 | cdef int n_rated = len(col_indices) 207 | # aux variables 208 | cdef int n 209 | # accumulate the item vectors for the items rated by the user 210 | for n in range(n_rated): 211 | ril = data[n] 212 | Y_acc += ril * Y[col_indices[n]] 213 | if n_rated > 0: 214 | Y_acc /= np.sqrt(n_rated) 215 | return Y_acc 216 | 217 | 218 | from libc.math cimport exp, log 219 | 220 | @cython.boundscheck(False) 221 | def BPRMF_sgd(R, num_factors=50, lrate=0.01, user_reg=0.015, pos_reg=0.015, neg_reg=0.0015, iters=10, 222 | sampling_type='user_uniform_item_uniform',sample_with_replacement=True, use_resampling=False, sampling_pop_alpha=1.0, 223 | init_mean=0.0, init_std=0.1, lrate_decay=1.0, rnd_seed=42,verbose=False): 224 | if not isinstance(R, sps.csr_matrix): 225 | raise ValueError('R must be an instance of scipy.sparse.csr_matrix') 226 | 227 | # use Cython MemoryViews for fast access to the sparse structure of R 228 | cdef int [:] col_indices = R.indices, indptr = R.indptr 229 | cdef float [:] data = R.data 230 | cdef int M = R.shape[0], N = R.shape[1] 231 | cdef int nnz = len(R.data) 232 | 233 | # set the seed of the random number generator 234 | np.random.seed(rnd_seed) 235 | # randomly initialize the user and item latent factors 236 | cdef np.ndarray[np.float32_t, ndim=2] X = np.random.normal(init_mean, init_std, (M, num_factors)).astype(np.float32) 237 | cdef np.ndarray[np.float32_t, ndim=2] Y = np.random.normal(init_mean, init_std, (N, num_factors)).astype(np.float32) 238 | 239 | # sample the training triples 240 | cdef np.ndarray[np.int64_t, ndim=2] sample 241 | if sampling_type == 'user_uniform_item_uniform': 242 | sample = user_uniform_item_uniform_sampling(R, nnz, replace=sample_with_replacement, seed=rnd_seed, verbose=verbose) 243 | elif sampling_type == 'user_uniform_item_pop': 244 | sample = user_uniform_item_pop_sampling(R, nnz, alpha=sampling_pop_alpha, seed=rnd_seed, verbose=verbose) 245 | else: 246 | raise RuntimeError('Unknown sampling procedure "{}"'.format(sampling_type)) 247 | 248 | # here we define some auxiliary variables 249 | cdef int i, j, k, idx, it, n 250 | cdef float rij, rik, loss, deriv 251 | cdef np.ndarray[np.float32_t, ndim=1] X_i = np.zeros(num_factors, dtype=np.float32) 252 | cdef np.ndarray[np.float32_t, ndim=1] Y_j = np.zeros(num_factors, dtype=np.float32) 253 | cdef np.ndarray[np.float32_t, ndim=1] Y_k = np.zeros(num_factors, dtype=np.float32) 254 | 255 | # 256 | # Stochastic Gradient Descent starts here 257 | # 258 | for it in range(iters): # for each iteration 259 | loss = 0.0 260 | for n in range(nnz): 261 | i, j, k = sample[n] 262 | # get the user and item factors 263 | X_i = X[i].copy() 264 | Y_j = Y[j].copy() 265 | Y_k = Y[k].copy() 266 | # compute the difference of the predicted scores 267 | diff_yjk = Y_j - Y_k 268 | zijk = np.dot(X_i, diff_yjk) 269 | # compute the sigmoid 270 | sig = 1. / (1. + exp(-zijk)) 271 | # update the loss 272 | loss += log(sig) 273 | 274 | # adjust the latent factors 275 | deriv = 1. - sig 276 | X[i] += lrate * (deriv * diff_yjk - user_reg * X_i) 277 | Y[j] += lrate * (deriv * X_i - pos_reg * Y_j) 278 | Y[k] += lrate * (-deriv * X_i - neg_reg * Y_k) 279 | 280 | loss /= nnz 281 | if verbose: 282 | print('Iter {} - loss: {:.4f}'.format(it+1, loss)) 283 | # update the learning rate 284 | lrate *= lrate_decay 285 | if use_resampling: 286 | if sampling_type == 'user_uniform_item_uniform': 287 | sample = user_uniform_item_uniform_sampling(R, nnz, replace=sample_with_replacement, seed=rnd_seed, verbose=verbose) 288 | elif sampling_type == 'user_uniform_item_pop': 289 | sample = user_uniform_item_pop_sampling(R, nnz, alpha=sampling_pop_alpha, seed=rnd_seed, verbose=verbose) 290 | 291 | return X, Y 292 | 293 | def user_uniform_item_uniform_sampling(R, size, replace=True, seed=1234, verbose=True): 294 | # use Cython MemoryViews for fast access to the sparse structure of R 295 | cdef int [:] col_indices = R.indices, indptr = R.indptr 296 | cdef int M = R.shape[0], N = R.shape[1] 297 | cdef int nnz = len(R.data) 298 | 299 | cdef np.ndarray[np.int64_t, ndim=2] sample = np.zeros((size, 3), dtype=np.int64) 300 | cdef np.ndarray[np.int8_t, ndim=1] is_sampled # boolean arrays are not yet supported by Cython 301 | if not replace: 302 | is_sampled = np.zeros(nnz, dtype=np.int8) 303 | 304 | # set the seed of the random number generator 305 | np.random.seed(seed) 306 | 307 | cdef int i=0, start, end, iid, jid, kid, idx 308 | cdef np.ndarray[np.int64_t, ndim=1] aux, neg_candidates 309 | cdef int [:] pos_candidates 310 | while i < size: 311 | # 1) sample a user from a uniform distribution 312 | iid = np.random.choice(M) 313 | 314 | # 2) sample a positive item uniformly at random 315 | start = indptr[iid] 316 | end = indptr[iid+1] 317 | pos_candidates = col_indices[start:end] 318 | if start == end: 319 | # empty candidate set 320 | continue 321 | if replace: 322 | # sample positive items with replacement 323 | jid = np.random.choice(pos_candidates) 324 | else: 325 | # sample positive items without replacement 326 | # use a index vector between start and end 327 | aux = np.arange(start, end) 328 | if np.all(is_sampled[aux]): 329 | # all positive items have been already sampled 330 | continue 331 | idx = np.random.choice(aux) 332 | while is_sampled[idx]: 333 | # TODO: remove idx from aux to speed up the sampling 334 | idx = np.random.choice(aux) 335 | is_sampled[idx] = 1 336 | jid = col_indices[idx] 337 | 338 | # 3) sample a negative item uniformly at random 339 | # build the candidate set of negative items 340 | # TODO: precompute the negative candidate set for speed-up 341 | neg_candidates = np.delete(np.arange(N), pos_candidates) 342 | kid = np.random.choice(neg_candidates) 343 | sample[i, :] = [iid, jid, kid] 344 | i += 1 345 | if verbose and i % 10000 == 0: 346 | print('Sampling... {:.2f}% complete'.format(i/size*100)) 347 | return sample 348 | 349 | 350 | def user_uniform_item_pop_sampling(R, size, alpha=1., seed=1234, verbose=True): 351 | # use Cython MemoryViews for fast access to the sparse structure of R 352 | cdef int [:] col_indices = R.indices, indptr = R.indptr 353 | cdef int M = R.shape[0], N = R.shape[1] 354 | cdef int nnz = len(R.data) 355 | 356 | cdef np.ndarray[np.int64_t, ndim=2] sample = np.zeros((size, 3), dtype=np.int64) 357 | 358 | # compute the item popularity 359 | cdef np.ndarray[np.float32_t, ndim=1] item_pop = np.asarray(np.sum(R > 0, axis=0)).squeeze().astype(np.float32) 360 | # smooth popularity with an exponential factor alpha 361 | item_pop = np.power(item_pop, alpha) 362 | 363 | # set the seed of the random number generator 364 | np.random.seed(seed) 365 | 366 | cdef int i=0, start, end, iid, jid, kid, idx 367 | cdef np.ndarray[np.int64_t, ndim=1] aux, neg_candidates 368 | cdef int [:] pos_candidates 369 | cdef np.ndarray[np.float32_t, ndim=1] p 370 | while i < size: 371 | # 1) sample a user from a uniform distribution 372 | iid = np.random.choice(M) 373 | 374 | # 2) sample a positive item proportionally to its popularity 375 | start = indptr[iid] 376 | end = indptr[iid+1] 377 | pos_candidates = col_indices[start:end] 378 | if start == end: 379 | # empty candidate set 380 | continue 381 | # always sample with replacement 382 | p = item_pop[pos_candidates] 383 | p /= np.sum(p) 384 | jid = np.random.choice(pos_candidates, p=p) 385 | 386 | # 3) sample a negative item uniformly at random 387 | # build the candidate set of negative items 388 | # TODO: precompute the negative candidate set for speed-up 389 | neg_candidates = np.delete(np.arange(N), pos_candidates) 390 | kid = np.random.choice(neg_candidates) 391 | sample[i, :] = [iid, jid, kid] 392 | i += 1 393 | if verbose and i % 10000 == 0: 394 | print('Sampling... {:.2f}% complete'.format(i/size*100)) 395 | return sample -------------------------------------------------------------------------------- /MatrixFactorization/Cython/build/temp.linux-x86_64-3.6/MF_BPR_Cython_Epoch.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/MatrixFactorization/Cython/build/temp.linux-x86_64-3.6/MF_BPR_Cython_Epoch.o -------------------------------------------------------------------------------- /MatrixFactorization/Cython/compileCython.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 16/07/2017 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | 10 | try: 11 | from setuptools import setup 12 | from setuptools import Extension 13 | except ImportError: 14 | from distutils.core import setup 15 | from distutils.extension import Extension 16 | 17 | 18 | from Cython.Distutils import build_ext 19 | 20 | 21 | import numpy 22 | 23 | import sys 24 | import re 25 | 26 | 27 | if len(sys.argv) != 4: 28 | raise ValueError("Wrong number of paramethers received. Expected 4, got {}".format(sys.argv)) 29 | 30 | 31 | #fileToCompile = 'MF_BPR_Cython_Epoch.pyx' 32 | 33 | # Get the name of the file to compile 34 | fileToCompile = sys.argv[1] 35 | # Remove the argument from sys argv in order for it to contain only what setup needs 36 | del sys.argv[1] 37 | 38 | extensionName = re.sub("\.pyx", "", fileToCompile) 39 | 40 | 41 | ext_modules = Extension(extensionName, 42 | [fileToCompile], 43 | extra_compile_args=['-O3'], 44 | include_dirs=[numpy.get_include(),], 45 | ) 46 | 47 | setup( 48 | cmdclass={'build_ext': build_ext}, 49 | ext_modules=[ext_modules] 50 | ) 51 | 52 | -------------------------------------------------------------------------------- /MatrixFactorization/MatrixFactorization_RMSE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 23/10/17 5 | 6 | @author: Massimo Quadrana 7 | """ 8 | 9 | import logging 10 | 11 | import numpy as np 12 | from Base.Recommender_utils import check_matrix 13 | 14 | from Base.Recommender import Recommender 15 | from MatrixFactorization.Cython.MF_RMSE import FunkSVD_sgd, AsySVD_sgd, AsySVD_compute_user_factors, BPRMF_sgd 16 | 17 | logger = logging.getLogger(__name__) 18 | logging.basicConfig( 19 | level=logging.INFO, 20 | format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") 21 | 22 | 23 | 24 | 25 | class FunkSVD(Recommender): 26 | ''' 27 | FunkSVD model 28 | Reference: http://sifter.org/~simon/journal/20061211.html 29 | 30 | Factorizes the rating matrix R into the dot product of two matrices U and V of latent factors. 31 | U represent the user latent factors, V the item latent factors. 32 | The model is learned by solving the following regularized Least-squares objective function with Stochastic Gradient Descent 33 | \operatornamewithlimits{argmin} \limits_{U,V}\frac{1}{2}||R - UV^T||^2_2 + \frac{\lambda}{2}(||U||^2_F + ||V||^2_F) 34 | Latent factors are initialized from a Normal distribution with given mean and std. 35 | ''' 36 | 37 | # TODO: add global effects 38 | def __init__(self, URM_train): 39 | 40 | super(FunkSVD, self).__init__() 41 | 42 | self.URM_train = check_matrix(URM_train, 'csr', dtype=np.float32) 43 | 44 | 45 | 46 | def __str__(self): 47 | return "FunkSVD(num_factors={}, lrate={}, reg={}, iters={}, init_mean={}, " \ 48 | "init_std={}, lrate_decay={}, rnd_seed={})".format( 49 | self.num_factors, self.learning_rate, self.reg, self.epochs, self.init_mean, self.init_std, self.lrate_decay, 50 | self.rnd_seed 51 | ) 52 | 53 | 54 | def fit(self, num_factors=50, 55 | learning_rate=0.01, 56 | reg=0.015, 57 | epochs=10, 58 | init_mean=0.0, 59 | init_std=0.1, 60 | lrate_decay=1.0, 61 | rnd_seed=42): 62 | """ 63 | 64 | Initialize the model 65 | :param num_factors: number of latent factors 66 | :param learning_rate: initial learning rate used in SGD 67 | :param reg: regularization term 68 | :param epochs: number of iterations in training the model with SGD 69 | :param init_mean: mean used to initialize the latent factors 70 | :param init_std: standard deviation used to initialize the latent factors 71 | :param lrate_decay: learning rate decay 72 | :param rnd_seed: random seed 73 | """ 74 | 75 | self.num_factors = num_factors 76 | self.learning_rate = learning_rate 77 | self.reg = reg 78 | self.epochs = epochs 79 | self.init_mean = init_mean 80 | self.init_std = init_std 81 | self.lrate_decay = lrate_decay 82 | self.rnd_seed = rnd_seed 83 | 84 | self.U, self.V = FunkSVD_sgd(self.URM_train, self.num_factors, self.learning_rate, self.reg, self.epochs, self.init_mean, 85 | self.init_std, 86 | self.lrate_decay, self.rnd_seed) 87 | 88 | # def recommend(self, user_id, n=None, exclude_seen=True): 89 | # scores = np.dot(self.U[user_id], self.V.T) 90 | # ranking = scores.argsort()[::-1] 91 | # # rank items 92 | # if exclude_seen: 93 | # ranking = self._filter_seen(user_id, ranking) 94 | # return ranking[:n] 95 | # 96 | # 97 | # def _get_user_ratings(self, user_id): 98 | # return self.dataset[user_id] 99 | # 100 | # def _get_item_ratings(self, item_id): 101 | # return self.dataset[:, item_id] 102 | # 103 | # 104 | # def _filter_seen(self, user_id, ranking): 105 | # user_profile = self._get_user_ratings(user_id) 106 | # seen = user_profile.indices 107 | # unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True) 108 | # return ranking[unseen_mask] 109 | 110 | 111 | 112 | 113 | 114 | def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 115 | 116 | # compute the scores using the dot product 117 | user_profile_batch = self.URM_train[users_in_batch] 118 | 119 | scores_array = np.dot(self.U[users_in_batch], self.V.T) 120 | 121 | if self.normalize: 122 | raise ValueError("Not implemented") 123 | 124 | # To exclude seen items perform a boolean indexing and replace their score with -inf 125 | # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be 126 | # recommended 127 | if exclude_seen: 128 | scores_array[user_profile_batch.nonzero()] = -np.inf 129 | 130 | if filterTopPop: 131 | scores_array[:,self.filterTopPop_ItemsID] = -np.inf 132 | 133 | if filterCustomItems: 134 | scores_array[:, self.filterCustomItems_ItemsID] = -np.inf 135 | 136 | 137 | # rank items and mirror column to obtain a ranking in descending score 138 | #ranking = (-scores_array).argsort(axis=1) 139 | #ranking = np.fliplr(ranking) 140 | #ranking = ranking[:,0:n] 141 | 142 | ranking = np.zeros((scores_array.shape[0],n), dtype=np.int) 143 | 144 | for row_index in range(scores_array.shape[0]): 145 | scores = scores_array[row_index] 146 | 147 | relevant_items_partition = (-scores).argpartition(n)[0:n] 148 | relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition]) 149 | ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting] 150 | 151 | 152 | return ranking 153 | 154 | 155 | 156 | def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False): 157 | 158 | 159 | if n==None: 160 | n=self.URM_train.shape[1]-1 161 | 162 | scores_array = np.dot(self.U[user_id], self.V.T) 163 | 164 | if self.normalize: 165 | raise ValueError("Not implemented") 166 | 167 | 168 | if exclude_seen: 169 | scores = self._filter_seen_on_scores(user_id, scores_array) 170 | 171 | if filterTopPop: 172 | scores = self._filter_TopPop_on_scores(scores_array) 173 | 174 | if filterCustomItems: 175 | scores = self._filterCustomItems_on_scores(scores_array) 176 | 177 | 178 | # rank items and mirror column to obtain a ranking in descending score 179 | #ranking = scores.argsort() 180 | #ranking = np.flip(ranking, axis=0) 181 | 182 | # Sorting is done in three steps. Faster then plain np.argsort for higher number of items 183 | # - Partition the data to extract the set of relevant items 184 | # - Sort only the relevant items 185 | # - Get the original item index 186 | relevant_items_partition = (-scores_array).argpartition(n)[0:n] 187 | relevant_items_partition_sorting = np.argsort(-scores_array[relevant_items_partition]) 188 | ranking = relevant_items_partition[relevant_items_partition_sorting] 189 | 190 | 191 | return ranking 192 | 193 | 194 | 195 | 196 | class AsySVD(Recommender): 197 | ''' 198 | AsymmetricSVD model 199 | Reference: Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model (Koren, 2008) 200 | 201 | Factorizes the rating matrix R into two matrices X and Y of latent factors, which both represent item latent features. 202 | Users are represented by aggregating the latent features in Y of items they have already rated. 203 | Rating prediction is performed by computing the dot product of this accumulated user profile with the target item's 204 | latent factor in X. 205 | 206 | The model is learned by solving the following regularized Least-squares objective function with Stochastic Gradient Descent 207 | \operatornamewithlimits{argmin}\limits_{x*,y*}\frac{1}{2}\sum_{i,j \in R}(r_{ij} - x_j^T \sum_{l \in R(i)} r_{il}y_l)^2 + \frac{\lambda}{2}(\sum_{i}{||x_i||^2} + \sum_{j}{||y_j||^2}) 208 | ''' 209 | 210 | # TODO: add global effects 211 | # TODO: recommendation for new-users. Update the precomputed profiles online 212 | def __init__(self, 213 | num_factors=50, 214 | lrate=0.01, 215 | reg=0.015, 216 | iters=10, 217 | init_mean=0.0, 218 | init_std=0.1, 219 | lrate_decay=1.0, 220 | rnd_seed=42): 221 | ''' 222 | Initialize the model 223 | :param num_factors: number of latent factors 224 | :param lrate: initial learning rate used in SGD 225 | :param reg: regularization term 226 | :param iters: number of iterations in training the model with SGD 227 | :param init_mean: mean used to initialize the latent factors 228 | :param init_std: standard deviation used to initialize the latent factors 229 | :param lrate_decay: learning rate decay 230 | :param rnd_seed: random seed 231 | ''' 232 | super(AsySVD, self).__init__() 233 | self.num_factors = num_factors 234 | self.lrate = lrate 235 | self.reg = reg 236 | self.iters = iters 237 | self.init_mean = init_mean 238 | self.init_std = init_std 239 | self.lrate_decay = lrate_decay 240 | self.rnd_seed = rnd_seed 241 | 242 | def __str__(self): 243 | return "AsySVD(num_factors={}, lrate={}, reg={}, iters={}, init_mean={}, " \ 244 | "init_std={}, lrate_decay={}, rnd_seed={})".format( 245 | self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, self.init_std, self.lrate_decay, 246 | self.rnd_seed 247 | ) 248 | 249 | def fit(self, R): 250 | self.dataset = R 251 | R = check_matrix(R, 'csr', dtype=np.float32) 252 | self.X, self.Y = AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, 253 | self.init_std, 254 | self.lrate_decay, self.rnd_seed) 255 | # precompute the user factors 256 | M = R.shape[0] 257 | self.U = np.vstack([AsySVD_compute_user_factors(R[i], self.Y) for i in range(M)]) 258 | 259 | def recommend(self, user_id, n=None, exclude_seen=True): 260 | scores = np.dot(self.X, self.U[user_id].T) 261 | ranking = scores.argsort()[::-1] 262 | # rank items 263 | if exclude_seen: 264 | ranking = self._filter_seen(user_id, ranking) 265 | return ranking[:n] 266 | 267 | 268 | def _get_user_ratings(self, user_id): 269 | return self.dataset[user_id] 270 | 271 | def _get_item_ratings(self, item_id): 272 | return self.dataset[:, item_id] 273 | 274 | 275 | def _filter_seen(self, user_id, ranking): 276 | user_profile = self._get_user_ratings(user_id) 277 | seen = user_profile.indices 278 | unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True) 279 | return ranking[unseen_mask] 280 | 281 | 282 | 283 | class IALS_numpy(Recommender): 284 | ''' 285 | binary Alternating Least Squares model (or Weighed Regularized Matrix Factorization) 286 | Reference: Collaborative Filtering for binary Feedback Datasets (Hu et al., 2008) 287 | 288 | Factorization model for binary feedback. 289 | First, splits the feedback matrix R as the element-wise a Preference matrix P and a Confidence matrix C. 290 | Then computes the decomposition of them into the dot product of two matrices X and Y of latent factors. 291 | X represent the user latent factors, Y the item latent factors. 292 | 293 | The model is learned by solving the following regularized Least-squares objective function with Stochastic Gradient Descent 294 | \operatornamewithlimits{argmin}\limits_{x*,y*}\frac{1}{2}\sum_{i,j}{c_{ij}(p_{ij}-x_i^T y_j) + \lambda(\sum_{i}{||x_i||^2} + \sum_{j}{||y_j||^2})} 295 | ''' 296 | 297 | # TODO: Add support for multiple confidence scaling functions (e.g. linear and log scaling) 298 | def __init__(self, 299 | num_factors=50, 300 | reg=0.015, 301 | iters=10, 302 | scaling='linear', 303 | alpha=40, 304 | epsilon=1.0, 305 | init_mean=0.0, 306 | init_std=0.1, 307 | rnd_seed=42): 308 | ''' 309 | Initialize the model 310 | :param num_factors: number of latent factors 311 | :param reg: regularization term 312 | :param iters: number of iterations in training the model with SGD 313 | :param scaling: supported scaling modes for the observed values: 'linear' or 'log' 314 | :param alpha: scaling factor to compute confidence scores 315 | :param epsilon: epsilon used in log scaling only 316 | :param init_mean: mean used to initialize the latent factors 317 | :param init_std: standard deviation used to initialize the latent factors 318 | :param rnd_seed: random seed 319 | ''' 320 | 321 | super(IALS_numpy, self).__init__() 322 | assert scaling in ['linear', 'log'], 'Unsupported scaling: {}'.format(scaling) 323 | 324 | self.num_factors = num_factors 325 | self.reg = reg 326 | self.iters = iters 327 | self.scaling = scaling 328 | self.alpha = alpha 329 | self.epsilon = epsilon 330 | self.init_mean = init_mean 331 | self.init_std = init_std 332 | self.rnd_seed = rnd_seed 333 | 334 | def __str__(self): 335 | return "WRMF-iALS(num_factors={}, reg={}, iters={}, scaling={}, alpha={}, episilon={}, init_mean={}, " \ 336 | "init_std={}, rnd_seed={})".format( 337 | self.num_factors, self.reg, self.iters, self.scaling, self.alpha, self.epsilon, self.init_mean, 338 | self.init_std, self.rnd_seed 339 | ) 340 | 341 | def _linear_scaling(self, R): 342 | C = R.copy().tocsr() 343 | C.data *= self.alpha 344 | C.data += 1.0 345 | return C 346 | 347 | def _log_scaling(self, R): 348 | C = R.copy().tocsr() 349 | C.data = 1.0 + self.alpha * np.log(1.0 + C.data / self.epsilon) 350 | return C 351 | 352 | def fit(self, R): 353 | self.dataset = R 354 | # compute the confidence matrix 355 | if self.scaling == 'linear': 356 | C = self._linear_scaling(R) 357 | else: 358 | C = self._log_scaling(R) 359 | 360 | Ct = C.T.tocsr() 361 | M, N = R.shape 362 | 363 | # set the seed 364 | np.random.seed(self.rnd_seed) 365 | 366 | # initialize the latent factors 367 | self.X = np.random.normal(self.init_mean, self.init_std, size=(M, self.num_factors)) 368 | self.Y = np.random.normal(self.init_mean, self.init_std, size=(N, self.num_factors)) 369 | 370 | for it in range(self.iters): 371 | self.X = self._lsq_solver_fast(C, self.X, self.Y, self.reg) 372 | self.Y = self._lsq_solver_fast(Ct, self.Y, self.X, self.reg) 373 | logger.debug('Finished iter {}'.format(it + 1)) 374 | 375 | def recommend(self, user_id, n=None, exclude_seen=True): 376 | scores = np.dot(self.X[user_id], self.Y.T) 377 | ranking = scores.argsort()[::-1] 378 | # rank items 379 | if exclude_seen: 380 | ranking = self._filter_seen(user_id, ranking) 381 | return ranking[:n] 382 | 383 | def _lsq_solver(self, C, X, Y, reg): 384 | # precompute YtY 385 | rows, factors = X.shape 386 | YtY = np.dot(Y.T, Y) 387 | 388 | for i in range(rows): 389 | # accumulate YtCiY + reg*I in A 390 | A = YtY + reg * np.eye(factors) 391 | 392 | # accumulate Yt*Ci*p(i) in b 393 | b = np.zeros(factors) 394 | 395 | for j, cij in self._nonzeros(C, i): 396 | vj = Y[j] 397 | A += (cij - 1.0) * np.outer(vj, vj) 398 | b += cij * vj 399 | 400 | X[i] = np.linalg.solve(A, b) 401 | return X 402 | 403 | def _lsq_solver_fast(self, C, X, Y, reg): 404 | # precompute YtY 405 | rows, factors = X.shape 406 | YtY = np.dot(Y.T, Y) 407 | 408 | for i in range(rows): 409 | # accumulate YtCiY + reg*I in A 410 | A = YtY + reg * np.eye(factors) 411 | 412 | start, end = C.indptr[i], C.indptr[i + 1] 413 | j = C.indices[start:end] # indices of the non-zeros in Ci 414 | ci = C.data[start:end] # non-zeros in Ci 415 | 416 | Yj = Y[j] # only the factors with non-zero confidence 417 | # compute Yt(Ci-I)Y 418 | aux = np.dot(Yj.T, np.diag(ci - 1.0)) 419 | A += np.dot(aux, Yj) 420 | # compute YtCi 421 | b = np.dot(Yj.T, ci) 422 | 423 | X[i] = np.linalg.solve(A, b) 424 | return X 425 | 426 | def _nonzeros(self, R, row): 427 | for i in range(R.indptr[row], R.indptr[row + 1]): 428 | yield (R.indices[i], R.data[i]) 429 | 430 | 431 | def _get_user_ratings(self, user_id): 432 | return self.dataset[user_id] 433 | 434 | def _get_item_ratings(self, item_id): 435 | return self.dataset[:, item_id] 436 | 437 | 438 | def _filter_seen(self, user_id, ranking): 439 | user_profile = self._get_user_ratings(user_id) 440 | seen = user_profile.indices 441 | unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True) 442 | return ranking[unseen_mask] 443 | 444 | 445 | 446 | class BPRMF(Recommender): 447 | ''' 448 | BPRMF model 449 | ''' 450 | 451 | # TODO: add global effects 452 | def __init__(self, 453 | num_factors=50, 454 | lrate=0.01, 455 | user_reg=0.015, 456 | pos_reg=0.015, 457 | neg_reg=0.0015, 458 | iters=10, 459 | sampling_type='user_uniform_item_uniform', 460 | sample_with_replacement=True, 461 | use_resampling=True, 462 | sampling_pop_alpha=1.0, 463 | init_mean=0.0, 464 | init_std=0.1, 465 | lrate_decay=1.0, 466 | rnd_seed=42, 467 | verbose=True): 468 | ''' 469 | Initialize the model 470 | :param num_factors: number of latent factors 471 | :param lrate: initial learning rate used in SGD 472 | :param user_reg: regularization for the user factors 473 | :param pos_reg: regularization for the factors of the positive sampled items 474 | :param neg_reg: regularization for the factors of the negative sampled items 475 | :param iters: number of iterations in training the model with SGD 476 | :param sampling_type: type of sampling. Supported types are 'user_uniform_item_uniform' and 'user_uniform_item_pop' 477 | :param sample_with_replacement: `True` to sample positive items with replacement (doesn't work with 'user_uniform_item_pop') 478 | :param use_resampling: `True` to resample at each iteration during training 479 | :param sampling_pop_alpha: float smoothing factor for popularity based samplers (e.g., 'user_uniform_item_pop') 480 | :param init_mean: mean used to initialize the latent factors 481 | :param init_std: standard deviation used to initialize the latent factors 482 | :param lrate_decay: learning rate decay 483 | :param rnd_seed: random seed 484 | :param verbose: controls verbosity in output 485 | ''' 486 | super(BPRMF, self).__init__() 487 | self.num_factors = num_factors 488 | self.lrate = lrate 489 | self.user_reg = user_reg 490 | self.pos_reg = pos_reg 491 | self.neg_reg = neg_reg 492 | self.iters = iters 493 | self.sampling_type = sampling_type 494 | self.sample_with_replacement = sample_with_replacement 495 | self.use_resampling = use_resampling 496 | self.sampling_pop_alpha = sampling_pop_alpha 497 | self.init_mean = init_mean 498 | self.init_std = init_std 499 | self.lrate_decay = lrate_decay 500 | self.rnd_seed = rnd_seed 501 | self.verbose = verbose 502 | 503 | def __str__(self): 504 | return "BPRMF(num_factors={}, lrate={}, user_reg={}. pos_reg={}, neg_reg={}, iters={}, " \ 505 | "sampling_type={}, sample_with_replacement={}, use_resampling={}, sampling_pop_alpha={}, init_mean={}, " \ 506 | "init_std={}, lrate_decay={}, rnd_seed={}, verbose={})".format( 507 | self.num_factors, self.lrate, self.user_reg, self.pos_reg, self.neg_reg, self.iters, 508 | self.sampling_type, self.sample_with_replacement, self.use_resampling, self.sampling_pop_alpha, 509 | self.init_mean, 510 | self.init_std, 511 | self.lrate_decay, 512 | self.rnd_seed, 513 | self.verbose 514 | ) 515 | 516 | def fit(self, R): 517 | self.dataset = R 518 | R = check_matrix(R, 'csr', dtype=np.float32) 519 | self.X, self.Y = BPRMF_sgd(R, 520 | num_factors=self.num_factors, 521 | lrate=self.lrate, 522 | user_reg=self.user_reg, 523 | pos_reg=self.pos_reg, 524 | neg_reg=self.neg_reg, 525 | iters=self.iters, 526 | sampling_type=self.sampling_type, 527 | sample_with_replacement=self.sample_with_replacement, 528 | use_resampling=self.use_resampling, 529 | sampling_pop_alpha=self.sampling_pop_alpha, 530 | init_mean=self.init_mean, 531 | init_std=self.init_std, 532 | lrate_decay=self.lrate_decay, 533 | rnd_seed=self.rnd_seed, 534 | verbose=self.verbose) 535 | 536 | def recommend(self, user_id, n=None, exclude_seen=True): 537 | scores = np.dot(self.X[user_id], self.Y.T) 538 | ranking = scores.argsort()[::-1] 539 | # rank items 540 | if exclude_seen: 541 | ranking = self._filter_seen(user_id, ranking) 542 | return ranking[:n] 543 | 544 | 545 | 546 | def _get_user_ratings(self, user_id): 547 | return self.dataset[user_id] 548 | 549 | def _get_item_ratings(self, item_id): 550 | return self.dataset[:, item_id] 551 | 552 | 553 | def _filter_seen(self, user_id, ranking): 554 | user_profile = self._get_user_ratings(user_id) 555 | seen = user_profile.indices 556 | unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True) 557 | return ranking[unseen_mask] 558 | 559 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RecSys Course 2017 2 | This is the official repository for the 2017 Recommender Systems course at Polimi. 3 | 4 | ## This repo is obsolete, please refer to the updated version [HERE](https://github.com/MaurizioFD/RecSys_Course_2018) 5 | 6 | 7 | #### This repo contains a Cython implementation of: 8 | - SLIM BPR: Uses a Cython tree-based sparse matrix, suitable for datasets whose number of items is too big for the 9 | dense similarity matrix to fit in memory. 10 | Dense similarity is also supported. 11 | - MF BPR: Matrix factorization optimizing BPR 12 | - FunkSVD: Matrix factorization optimizing RMSE 13 | - AsymmetricSVD 14 | 15 | #### This repo contains a Python implementation of: 16 | - Item-based KNN collaborative 17 | - Item-based KNN content 18 | - User-based KNN 19 | - SLIM_RMSE: SLIM solver using ElasticNet. The solver fits every column in the similarity matrix independently 20 | 21 | #### This repo also provides an implementation of: 22 | 23 | - Cosine Similarity, Adjusted Cosine, Pearson Correlation, Jaccard Correlation, Tanimoto Coefficient: Implemented both in Python and Cython with the same interface, Base.cosine_similarity and Base.Cython.cosine_similarity 24 | - MAP, recall, precision, ROC-AUC, MRR, RR, NDCG to be used in testing 25 | - Movielens10MReader: reads movielens 10M rating file, splits it into three URMs for train, test and validation. 26 | 27 | 28 | Cython code is already compiled for Linux. To recompile the code just set the recompile_cython flag to True. 29 | For other OS such as Windows the c-imported numpy interface might be different (e.g. return tipe long long insead of long) therefore the code could require modifications in oder to compile. 30 | 31 | 32 | ##### In "all_algorithms.py" you can see how to use every model. -------------------------------------------------------------------------------- /SLIM_BPR/Cython/SLIM_BPR_Cython.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 07/09/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | from Base.Recommender_utils import similarityMatrixTopK 10 | from SLIM_BPR.SLIM_BPR_Python import SLIM_BPR_Python 11 | import subprocess 12 | import os, sys 13 | import numpy as np 14 | 15 | 16 | class SLIM_BPR_Cython(SLIM_BPR_Python): 17 | 18 | 19 | def __init__(self, URM_train, positive_threshold=4, 20 | recompile_cython = False, sparse_weights = False, 21 | symmetric = True, sgd_mode='adagrad'): 22 | 23 | 24 | super(SLIM_BPR_Cython, self).__init__(URM_train, 25 | positive_threshold=positive_threshold, 26 | sparse_weights = sparse_weights) 27 | 28 | 29 | self.sgd_mode = sgd_mode 30 | self.symmetric = symmetric 31 | 32 | if not sparse_weights: 33 | 34 | n_items = URM_train.shape[1] 35 | requiredGB = 8 * n_items**2 / 1e+06 36 | 37 | if symmetric: 38 | requiredGB /=2 39 | 40 | print("SLIM_BPR_Cython: Estimated memory required for similarity matrix of {} items is {:.2f} MB".format(n_items, requiredGB)) 41 | 42 | 43 | 44 | 45 | if recompile_cython: 46 | print("Compiling in Cython") 47 | self.runCompilationScript() 48 | print("Compilation Complete") 49 | 50 | 51 | 52 | def fit(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, minRatingsPerUser=1, 53 | batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0, 54 | lambda_i = 0.0, lambda_j = 0.0, learning_rate = 0.01, topK = 200, sgd_mode='adagrad'): 55 | 56 | 57 | # Select only positive interactions 58 | URM_train_positive = self.URM_train.copy() 59 | 60 | URM_train_positive.data = URM_train_positive.data >= self.positive_threshold 61 | URM_train_positive.eliminate_zeros() 62 | 63 | 64 | self.sgd_mode = sgd_mode 65 | 66 | 67 | # Import compiled module 68 | from SLIM_BPR.Cython.SLIM_BPR_Cython_Epoch import SLIM_BPR_Cython_Epoch 69 | 70 | 71 | self.cythonEpoch = SLIM_BPR_Cython_Epoch(self.URM_mask, 72 | sparse_weights = self.sparse_weights, 73 | topK=topK, 74 | learning_rate=learning_rate, 75 | li_reg = lambda_i, 76 | lj_reg = lambda_j, 77 | batch_size=1, 78 | symmetric = self.symmetric, 79 | sgd_mode = sgd_mode) 80 | 81 | 82 | # Cal super.fit to start training 83 | super(SLIM_BPR_Cython, self).fit_alreadyInitialized(epochs=epochs, 84 | logFile=logFile, 85 | URM_test=URM_test, 86 | filterTopPop=filterTopPop, 87 | minRatingsPerUser=minRatingsPerUser, 88 | batch_size=batch_size, 89 | validate_every_N_epochs=validate_every_N_epochs, 90 | start_validation_after_N_epochs=start_validation_after_N_epochs, 91 | lambda_i = lambda_i, 92 | lambda_j = lambda_j, 93 | learning_rate = learning_rate, 94 | topK = topK) 95 | 96 | 97 | 98 | 99 | def runCompilationScript(self): 100 | 101 | # Run compile script setting the working directory to ensure the compiled file are contained in the 102 | # appropriate subfolder and not the project root 103 | 104 | compiledModuleSubfolder = "/SLIM_BPR/Cython" 105 | #fileToCompile_list = ['Sparse_Matrix_CSR.pyx', 'SLIM_BPR_Cython_Epoch.pyx'] 106 | fileToCompile_list = ['SLIM_BPR_Cython_Epoch.pyx'] 107 | 108 | for fileToCompile in fileToCompile_list: 109 | 110 | command = ['python', 111 | 'compileCython.py', 112 | fileToCompile, 113 | 'build_ext', 114 | '--inplace' 115 | ] 116 | 117 | 118 | output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder) 119 | 120 | try: 121 | 122 | command = ['cython', 123 | fileToCompile, 124 | '-a' 125 | ] 126 | 127 | output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder) 128 | 129 | except: 130 | pass 131 | 132 | 133 | print("Compiled module saved in subfolder: {}".format(compiledModuleSubfolder)) 134 | 135 | # Command to run compilation script 136 | #python compileCython.py SLIM_BPR_Cython_Epoch.pyx build_ext --inplace 137 | 138 | # Command to generate html report 139 | # cython -a SLIM_BPR_Cython_Epoch.pyx 140 | 141 | 142 | def updateSimilarityMatrix(self): 143 | 144 | self.S = self.cythonEpoch.get_S() 145 | 146 | if self.sparse_weights: 147 | self.W_sparse = self.S 148 | else: 149 | self.W = self.S 150 | 151 | 152 | 153 | def epochIteration(self): 154 | 155 | self.cythonEpoch.epochIteration_Cython() 156 | 157 | 158 | 159 | 160 | def writeCurrentConfig(self, currentEpoch, results_run, logFile): 161 | 162 | current_config = {'learn_rate': self.learning_rate, 163 | 'topK_similarity': self.topK, 164 | 'epoch': currentEpoch, 165 | 'sgd_mode': self.sgd_mode} 166 | 167 | print("Test case: {}\nResults {}\n".format(current_config, results_run)) 168 | # print("Weights: {}\n".format(str(list(self.weights)))) 169 | 170 | sys.stdout.flush() 171 | 172 | if (logFile != None): 173 | logFile.write("Test case: {}, Results {}\n".format(current_config, results_run)) 174 | # logFile.write("Weights: {}\n".format(str(list(self.weights)))) 175 | logFile.flush() 176 | -------------------------------------------------------------------------------- /SLIM_BPR/Cython/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/lib.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/lib.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.o -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_CSR.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_CSR.o -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_Tree_CSR.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_Tree_CSR.o -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/SLIM_BPR_Cython_Epoch.cp36-win_amd64.def: -------------------------------------------------------------------------------- 1 | LIBRARY SLIM_BPR_Cython_Epoch.cp36-win_amd64.pyd 2 | EXPORTS 3 | PyInit_SLIM_BPR_Cython_Epoch 4 | -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/Sparse_Matrix_CSR.cp36-win_amd64.def: -------------------------------------------------------------------------------- 1 | LIBRARY Sparse_Matrix_CSR.cp36-win_amd64.pyd 2 | EXPORTS 3 | PyInit_Sparse_Matrix_CSR 4 | -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/Sparse_Matrix_Tree_CSR.cp36-win_amd64.def: -------------------------------------------------------------------------------- 1 | LIBRARY Sparse_Matrix_Tree_CSR.cp36-win_amd64.pyd 2 | EXPORTS 3 | PyInit_Sparse_Matrix_Tree_CSR 4 | -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/slim_bpr_cython_epoch.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/slim_bpr_cython_epoch.o -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_csr.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_csr.o -------------------------------------------------------------------------------- /SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_tree_csr.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_tree_csr.o -------------------------------------------------------------------------------- /SLIM_BPR/Cython/compileCython.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 16/07/2017 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | 10 | try: 11 | from setuptools import setup 12 | from setuptools import Extension 13 | except ImportError: 14 | from distutils.core import setup 15 | from distutils.extension import Extension 16 | 17 | 18 | from Cython.Distutils import build_ext 19 | 20 | 21 | import numpy 22 | 23 | import sys 24 | import re 25 | 26 | 27 | if len(sys.argv) != 4: 28 | raise ValueError("Wrong number of paramethers received. Expected 4, got {}".format(sys.argv)) 29 | 30 | 31 | #fileToCompile = 'MF_BPR_Cython_Epoch.pyx' 32 | 33 | # Get the name of the file to compile 34 | fileToCompile = sys.argv[1] 35 | # Remove the argument from sys argv in order for it to contain only what setup needs 36 | del sys.argv[1] 37 | 38 | extensionName = re.sub("\.pyx", "", fileToCompile) 39 | 40 | 41 | ext_modules = Extension(extensionName, 42 | [fileToCompile], 43 | extra_compile_args=['-O3'], 44 | include_dirs=[numpy.get_include(),], 45 | ) 46 | 47 | setup( 48 | cmdclass={'build_ext': build_ext}, 49 | ext_modules=[ext_modules] 50 | ) 51 | 52 | -------------------------------------------------------------------------------- /SLIM_BPR/SLIM_BPR_Python.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 28 June 2017 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | import sys 10 | import time 11 | 12 | import numpy as np 13 | import scipy.sparse as sps 14 | from Base.Recommender_utils import similarityMatrixTopK 15 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender 16 | from scipy.special import expit 17 | 18 | from BPR.BPR_Sampling import BPR_Sampling 19 | from Base.Recommender import Recommender 20 | 21 | 22 | def sigmoidFunction(x): 23 | return 1 / (1 + np.exp(-x)) 24 | 25 | 26 | class SLIM_BPR_Python(BPR_Sampling, Similarity_Matrix_Recommender, Recommender): 27 | 28 | def __init__(self, URM_train, positive_threshold=4, sparse_weights = False): 29 | super(SLIM_BPR_Python, self).__init__() 30 | 31 | """ 32 | Creates a new object for training and testing a Bayesian 33 | Personalised Ranking (BPR) SLIM 34 | 35 | This object uses the Theano library for training the model, meaning 36 | it can run on a GPU through CUDA. To make sure your Theano 37 | install is using the GPU, see: 38 | 39 | http://deeplearning.net/software/theano/tutorial/using_gpu.html 40 | 41 | When running on CPU, we recommend using OpenBLAS. 42 | 43 | http://www.openblas.net/ 44 | """ 45 | """ 46 | if objective!='sigmoid' and objective != 'logsigmoid': 47 | raise ValueError("Objective not valid. Acceptable values are 'sigmoid' and 'logsigmoid'. Provided value was '{}'".format(objective)) 48 | self.objective = objective 49 | """ 50 | 51 | self.URM_train = URM_train 52 | self.n_users = URM_train.shape[0] 53 | self.n_items = URM_train.shape[1] 54 | self.normalize = False 55 | self.sparse_weights = sparse_weights 56 | self.positive_threshold = positive_threshold 57 | 58 | 59 | self.URM_mask = self.URM_train.copy() 60 | 61 | self.URM_mask.data = self.URM_mask.data >= self.positive_threshold 62 | self.URM_mask.eliminate_zeros() 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | def updateSimilarityMatrix(self): 71 | 72 | if self.topK != False: 73 | if self.sparse_weights: 74 | self.W_sparse = similarityMatrixTopK(self.S.T, k=self.topK, forceSparseOutput=True) 75 | else: 76 | self.W = similarityMatrixTopK(self.S.T, k=self.topK, forceSparseOutput=False) 77 | 78 | else: 79 | if self.sparse_weights: 80 | self.W_sparse = sps.csr_matrix(self.S.T) 81 | else: 82 | self.W = self.S.T 83 | 84 | 85 | 86 | def updateWeightsLoop(self, u, i, j): 87 | """ 88 | Define the update rules to be used in the train phase and compile the train function 89 | :return: 90 | """ 91 | 92 | x_ui = self.S[i] 93 | x_uj = self.S[j] 94 | 95 | # The difference is computed on the whole row not only on the user_seen items 96 | # The performance seems to be higher this way 97 | x_uij = x_ui - x_uj 98 | 99 | # Sigmoid whose argument is minus in order for the exponent of the exponential to be positive 100 | sigmoid = expit(-x_uij) 101 | 102 | delta_i = sigmoid-self.lambda_i*self.S[i] 103 | delta_j = -sigmoid-self.lambda_j*self.S[j] 104 | 105 | # Since a shared variable may be the target of only one update rule 106 | # All the required updates are chained inside a subtensor 107 | for sampleIndex in range(self.batch_size): 108 | 109 | user_id = u[sampleIndex] 110 | 111 | for item_id in self.userSeenItems[user_id]: 112 | # Do not update items i 113 | if item_id != i[sampleIndex]: 114 | self.S[i] += self.learning_rate * delta_i 115 | 116 | # Do not update j 117 | if item_id != j[sampleIndex]: 118 | self.S[j] += self.learning_rate * delta_j 119 | 120 | 121 | def updateWeightsBatch(self, u, i, j): 122 | """ 123 | Define the update rules to be used in the train phase and compile the train function 124 | :return: 125 | """ 126 | 127 | if self.batch_size==1: 128 | seenItems = self.userSeenItems[u[0]] 129 | 130 | x_ui = self.S[i, seenItems] 131 | x_uj = self.S[j, seenItems] 132 | 133 | # The difference is computed on the user_seen items 134 | x_uij = x_ui - x_uj 135 | 136 | #x_uij = x_uij[0,seenItems] 137 | x_uij = np.sum(x_uij) 138 | 139 | # log(sigm(+x_uij)) 140 | gradient = 1 / (1 + np.exp(x_uij)) 141 | 142 | # sigm(-x_uij) 143 | #exp = np.exp(x_uij) 144 | #gradient = exp/np.power(exp+1, 2) 145 | 146 | else: 147 | 148 | x_ui = self.S[i] 149 | x_uj = self.S[j] 150 | 151 | # The difference is computed on the user_seen items 152 | x_uij = x_ui - x_uj 153 | 154 | x_uij = self.URM_mask[u,:].dot(x_uij.T).diagonal() 155 | 156 | gradient = np.sum(1 / (1 + np.exp(x_uij))) / self.batch_size 157 | 158 | # Sigmoid whose argument is minus in order for the exponent of the exponential to be positive 159 | # Best performance with: gradient = np.sum(expit(-x_uij)) / self.batch_size 160 | #gradient = np.sum(x_uij) / self.batch_size 161 | #gradient = expit(-gradient) 162 | #gradient = np.sum(expit(-x_uij)) / self.batch_size 163 | #gradient = np.sum(np.log(expit(x_uij))) / self.batch_size 164 | #gradient = np.sum(1/(1+np.exp(x_uij))) / self.batch_size 165 | #gradient = min(10, max(-10, gradient))+10 166 | 167 | 168 | if self.batch_size==1: 169 | 170 | userSeenItems = self.userSeenItems[u[0]] 171 | 172 | self.S[i, userSeenItems] += self.learning_rate * gradient 173 | self.S[i, i] = 0 174 | 175 | self.S[j, userSeenItems] -= self.learning_rate * gradient 176 | self.S[j, j] = 0 177 | 178 | 179 | 180 | else: 181 | itemsToUpdate = np.array(self.URM_mask[u, :].sum(axis=0) > 0).ravel() 182 | 183 | # Do not update items i, set all user-posItem to false 184 | # itemsToUpdate[i] = False 185 | 186 | self.S[i] += self.learning_rate * gradient * itemsToUpdate 187 | self.S[i, i] = 0 188 | 189 | # Now update i, setting all user-posItem to true 190 | # Do not update j 191 | 192 | # itemsToUpdate[i] = True 193 | # itemsToUpdate[j] = False 194 | 195 | self.S[j] -= self.learning_rate * gradient * itemsToUpdate 196 | self.S[j, j] = 0 197 | 198 | def fit(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, minRatingsPerUser=1, 199 | batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0, 200 | lambda_i = 0.0025, lambda_j = 0.00025, learning_rate = 0.05, topK = False): 201 | 202 | 203 | 204 | if self.sparse_weights: 205 | self.S = sps.csr_matrix((self.n_items, self.n_items), dtype=np.float32) 206 | else: 207 | self.S = np.zeros((self.n_items, self.n_items)).astype('float32') 208 | 209 | 210 | self.initializeFastSampling(positive_threshold=self.positive_threshold) 211 | 212 | 213 | self.fit_alreadyInitialized(epochs=epochs, 214 | logFile=logFile, 215 | URM_test=URM_test, 216 | filterTopPop = filterTopPop, 217 | minRatingsPerUser=minRatingsPerUser, 218 | batch_size = batch_size, 219 | validate_every_N_epochs = validate_every_N_epochs, 220 | start_validation_after_N_epochs = start_validation_after_N_epochs, 221 | lambda_i = lambda_i, 222 | lambda_j = lambda_j, 223 | learning_rate = learning_rate, 224 | topK = topK) 225 | 226 | 227 | 228 | def fit_alreadyInitialized(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, minRatingsPerUser=1, 229 | batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0, 230 | lambda_i = 0.0025, lambda_j = 0.00025, learning_rate = 0.05, topK = False): 231 | """ 232 | Fits the model performing a round of testing at the end of each epoch 233 | :param epochs: 234 | :param batch_size: 235 | :param logFile: 236 | :param URM_test: 237 | :return: 238 | """ 239 | 240 | 241 | if(topK != False and topK<1): 242 | raise ValueError("TopK not valid. Acceptable values are either False or a positive integer value. Provided value was '{}'".format(topK)) 243 | self.topK = topK 244 | 245 | 246 | self.batch_size = batch_size 247 | self.lambda_i = lambda_i 248 | self.lambda_j = lambda_j 249 | self.learning_rate = learning_rate 250 | 251 | 252 | start_time_train = time.time() 253 | 254 | for currentEpoch in range(epochs): 255 | 256 | start_time_epoch = time.time() 257 | 258 | if self.batch_size>0: 259 | self.epochIteration() 260 | else: 261 | print("No batch not available") 262 | 263 | 264 | if (URM_test is not None) and ((currentEpoch +1 )% validate_every_N_epochs == 0) and \ 265 | currentEpoch >= start_validation_after_N_epochs: 266 | 267 | print("Evaluation begins") 268 | 269 | self.updateSimilarityMatrix() 270 | 271 | results_run = self.evaluateRecommendations(URM_test, filterTopPop=filterTopPop, 272 | minRatingsPerUser=minRatingsPerUser) 273 | 274 | self.writeCurrentConfig(currentEpoch, results_run, logFile) 275 | 276 | print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs, 277 | float(time.time() - start_time_epoch) / 60)) 278 | 279 | 280 | # Fit with no validation 281 | else: 282 | print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs, 283 | float(time.time() - start_time_epoch) / 60)) 284 | 285 | self.updateSimilarityMatrix() 286 | 287 | print("Fit completed in {:.2f} minutes".format(float(time.time() - start_time_train) / 60)) 288 | 289 | sys.stdout.flush() 290 | 291 | 292 | 293 | def writeCurrentConfig(self, currentEpoch, results_run, logFile): 294 | 295 | current_config = {'lambda_i': self.lambda_i, 296 | 'lambda_j': self.lambda_j, 297 | 'batch_size': self.batch_size, 298 | 'learn_rate': self.learning_rate, 299 | 'topK_similarity': self.topK, 300 | 'epoch': currentEpoch} 301 | 302 | print("Test case: {}\nResults {}\n".format(current_config, results_run)) 303 | # print("Weights: {}\n".format(str(list(self.weights)))) 304 | 305 | sys.stdout.flush() 306 | 307 | if (logFile != None): 308 | logFile.write("Test case: {}, Results {}\n".format(current_config, results_run)) 309 | # logFile.write("Weights: {}\n".format(str(list(self.weights)))) 310 | logFile.flush() 311 | 312 | 313 | 314 | def epochIteration(self): 315 | 316 | # Get number of available interactions 317 | numPositiveIteractions = int(self.URM_mask.nnz*1) 318 | 319 | start_time_epoch = time.time() 320 | start_time_batch = time.time() 321 | 322 | totalNumberOfBatch = int(numPositiveIteractions/self.batch_size)+1 323 | 324 | # Uniform user sampling without replacement 325 | for numCurrentBatch in range(totalNumberOfBatch): 326 | 327 | sgd_users, sgd_pos_items, sgd_neg_items = self.sampleBatch() 328 | 329 | self.updateWeightsBatch( 330 | sgd_users, 331 | sgd_pos_items, 332 | sgd_neg_items 333 | ) 334 | 335 | """ 336 | self.updateWeightsLoop( 337 | sgd_users, 338 | sgd_pos_items, 339 | sgd_neg_items 340 | ) 341 | """ 342 | 343 | if(time.time() - start_time_batch >= 30 or numCurrentBatch==totalNumberOfBatch-1): 344 | print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Sample per second: {:.0f}".format( 345 | numCurrentBatch*self.batch_size, 346 | 100.0* float(numCurrentBatch*self.batch_size)/numPositiveIteractions, 347 | time.time() - start_time_batch, 348 | float(numCurrentBatch*self.batch_size + 1) / (time.time() - start_time_epoch))) 349 | 350 | sys.stdout.flush() 351 | sys.stderr.flush() 352 | 353 | start_time_batch = time.time() 354 | 355 | 356 | 357 | self.S[np.arange(0, self.n_items), np.arange(0, self.n_items)] = 0.0 358 | 359 | 360 | 361 | -------------------------------------------------------------------------------- /SLIM_RMSE/SLIM_RMSE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: Massimo Quadrana 5 | """ 6 | 7 | 8 | import numpy as np 9 | import scipy.sparse as sps 10 | from Base.Recommender import Recommender 11 | from Base.Recommender_utils import check_matrix 12 | from sklearn.linear_model import ElasticNet 13 | 14 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender 15 | import time, sys 16 | 17 | class SLIM_RMSE(Recommender, Similarity_Matrix_Recommender): 18 | """ 19 | Train a Sparse Linear Methods (SLIM) item similarity model. 20 | NOTE: ElasticNet solver is parallel, a single intance of SLIM_RMSE will 21 | make use of half the cores available 22 | 23 | See: 24 | Efficient Top-N Recommendation by Linear Regression, 25 | M. Levy and K. Jack, LSRS workshop at RecSys 2013. 26 | 27 | SLIM: Sparse linear methods for top-n recommender systems, 28 | X. Ning and G. Karypis, ICDM 2011. 29 | http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf 30 | """ 31 | 32 | def __init__(self, URM_train): 33 | 34 | super(SLIM_RMSE, self).__init__() 35 | 36 | self.URM_train = URM_train 37 | 38 | 39 | def __str__(self): 40 | return "SLIM (l1_penalty={},l2_penalty={},positive_only={})".format( 41 | self.l1_penalty, self.l2_penalty, self.positive_only 42 | ) 43 | 44 | def fit(self, l1_penalty=0.1, l2_penalty=0.1, positive_only=True, topK = 100): 45 | 46 | self.l1_penalty = l1_penalty 47 | self.l2_penalty = l2_penalty 48 | self.positive_only = positive_only 49 | self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty) 50 | self.topK = topK 51 | 52 | X = check_matrix(self.URM_train, 'csc', dtype=np.float32) 53 | 54 | n_items = X.shape[1] 55 | 56 | # initialize the ElasticNet model 57 | self.model = ElasticNet(alpha=1.0, 58 | l1_ratio=self.l1_ratio, 59 | positive=self.positive_only, 60 | fit_intercept=False, 61 | copy_X=False, 62 | precompute=True, 63 | selection='random', 64 | max_iter=100, 65 | tol=1e-4) 66 | 67 | # we'll store the W matrix into a sparse csr_matrix 68 | # let's initialize the vectors used by the sparse.csc_matrix constructor 69 | values, rows, cols = [], [], [] 70 | start_time = time.time() 71 | start_time_printBatch = start_time 72 | 73 | # fit each item's factors sequentially (not in parallel) 74 | for currentItem in range(n_items): 75 | # get the target column 76 | y = X[:, currentItem].toarray() 77 | # set the j-th column of X to zero 78 | startptr = X.indptr[currentItem] 79 | endptr = X.indptr[currentItem + 1] 80 | bak = X.data[startptr: endptr].copy() 81 | X.data[startptr: endptr] = 0.0 82 | # fit one ElasticNet model per column 83 | self.model.fit(X, y) 84 | 85 | # self.model.coef_ contains the coefficient of the ElasticNet model 86 | # let's keep only the non-zero values 87 | #nnz_idx = self.model.coef_ > 0.0 88 | 89 | # Select topK values 90 | # Sorting is done in three steps. Faster then plain np.argsort for higher number of items 91 | # - Partition the data to extract the set of relevant items 92 | # - Sort only the relevant items 93 | # - Get the original item index 94 | relevant_items_partition = (-self.model.coef_).argpartition(self.topK)[0:self.topK] 95 | relevant_items_partition_sorting = np.argsort(-self.model.coef_[relevant_items_partition]) 96 | ranking = relevant_items_partition[relevant_items_partition_sorting] 97 | 98 | notZerosMask = self.model.coef_[ranking] > 0.0 99 | ranking = ranking[notZerosMask] 100 | 101 | values.extend(self.model.coef_[ranking]) 102 | rows.extend(ranking) 103 | cols.extend([currentItem]*len(ranking)) 104 | 105 | # finally, replace the original values of the j-th column 106 | X.data[startptr:endptr] = bak 107 | 108 | 109 | if time.time() - start_time_printBatch > 300: 110 | print("Processed {} ( {:.2f}% ) in {:.2f} minutes. Columns per second: {:.0f}".format( 111 | currentItem, 112 | 100.0* float(currentItem)/n_items, 113 | (time.time()-start_time)/60, 114 | float(currentItem)/(time.time()-start_time))) 115 | sys.stdout.flush() 116 | sys.stderr.flush() 117 | 118 | start_time_printBatch = time.time() 119 | 120 | 121 | # generate the sparse weight matrix 122 | self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32) 123 | 124 | 125 | 126 | 127 | import multiprocessing 128 | from multiprocessing import Pool 129 | from functools import partial 130 | 131 | 132 | class MultiThreadSLIM_RMSE(SLIM_RMSE, Similarity_Matrix_Recommender): 133 | 134 | def __init__(self, URM_train): 135 | 136 | super(MultiThreadSLIM_RMSE, self).__init__(URM_train) 137 | 138 | def __str__(self): 139 | return "SLIM_mt (l1_penalty={},l2_penalty={},positive_only={},workers={})".format( 140 | self.l1_penalty, self.l2_penalty, self.positive_only, self.workers 141 | ) 142 | 143 | def _partial_fit(self, currentItem, X, topK): 144 | model = ElasticNet(alpha=1.0, 145 | l1_ratio=self.l1_ratio, 146 | positive=self.positive_only, 147 | fit_intercept=False, 148 | copy_X=False, 149 | precompute=True, 150 | selection='random', 151 | max_iter=100, 152 | tol=1e-4) 153 | 154 | # WARNING: make a copy of X to avoid race conditions on column j 155 | # TODO: We can probably come up with something better here. 156 | X_j = X.copy() 157 | # get the target column 158 | y = X_j[:, currentItem].toarray() 159 | # set the j-th column of X to zero 160 | X_j.data[X_j.indptr[currentItem]:X_j.indptr[currentItem + 1]] = 0.0 161 | # fit one ElasticNet model per column 162 | model.fit(X_j, y) 163 | # self.model.coef_ contains the coefficient of the ElasticNet model 164 | # let's keep only the non-zero values 165 | #nnz_idx = model.coef_ > 0.0 166 | 167 | relevant_items_partition = (-model.coef_).argpartition(topK)[0:topK] 168 | relevant_items_partition_sorting = np.argsort(-model.coef_[relevant_items_partition]) 169 | ranking = relevant_items_partition[relevant_items_partition_sorting] 170 | 171 | notZerosMask = model.coef_[ranking] > 0.0 172 | ranking = ranking[notZerosMask] 173 | 174 | values = model.coef_[ranking] 175 | rows = ranking 176 | cols = [currentItem] * len(ranking) 177 | 178 | return values, rows, cols 179 | 180 | def fit(self,l1_penalty=0.1, 181 | l2_penalty=0.1, 182 | positive_only=True, 183 | topK = 100, 184 | workers=multiprocessing.cpu_count()): 185 | 186 | 187 | self.l1_penalty = l1_penalty 188 | self.l2_penalty = l2_penalty 189 | self.positive_only = positive_only 190 | self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty) 191 | self.topK = topK 192 | 193 | self.workers = workers 194 | 195 | 196 | 197 | 198 | self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) 199 | n_items = self.URM_train.shape[1] 200 | # fit item's factors in parallel 201 | 202 | #oggetto riferito alla funzione nel quale predefinisco parte dell'input 203 | _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK) 204 | 205 | #creo un pool con un certo numero di processi 206 | pool = Pool(processes=self.workers) 207 | 208 | #avvio il pool passando la funzione (con la parte fissa dell'input) 209 | #e il rimanente parametro, variabile 210 | res = pool.map(_pfit, np.arange(n_items)) 211 | 212 | # res contains a vector of (values, rows, cols) tuples 213 | values, rows, cols = [], [], [] 214 | for values_, rows_, cols_ in res: 215 | values.extend(values_) 216 | rows.extend(rows_) 217 | cols.extend(cols_) 218 | 219 | # generate the sparse weight matrix 220 | self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32) 221 | 222 | -------------------------------------------------------------------------------- /all_algorithms.py: -------------------------------------------------------------------------------- 1 | 2 | from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython 3 | from SLIM_RMSE.SLIM_RMSE import SLIM_RMSE 4 | 5 | from MatrixFactorization.Cython.MF_BPR_Cython import MF_BPR_Cython 6 | from MatrixFactorization.MatrixFactorization_RMSE import FunkSVD 7 | 8 | from KNN.user_knn_CF import UserKNNCFRecommender 9 | from KNN.item_knn_CF import ItemKNNCFRecommender 10 | from KNN.item_knn_CBF import ItemKNNCBFRecommender 11 | 12 | from data.Movielens10MReader import Movielens10MReader 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | 18 | dataReader = Movielens10MReader() 19 | 20 | URM_train = dataReader.get_URM_train() 21 | URM_validation = dataReader.get_URM_validation() 22 | URM_test = dataReader.get_URM_test() 23 | 24 | recommender_list = [] 25 | recommender_list.append(ItemKNNCFRecommender(URM_train)) 26 | recommender_list.append(UserKNNCFRecommender(URM_train)) 27 | recommender_list.append(MF_BPR_Cython(URM_train)) 28 | recommender_list.append(FunkSVD(URM_train)) 29 | recommender_list.append(SLIM_BPR_Cython(URM_train, sparse_weights=False)) 30 | recommender_list.append(SLIM_RMSE(URM_train)) 31 | 32 | 33 | 34 | for recommender in recommender_list: 35 | 36 | print("Algorithm: {}".format(recommender.__class__)) 37 | 38 | recommender.fit() 39 | 40 | results_run = recommender.evaluateRecommendations(URM_test, at=5, exclude_seen=True) 41 | print("Algorithm: {}, results: {}".format(recommender.__class__, results_run)) 42 | 43 | -------------------------------------------------------------------------------- /data/Movielens10MReader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 14/09/17 5 | 6 | @author: Maurizio Ferrari Dacrema 7 | """ 8 | 9 | 10 | import numpy as np 11 | import scipy.sparse as sps 12 | import zipfile 13 | 14 | from Base.Recommender_utils import removeZeroRatingRowAndCol 15 | 16 | 17 | def loadCSVintoSparse (filePath, header = False, separator="::"): 18 | 19 | values, rows, cols = [], [], [] 20 | 21 | fileHandle = open(filePath, "r") 22 | numCells = 0 23 | 24 | if header: 25 | fileHandle.readline() 26 | 27 | for line in fileHandle: 28 | numCells += 1 29 | if (numCells % 1000000 == 0): 30 | print("Processed {} cells".format(numCells)) 31 | 32 | if (len(line)) > 1: 33 | line = line.split(separator) 34 | 35 | line[-1] = line[-1].replace("\n", "") 36 | 37 | if not line[2] == "0" and not line[2] == "NaN": 38 | rows.append(int(line[0])) 39 | cols.append(int(line[1])) 40 | values.append(float(line[2])) 41 | 42 | fileHandle.close() 43 | 44 | return sps.csr_matrix((values, (rows, cols)), dtype=np.float32) 45 | 46 | 47 | 48 | def saveSparseIntoCSV (filePath, sparse_matrix, separator=","): 49 | 50 | sparse_matrix = sparse_matrix.tocoo() 51 | 52 | fileHandle = open(filePath, "w") 53 | 54 | for index in range(len(sparse_matrix.data)): 55 | fileHandle.write("{row}{separator}{col}{separator}{value}\n".format( 56 | row = sparse_matrix.row[index], col = sparse_matrix.col[index], value = sparse_matrix.data[index], 57 | separator = separator)) 58 | 59 | 60 | 61 | 62 | class Movielens10MReader(object): 63 | 64 | def __init__(self, splitTrainTest = False, splitTrainTestValidation =[0.6, 0.2, 0.2] , loadPredefinedTrainTest = True): 65 | 66 | super(Movielens10MReader, self).__init__() 67 | 68 | if sum(splitTrainTestValidation) != 1.0 or len(splitTrainTestValidation) != 3: 69 | raise ValueError("Movielens10MReader: splitTrainTestValidation must be a probability distribution over Train, Test and Validation") 70 | 71 | print("Movielens10MReader: loading data...") 72 | 73 | dataSubfolder = "./data/" 74 | 75 | dataFile = zipfile.ZipFile(dataSubfolder + "movielens_10m.zip") 76 | URM_path = dataFile.extract("ml-10M100K/ratings.dat", path=dataSubfolder) 77 | 78 | 79 | if not loadPredefinedTrainTest: 80 | self.URM_all = loadCSVintoSparse(URM_path, separator="::") 81 | self.URM_all = removeZeroRatingRowAndCol(self.URM_all) 82 | 83 | else: 84 | 85 | try: 86 | self.URM_train = sps.load_npz(dataSubfolder + "URM_train.npz") 87 | self.URM_test = sps.load_npz(dataSubfolder + "URM_test.npz") 88 | self.URM_validation = sps.load_npz(dataSubfolder + "URM_validation.npz") 89 | 90 | return 91 | 92 | except FileNotFoundError: 93 | # Rebuild split 94 | print("Movielens10MReader: URM_train or URM_test or URM_validation not found. Building new ones") 95 | 96 | splitTrainTest = True 97 | self.URM_all = loadCSVintoSparse(URM_path) 98 | self.URM_all = removeZeroRatingRowAndCol(self.URM_all) 99 | 100 | 101 | 102 | if splitTrainTest: 103 | 104 | self.URM_all = self.URM_all.tocoo() 105 | 106 | numInteractions= len(self.URM_all.data) 107 | 108 | split = np.random.choice([1, 2, 3], numInteractions, p=splitTrainTestValidation) 109 | 110 | 111 | trainMask = split == 1 112 | self.URM_train = sps.coo_matrix((self.URM_all.data[trainMask], (self.URM_all.row[trainMask], self.URM_all.col[trainMask]))) 113 | self.URM_train = self.URM_train.tocsr() 114 | 115 | testMask = split == 2 116 | 117 | self.URM_test = sps.coo_matrix((self.URM_all.data[testMask], (self.URM_all.row[testMask], self.URM_all.col[testMask]))) 118 | self.URM_test = self.URM_test.tocsr() 119 | 120 | validationMask = split == 3 121 | 122 | self.URM_validation = sps.coo_matrix((self.URM_all.data[validationMask], (self.URM_all.row[validationMask], self.URM_all.col[validationMask]))) 123 | self.URM_validation = self.URM_validation.tocsr() 124 | 125 | del self.URM_all 126 | 127 | print("Movielens10MReader: saving URM_train and URM_test") 128 | sps.save_npz(dataSubfolder + "URM_train.npz", self.URM_train) 129 | sps.save_npz(dataSubfolder + "URM_test.npz", self.URM_test) 130 | sps.save_npz(dataSubfolder + "URM_validation.npz", self.URM_validation) 131 | 132 | print("Movielens10MReader: loading complete") 133 | 134 | 135 | 136 | 137 | def get_URM_train(self): 138 | return self.URM_train 139 | 140 | def get_URM_test(self): 141 | return self.URM_test 142 | 143 | def get_URM_validation(self): 144 | return self.URM_validation 145 | -------------------------------------------------------------------------------- /data/URM_test.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/URM_test.npz -------------------------------------------------------------------------------- /data/URM_train.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/URM_train.npz -------------------------------------------------------------------------------- /data/URM_validation.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/URM_validation.npz -------------------------------------------------------------------------------- /data/movielens_10m.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/movielens_10m.zip -------------------------------------------------------------------------------- /run_SLIM_BPR.py: -------------------------------------------------------------------------------- 1 | 2 | from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython 3 | from MatrixFactorization.Cython.MF_BPR_Cython import MF_BPR_Cython 4 | from data.Movielens10MReader import Movielens10MReader 5 | 6 | 7 | def run_SLIM(): 8 | 9 | dataReader = Movielens10MReader() 10 | 11 | URM_train = dataReader.get_URM_train() 12 | URM_test = dataReader.get_URM_test() 13 | 14 | recommender = SLIM_BPR_Cython(URM_train, recompile_cython=False, positive_threshold=4, sparse_weights=True) 15 | #recommender = MF_BPR_Cython(URM_train, recompile_cython=False, positive_threshold=4) 16 | 17 | logFile = open("Result_log.txt", "a") 18 | 19 | 20 | recommender.fit(epochs=2, validate_every_N_epochs=1, URM_test=URM_test, 21 | logFile=logFile, batch_size=1, sgd_mode='rmsprop', learning_rate=1e-4) 22 | 23 | 24 | results_run = recommender.evaluateRecommendations(URM_test, at=5) 25 | print(results_run) 26 | 27 | 28 | run_SLIM() -------------------------------------------------------------------------------- /slides/20161219_BPR.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/20161219_BPR.pptx -------------------------------------------------------------------------------- /slides/2017_MF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/2017_MF.pdf -------------------------------------------------------------------------------- /slides/2017_MF.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/2017_MF.pptx -------------------------------------------------------------------------------- /slides/Amazon AWS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/Amazon AWS.pdf -------------------------------------------------------------------------------- /slides/FM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/FM.pdf -------------------------------------------------------------------------------- /slides/FunkSVD - 2006.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/FunkSVD - 2006.pdf -------------------------------------------------------------------------------- /slides/hu_koren_volinsky.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/hu_koren_volinsky.pdf -------------------------------------------------------------------------------- /slides/koren_sdv++.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/koren_sdv++.pdf -------------------------------------------------------------------------------- /slides/rendle_bpr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/rendle_bpr.pdf --------------------------------------------------------------------------------