├── .gitignore
├── BPR
    └── BPR_Sampling.py
├── Base
    ├── Cython
    │   ├── compileCython.py
    │   ├── cosine_similarity.c
    │   ├── cosine_similarity.cpython-36m-x86_64-linux-gnu.so
    │   ├── cosine_similarity.html
    │   └── cosine_similarity.pyx
    ├── Recommender.py
    ├── Recommender_utils.py
    ├── Recommender_utils_Test.py
    ├── Similarity_Matrix_Recommender.py
    ├── cosine_similarity.py
    ├── cosine_similarity_test.py
    └── metrics.py
├── KNN
    ├── item_knn_CBF.py
    ├── item_knn_CF.py
    ├── item_knn_custom_Similarity.py
    └── user_knn_CF.py
├── MatrixFactorization
    ├── Cython
    │   ├── MF_BPR_Cython.py
    │   ├── MF_BPR_Cython_Epoch.c
    │   ├── MF_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so
    │   ├── MF_BPR_Cython_Epoch.html
    │   ├── MF_BPR_Cython_Epoch.pyx
    │   ├── MF_RMSE.c
    │   ├── MF_RMSE.cpython-36m-x86_64-linux-gnu.so
    │   ├── MF_RMSE.html
    │   ├── MF_RMSE.pyx
    │   ├── build
    │   │   └── temp.linux-x86_64-3.6
    │   │   │   └── MF_BPR_Cython_Epoch.o
    │   └── compileCython.py
    └── MatrixFactorization_RMSE.py
├── README.md
├── SLIM_BPR
    ├── Cython
    │   ├── SLIM_BPR_Cython.py
    │   ├── SLIM_BPR_Cython_Epoch.c
    │   ├── SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so
    │   ├── SLIM_BPR_Cython_Epoch.html
    │   ├── SLIM_BPR_Cython_Epoch.pyx
    │   ├── build
    │   │   ├── lib.linux-x86_64-3.6
    │   │   │   └── SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so
    │   │   ├── temp.linux-x86_64-3.6
    │   │   │   ├── SLIM_BPR_Cython_Epoch.o
    │   │   │   ├── Sparse_Matrix_CSR.o
    │   │   │   └── Sparse_Matrix_Tree_CSR.o
    │   │   └── temp.win-amd64-3.6
    │   │   │   └── Release
    │   │   │       ├── SLIM_BPR_Cython_Epoch.cp36-win_amd64.def
    │   │   │       ├── Sparse_Matrix_CSR.cp36-win_amd64.def
    │   │   │       ├── Sparse_Matrix_Tree_CSR.cp36-win_amd64.def
    │   │   │       ├── slim_bpr_cython_epoch.o
    │   │   │       ├── sparse_matrix_csr.o
    │   │   │       └── sparse_matrix_tree_csr.o
    │   └── compileCython.py
    └── SLIM_BPR_Python.py
├── SLIM_RMSE
    └── SLIM_RMSE.py
├── all_algorithms.py
├── data
    ├── Movielens10MReader.py
    ├── URM_test.npz
    ├── URM_train.npz
    ├── URM_validation.npz
    └── movielens_10m.zip
├── run_SLIM_BPR.py
└── slides
    ├── 20161219_BPR.pptx
    ├── 2017_MF.pdf
    ├── 2017_MF.pptx
    ├── Amazon AWS.pdf
    ├── FM.pdf
    ├── FunkSVD - 2006.pdf
    ├── hu_koren_volinsky.pdf
    ├── koren_sdv++.pdf
    └── rendle_bpr.pdf


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | env/
 9 | build/
10 | develop-eggs/
11 | dist/
12 | downloads/
13 | eggs/
14 | .eggs/
15 | lib/
16 | lib64/
17 | parts/
18 | sdist/
19 | var/
20 | wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .coverage.*
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | *.cover
44 | .hypothesis/
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | local_settings.py
53 | 
54 | # Flask stuff:
55 | instance/
56 | .webassets-cache
57 | 
58 | # Scrapy stuff:
59 | .scrapy
60 | 
61 | # Sphinx documentation
62 | docs/_build/
63 | 
64 | # PyBuilder
65 | target/
66 | 
67 | # Jupyter Notebook
68 | .ipynb_checkpoints
69 | 
70 | # pyenv
71 | .python-version
72 | 
73 | # celery beat schedule file
74 | celerybeat-schedule
75 | 
76 | # SageMath parsed files
77 | *.sage.py
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | .venv
84 | venv/
85 | ENV/
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | .spyproject
90 | 
91 | # Rope project settings
92 | .ropeproject
93 | 
94 | # mkdocs documentation
95 | /site
96 | 
97 | # mypy
98 | .mypy_cache/
99 | 


--------------------------------------------------------------------------------
/BPR/BPR_Sampling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 5/09/2017
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | import numpy as np
 10 | 
 11 | 
 12 | class BPR_Sampling(object):
 13 | 
 14 |     def __init__(self):
 15 |         super(BPR_Sampling, self).__init__()
 16 | 
 17 | 
 18 |     def sampleUser(self):
 19 |         """
 20 |         Sample a user that has viewed at least one and not all items
 21 |         :return: user_id
 22 |         """
 23 |         while (True):
 24 | 
 25 |             user_id = np.random.randint(0, self.n_users)
 26 |             numSeenItems = self.URM_train[user_id].nnz
 27 | 
 28 |             if (numSeenItems > 0 and numSeenItems < self.n_items):
 29 |                 return user_id
 30 | 
 31 | 
 32 |     def sampleItemPair(self, user_id):
 33 |         """
 34 |         Returns for the given user a random seen item and a random not seen item
 35 |         :param user_id:
 36 |         :return: pos_item_id, neg_item_id
 37 |         """
 38 | 
 39 |         userSeenItems = self.URM_train[user_id].indices
 40 | 
 41 |         pos_item_id = userSeenItems[np.random.randint(0, len(userSeenItems))]
 42 | 
 43 |         while (True):
 44 | 
 45 |             neg_item_id = np.random.randint(0, self.n_items)
 46 | 
 47 |             if (neg_item_id not in userSeenItems):
 48 |                 return pos_item_id, neg_item_id
 49 | 
 50 | 
 51 |     def sampleTriple(self):
 52 |         """
 53 |         Randomly samples a user and then samples randomly a seen and not seen item
 54 |         :return: user_id, pos_item_id, neg_item_id
 55 |         """
 56 | 
 57 |         user_id = self.sampleUser()
 58 |         pos_item_id, neg_item_id = self.sampleItemPair(user_id)
 59 | 
 60 |         return user_id, pos_item_id, neg_item_id
 61 | 
 62 | 
 63 |     def initializeFastSampling(self, positive_threshold=3):
 64 |         print("Initializing fast sampling")
 65 | 
 66 |         self.eligibleUsers = []
 67 |         self.userSeenItems = dict()
 68 | 
 69 |         # Select only positive interactions
 70 |         URM_train_positive = self.URM_train.multiply(self.URM_train>positive_threshold)
 71 | 
 72 |         for user_id in range(self.n_users):
 73 | 
 74 |             if (URM_train_positive[user_id].nnz > 0):
 75 |                 self.eligibleUsers.append(user_id)
 76 |                 self.userSeenItems[user_id] = URM_train_positive[user_id].indices
 77 | 
 78 |         self.eligibleUsers = np.array(self.eligibleUsers)
 79 | 
 80 | 
 81 |     def sampleBatch(self):
 82 |         user_id_list = np.random.choice(self.eligibleUsers, size=(self.batch_size))
 83 |         pos_item_id_list = [None]*self.batch_size
 84 |         neg_item_id_list = [None]*self.batch_size
 85 | 
 86 |         for sample_index in range(self.batch_size):
 87 |             user_id = user_id_list[sample_index]
 88 | 
 89 |             pos_item_id_list[sample_index] = np.random.choice(self.userSeenItems[user_id])
 90 | 
 91 |             negItemSelected = False
 92 | 
 93 |             # It's faster to just try again then to build a mapping of the non-seen items
 94 |             # for every user
 95 |             while (not negItemSelected):
 96 |                 neg_item_id = np.random.randint(0, self.n_items)
 97 | 
 98 |                 if (neg_item_id not in self.userSeenItems[user_id]):
 99 |                     negItemSelected = True
100 |                     neg_item_id_list[sample_index] = neg_item_id
101 | 
102 |         return user_id_list, pos_item_id_list, neg_item_id_list
103 | 
104 | 


--------------------------------------------------------------------------------
/Base/Cython/compileCython.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on 16/07/2017
 5 | 
 6 | @author: Maurizio Ferrari Dacrema
 7 | """
 8 | 
 9 | 
10 | try:
11 |     from setuptools import setup
12 |     from setuptools import Extension
13 | except ImportError:
14 |     from distutils.core import setup
15 |     from distutils.extension import Extension
16 | 
17 | 
18 | from Cython.Distutils import build_ext
19 | import numpy
20 | import sys
21 | import re
22 | 
23 | 
24 | if len(sys.argv) != 4:
25 |     raise ValueError("Wrong number of paramethers received. Expected 4, got {}".format(sys.argv))
26 | 
27 | 
28 | #fileToCompile = 'FW_SIMILARITY_RMSE_Cython_Epoch.pyx'
29 | 
30 | # Get the name of the file to compile
31 | fileToCompile = sys.argv[1]
32 | # Remove the argument from sys argv in order for it to contain only what setup needs
33 | del sys.argv[1]
34 | 
35 | extensionName = re.sub("\.pyx", "", fileToCompile)
36 | 
37 | 
38 | ext_modules = Extension(extensionName,
39 |                 [fileToCompile],
40 |                 extra_compile_args=['-O3'],
41 |                 include_dirs=[numpy.get_include(),],
42 |                 )
43 | 
44 | setup(
45 |     cmdclass={'build_ext': build_ext},
46 |     ext_modules=[ext_modules]
47 | )
48 | 


--------------------------------------------------------------------------------
/Base/Cython/cosine_similarity.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/Base/Cython/cosine_similarity.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/Base/Cython/cosine_similarity.pyx:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on 23/10/17
  3 | 
  4 | @author: Maurizio Ferrari Dacrema
  5 | """
  6 | 
  7 | #cython: boundscheck=False
  8 | #cython: wraparound=True
  9 | #cython: initializedcheck=False
 10 | #cython: language_level=3
 11 | #cython: nonecheck=False
 12 | #cython: cdivision=True
 13 | #cython: unpack_method_calls=True
 14 | #cython: overflowcheck=False
 15 | 
 16 | 
 17 | import time, sys
 18 | 
 19 | import numpy as np
 20 | cimport numpy as np
 21 | from cpython.array cimport array, clone
 22 | 
 23 | 
 24 | 
 25 | import scipy.sparse as sps
 26 | from Base.Recommender_utils import check_matrix
 27 | 
 28 | 
 29 | cdef class Cosine_Similarity:
 30 | 
 31 |     cdef int TopK
 32 |     cdef long n_items, n_users
 33 | 
 34 |     cdef int[:] user_to_item_row_ptr, user_to_item_cols
 35 |     cdef int[:] item_to_user_rows, item_to_user_col_ptr
 36 |     cdef double[:] user_to_item_data, item_to_user_data
 37 |     cdef double[:] sumOfSquared
 38 |     cdef int shrink, normalize, adjusted_cosine, pearson_correlation, tanimoto_coefficient
 39 | 
 40 |     cdef double[:,:] W_dense
 41 | 
 42 |     def __init__(self, URM, topK = 100, shrink=0, normalize = True,
 43 |                  mode = "cosine"):
 44 |         """
 45 |         Computes the cosine similarity on the columns of dataMatrix
 46 |         If it is computed on URM=|users|x|items|, pass the URM as is.
 47 |         If it is computed on ICM=|items|x|features|, pass the ICM transposed.
 48 |         :param dataMatrix:
 49 |         :param topK:
 50 |         :param shrink:
 51 |         :param normalize:
 52 |         :param mode:    "cosine"    computes Cosine similarity
 53 |                         "adjusted"  computes Adjusted Cosine, removing the average of the users
 54 |                         "pearson"   computes Pearson Correlation, removing the average of the items
 55 |                         "jaccard"   computes Jaccard similarity for binary interactions using Tanimoto
 56 |                         "tanimoto"  computes Tanimoto coefficient for binary interactions
 57 | 
 58 |         """
 59 | 
 60 |         super(Cosine_Similarity, self).__init__()
 61 | 
 62 |         self.n_items = URM.shape[1]
 63 |         self.n_users = URM.shape[0]
 64 |         self.shrink = shrink
 65 |         self.normalize = normalize
 66 | 
 67 |         self.adjusted_cosine = False
 68 |         self.pearson_correlation = False
 69 |         self.tanimoto_coefficient = False
 70 | 
 71 |         if mode == "adjusted":
 72 |             self.adjusted_cosine = True
 73 |         elif mode == "pearson":
 74 |             self.pearson_correlation = True
 75 |         elif mode == "jaccard" or mode == "tanimoto":
 76 |             self.tanimoto_coefficient = True
 77 |             # Tanimoto has a specific kind of normalization
 78 |             self.normalize = False
 79 | 
 80 |         elif mode == "cosine":
 81 |             pass
 82 |         else:
 83 |             raise ValueError("Cosine_Similarity: value for paramether 'mode' not recognized."
 84 |                              " Allowed values are: 'cosine', 'pearson', 'adjusted', 'jaccard', 'tanimoto'."
 85 |                              " Passed value was '{}'".format(mode))
 86 | 
 87 | 
 88 |         self.TopK = min(topK, self.n_items)
 89 | 
 90 |         # Copy data to avoid altering the original object
 91 |         URM = URM.copy()
 92 | 
 93 |         if self.adjusted_cosine:
 94 |             URM = self.applyAdjustedCosine(URM)
 95 |         elif self.pearson_correlation:
 96 |             URM = self.applyPearsonCorrelation(URM)
 97 |         elif self.tanimoto_coefficient:
 98 |             URM = self.useOnlyBooleanInteractions(URM)
 99 | 
100 | 
101 |         URM = check_matrix(URM, 'csr')
102 | 
103 |         self.user_to_item_row_ptr = URM.indptr
104 |         self.user_to_item_cols = URM.indices
105 |         self.user_to_item_data = np.array(URM.data, dtype=np.float64)
106 | 
107 |         URM = check_matrix(URM, 'csc')
108 |         self.item_to_user_rows = URM.indices
109 |         self.item_to_user_col_ptr = URM.indptr
110 |         self.item_to_user_data = np.array(URM.data, dtype=np.float64)
111 | 
112 |         # Compute sum of squared values to be used in normalization
113 |         self.sumOfSquared = np.array(URM.power(2).sum(axis=0), dtype=np.float64).ravel()
114 | 
115 |         # Tanimoto does not require the square root to be applied
116 |         if not self.tanimoto_coefficient:
117 |             self.sumOfSquared = np.sqrt(self.sumOfSquared)
118 | 
119 | 
120 |         if self.TopK == 0:
121 |             self.W_dense = np.zeros((self.n_items,self.n_items))
122 | 
123 | 
124 |     cdef useOnlyBooleanInteractions(self, URM):
125 |         """
126 |         Set to 1 all data points
127 |         :return:
128 |         """
129 | 
130 |         cdef long index
131 | 
132 |         for index in range(len(URM.data)):
133 |             URM.data[index] = 1
134 | 
135 |         return URM
136 | 
137 | 
138 | 
139 |     cdef applyPearsonCorrelation(self, URM):
140 |         """
141 |         Remove from every data point the average for the corresponding column
142 |         :return:
143 |         """
144 | 
145 |         cdef double[:] sumPerCol
146 |         cdef int[:] interactionsPerCol
147 |         cdef long colIndex, innerIndex, start_pos, end_pos
148 |         cdef double colAverage
149 | 
150 | 
151 |         URM = check_matrix(URM, 'csc')
152 | 
153 | 
154 |         sumPerCol = np.array(URM.sum(axis=0), dtype=np.float64).ravel()
155 |         interactionsPerCol = np.diff(URM.indptr)
156 | 
157 | 
158 |         #Remove for every row the corresponding average
159 |         for colIndex in range(self.n_items):
160 | 
161 |             if interactionsPerCol[colIndex]>0:
162 | 
163 |                 colAverage = sumPerCol[colIndex] / interactionsPerCol[colIndex]
164 | 
165 |                 start_pos = URM.indptr[colIndex]
166 |                 end_pos = URM.indptr[colIndex+1]
167 | 
168 |                 innerIndex = start_pos
169 | 
170 |                 while innerIndex < end_pos:
171 | 
172 |                     URM.data[innerIndex] -= colAverage
173 |                     innerIndex+=1
174 | 
175 | 
176 |         return URM
177 | 
178 | 
179 | 
180 |     cdef applyAdjustedCosine(self, URM):
181 |         """
182 |         Remove from every data point the average for the corresponding row
183 |         :return:
184 |         """
185 | 
186 |         cdef double[:] sumPerRow
187 |         cdef int[:] interactionsPerRow
188 |         cdef long rowIndex, innerIndex, start_pos, end_pos
189 |         cdef double rowAverage
190 | 
191 |         URM = check_matrix(URM, 'csr')
192 | 
193 |         sumPerRow = np.array(URM.sum(axis=1), dtype=np.float64).ravel()
194 |         interactionsPerRow = np.diff(URM.indptr)
195 | 
196 | 
197 |         #Remove for every row the corresponding average
198 |         for rowIndex in range(self.n_users):
199 | 
200 |             if interactionsPerRow[rowIndex]>0:
201 | 
202 |                 rowAverage = sumPerRow[rowIndex] / interactionsPerRow[rowIndex]
203 | 
204 |                 start_pos = URM.indptr[rowIndex]
205 |                 end_pos = URM.indptr[rowIndex+1]
206 | 
207 |                 innerIndex = start_pos
208 | 
209 |                 while innerIndex < end_pos:
210 | 
211 |                     URM.data[innerIndex] -= rowAverage
212 |                     innerIndex+=1
213 | 
214 | 
215 |         return URM
216 | 
217 | 
218 | 
219 | 
220 | 
221 |     cdef int[:] getUsersThatRatedItem(self, long item_id):
222 |         return self.item_to_user_rows[self.item_to_user_col_ptr[item_id]:self.item_to_user_col_ptr[item_id+1]]
223 | 
224 |     cdef int[:] getItemsRatedByUser(self, long user_id):
225 |         return self.user_to_item_cols[self.user_to_item_row_ptr[user_id]:self.user_to_item_row_ptr[user_id+1]]
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 |     cdef double[:] computeItemSimilarities(self, long item_id_input):
234 |         """
235 |         For every item the cosine similarity against other items depends on whether they have users in common. The more
236 |         common users the higher the similarity.
237 |         
238 |         The basic implementation is:
239 |         - Select the first item
240 |         - Loop through all other items
241 |         -- Given the two items, get the users they have in common
242 |         -- Update the similarity for all common users
243 |         
244 |         That is VERY slow due to the common user part, in which a long data structure is looped multiple times.
245 |         
246 |         A better way is to use the data structure in a different way skipping the search part, getting directly the
247 |         information we need.
248 |         
249 |         The implementation here used is:
250 |         - Select the first item
251 |         - Initialize a zero valued array for the similarities
252 |         - Get the users who rated the first item
253 |         - Loop through the users
254 |         -- Given a user, get the items he rated (second item)
255 |         -- Update the similarity of the items he rated
256 |         
257 |         
258 |         """
259 | 
260 |         # Create template used to initialize an array with zeros
261 |         # Much faster than np.zeros(self.n_items)
262 |         cdef array[double] template_zero = array('d')
263 |         cdef array[double] result = clone(template_zero, self.n_items, zero=True)
264 | 
265 | 
266 |         cdef long user_index, user_id, item_index, item_id_second
267 | 
268 |         cdef int[:] users_that_rated_item = self.getUsersThatRatedItem(item_id_input)
269 |         cdef int[:] items_rated_by_user
270 | 
271 |         cdef double rating_item_input, rating_item_second
272 | 
273 |         # Get users that rated the items
274 |         for user_index in range(len(users_that_rated_item)):
275 | 
276 |             user_id = users_that_rated_item[user_index]
277 |             rating_item_input = self.item_to_user_data[self.item_to_user_col_ptr[item_id_input]+user_index]
278 | 
279 |             # Get all items rated by that user
280 |             items_rated_by_user = self.getItemsRatedByUser(user_id)
281 | 
282 |             for item_index in range(len(items_rated_by_user)):
283 | 
284 |                 item_id_second = items_rated_by_user[item_index]
285 | 
286 |                 # Do not compute the similarity on the diagonal
287 |                 if item_id_second != item_id_input:
288 |                     # Increment similairty
289 |                     rating_item_second = self.user_to_item_data[self.user_to_item_row_ptr[user_id]+item_index]
290 | 
291 |                     result[item_id_second] += rating_item_input*rating_item_second
292 | 
293 |         return result
294 | 
295 | 
296 | 
297 | 
298 |     def compute_similarity(self):
299 | 
300 |         cdef int itemIndex, innerItemIndex
301 |         cdef long long topKItemIndex
302 | 
303 |         cdef long long[:] top_k_idx
304 | 
305 |         # Declare numpy data type to use vetor indexing and simplify the topK selection code
306 |         cdef np.ndarray[long, ndim=1] top_k_partition, top_k_partition_sorting
307 |         cdef np.ndarray[np.float64_t, ndim=1] this_item_weights_np
308 | 
309 |         cdef double[:] this_item_weights
310 | 
311 |         cdef long processedItems = 0
312 | 
313 |         # Data structure to incrementally build sparse matrix
314 |         # Preinitialize max possible length
315 |         cdef double[:] values = np.zeros((self.n_items*self.TopK))
316 |         cdef int[:] rows = np.zeros((self.n_items*self.TopK,), dtype=np.int32)
317 |         cdef int[:] cols = np.zeros((self.n_items*self.TopK,), dtype=np.int32)
318 |         cdef long sparse_data_pointer = 0
319 | 
320 | 
321 | 
322 |         start_time = time.time()
323 | 
324 |         # Compute all similarities for each item
325 |         for itemIndex in range(self.n_items):
326 | 
327 |             processedItems += 1
328 | 
329 |             if processedItems % 10000==0 or processedItems==self.n_items:
330 | 
331 |                 itemPerSec = processedItems/(time.time()-start_time)
332 | 
333 |                 print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format(
334 |                     processedItems, processedItems*1.0/self.n_items*100, itemPerSec, (time.time()-start_time) / 60))
335 | 
336 |                 sys.stdout.flush()
337 |                 sys.stderr.flush()
338 | 
339 | 
340 |             this_item_weights = self.computeItemSimilarities(itemIndex)
341 | 
342 | 
343 |             # Apply normalization and shrinkage, ensure denominator != 0
344 |             if self.normalize:
345 |                 for innerItemIndex in range(self.n_items):
346 |                     this_item_weights[innerItemIndex] /= self.sumOfSquared[itemIndex] * self.sumOfSquared[innerItemIndex]\
347 |                                                          + self.shrink + 1e-6
348 | 
349 |             # Apply the specific denominator for Tanimoto
350 |             elif self.tanimoto_coefficient:
351 |                 for innerItemIndex in range(self.n_items):
352 |                     this_item_weights[innerItemIndex] /= self.sumOfSquared[itemIndex] + self.sumOfSquared[innerItemIndex] -\
353 |                                                          this_item_weights[innerItemIndex] + self.shrink + 1e-6
354 | 
355 |             elif self.shrink != 0:
356 |                 for innerItemIndex in range(self.n_items):
357 |                     this_item_weights[innerItemIndex] /= self.shrink
358 | 
359 | 
360 |             if self.TopK == 0:
361 | 
362 |                 for innerItemIndex in range(self.n_items):
363 |                     self.W_dense[innerItemIndex,itemIndex] = this_item_weights[innerItemIndex]
364 | 
365 |             else:
366 | 
367 |                 # Sort indices and select TopK
368 |                 # Using numpy implies some overhead, unfortunately the plain C qsort function is even slower
369 |                 #top_k_idx = np.argsort(this_item_weights) [-self.TopK:]
370 | 
371 |                 # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
372 |                 # because we avoid sorting elements we already know we don't care about
373 |                 # - Partition the data to extract the set of TopK items, this set is unsorted
374 |                 # - Sort only the TopK items, discarding the rest
375 |                 # - Get the original item index
376 | 
377 |                 this_item_weights_np = - np.array(this_item_weights)
378 |                 #
379 |                 # Get the unordered set of topK items
380 |                 top_k_partition = np.argpartition(this_item_weights_np, self.TopK-1)[0:self.TopK]
381 |                 # Sort only the elements in the partition
382 |                 top_k_partition_sorting = np.argsort(this_item_weights_np[top_k_partition])
383 |                 # Get original index
384 |                 top_k_idx = top_k_partition[top_k_partition_sorting]
385 | 
386 | 
387 | 
388 |                 # Incrementally build sparse matrix
389 |                 for innerItemIndex in range(len(top_k_idx)):
390 | 
391 |                     topKItemIndex = top_k_idx[innerItemIndex]
392 | 
393 |                     values[sparse_data_pointer] = this_item_weights[topKItemIndex]
394 |                     rows[sparse_data_pointer] = topKItemIndex
395 |                     cols[sparse_data_pointer] = itemIndex
396 | 
397 |                     sparse_data_pointer += 1
398 | 
399 | 
400 |         if self.TopK == 0:
401 | 
402 |             return np.array(self.W_dense)
403 | 
404 |         else:
405 | 
406 |             values = np.array(values[0:sparse_data_pointer])
407 |             rows = np.array(rows[0:sparse_data_pointer])
408 |             cols = np.array(cols[0:sparse_data_pointer])
409 | 
410 |             W_sparse = sps.csr_matrix((values, (rows, cols)),
411 |                                     shape=(self.n_items, self.n_items),
412 |                                     dtype=np.float32)
413 | 
414 |             return W_sparse
415 | 
416 | 


--------------------------------------------------------------------------------
/Base/Recommender.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | @author: Maurizio Ferrari Dacrema
  6 | """
  7 | 
  8 | import multiprocessing
  9 | import time
 10 | 
 11 | import numpy as np
 12 | 
 13 | from Base.metrics import roc_auc, precision, recall, map, ndcg, rr
 14 | #from Base.Cython.metrics import roc_auc, precision, recall, map, ndcg, rr
 15 | from Base.Recommender_utils import check_matrix, areURMequals, removeTopPop
 16 | 
 17 | 
 18 | class Recommender(object):
 19 |     """Abstract Recommender"""
 20 | 
 21 |     def __init__(self):
 22 |         super(Recommender, self).__init__()
 23 |         self.URM_train = None
 24 |         self.sparse_weights = True
 25 |         self.normalize = False
 26 | 
 27 |         self.filterTopPop = False
 28 |         self.filterTopPop_ItemsID = np.array([], dtype=np.int)
 29 | 
 30 |         self.filterCustomItems = False
 31 |         self.filterCustomItems_ItemsID = np.array([], dtype=np.int)
 32 | 
 33 | 
 34 |     def fit(self):
 35 |         pass
 36 | 
 37 |     def _filter_TopPop_on_scores(self, scores):
 38 |         scores[self.filterTopPop_ItemsID] = -np.inf
 39 |         return scores
 40 | 
 41 | 
 42 |     def _filterCustomItems_on_scores(self, scores):
 43 |         scores[self.filterCustomItems_ItemsID] = -np.inf
 44 |         return scores
 45 | 
 46 | 
 47 |     def _filter_seen_on_scores(self, user_id, scores):
 48 | 
 49 |         seen = self.URM_train.indices[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]]
 50 | 
 51 |         scores[seen] = -np.inf
 52 |         return scores
 53 | 
 54 | 
 55 | 
 56 | 
 57 |     def evaluateRecommendations(self, URM_test_new, at=5, minRatingsPerUser=1, exclude_seen=True,
 58 |                                 mode='sequential', filterTopPop = False,
 59 |                                 filterCustomItems = np.array([], dtype=np.int),
 60 |                                 filterCustomUsers = np.array([], dtype=np.int)):
 61 |         """
 62 |         Speed info:
 63 |         - Sparse weighgs: batch mode is 2x faster than sequential
 64 |         - Dense weighgts: batch and sequential speed are equivalent
 65 | 
 66 | 
 67 |         :param URM_test_new:            URM to be used for testing
 68 |         :param at: 5                    Length of the recommended items
 69 |         :param minRatingsPerUser: 1     Users with less than this number of interactions will not be evaluated
 70 |         :param exclude_seen: True       Whether to remove already seen items from the recommended items
 71 | 
 72 |         :param mode: 'sequential', 'parallel', 'batch'
 73 |         :param filterTopPop: False or decimal number        Percentage of items to be removed from recommended list and testing interactions
 74 |         :param filterCustomItems: Array, default empty           Items ID to NOT take into account when recommending
 75 |         :param filterCustomUsers: Array, default empty           Users ID to NOT take into account when recommending
 76 |         :return:
 77 |         """
 78 | 
 79 |         if len(filterCustomItems) == 0:
 80 |             self.filterCustomItems = False
 81 |         else:
 82 |             self.filterCustomItems = True
 83 |             self.filterCustomItems_ItemsID = np.array(filterCustomItems)
 84 | 
 85 | 
 86 |         if filterTopPop != False:
 87 | 
 88 |             self.filterTopPop = True
 89 | 
 90 |             _,_, self.filterTopPop_ItemsID = removeTopPop(self.URM_train, URM_2 = URM_test_new, percentageToRemove=filterTopPop)
 91 | 
 92 |             print("Filtering {}% TopPop items, count is: {}".format(filterTopPop*100, len(self.filterTopPop_ItemsID)))
 93 | 
 94 |             # Zero-out the items in order to be considered irrelevant
 95 |             URM_test_new = check_matrix(URM_test_new, format='lil')
 96 |             URM_test_new[:,self.filterTopPop_ItemsID] = 0
 97 |             URM_test_new = check_matrix(URM_test_new, format='csr')
 98 | 
 99 | 
100 |         # During testing CSR is faster
101 |         self.URM_test = check_matrix(URM_test_new, format='csr')
102 |         self.URM_train = check_matrix(self.URM_train, format='csr')
103 |         self.at = at
104 |         self.minRatingsPerUser = minRatingsPerUser
105 |         self.exclude_seen = exclude_seen
106 | 
107 | 
108 |         nusers = self.URM_test.shape[0]
109 | 
110 |         # Prune users with an insufficient number of ratings
111 |         rows = self.URM_test.indptr
112 |         numRatings = np.ediff1d(rows)
113 |         mask = numRatings >= minRatingsPerUser
114 |         usersToEvaluate = np.arange(nusers)[mask]
115 | 
116 |         if len(filterCustomUsers) != 0:
117 |             print("Filtering {} Users".format(len(filterCustomUsers)))
118 |             usersToEvaluate = set(usersToEvaluate) - set(filterCustomUsers)
119 | 
120 |         usersToEvaluate = list(usersToEvaluate)
121 | 
122 | 
123 | 
124 |         if mode=='sequential':
125 |             return self.evaluateRecommendationsSequential(usersToEvaluate)
126 |         elif mode=='parallel':
127 |             return self.evaluateRecommendationsParallel(usersToEvaluate)
128 |         elif mode=='batch':
129 |             return self.evaluateRecommendationsBatch(usersToEvaluate)
130 |         # elif mode=='cython':
131 |         #     return self.evaluateRecommendationsCython(usersToEvaluate)
132 |         # elif mode=='random-equivalent':
133 |         #     return self.evaluateRecommendationsRandomEquivalent(usersToEvaluate)
134 |         else:
135 |             raise ValueError("Mode '{}' not available".format(mode))
136 | 
137 | 
138 |     def get_user_relevant_items(self, user_id):
139 | 
140 |         return self.URM_test.indices[self.URM_test.indptr[user_id]:self.URM_test.indptr[user_id+1]]
141 | 
142 |     def get_user_test_ratings(self, user_id):
143 | 
144 |         return self.URM_test.data[self.URM_test.indptr[user_id]:self.URM_test.indptr[user_id+1]]
145 | 
146 | 
147 | 
148 | 
149 |     def evaluateRecommendationsSequential(self, usersToEvaluate):
150 | 
151 |         start_time = time.time()
152 | 
153 |         roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
154 |         n_eval = 0
155 | 
156 |         for test_user in usersToEvaluate:
157 | 
158 |             # Calling the 'evaluateOneUser' function instead of copying its code would be cleaner, but is 20% slower
159 | 
160 |             # Being the URM CSR, the indices are the non-zero column indexes
161 |             relevant_items = self.get_user_relevant_items(test_user)
162 | 
163 |             n_eval += 1
164 | 
165 |             recommended_items = self.recommend(user_id=test_user, exclude_seen=self.exclude_seen,
166 |                                                n=self.at, filterTopPop=self.filterTopPop, filterCustomItems=self.filterCustomItems)
167 | 
168 |             is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
169 | 
170 |             # evaluate the recommendation list with ranking metrics ONLY
171 |             roc_auc_ += roc_auc(is_relevant)
172 |             precision_ += precision(is_relevant)
173 |             recall_ += recall(is_relevant, relevant_items)
174 |             map_ += map(is_relevant, relevant_items)
175 |             mrr_ += rr(is_relevant)
176 |             ndcg_ += ndcg(recommended_items, relevant_items, relevance=self.get_user_test_ratings(test_user), at=self.at)
177 | 
178 | 
179 | 
180 |             if n_eval % 10000 == 0 or n_eval==len(usersToEvaluate)-1:
181 |                 print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}".format(
182 |                                   n_eval,
183 |                                   100.0* float(n_eval+1)/len(usersToEvaluate),
184 |                                   time.time()-start_time,
185 |                                   float(n_eval)/(time.time()-start_time)))
186 | 
187 | 
188 | 
189 | 
190 |         if (n_eval > 0):
191 |             roc_auc_ /= n_eval
192 |             precision_ /= n_eval
193 |             recall_ /= n_eval
194 |             map_ /= n_eval
195 |             mrr_ /= n_eval
196 |             ndcg_ /= n_eval
197 | 
198 |         else:
199 |             print("WARNING: No users had a sufficient number of relevant items")
200 | 
201 |         results_run = {}
202 | 
203 |         results_run["AUC"] = roc_auc_
204 |         results_run["precision"] = precision_
205 |         results_run["recall"] = recall_
206 |         results_run["map"] = map_
207 |         results_run["NDCG"] = ndcg_
208 |         results_run["MRR"] = mrr_
209 | 
210 |         return (results_run)
211 | 
212 | 
213 | 
214 | 
215 |     def evaluateRecommendationsBatch(self, usersToEvaluate, batch_size = 1000):
216 | 
217 |         roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
218 |         n_eval = 0
219 | 
220 |         start_time = time.time()
221 |         start_time_batch = time.time()
222 | 
223 |         #Number of blocks is rounded to the next integer
224 |         totalNumberOfBatch = int(len(usersToEvaluate) / batch_size) + 1
225 | 
226 |         for current_batch in range(totalNumberOfBatch):
227 | 
228 |             user_first_id = current_batch*batch_size
229 |             user_last_id = min((current_batch+1)*batch_size-1,  len(usersToEvaluate)-1)
230 | 
231 |             users_in_batch = usersToEvaluate[user_first_id:user_last_id]
232 | 
233 |             relevant_items_batch = self.URM_test[users_in_batch]
234 | 
235 |             recommended_items_batch = self.recommendBatch(users_in_batch,
236 |                                                           exclude_seen=self.exclude_seen,
237 |                                                           n=self.at, filterTopPop=self.filterTopPop,
238 |                                                           filterCustomItems=self.filterCustomItems)
239 | 
240 | 
241 |             for test_user in range(recommended_items_batch.shape[0]):
242 | 
243 |                 n_eval += 1
244 | 
245 |                 current_user = relevant_items_batch[test_user,:]
246 | 
247 |                 relevant_items = current_user.indices
248 |                 recommended_items = recommended_items_batch[test_user,:]
249 | 
250 |                 is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
251 | 
252 |                 # evaluate the recommendation list with ranking metrics ONLY
253 |                 roc_auc_ += roc_auc(is_relevant)
254 |                 precision_ += precision(is_relevant)
255 |                 recall_ += recall(is_relevant, relevant_items)
256 |                 map_ += map(is_relevant, relevant_items)
257 |                 mrr_ += rr(is_relevant)
258 |                 ndcg_ += ndcg(recommended_items, relevant_items, relevance=current_user.data, at=self.at)
259 | 
260 | 
261 | 
262 |             if(time.time() - start_time_batch >= 20 or current_batch == totalNumberOfBatch-1):
263 |                 print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}".format(
264 |                                   n_eval,
265 |                                   100.0* float(n_eval)/len(usersToEvaluate),
266 |                                   time.time()-start_time,
267 |                                   float(n_eval)/(time.time()-start_time)))
268 | 
269 |                 start_time_batch = time.time()
270 | 
271 | 
272 |         if (n_eval > 0):
273 |             roc_auc_ /= n_eval
274 |             precision_ /= n_eval
275 |             recall_ /= n_eval
276 |             map_ /= n_eval
277 |             mrr_ /= n_eval
278 |             ndcg_ /= n_eval
279 | 
280 |         else:
281 |             print("WARNING: No users had a sufficient number of relevant items")
282 | 
283 |         results_run = {}
284 | 
285 |         results_run["AUC"] = roc_auc_
286 |         results_run["precision"] = precision_
287 |         results_run["recall"] = recall_
288 |         results_run["map"] = map_
289 |         results_run["NDCG"] = ndcg_
290 |         results_run["MRR"] = mrr_
291 | 
292 |         return (results_run)
293 | 
294 | 
295 | 
296 |     def evaluateOneUser(self, test_user):
297 | 
298 |         # Being the URM CSR, the indices are the non-zero column indexes
299 |         #relevant_items = self.URM_test_relevantItems[test_user]
300 |         relevant_items = self.URM_test[test_user].indices
301 | 
302 |         # this will rank top n items
303 |         recommended_items = self.recommend(user_id=test_user, exclude_seen=self.exclude_seen,
304 |                                            n=self.at, filterTopPop=self.filterTopPop,
305 |                                            filterCustomItems=self.filterCustomItems)
306 | 
307 |         is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
308 | 
309 |         # evaluate the recommendation list with ranking metrics ONLY
310 |         roc_auc_ = roc_auc(is_relevant)
311 |         precision_ = precision(is_relevant)
312 |         recall_ = recall(is_relevant, relevant_items)
313 |         map_ = map(is_relevant, relevant_items)
314 |         mrr_ = rr(is_relevant)
315 |         ndcg_ = ndcg(recommended_items, relevant_items, relevance=self.get_user_test_ratings(test_user), at=self.at)
316 | 
317 |         return roc_auc_, precision_, recall_, map_, mrr_, ndcg_
318 | 
319 | 
320 | 
321 |     def evaluateRecommendationsParallel(self, usersToEvaluate):
322 | 
323 |         print("Evaluation of {} users begins".format(len(usersToEvaluate)))
324 | 
325 |         pool = multiprocessing.Pool(processes=multiprocessing.cpu_count(), maxtasksperchild=1)
326 |         resultList = pool.map(self.evaluateOneUser, usersToEvaluate)
327 | 
328 |         # Close the pool to avoid memory leaks
329 |         pool.close()
330 | 
331 |         n_eval = len(usersToEvaluate)
332 |         roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
333 | 
334 |         # Looping is slightly faster then using the numpy vectorized approach, less data transformation
335 |         for result in resultList:
336 |             roc_auc_ += result[0]
337 |             precision_ += result[1]
338 |             recall_ += result[2]
339 |             map_ += result[3]
340 |             mrr_ += result[4]
341 |             ndcg_ += result[5]
342 | 
343 | 
344 |         if (n_eval > 0):
345 |             roc_auc_ = roc_auc_/n_eval
346 |             precision_ = precision_/n_eval
347 |             recall_ = recall_/n_eval
348 |             map_ = map_/n_eval
349 |             mrr_ = mrr_/n_eval
350 |             ndcg_ =  ndcg_/n_eval
351 | 
352 |         else:
353 |             print("WARNING: No users had a sufficient number of relevant items")
354 | 
355 | 
356 |         print("Evaluated {} users".format(n_eval))
357 | 
358 |         results = {}
359 | 
360 |         results["AUC"] = roc_auc_
361 |         results["precision"] = precision_
362 |         results["recall"] = recall_
363 |         results["map"] = map_
364 |         results["NDCG"] = ndcg_
365 |         results["MRR"] = mrr_
366 | 
367 |         return (results)
368 | 
369 | 
370 | 
371 | 


--------------------------------------------------------------------------------
/Base/Recommender_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | @author: Maurizio Ferrari Dacrema
  6 | """
  7 | 
  8 | import numpy as np
  9 | import scipy.sparse as sps
 10 | import time
 11 | import os
 12 | 
 13 | def check_matrix(X, format='csc', dtype=np.float32):
 14 |     if format == 'csc' and not isinstance(X, sps.csc_matrix):
 15 |         return X.tocsc().astype(dtype)
 16 |     elif format == 'csr' and not isinstance(X, sps.csr_matrix):
 17 |         return X.tocsr().astype(dtype)
 18 |     elif format == 'coo' and not isinstance(X, sps.coo_matrix):
 19 |         return X.tocoo().astype(dtype)
 20 |     elif format == 'dok' and not isinstance(X, sps.dok_matrix):
 21 |         return X.todok().astype(dtype)
 22 |     elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
 23 |         return X.tobsr().astype(dtype)
 24 |     elif format == 'dia' and not isinstance(X, sps.dia_matrix):
 25 |         return X.todia().astype(dtype)
 26 |     elif format == 'lil' and not isinstance(X, sps.lil_matrix):
 27 |         return X.tolil().astype(dtype)
 28 |     else:
 29 |         return X.astype(dtype)
 30 | 
 31 | 
 32 | def similarityMatrixTopK(item_weights, forceSparseOutput = True, k=100, verbose = False, inplace=True):
 33 |     """
 34 |     The function selects the TopK most similar elements, column-wise
 35 | 
 36 |     :param item_weights:
 37 |     :param forceSparseOutput:
 38 |     :param k:
 39 |     :param verbose:
 40 |     :param inplace: Default True, WARNING matrix will be modified
 41 |     :return:
 42 |     """
 43 | 
 44 |     assert (item_weights.shape[0] == item_weights.shape[1]), "selectTopK: ItemWeights is not a square matrix"
 45 | 
 46 |     start_time = time.time()
 47 | 
 48 |     if verbose:
 49 |         print("Generating topK matrix")
 50 | 
 51 |     nitems = item_weights.shape[1]
 52 |     k = min(k, nitems)
 53 | 
 54 |     # for each column, keep only the top-k scored items
 55 |     sparse_weights = not isinstance(item_weights, np.ndarray)
 56 | 
 57 |     if not sparse_weights:
 58 | 
 59 |         idx_sorted = np.argsort(item_weights, axis=0)  # sort data inside each column
 60 | 
 61 |         if inplace:
 62 |             W = item_weights
 63 |         else:
 64 |             W = item_weights.copy()
 65 | 
 66 |         # index of the items that don't belong to the top-k similar items of each column
 67 |         not_top_k = idx_sorted[:-k, :]
 68 |         # use numpy fancy indexing to zero-out the values in sim without using a for loop
 69 |         W[not_top_k, np.arange(nitems)] = 0.0
 70 | 
 71 |         if forceSparseOutput:
 72 |             W_sparse = sps.csr_matrix(W, shape=(nitems, nitems))
 73 | 
 74 |             if verbose:
 75 |                 print("Sparse TopK matrix generated in {:.2f} seconds".format(time.time() - start_time))
 76 | 
 77 |             return W_sparse
 78 | 
 79 |         if verbose:
 80 |             print("Dense TopK matrix generated in {:.2f} seconds".format(time.time()-start_time))
 81 | 
 82 |         return W
 83 | 
 84 |     else:
 85 |         # iterate over each column and keep only the top-k similar items
 86 |         data, rows_indices, cols_indptr = [], [], []
 87 | 
 88 |         item_weights = check_matrix(item_weights, format='csc', dtype=np.float32)
 89 | 
 90 |         for item_idx in range(nitems):
 91 | 
 92 |             cols_indptr.append(len(data))
 93 | 
 94 |             start_position = item_weights.indptr[item_idx]
 95 |             end_position = item_weights.indptr[item_idx+1]
 96 | 
 97 |             column_data = item_weights.data[start_position:end_position]
 98 |             column_row_index = item_weights.indices[start_position:end_position]
 99 | 
100 |             idx_sorted = np.argsort(column_data)  # sort by column
101 |             top_k_idx = idx_sorted[-k:]
102 | 
103 |             data.extend(column_data[top_k_idx])
104 |             rows_indices.extend(column_row_index[top_k_idx])
105 | 
106 | 
107 |         cols_indptr.append(len(data))
108 | 
109 |         # During testing CSR is faster
110 |         W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr), shape=(nitems, nitems), dtype=np.float32)
111 |         W_sparse = W_sparse.tocsr()
112 | 
113 |         if verbose:
114 |             print("Sparse TopK matrix generated in {:.2f} seconds".format(time.time() - start_time))
115 | 
116 |         return W_sparse
117 | 
118 | 
119 | 
120 | 
121 | def removeZeroRatingRowAndCol(URM):
122 | 
123 |     rows = URM.indptr
124 |     numRatings = np.ediff1d(rows)
125 |     mask = numRatings >= 1
126 | 
127 |     URM = URM[mask,:]
128 | 
129 |     cols = URM.tocsc().indptr
130 |     numRatings = np.ediff1d(cols)
131 |     mask = numRatings >= 1
132 | 
133 |     URM = URM[:,mask]
134 | 
135 |     return URM.tocsr()
136 | 
137 | 
138 | def areURMequals(URM1, URM2):
139 | 
140 |     if(URM1.shape != URM2.shape):
141 |         return False
142 | 
143 |     return (URM1-URM2).nnz ==0
144 | 
145 | 
146 | def removeTopPop(URM_1, URM_2=None, percentageToRemove=0.2):
147 |     """
148 |     Remove the top popular items from the matrix
149 |     :param URM_1: user X items
150 |     :param URM_2: user X items
151 |     :param percentageToRemove: value 1 corresponds to 100%
152 |     :return: URM: user X selectedItems, obtained from URM_1
153 |              Array: itemMappings[selectedItemIndex] = originalItemIndex
154 |              Array: removedItems
155 |     """
156 | 
157 | 
158 |     item_pop = URM_1.sum(axis=0)  # this command returns a numpy.matrix of size (1, nitems)
159 | 
160 |     if URM_2 != None:
161 | 
162 |         assert URM_2.shape[1] == URM_1.shape[1], \
163 |             "The two URM do not contain the same number of columns, URM_1 has {}, URM_2 has {}".format(URM_1.shape[1], URM_2.shape[1])
164 | 
165 |         item_pop += URM_2.sum(axis=0)
166 | 
167 | 
168 |     item_pop = np.asarray(item_pop).squeeze()  # necessary to convert it into a numpy.array of size (nitems,)
169 |     popularItemsSorted = np.argsort(item_pop)[::-1]
170 | 
171 |     numItemsToRemove = int(len(popularItemsSorted)*percentageToRemove)
172 | 
173 |     # Choose which columns to keep
174 |     itemMask = np.in1d(np.arange(len(popularItemsSorted)), popularItemsSorted[:numItemsToRemove],  invert=True)
175 | 
176 |     # Map the column index of the new URM to the original ItemID
177 |     itemMappings = np.arange(len(popularItemsSorted))[itemMask]
178 | 
179 |     removedItems = np.arange(len(popularItemsSorted))[np.logical_not(itemMask)]
180 | 
181 |     return URM_1[:,itemMask], itemMappings, removedItems
182 | 
183 | 
184 | 
185 | def loadCSVintoSparse (filePath, header = False):
186 | 
187 |     values, rows, cols = [], [], []
188 | 
189 |     fileHandle = open(filePath, "r")
190 |     numCells = 0
191 | 
192 |     if header:
193 |         fileHandle.readline()
194 | 
195 |     for line in fileHandle:
196 |         numCells += 1
197 |         if (numCells % 1000000 == 0):
198 |             print("Processed {} cells".format(numCells))
199 | 
200 |         if (len(line)) > 1:
201 |             line = line.split(",")
202 | 
203 |             value = line[2].replace("\n", "")
204 | 
205 |             if not value == "0" and not value == "NaN":
206 |                 rows.append(int(line[0]))
207 |                 cols.append(int(line[1]))
208 |                 values.append(float(value))
209 | 
210 |     return  sps.csr_matrix((values, (rows, cols)), dtype=np.float32)
211 | 
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/Base/Recommender_utils_Test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on 30/09/17
 5 | 
 6 | @author: Maurizio Ferrari Dacrema
 7 | """
 8 | 
 9 | from Base.Recommender_utils import similarityMatrixTopK
10 | 
11 | import numpy as np
12 | import scipy.sparse as sps
13 | import unittest
14 | 
15 | 
16 | class MyTestCase(unittest.TestCase):
17 | 
18 |     def test_similarityMatrixTopK_denseToDense(self):
19 | 
20 |         numRows = 100
21 | 
22 |         TopK = 20
23 | 
24 |         dense_input = np.random.random((numRows, numRows))
25 |         dense_output = similarityMatrixTopK(dense_input, k=TopK, forceSparseOutput=False)
26 | 
27 |         numExpectedNonZeroCells = TopK*numRows
28 | 
29 |         numNonZeroCells = np.sum(dense_output!=0)
30 | 
31 |         self.assertEqual(numExpectedNonZeroCells, numNonZeroCells, "DenseToDense incorrect")
32 | 
33 | 
34 |     def test_similarityMatrixTopK_denseToSparse(self):
35 | 
36 |         numRows = 100
37 | 
38 |         TopK = 20
39 | 
40 |         dense = np.random.random((numRows, numRows))
41 | 
42 |         sparse = similarityMatrixTopK(dense, k=TopK, forceSparseOutput=True)
43 |         dense = similarityMatrixTopK(dense, k=TopK, forceSparseOutput=False)
44 | 
45 | 
46 |         self.assertTrue(np.equal(dense, sparse.todense()).all(), "denseToSparse incorrect")
47 | 
48 | 
49 |     def test_similarityMatrixTopK_sparseToSparse(self):
50 | 
51 |         numRows = 20
52 | 
53 |         TopK = 5
54 | 
55 |         dense_input = np.random.random((numRows, numRows))
56 |         sparse_input = sps.csr_matrix(dense_input)
57 | 
58 |         dense_output = similarityMatrixTopK(dense_input, k=TopK, forceSparseOutput=False, inplace=False)
59 |         sparse_output = similarityMatrixTopK(sparse_input, k=TopK, forceSparseOutput=True)
60 | 
61 |         self.assertTrue(np.all((dense_output - sparse_output.todense())<1e-6), "sparseToSparse CSR incorrect")
62 | 
63 |         sparse_input = sps.csc_matrix(dense_input)
64 |         sparse_output = similarityMatrixTopK(sparse_input, k=TopK, forceSparseOutput=True)
65 |         self.assertTrue(np.all((dense_output - sparse_output.todense())<1e-6), "sparseToSparse CSC incorrect")
66 | 
67 | if __name__ == '__main__':
68 | 
69 |     unittest.main()
70 | 
71 | 


--------------------------------------------------------------------------------
/Base/Similarity_Matrix_Recommender.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 16/09/2017
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | import numpy as np
 10 | 
 11 | 
 12 | class Similarity_Matrix_Recommender(object):
 13 | 
 14 |     def __init__(self):
 15 |         super(Similarity_Matrix_Recommender, self).__init__()
 16 | 
 17 | 
 18 | 
 19 |     def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
 20 | 
 21 |         if n==None:
 22 |             n=self.URM_train.shape[1]-1
 23 | 
 24 |         # compute the scores using the dot product
 25 |         if self.sparse_weights:
 26 |             user_profile = self.URM_train[user_id]
 27 | 
 28 |             scores = user_profile.dot(self.W_sparse).toarray().ravel()
 29 | 
 30 |         else:
 31 | 
 32 |             user_profile = self.URM_train.indices[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]]
 33 |             user_ratings = self.URM_train.data[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]]
 34 | 
 35 |             relevant_weights = self.W[user_profile]
 36 |             scores = relevant_weights.T.dot(user_ratings)
 37 | 
 38 |         if self.normalize:
 39 |             # normalization will keep the scores in the same range
 40 |             # of value of the ratings in dataset
 41 |             rated = user_profile.copy()
 42 |             rated.data = np.ones_like(rated.data)
 43 |             if self.sparse_weights:
 44 |                 den = rated.dot(self.W_sparse).toarray().ravel()
 45 |             else:
 46 |                 den = rated.dot(self.W).ravel()
 47 |             den[np.abs(den) < 1e-6] = 1.0  # to avoid NaNs
 48 |             scores /= den
 49 | 
 50 |         if exclude_seen:
 51 |             scores = self._filter_seen_on_scores(user_id, scores)
 52 | 
 53 |         if filterTopPop:
 54 |             scores = self._filter_TopPop_on_scores(scores)
 55 | 
 56 |         if filterCustomItems:
 57 |             scores = self._filterCustomItems_on_scores(scores)
 58 | 
 59 | 
 60 |         # rank items and mirror column to obtain a ranking in descending score
 61 |         #ranking = scores.argsort()
 62 |         #ranking = np.flip(ranking, axis=0)
 63 | 
 64 |         # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
 65 |         # - Partition the data to extract the set of relevant items
 66 |         # - Sort only the relevant items
 67 |         # - Get the original item index
 68 |         relevant_items_partition = (-scores).argpartition(n)[0:n]
 69 |         relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition])
 70 |         ranking = relevant_items_partition[relevant_items_partition_sorting]
 71 | 
 72 | 
 73 |         return ranking
 74 | 
 75 | 
 76 | 
 77 | 
 78 |     def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
 79 | 
 80 |         # compute the scores using the dot product
 81 |         user_profile_batch = self.URM_train[users_in_batch]
 82 | 
 83 |         if self.sparse_weights:
 84 |             scores_array = np.array(user_profile_batch.dot(self.W_sparse))
 85 | 
 86 |         else:
 87 |             scores_array = user_profile_batch.dot(self.W)
 88 | 
 89 |         if self.normalize:
 90 |             raise ValueError("Not implemented")
 91 | 
 92 |         # To exclude seen items perform a boolean indexing and replace their score with -inf
 93 |         # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be
 94 |         # recommended
 95 |         if exclude_seen:
 96 |             scores_array[user_profile_batch.nonzero()] = -np.inf
 97 | 
 98 |         if filterTopPop:
 99 |             scores_array[:,self.filterTopPop_ItemsID] = -np.inf
100 | 
101 |         if filterCustomItems:
102 |             scores_array[:, self.filterCustomItems_ItemsID] = -np.inf
103 | 
104 | 
105 |         # rank items and mirror column to obtain a ranking in descending score
106 |         #ranking = (-scores_array).argsort(axis=1)
107 |         #ranking = np.fliplr(ranking)
108 |         #ranking = ranking[:,0:n]
109 | 
110 |         ranking = np.zeros((scores_array.shape[0],n), dtype=np.int)
111 | 
112 |         for row_index in range(scores_array.shape[0]):
113 |             scores = scores_array[row_index]
114 | 
115 |             relevant_items_partition = (-scores).argpartition(n)[0:n]
116 |             relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition])
117 |             ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting]
118 | 
119 | 
120 |         return ranking
121 | 
122 | 
123 | 
124 |     def recommend_new_user(self, user_profile, n=None, exclude_seen=True):
125 |         # compute the scores using the dot product
126 |         if self.sparse_weights:
127 |             assert user_profile.shape[1] == self.W_sparse.shape[0], 'The number of items does not match!'
128 |             scores = user_profile.dot(self.W_sparse).toarray().ravel()
129 |         else:
130 |             assert user_profile.shape[1] == self.W.shape[0], 'The number of items does not match!'
131 |             scores = user_profile.dot(self.W).ravel()
132 |         if self.normalize:
133 |             # normalization will keep the scores in the same range
134 |             # of value of the ratings in dataset
135 |             rated = user_profile.copy()
136 |             rated.data = np.ones_like(rated.data)
137 |             if self.sparse_weights:
138 |                 den = rated.dot(self.W_sparse).toarray().ravel()
139 |             else:
140 |                 den = rated.dot(self.W).ravel()
141 |             den[np.abs(den) < 1e-6] = 1.0  # to avoid NaNs
142 |             scores /= den
143 |         # rank items
144 |         ranking = scores.argsort()[::-1]
145 | 
146 |         if exclude_seen:
147 |             seen = user_profile.indices
148 |             unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
149 |             ranking = ranking[unseen_mask]
150 |         return ranking[:n]
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/Base/cosine_similarity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 23/10/17
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | import numpy as np
 10 | import time, sys
 11 | import scipy.sparse as sps
 12 | from Base.Recommender_utils import check_matrix
 13 | 
 14 | 
 15 | 
 16 | class Cosine_Similarity:
 17 | 
 18 | 
 19 |     def __init__(self, dataMatrix, topK=100, shrink = 0, normalize = True,
 20 |                  mode = "cosine"):
 21 |         """
 22 |         Computes the cosine similarity on the columns of dataMatrix
 23 |         If it is computed on URM=|users|x|items|, pass the URM as is.
 24 |         If it is computed on ICM=|items|x|features|, pass the ICM transposed.
 25 |         :param dataMatrix:
 26 |         :param topK:
 27 |         :param shrink:
 28 |         :param normalize:
 29 |         :param mode:    "cosine"    computes Cosine similarity
 30 |                         "adjusted"  computes Adjusted Cosine, removing the average of the users
 31 |                         "pearson"   computes Pearson Correlation, removing the average of the items
 32 |                         "jaccard"   computes Jaccard similarity for binary interactions using Tanimoto
 33 |                         "tanimoto"  computes Tanimoto coefficient for binary interactions
 34 | 
 35 |         """
 36 | 
 37 |         super(Cosine_Similarity, self).__init__()
 38 | 
 39 |         self.TopK = topK
 40 |         self.shrink = shrink
 41 |         self.normalize = normalize
 42 |         self.n_columns = dataMatrix.shape[1]
 43 |         self.n_rows = dataMatrix.shape[0]
 44 | 
 45 |         self.dataMatrix = dataMatrix.copy()
 46 | 
 47 |         self.adjusted_cosine = False
 48 |         self.pearson_correlation = False
 49 |         self.tanimoto_coefficient = False
 50 | 
 51 |         if mode == "adjusted":
 52 |             self.adjusted_cosine = True
 53 |         elif mode == "pearson":
 54 |             self.pearson_correlation = True
 55 |         elif mode == "jaccard" or mode == "tanimoto":
 56 |             self.tanimoto_coefficient = True
 57 |             # Tanimoto has a specific kind of normalization
 58 |             self.normalize = False
 59 | 
 60 |         elif mode == "cosine":
 61 |             pass
 62 |         else:
 63 |             raise ValueError("Cosine_Similarity: value for paramether 'mode' not recognized."
 64 |                              " Allowed values are: 'cosine', 'pearson', 'adjusted', 'jaccard', 'tanimoto'."
 65 |                              " Passed value was '{}'".format(mode))
 66 | 
 67 | 
 68 | 
 69 |         if self.TopK == 0:
 70 |             self.W_dense = np.zeros((self.n_columns, self.n_columns))
 71 | 
 72 | 
 73 | 
 74 | 
 75 |     def applyAdjustedCosine(self):
 76 |         """
 77 |         Remove from every data point the average for the corresponding row
 78 |         :return:
 79 |         """
 80 | 
 81 |         self.dataMatrix = check_matrix(self.dataMatrix, 'csr')
 82 | 
 83 | 
 84 |         interactionsPerRow = np.diff(self.dataMatrix.indptr)
 85 | 
 86 |         nonzeroRows = interactionsPerRow > 0
 87 |         sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel()
 88 | 
 89 |         rowAverage = np.zeros_like(sumPerRow)
 90 |         rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows]
 91 | 
 92 | 
 93 |         # Split in blocks to avoid duplicating the whole data structure
 94 |         start_row = 0
 95 |         end_row= 0
 96 | 
 97 |         blockSize = 1000
 98 | 
 99 | 
100 |         while end_row < self.n_rows:
101 | 
102 |             end_row = min(self.n_rows, end_row + blockSize)
103 | 
104 |             self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \
105 |                 np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])
106 | 
107 |             start_row += blockSize
108 | 
109 | 
110 | 
111 | 
112 |     def applyPearsonCorrelation(self):
113 |         """
114 |         Remove from every data point the average for the corresponding column
115 |         :return:
116 |         """
117 | 
118 |         self.dataMatrix = check_matrix(self.dataMatrix, 'csc')
119 | 
120 | 
121 |         interactionsPerCol = np.diff(self.dataMatrix.indptr)
122 | 
123 |         nonzeroCols = interactionsPerCol > 0
124 |         sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel()
125 | 
126 |         colAverage = np.zeros_like(sumPerCol)
127 |         colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols]
128 | 
129 | 
130 |         # Split in blocks to avoid duplicating the whole data structure
131 |         start_col = 0
132 |         end_col= 0
133 | 
134 |         blockSize = 1000
135 | 
136 | 
137 |         while end_col < self.n_columns:
138 | 
139 |             end_col = min(self.n_columns, end_col + blockSize)
140 | 
141 |             self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \
142 |                 np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col])
143 | 
144 |             start_col += blockSize
145 | 
146 | 
147 |     def useOnlyBooleanInteractions(self):
148 | 
149 |         # Split in blocks to avoid duplicating the whole data structure
150 |         start_pos = 0
151 |         end_pos= 0
152 | 
153 |         blockSize = 1000
154 | 
155 | 
156 |         while end_pos < len(self.dataMatrix.data):
157 | 
158 |             end_pos = min(len(self.dataMatrix.data), end_pos + blockSize)
159 | 
160 |             self.dataMatrix.data[start_pos:end_pos] = np.ones(end_pos-start_pos)
161 | 
162 |             start_pos += blockSize
163 | 
164 | 
165 | 
166 | 
167 |     def compute_similarity(self):
168 | 
169 |         values = []
170 |         rows = []
171 |         cols = []
172 | 
173 |         start_time = time.time()
174 |         start_time_print_batch = start_time
175 |         processedItems = 0
176 | 
177 |         if self.adjusted_cosine:
178 |             self.applyAdjustedCosine()
179 | 
180 |         elif self.pearson_correlation:
181 |             self.applyPearsonCorrelation()
182 | 
183 |         elif self.tanimoto_coefficient:
184 |             self.useOnlyBooleanInteractions()
185 | 
186 | 
187 |         # We explore the matrix column-wise
188 |         self.dataMatrix = check_matrix(self.dataMatrix, 'csc')
189 | 
190 | 
191 |         # Compute sum of squared values to be used in normalization
192 |         sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()
193 | 
194 |         # Tanimoto does not require the square root to be applied
195 |         if not self.tanimoto_coefficient:
196 |             sumOfSquared = np.sqrt(sumOfSquared)
197 | 
198 | 
199 |         # Compute all similarities for each item using vectorization
200 |         for columnIndex in range(self.n_columns):
201 | 
202 |             processedItems += 1
203 | 
204 |             if time.time() - start_time_print_batch >= 30 or processedItems==self.n_columns:
205 |                 columnPerSec = processedItems / (time.time() - start_time)
206 | 
207 |                 print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format(
208 |                     processedItems, processedItems / self.n_columns * 100, columnPerSec, (time.time() - start_time)/ 60))
209 | 
210 |                 sys.stdout.flush()
211 |                 sys.stderr.flush()
212 | 
213 |                 start_time_print_batch = time.time()
214 | 
215 | 
216 |             # All data points for a given item
217 |             item_data = self.dataMatrix[:, columnIndex]
218 |             item_data = item_data.toarray().squeeze()
219 | 
220 |             # Compute item similarities
221 |             this_column_weights = self.dataMatrix.T.dot(item_data)
222 |             this_column_weights[columnIndex] = 0.0
223 | 
224 |             # Apply normalization and shrinkage, ensure denominator != 0
225 |             if self.normalize:
226 |                 denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6
227 |                 this_column_weights = np.multiply(this_column_weights, 1 / denominator)
228 | 
229 |             # Apply the specific denominator for Tanimoto
230 |             elif self.tanimoto_coefficient:
231 |                 denominator = sumOfSquared[columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6
232 |                 this_column_weights = np.multiply(this_column_weights, 1 / denominator)
233 | 
234 |             # If no normalization or tanimoto is selected, apply only shrink
235 |             elif self.shrink != 0:
236 |                 this_column_weights = this_column_weights/self.shrink
237 | 
238 | 
239 |             if self.TopK == 0:
240 |                 self.W_dense[:, columnIndex] = this_column_weights
241 | 
242 |             else:
243 |                 # Sort indices and select TopK
244 |                 # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
245 |                 # - Partition the data to extract the set of relevant items
246 |                 # - Sort only the relevant items
247 |                 # - Get the original item index
248 |                 relevant_items_partition = (-this_column_weights).argpartition(self.TopK-1)[0:self.TopK]
249 |                 relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition])
250 |                 top_k_idx = relevant_items_partition[relevant_items_partition_sorting]
251 | 
252 |                 # Incrementally build sparse matrix
253 |                 values.extend(this_column_weights[top_k_idx])
254 |                 rows.extend(top_k_idx)
255 |                 cols.extend(np.ones(self.TopK) * columnIndex)
256 | 
257 |         if self.TopK == 0:
258 |             return self.W_dense
259 | 
260 |         else:
261 | 
262 |             W_sparse = sps.csr_matrix((values, (rows, cols)),
263 |                                       shape=(self.n_columns, self.n_columns),
264 |                                       dtype=np.float32)
265 | 
266 | 
267 |             return W_sparse


--------------------------------------------------------------------------------
/Base/cosine_similarity_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on 23/10/17
  3 | 
  4 | @author: Maurizio Ferrari Dacrema
  5 | """
  6 | 
  7 | import unittest
  8 | 
  9 | from Base.Recommender_utils import similarityMatrixTopK
 10 | import subprocess, os
 11 | import numpy as np
 12 | import time
 13 | import scipy.sparse as sps
 14 | 
 15 | 
 16 | 
 17 | def areSparseEquals(Sparse1, Sparse2):
 18 | 
 19 |     if(Sparse1.shape != Sparse2.shape):
 20 |         return False
 21 | 
 22 |     return (Sparse1 - Sparse2).nnz ==0
 23 | 
 24 | 
 25 | 
 26 | 
 27 | class MyTestCase(unittest.TestCase):
 28 | 
 29 |     def test_cosine_similarity_dense(self):
 30 | 
 31 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
 32 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
 33 | 
 34 |         TopK = 0
 35 | 
 36 |         data_matrix = np.array([[1,1,0,1],[0,1,1,1],[1,0,1,0]])
 37 |         data_matrix = sps.csr_matrix(data_matrix)
 38 | 
 39 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False)
 40 |         W_dense_Cython = cosine_similarity.compute_similarity()
 41 | 
 42 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False)
 43 |         W_dense_Python = cosine_similarity.compute_similarity()
 44 | 
 45 | 
 46 |         W_dense_mul = data_matrix.T.dot(data_matrix)
 47 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
 48 | 
 49 |         assert np.all(W_dense_Cython == W_dense_mul), "W_dense_Cython not matching control"
 50 |         assert np.all(W_dense_Python == W_dense_mul), "W_dense_Python not matching control"
 51 | 
 52 | 
 53 | 
 54 | 
 55 |     def test_cosine_similarity_dense_external_cfr(self):
 56 | 
 57 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
 58 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
 59 |         from sklearn.metrics.pairwise import cosine_similarity as Cosine_Similarity_Sklearn
 60 | 
 61 |         from scipy.spatial.distance import jaccard as Jaccard_Distance_Scipy
 62 | 
 63 | 
 64 |         TopK = 0
 65 |         shrink = 0
 66 | 
 67 |         data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]])
 68 |         data_matrix = sps.csr_matrix(data_matrix)
 69 | 
 70 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, shrink=shrink)
 71 |         W_dense_Cython = cosine_similarity.compute_similarity()
 72 | 
 73 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, shrink=shrink)
 74 |         W_dense_Python = cosine_similarity.compute_similarity()
 75 | 
 76 |         W_dense_sklearn = Cosine_Similarity_Sklearn(data_matrix.copy().T)
 77 |         W_dense_sklearn[np.arange(W_dense_sklearn.shape[0]),np.arange(W_dense_sklearn.shape[0])] = 0.0
 78 | 
 79 | 
 80 |         assert np.allclose(W_dense_Cython, W_dense_sklearn, atol=1e-4), "W_dense_Cython Cosine not matching Sklearn control"
 81 |         assert np.allclose(W_dense_Python, W_dense_sklearn, atol=1e-4), "W_dense_Python Cosine not matching Sklearn control"
 82 | 
 83 | 
 84 |         data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]])
 85 |         data_matrix = sps.csr_matrix(data_matrix)
 86 | 
 87 | 
 88 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, shrink=shrink,
 89 |                                                      mode='jaccard')
 90 |         W_dense_Cython = cosine_similarity.compute_similarity()
 91 | 
 92 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, shrink=shrink,
 93 |                                                      mode='jaccard')
 94 |         W_dense_Python = cosine_similarity.compute_similarity()
 95 | 
 96 | 
 97 |         W_dense_Scipy = np.zeros_like(W_dense_Python)
 98 |         data_matrix.data = np.ones_like(data_matrix.data)
 99 |         data_matrix = data_matrix.toarray()
100 | 
101 |         for row in range(W_dense_Scipy.shape[0]):
102 |             for col in range(W_dense_Scipy.shape[1]):
103 | 
104 |                 if row != col:
105 |                     W_dense_Scipy[row, col] = 1-Jaccard_Distance_Scipy(data_matrix[:,row], data_matrix[:,col])
106 | 
107 | 
108 |         assert np.allclose(W_dense_Cython, W_dense_Scipy, atol=1e-4), "W_dense_Cython Jaccard not matching Scipy control"
109 |         assert np.allclose(W_dense_Python, W_dense_Scipy, atol=1e-4), "W_dense_Python Jaccard not matching Scipy control"
110 | 
111 | 
112 | 
113 | 
114 | 
115 |     def test_cosine_similarity_dense_normalize(self):
116 | 
117 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
118 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
119 | 
120 |         import numpy.matlib
121 | 
122 |         TopK = 0
123 |         shrink = 5
124 | 
125 |         data_matrix = np.array([[1,1,0,1],[0,1,1,1],[1,0,1,0]])
126 |         data_matrix = sps.csr_matrix(data_matrix)
127 | 
128 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True, shrink=shrink)
129 |         W_dense_Cython = cosine_similarity.compute_similarity()
130 | 
131 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True, shrink=shrink)
132 |         W_dense_Python = cosine_similarity.compute_similarity()
133 | 
134 | 
135 |         W_dense_denominator = np.matlib.repmat(data_matrix.power(2).sum(axis=0), data_matrix.shape[1], 1)
136 |         W_dense_denominator = np.sqrt(W_dense_denominator)
137 |         W_dense_denominator = np.multiply(W_dense_denominator, W_dense_denominator.T) + shrink
138 | 
139 |         W_dense_mul = data_matrix.T.dot(data_matrix)
140 |         W_dense_mul /= W_dense_denominator
141 | 
142 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
143 | 
144 | 
145 |         assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control"
146 |         assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control"
147 | 
148 | 
149 | 
150 | 
151 |     def test_cosine_similarity_dense_adjusted(self):
152 | 
153 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
154 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
155 | 
156 |         import numpy.matlib
157 | 
158 |         TopK = 0
159 |         shrink = 0
160 | 
161 |         data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]])
162 |         data_matrix = sps.csr_matrix(data_matrix)
163 | 
164 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True,
165 |                                                      shrink=shrink, mode='adjusted')
166 |         W_dense_Cython = cosine_similarity.compute_similarity()
167 | 
168 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True,
169 |                                                      shrink=shrink, mode='adjusted')
170 |         W_dense_Python = cosine_similarity.compute_similarity()
171 | 
172 | 
173 |         data_matrix = data_matrix.toarray().astype(np.float64)
174 |         for row in range(data_matrix.shape[0]):
175 | 
176 |             nonzeroMask = data_matrix[row,:]>0
177 |             data_matrix[row,:][nonzeroMask] -= np.mean(data_matrix[row,:][nonzeroMask])
178 | 
179 | 
180 |         W_dense_denominator = np.matlib.repmat((data_matrix**2).sum(axis=0), data_matrix.shape[1], 1)
181 |         W_dense_denominator = np.sqrt(W_dense_denominator)
182 |         W_dense_denominator = np.multiply(W_dense_denominator, W_dense_denominator.T) + shrink
183 | 
184 |         W_dense_mul = data_matrix.T.dot(data_matrix)
185 |         W_dense_mul[W_dense_denominator>0] /= W_dense_denominator[W_dense_denominator>0]
186 | 
187 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
188 | 
189 |         assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control"
190 |         assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control"
191 | 
192 | 
193 | 
194 |     def test_cosine_similarity_dense_pearson(self):
195 | 
196 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
197 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
198 | 
199 |         import numpy.matlib
200 | 
201 |         TopK = 0
202 |         shrink = 0
203 | 
204 |         data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]])
205 |         data_matrix = sps.csr_matrix(data_matrix)
206 | 
207 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True,
208 |                                                      shrink=shrink, mode='pearson')
209 |         W_dense_Cython = cosine_similarity.compute_similarity()
210 | 
211 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True,
212 |                                                      shrink=shrink, mode='pearson')
213 |         W_dense_Python = cosine_similarity.compute_similarity()
214 | 
215 | 
216 |         data_matrix = data_matrix.toarray().astype(np.float64)
217 |         for col in range(data_matrix.shape[1]):
218 | 
219 |             nonzeroMask = data_matrix[:,col]>0
220 |             data_matrix[:,col][nonzeroMask] -= np.mean(data_matrix[:,col][nonzeroMask])
221 | 
222 | 
223 |         W_dense_denominator = np.matlib.repmat((data_matrix**2).sum(axis=0), data_matrix.shape[1], 1)
224 |         W_dense_denominator = np.sqrt(W_dense_denominator)
225 |         W_dense_denominator = np.multiply(W_dense_denominator, W_dense_denominator.T) + shrink
226 | 
227 |         W_dense_mul = data_matrix.T.dot(data_matrix)
228 |         W_dense_mul[W_dense_denominator>0] /= W_dense_denominator[W_dense_denominator>0]
229 | 
230 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
231 | 
232 |         assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control"
233 |         assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control"
234 | 
235 | 
236 | 
237 |     def test_cosine_similarity_dense_jaccard(self):
238 | 
239 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
240 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
241 | 
242 |         import numpy.matlib
243 | 
244 |         TopK = 0
245 |         shrink = 0
246 | 
247 |         data_matrix = np.array([[1,2,0,1],[0,1,4,1],[1,3,1,0]])
248 |         data_matrix = sps.csr_matrix(data_matrix)
249 | 
250 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = True,
251 |                                                      shrink=shrink, mode='jaccard')
252 |         W_dense_Cython = cosine_similarity.compute_similarity()
253 | 
254 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = True,
255 |                                                      shrink=shrink, mode='jaccard')
256 |         W_dense_Python = cosine_similarity.compute_similarity()
257 | 
258 | 
259 |         data_matrix.data = np.ones_like(data_matrix.data)
260 |         data_matrix = data_matrix.toarray().astype(np.float64)
261 | 
262 |         W_dense_mul = data_matrix.T.dot(data_matrix)
263 | 
264 | 
265 |         W_dense_denominator = np.matlib.repmat((data_matrix**2).sum(axis=0), data_matrix.shape[1], 1)
266 |         W_dense_denominator = W_dense_denominator + W_dense_denominator.T - W_dense_mul + shrink
267 | 
268 |         W_dense_mul[W_dense_denominator>0] /= W_dense_denominator[W_dense_denominator>0]
269 | 
270 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
271 | 
272 |         assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control"
273 |         assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control"
274 | 
275 | 
276 | 
277 |     def test_cosine_similarity_dense_big(self):
278 | 
279 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
280 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
281 | 
282 |         TopK = 0
283 |         n_items = 500
284 |         n_users = 1000
285 | 
286 |         data_matrix = sps.random(n_users, n_items, density=0.1)
287 | 
288 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False)
289 |         W_dense_Cython = cosine_similarity.compute_similarity()
290 | 
291 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False)
292 |         W_dense_Python = cosine_similarity.compute_similarity()
293 | 
294 | 
295 |         W_dense_mul = data_matrix.T.dot(data_matrix).toarray()
296 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
297 | 
298 |         assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_dense_Cython not matching control"
299 |         assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control"
300 | 
301 | 
302 |     def test_cosine_similarity_TopK(self):
303 | 
304 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
305 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
306 | 
307 |         TopK=4
308 | 
309 |         data_matrix = np.array([[1,1,0,1],[0,1,1,1],[1,0,1,0]])
310 |         data_matrix = sps.csr_matrix(data_matrix)
311 | 
312 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False)
313 |         W_dense_Cython = cosine_similarity.compute_similarity().toarray()
314 | 
315 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False)
316 |         W_dense_Python = cosine_similarity.compute_similarity().toarray()
317 | 
318 | 
319 |         W_dense_mul = data_matrix.T.dot(data_matrix)
320 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
321 | 
322 |         W_dense_mul = similarityMatrixTopK(W_dense_mul, k=TopK).toarray()
323 | 
324 |         assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_sparse_Cython not matching control"
325 |         assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control"
326 | 
327 | 
328 | 
329 |     def test_cosine_similarity_TopK_big(self):
330 | 
331 |         from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
332 |         from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
333 | 
334 | 
335 |         n_items = 500
336 |         n_users = 1000
337 |         TopK = n_items
338 | 
339 | 
340 |         data_matrix = sps.random(n_users, n_items, density=0.1)
341 | 
342 |         cosine_similarity = Cosine_Similarity_Cython(data_matrix, topK=TopK, normalize = False)
343 |         W_dense_Cython = cosine_similarity.compute_similarity().toarray()
344 | 
345 |         cosine_similarity = Cosine_Similarity_Python(data_matrix, topK=TopK, normalize = False)
346 |         W_dense_Python = cosine_similarity.compute_similarity().toarray()
347 | 
348 |         W_dense_mul = data_matrix.T.dot(data_matrix)
349 |         W_dense_mul[np.arange(W_dense_mul.shape[0]),np.arange(W_dense_mul.shape[0])] = 0.0
350 | 
351 |         W_dense_mul = similarityMatrixTopK(W_dense_mul, k=TopK).toarray()
352 | 
353 |         assert np.allclose(W_dense_Cython, W_dense_mul, atol=1e-4), "W_sparse_Cython not matching control"
354 |         assert np.allclose(W_dense_Python, W_dense_mul, atol=1e-4), "W_dense_Python not matching control"
355 | 
356 | 
357 | 
358 | 
359 | def runCompilationScript():
360 | 
361 |     # Run compile script setting the working directory to ensure the compiled file are contained in the
362 |     # appropriate subfolder and not the project root
363 | 
364 |     compiledModuleSubfolder = "/Cython"
365 |     fileToCompile = 'cosine_similarity.pyx'
366 | 
367 |     command = ['python',
368 |                'compileCython.py',
369 |                fileToCompile,
370 |                'build_ext',
371 |                '--inplace'
372 |                ]
373 | 
374 |     output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder)
375 | 
376 | 
377 |     try:
378 | 
379 |         command = ['cython',
380 |                    fileToCompile,
381 |                    '-a'
382 |                    ]
383 | 
384 |         output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder)
385 | 
386 |     except:
387 |         pass
388 | 
389 |     print("Compiled module saved in subfolder: {}".format(compiledModuleSubfolder))
390 | 
391 |     # Command to run compilation script
392 |     #python compileCython.py cosine_similarity.pyx build_ext --inplace
393 | 
394 |     # Command to generate html report
395 |     #subprocess.call(["cython", "-a", "cosine_similarity.pyx"])
396 | 
397 | if __name__ == '__main__':
398 | 
399 |     from data.Movielens10MReader import Movielens10MReader
400 | 
401 |     runCompilationScript()
402 | 
403 |     unittest.main()
404 |     #
405 |     # from data.NetflixEnhanced.NetflixEnhancedReader import NetflixEnhancedReader
406 |     #
407 |     # from Base.Cython.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Cython
408 |     # from Base.Cython.cosine_similarity import cosine_common
409 |     #
410 |     # from Base.cosine_similarity import Cosine_Similarity as Cosine_Similarity_Python
411 |     #
412 |     # from Base.Recommender_utils import similarityMatrixTopK
413 |     #
414 |     # TopK = 100
415 |     #
416 |     # dataReader = Movielens10MReader()
417 |     # #dataReader = NetflixEnhancedReader()
418 |     # URM_train = dataReader.get_URM_train()
419 |     #
420 |     # start_time = time.time()
421 |     # cosine_similarity = Cosine_Similarity_Cython(URM_train, TopK=TopK)
422 |     # W_sparse_Cython = cosine_similarity.compute_similarity()
423 |     # print("Cosine_Similarity_Cython {:.2f} sec, {:.2f} item/sec".format(time.time() - start_time,
424 |     #                                                          URM_train.shape[1] / (time.time() - start_time)))
425 |     #
426 |     # start_time = time.time()
427 |     # W_cosine_common = cosine_common(URM_train)
428 |     # print("Cosine common {:.2f} sec, {:.2f} item/sec".format(time.time()-start_time, URM_train.shape[1] / (time.time() - start_time)))
429 |     #
430 |     # start_time = time.time()
431 |     # cosine_similarity = Cosine_Similarity_Python(URM_train, TopK=TopK)
432 |     # W_sparse_Python = cosine_similarity.compute_similarity()
433 |     # print("Cosine_Similarity_Python {:.2f} sec, {:.2f} item/sec".format(time.time() - start_time,
434 |     #                                                          URM_train.shape[1] / (time.time() - start_time)))
435 |     #
436 | 
437 |     # start_time = time.time()
438 |     # product = URM_train.T.dot(URM_train)
439 |     # product[np.arange(product.shape[0]),np.arange(product.shape[0])] = 0.0
440 |     #
441 |     # W_sparse_Control = similarityMatrixTopK(product, k=TopK).toarray()
442 |     # print("similarityMatrixTopK {:.2f} sec, {:.2f} item/sec".format(time.time() - start_time,
443 |     #                                                          URM_train.shape[1] / (time.time() - start_time)))
444 | 
445 | 
446 | 
447 | 


--------------------------------------------------------------------------------
/Base/metrics.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | 
  4 | @author: Massimo Quadrana
  5 | """
  6 | 
  7 | import numpy as np
  8 | import unittest
  9 | 
 10 | 
 11 | 
 12 | def roc_auc(is_relevant):
 13 |     #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True)
 14 |     ranks = np.arange(len(is_relevant))
 15 |     pos_ranks = ranks[is_relevant]
 16 |     neg_ranks = ranks[~is_relevant]
 17 |     auc_score = 0.0
 18 |     if len(neg_ranks) == 0:
 19 |         return 1.0
 20 |     if len(pos_ranks) > 0:
 21 |         for pos_pred in pos_ranks:
 22 |             auc_score += np.sum(pos_pred < neg_ranks, dtype=np.float32)
 23 |         auc_score /= (pos_ranks.shape[0] * neg_ranks.shape[0])
 24 |     assert 0 <= auc_score <= 1, auc_score
 25 |     return auc_score
 26 | 
 27 | 
 28 | def precision(is_relevant):
 29 |     #ranked_list = ranked_list[:at]
 30 |     #is_relevant = np.in1d(is_relevant, pos_items, assume_unique=True)
 31 |     precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
 32 |     assert 0 <= precision_score <= 1, precision_score
 33 |     return precision_score
 34 | 
 35 | 
 36 | def recall(is_relevant, pos_items):
 37 |     #ranked_list = ranked_list[:at]
 38 |     #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True)
 39 |     recall_score = np.sum(is_relevant, dtype=np.float32) / pos_items.shape[0]
 40 |     assert 0 <= recall_score <= 1, recall_score
 41 |     return recall_score
 42 | 
 43 | 
 44 | def rr(is_relevant):
 45 |     # reciprocal rank of the FIRST relevant item in the ranked list (0 if none)
 46 |     #ranked_list = ranked_list[:at]
 47 |     #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True)
 48 |     ranks = np.arange(1, len(is_relevant) + 1)[is_relevant]
 49 |     if len(ranks) > 0:
 50 |         return 1. / ranks[0]
 51 |     else:
 52 |         return 0.0
 53 | 
 54 | 
 55 | def map(is_relevant, pos_items):
 56 |     #ranked_list = ranked_list[:at]
 57 |     #is_relevant = np.in1d(ranked_list, pos_items, assume_unique=True)
 58 |     p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
 59 |     map_score = np.sum(p_at_k) / np.min([pos_items.shape[0], is_relevant.shape[0]])
 60 |     assert 0 <= map_score <= 1, map_score
 61 |     return map_score
 62 | 
 63 | 
 64 | def ndcg(ranked_list, pos_items, relevance=None, at=None):
 65 |     if relevance is None:
 66 |         relevance = np.ones_like(pos_items)
 67 |     assert len(relevance) == pos_items.shape[0]
 68 |     it2rel = {it: r for it, r in zip(pos_items, relevance)}
 69 |     rank_scores = np.asarray([it2rel.get(it, 0.0) for it in ranked_list[:at]], dtype=np.float32)
 70 |     ideal_dcg = dcg(np.sort(relevance)[::-1])
 71 |     rank_dcg = dcg(rank_scores)
 72 |     ndcg_ = rank_dcg / ideal_dcg
 73 |     # assert 0 <= ndcg_ <= 1, (rank_dcg, ideal_dcg, ndcg_)
 74 |     return ndcg_
 75 | 
 76 | 
 77 | def dcg(scores):
 78 |     return np.sum(np.divide(np.power(2, scores) - 1, np.log(np.arange(scores.shape[0], dtype=np.float32) + 2)),
 79 |                   dtype=np.float32)
 80 | 
 81 | 
 82 | metrics = ['AUC', 'Precision' 'Recall', 'MAP', 'NDCG']
 83 | 
 84 | 
 85 | def pp_metrics(metric_names, metric_values, metric_at):
 86 |     """
 87 |     Pretty-prints metric values
 88 |     :param metrics_arr:
 89 |     :return:
 90 |     """
 91 |     assert len(metric_names) == len(metric_values)
 92 |     if isinstance(metric_at, int):
 93 |         metric_at = [metric_at] * len(metric_values)
 94 |     return ' '.join(['{}: {:.4f}'.format(mname, mvalue) if mcutoff is None or mcutoff == 0 else
 95 |                      '{}@{}: {:.4f}'.format(mname, mcutoff, mvalue)
 96 |                      for mname, mcutoff, mvalue in zip(metric_names, metric_at, metric_values)])
 97 | 
 98 | 
 99 | class TestAUC(unittest.TestCase):
100 |     def runTest(self):
101 |         pos_items = np.asarray([2, 4])
102 |         ranked_list = np.asarray([1, 2, 3, 4, 5])
103 |         self.assertTrue(np.allclose(roc_auc(ranked_list, pos_items),
104 |                                     (2. / 3 + 1. / 3) / 2))
105 | 
106 | 
107 | class TestRecall(unittest.TestCase):
108 |     def runTest(self):
109 |         pos_items = np.asarray([2, 4, 5, 10])
110 |         ranked_list_1 = np.asarray([1, 2, 3, 4, 5])
111 |         ranked_list_2 = np.asarray([10, 5, 2, 4, 3])
112 |         ranked_list_3 = np.asarray([1, 3, 6, 7, 8])
113 |         self.assertTrue(np.allclose(recall(ranked_list_1, pos_items), 3. / 4))
114 |         self.assertTrue(np.allclose(recall(ranked_list_2, pos_items), 1.0))
115 |         self.assertTrue(np.allclose(recall(ranked_list_3, pos_items), 0.0))
116 | 
117 |         thresholds = [1, 2, 3, 4, 5]
118 |         values = [0.0, 1. / 4, 1. / 4, 2. / 4, 3. / 4]
119 |         for at, val in zip(thresholds, values):
120 |             self.assertTrue(np.allclose(np.asarray(recall(ranked_list_1, pos_items, at=at)), val))
121 | 
122 | 
123 | class TestPrecision(unittest.TestCase):
124 |     def runTest(self):
125 |         pos_items = np.asarray([2, 4, 5, 10])
126 |         ranked_list_1 = np.asarray([1, 2, 3, 4, 5])
127 |         ranked_list_2 = np.asarray([10, 5, 2, 4, 3])
128 |         ranked_list_3 = np.asarray([1, 3, 6, 7, 8])
129 |         self.assertTrue(np.allclose(precision(ranked_list_1, pos_items), 3. / 5))
130 |         self.assertTrue(np.allclose(precision(ranked_list_2, pos_items), 4. / 5))
131 |         self.assertTrue(np.allclose(precision(ranked_list_3, pos_items), 0.0))
132 | 
133 |         thresholds = [1, 2, 3, 4, 5]
134 |         values = [0.0, 1. / 2, 1. / 3, 2. / 4, 3. / 5]
135 |         for at, val in zip(thresholds, values):
136 |             self.assertTrue(np.allclose(np.asarray(precision(ranked_list_1, pos_items, at=at)), val))
137 | 
138 | 
139 | class TestRR(unittest.TestCase):
140 |     def runTest(self):
141 |         pos_items = np.asarray([2, 4, 5, 10])
142 |         ranked_list_1 = np.asarray([1, 2, 3, 4, 5])
143 |         ranked_list_2 = np.asarray([10, 5, 2, 4, 3])
144 |         ranked_list_3 = np.asarray([1, 3, 6, 7, 8])
145 |         self.assertTrue(np.allclose(rr(ranked_list_1, pos_items), 1. / 2))
146 |         self.assertTrue(np.allclose(rr(ranked_list_2, pos_items), 1.))
147 |         self.assertTrue(np.allclose(rr(ranked_list_3, pos_items), 0.0))
148 | 
149 |         thresholds = [1, 2, 3, 4, 5]
150 |         values = [0.0, 1. / 2, 1. / 2, 1. / 2, 1. / 2]
151 |         for at, val in zip(thresholds, values):
152 |             self.assertTrue(np.allclose(np.asarray(rr(ranked_list_1, pos_items, at=at)), val))
153 | 
154 | 
155 | class TestMAP(unittest.TestCase):
156 |     def runTest(self):
157 |         pos_items = np.asarray([2, 4, 5, 10])
158 |         ranked_list_1 = np.asarray([1, 2, 3, 4, 5])
159 |         ranked_list_2 = np.asarray([10, 5, 2, 4, 3])
160 |         ranked_list_3 = np.asarray([1, 3, 6, 7, 8])
161 |         ranked_list_4 = np.asarray([11, 12, 13, 14, 15, 16, 2, 4, 5, 10])
162 |         ranked_list_5 = np.asarray([2, 11, 12, 13, 14, 15, 4, 5, 10, 16])
163 |         self.assertTrue(np.allclose(map(ranked_list_1, pos_items), (1. / 2 + 2. / 4 + 3. / 5) / 4))
164 |         self.assertTrue(np.allclose(map(ranked_list_2, pos_items), 1.0))
165 |         self.assertTrue(np.allclose(map(ranked_list_3, pos_items), 0.0))
166 |         self.assertTrue(np.allclose(map(ranked_list_4, pos_items), (1. / 7 + 2. / 8 + 3. / 9 + 4. / 10) / 4))
167 |         self.assertTrue(np.allclose(map(ranked_list_5, pos_items), (1. + 2. / 7 + 3. / 8 + 4. / 9) / 4))
168 | 
169 |         thresholds = [1, 2, 3, 4, 5]
170 |         values = [
171 |             0.0,
172 |             1. / 2 / 2,
173 |             1. / 2 / 3,
174 |             (1. / 2 + 2. / 4) / 4,
175 |             (1. / 2 + 2. / 4 + 3. / 5) / 4
176 |         ]
177 |         for at, val in zip(thresholds, values):
178 |             self.assertTrue(np.allclose(np.asarray(map(ranked_list_1, pos_items, at)), val))
179 | 
180 | 
181 | class TestNDCG(unittest.TestCase):
182 |     def runTest(self):
183 |         pos_items = np.asarray([2, 4, 5, 10])
184 |         pos_relevances = np.asarray([5, 4, 3, 2])
185 |         ranked_list_1 = np.asarray([1, 2, 3, 4, 5])  # rel = 0, 5, 0, 4, 3
186 |         ranked_list_2 = np.asarray([10, 5, 2, 4, 3])  # rel = 2, 3, 5, 4, 0
187 |         ranked_list_3 = np.asarray([1, 3, 6, 7, 8])  # rel = 0, 0, 0, 0, 0
188 |         idcg = ((2 ** 5 - 1) / np.log(2) +
189 |                 (2 ** 4 - 1) / np.log(3) +
190 |                 (2 ** 3 - 1) / np.log(4) +
191 |                 (2 ** 2 - 1) / np.log(5))
192 |         self.assertTrue(np.allclose(dcg(np.sort(pos_relevances)[::-1]), idcg))
193 |         self.assertTrue(np.allclose(ndcg(ranked_list_1, pos_items, pos_relevances),
194 |                                     ((2 ** 5 - 1) / np.log(3) +
195 |                                      (2 ** 4 - 1) / np.log(5) +
196 |                                      (2 ** 3 - 1) / np.log(6)) / idcg))
197 |         self.assertTrue(np.allclose(ndcg(ranked_list_2, pos_items, pos_relevances),
198 |                                     ((2 ** 2 - 1) / np.log(2) +
199 |                                      (2 ** 3 - 1) / np.log(3) +
200 |                                      (2 ** 5 - 1) / np.log(4) +
201 |                                      (2 ** 4 - 1) / np.log(5)) / idcg))
202 |         self.assertTrue(np.allclose(ndcg(ranked_list_3, pos_items, pos_relevances), 0.0))
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     unittest.main()
207 | 


--------------------------------------------------------------------------------
/KNN/item_knn_CBF.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on 23/10/17
 5 | 
 6 | @author: Maurizio Ferrari Dacrema
 7 | """
 8 | 
 9 | from Base.Recommender import Recommender
10 | from Base.Recommender_utils import check_matrix
11 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender
12 | 
13 | try:
14 |     from Base.Cython.cosine_similarity import Cosine_Similarity
15 | except ImportError:
16 |     print("Unable to load Cython Cosine_Similarity, reverting to Python")
17 |     from Base.cosine_similarity import Cosine_Similarity
18 | 
19 | 
20 | class ItemKNNCBFRecommender(Recommender, Similarity_Matrix_Recommender):
21 |     """ ItemKNN recommender"""
22 | 
23 |     def __init__(self, ICM, URM_train, sparse_weights=True):
24 |         super(ItemKNNCBFRecommender, self).__init__()
25 | 
26 |         self.ICM = ICM
27 | 
28 |         # CSR is faster during evaluation
29 |         self.URM_train = check_matrix(URM_train, 'csr')
30 | 
31 |         self.sparse_weights = sparse_weights
32 | 
33 | 
34 |     def fit(self, k=50, shrink=100, similarity='cosine', normalize=True):
35 | 
36 |         self.k = k
37 |         self.shrink = shrink
38 | 
39 |         self.similarity = Cosine_Similarity(self.ICM.T, shrink=shrink, topK=k, normalize=normalize, mode = similarity)
40 | 
41 | 
42 |         if self.sparse_weights:
43 |             self.W_sparse = self.similarity.compute_similarity()
44 |         else:
45 |             self.W = self.similarity.compute_similarity()
46 |             self.W = self.W.toarray()
47 | 
48 | 


--------------------------------------------------------------------------------
/KNN/item_knn_CF.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on 23/10/17
 5 | 
 6 | @author: Maurizio Ferrari Dacrema
 7 | """
 8 | 
 9 | from Base.Recommender import Recommender
10 | from Base.Recommender_utils import check_matrix
11 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender
12 | 
13 | try:
14 |     from Base.Cython.cosine_similarity import Cosine_Similarity
15 | except ImportError:
16 |     print("Unable to load Cython Cosine_Similarity, reverting to Python")
17 |     from Base.cosine_similarity import Cosine_Similarity
18 | 
19 | 
20 | class ItemKNNCFRecommender(Recommender, Similarity_Matrix_Recommender):
21 |     """ ItemKNN recommender"""
22 | 
23 |     def __init__(self, URM_train, sparse_weights=True):
24 |         super(ItemKNNCFRecommender, self).__init__()
25 | 
26 |         # CSR is faster during evaluation
27 |         self.URM_train = check_matrix(URM_train, 'csr')
28 | 
29 |         self.dataset = None
30 | 
31 |         self.sparse_weights = sparse_weights
32 | 
33 |     def fit(self, k=50, shrink=100, similarity='cosine', normalize=True):
34 | 
35 |         self.k = k
36 |         self.shrink = shrink
37 | 
38 |         self.similarity = Cosine_Similarity(self.URM_train, shrink=shrink, topK=k, normalize=normalize, mode = similarity)
39 | 
40 |         if self.sparse_weights:
41 |             self.W_sparse = self.similarity.compute_similarity()
42 |         else:
43 |             self.W = self.similarity.compute_similarity()
44 |             self.W = self.W.toarray()
45 | 


--------------------------------------------------------------------------------
/KNN/item_knn_custom_Similarity.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on 23/10/17
 5 | 
 6 | @author: Maurizio Ferrari Dacrema
 7 | """
 8 | 
 9 | 
10 | import numpy as np
11 | import scipy.sparse as sps
12 | from Base.Recommender_utils import check_matrix
13 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender
14 | from Base.Recommender import Recommender
15 | 
16 | 
17 | class ItemKNNCustomSimilarityRecommender(Recommender, Similarity_Matrix_Recommender):
18 |     """ ItemKNN recommender"""
19 | 
20 |     def __init__(self, k=50, shrinkage=100, normalize=False, sparse_weights=True):
21 |         super(ItemKNNCustomSimilarityRecommender, self).__init__()
22 |         self.k = k
23 |         self.shrinkage = shrinkage
24 |         self.normalize = normalize
25 |         self.dataset = None
26 |         self.similarity_name = None
27 |         self.sparse_weights = sparse_weights
28 | 
29 | 
30 |     def __str__(self):
31 |         return "ItemKNNCBF(similarity={},k={},shrinkage={},normalize={},sparse_weights={})".format(
32 |             self.similarity_name, self.k, self.shrinkage, self.normalize, self.sparse_weights)
33 | 
34 |     def fit(self, item_weights, URM_train, selectTopK = False):
35 | 
36 |         self.URM_train = check_matrix(URM_train, format='csc')
37 | 
38 |         # If no topK selection is required, just save the similarity
39 |         if (not selectTopK):
40 |             if isinstance(item_weights, np.ndarray):
41 |                 #self.W = item_weights
42 |                 #self.sparse_weights = False
43 |                 self.W_sparse = sps.csr_matrix(item_weights)
44 |                 self.sparse_weights = True
45 |             else:
46 |                 self.W_sparse = check_matrix(item_weights, format='csr')
47 |                 self.sparse_weights = True
48 | 
49 |             return
50 | 
51 | 
52 |         # If matrix is not dense, make it dense to select top K
53 |         if not isinstance(item_weights, np.ndarray):
54 |             item_weights = item_weights.toarray()
55 | 
56 | 
57 |         idx_sorted = np.argsort(item_weights, axis=0)  # sort by column
58 | 
59 |         # for each column, keep only the top-k scored items
60 | 
61 |         if not self.sparse_weights:
62 |             self.W = item_weights.copy()
63 |             # index of the items that don't belong to the top-k similar items of each column
64 |             not_top_k = idx_sorted[:-self.k, :]
65 |             # use numpy fancy indexing to zero-out the values in sim without using a for loop
66 |             self.W[not_top_k, np.arange(item_weights.shape[1])] = 0.0
67 |         else:
68 |             # iterate over each column and keep only the top-k similar items
69 |             values, rows, cols = [], [], []
70 |             nitems = self.URM_train.shape[1]
71 |             for i in range(nitems):
72 | 
73 |                 top_k_idx = idx_sorted[-self.k:, i]
74 | 
75 |                 values.extend(item_weights[top_k_idx, i])
76 |                 rows.extend(np.arange(nitems)[top_k_idx])
77 |                 cols.extend(np.ones(self.k) * i)
78 | 
79 |                 # During testing CSR is faster
80 |             self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)
81 | 
82 |         #self.scoresAll = URM_train.dot(self.W_sparse)
83 | 


--------------------------------------------------------------------------------
/KNN/user_knn_CF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 23/10/17
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | import numpy as np
 10 | 
 11 | from Base.Recommender import Recommender
 12 | from Base.Recommender_utils import check_matrix
 13 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender
 14 | 
 15 | try:
 16 |     from Base.Cython.cosine_similarity import Cosine_Similarity
 17 | except ImportError:
 18 |     print("Unable to load Cython Cosine_Similarity, reverting to Python")
 19 |     from Base.cosine_similarity import Cosine_Similarity
 20 | 
 21 | 
 22 | class UserKNNCFRecommender(Recommender, Similarity_Matrix_Recommender):
 23 |     """ UserKNN recommender"""
 24 | 
 25 |     def __init__(self, URM_train, sparse_weights=True):
 26 |         super(UserKNNCFRecommender, self).__init__()
 27 | 
 28 |         # Not sure if CSR here is faster
 29 |         self.URM_train = check_matrix(URM_train, 'csr')
 30 | 
 31 |         self.dataset = None
 32 | 
 33 |         self.sparse_weights = sparse_weights
 34 | 
 35 |     def fit(self, k=50, shrink=100, similarity='cosine', normalize=True):
 36 | 
 37 |         self.k = k
 38 |         self.shrink = shrink
 39 | 
 40 |         self.similarity = Cosine_Similarity(self.URM_train.T, shrink=shrink, topK=k, normalize=normalize, mode = similarity)
 41 | 
 42 |         if self.sparse_weights:
 43 |             self.W_sparse = self.similarity.compute_similarity()
 44 |         else:
 45 |             self.W = self.similarity.compute_similarity()
 46 |             self.W = self.W.toarray()
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 |     def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
 53 | 
 54 |         if n==None:
 55 |             n=self.URM_train.shape[1]-1
 56 | 
 57 |         # compute the scores using the dot product
 58 |         if self.sparse_weights:
 59 | 
 60 |             scores = self.W_sparse[user_id].dot(self.URM_train).toarray().ravel()
 61 | 
 62 |         else:
 63 |             # Numpy dot does not recognize sparse matrices, so we must
 64 |             # invoke the dot function on the sparse one
 65 |             scores = self.URM_train.T.dot(self.W[user_id])
 66 | 
 67 |         if self.normalize:
 68 |             # normalization will keep the scores in the same range
 69 |             # of value of the ratings in dataset
 70 |             user_profile = self.URM_train[user_id]
 71 | 
 72 |             rated = user_profile.copy()
 73 |             rated.data = np.ones_like(rated.data)
 74 |             if self.sparse_weights:
 75 |                 den = rated.dot(self.W_sparse).toarray().ravel()
 76 |             else:
 77 |                 den = rated.dot(self.W).ravel()
 78 |             den[np.abs(den) < 1e-6] = 1.0  # to avoid NaNs
 79 |             scores /= den
 80 | 
 81 |         if exclude_seen:
 82 |             scores = self._filter_seen_on_scores(user_id, scores)
 83 | 
 84 |         if filterTopPop:
 85 |             scores = self._filter_TopPop_on_scores(scores)
 86 | 
 87 |         if filterCustomItems:
 88 |             scores = self._filterCustomItems_on_scores(scores)
 89 | 
 90 | 
 91 |         # rank items and mirror column to obtain a ranking in descending score
 92 |         #ranking = scores.argsort()
 93 |         #ranking = np.flip(ranking, axis=0)
 94 | 
 95 |         # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
 96 |         # - Partition the data to extract the set of relevant items
 97 |         # - Sort only the relevant items
 98 |         # - Get the original item index
 99 |         relevant_items_partition = (-scores).argpartition(n)[0:n]
100 |         relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition])
101 |         ranking = relevant_items_partition[relevant_items_partition_sorting]
102 | 
103 | 
104 |         return ranking
105 | 
106 | 
107 | 
108 | 
109 |     def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
110 | 
111 |         # compute the scores using the dot product
112 | 
113 |         if self.sparse_weights:
114 | 
115 |             scores_array = self.W_sparse[users_in_batch].dot(self.URM_train)
116 |             scores_array = scores_array.toarray()
117 | 
118 |         else:
119 |             # Numpy dot does not recognize sparse matrices, so we must
120 |             # invoke the dot function on the sparse one
121 |             scores_array = self.URM_train.T.dot(self.W[users_in_batch].T)
122 | 
123 |         if self.normalize:
124 |             raise ValueError("Not implemented")
125 | 
126 |         # To exclude seen items perform a boolean indexing and replace their score with -inf
127 |         # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be
128 |         # recommended
129 |         if exclude_seen:
130 |             user_profile_batch = self.URM_train[users_in_batch]
131 |             scores_array[user_profile_batch.nonzero()] = -np.inf
132 | 
133 |         if filterTopPop:
134 |             scores_array[:,self.filterTopPop_ItemsID] = -np.inf
135 | 
136 |         if filterCustomItems:
137 |             scores_array[:, self.filterCustomItems_ItemsID] = -np.inf
138 | 
139 | 
140 |         # rank items and mirror column to obtain a ranking in descending score
141 |         #ranking = (-scores_array).argsort(axis=1)
142 |         #ranking = np.fliplr(ranking)
143 |         #ranking = ranking[:,0:n]
144 | 
145 |         ranking = np.zeros((scores_array.shape[0],n), dtype=np.int)
146 | 
147 |         for row_index in range(scores_array.shape[0]):
148 |             scores = scores_array[row_index]
149 | 
150 |             relevant_items_partition = (-scores).argpartition(n)[0:n]
151 |             relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition])
152 |             ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting]
153 | 
154 | 
155 |         return ranking
156 | 
157 | 


--------------------------------------------------------------------------------
/MatrixFactorization/Cython/MF_BPR_Cython.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 07/09/17
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | from Base.Recommender_utils import similarityMatrixTopK
 10 | from Base.Recommender import Recommender
 11 | import subprocess
 12 | import os, sys
 13 | import time
 14 | import numpy as np
 15 | 
 16 | 
 17 | class MF_BPR_Cython(Recommender):
 18 | 
 19 | 
 20 |     def __init__(self, URM_train, recompile_cython = False):
 21 | 
 22 | 
 23 |         super(MF_BPR_Cython, self).__init__()
 24 | 
 25 | 
 26 |         self.URM_train = URM_train
 27 |         self.n_users = URM_train.shape[0]
 28 |         self.n_items = URM_train.shape[1]
 29 |         self.normalize = False
 30 | 
 31 |         if recompile_cython:
 32 |             print("Compiling in Cython")
 33 |             self.runCompilationScript()
 34 |             print("Compilation Complete")
 35 | 
 36 | 
 37 | 
 38 |     def fit(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, filterCustomItems = np.array([], dtype=np.int), minRatingsPerUser=1,
 39 |             batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0, num_factors=10,  positive_threshold=4,
 40 |             learning_rate = 0.05, sgd_mode='sgd', user_reg = 0.0, positive_reg = 0.0, negative_reg = 0.0):
 41 | 
 42 | 
 43 |         self.num_factors = num_factors
 44 |         self.positive_threshold = positive_threshold
 45 | 
 46 |         # Select only positive interactions
 47 |         URM_train_positive = self.URM_train.copy()
 48 | 
 49 |         URM_train_positive.data = URM_train_positive.data >= self.positive_threshold
 50 |         URM_train_positive.eliminate_zeros()
 51 | 
 52 |         self.sgd_mode = sgd_mode
 53 | 
 54 | 
 55 |         # Import compiled module
 56 |         from MatrixFactorization.Cython.MF_BPR_Cython_Epoch import MF_BPR_Cython_Epoch
 57 | 
 58 | 
 59 |         self.cythonEpoch = MF_BPR_Cython_Epoch(URM_train_positive,
 60 |                                                  n_factors = self.num_factors,
 61 |                                                  learning_rate=learning_rate,
 62 |                                                  batch_size=1,
 63 |                                                  sgd_mode = sgd_mode,
 64 |                                                  user_reg=user_reg,
 65 |                                                  positive_reg=positive_reg,
 66 |                                                  negative_reg=negative_reg)
 67 | 
 68 | 
 69 |         self.batch_size = batch_size
 70 |         self.learning_rate = learning_rate
 71 | 
 72 | 
 73 |         start_time_train = time.time()
 74 | 
 75 |         for currentEpoch in range(epochs):
 76 | 
 77 |             start_time_epoch = time.time()
 78 | 
 79 |             if self.batch_size>0:
 80 |                 self.epochIteration()
 81 |             else:
 82 |                 print("No batch not available")
 83 | 
 84 | 
 85 |             if (URM_test is not None) and (currentEpoch % validate_every_N_epochs == 0) and \
 86 |                             currentEpoch >= start_validation_after_N_epochs:
 87 | 
 88 |                 print("Evaluation begins")
 89 | 
 90 |                 self.W = self.cythonEpoch.get_W()
 91 |                 self.H = self.cythonEpoch.get_H()
 92 | 
 93 |                 results_run = self.evaluateRecommendations(URM_test, filterTopPop=filterTopPop,
 94 |                                                            minRatingsPerUser=minRatingsPerUser, filterCustomItems=filterCustomItems)
 95 | 
 96 |                 self.writeCurrentConfig(currentEpoch, results_run, logFile)
 97 | 
 98 |                 print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs,
 99 |                                                                      float(time.time() - start_time_epoch) / 60))
100 | 
101 | 
102 |             # Fit with no validation
103 |             else:
104 |                 print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs,
105 |                                                                          float(time.time() - start_time_epoch) / 60))
106 | 
107 |         # Ensure W and H are up to date
108 |         self.W = self.cythonEpoch.get_W()
109 |         self.H = self.cythonEpoch.get_H()
110 | 
111 |         print("Fit completed in {:.2f} minutes".format(float(time.time() - start_time_train) / 60))
112 | 
113 |         sys.stdout.flush()
114 | 
115 | 
116 | 
117 | 
118 |     def runCompilationScript(self):
119 | 
120 |         # Run compile script setting the working directory to ensure the compiled file are contained in the
121 |         # appropriate subfolder and not the project root
122 | 
123 |         compiledModuleSubfolder = "/MatrixFactorization/Cython"
124 |         fileToCompile_list = ['MF_BPR_Cython_Epoch.pyx']
125 | 
126 |         for fileToCompile in fileToCompile_list:
127 | 
128 |             command = ['python',
129 |                        'compileCython.py',
130 |                        fileToCompile,
131 |                        'build_ext',
132 |                        '--inplace'
133 |                        ]
134 | 
135 | 
136 |             output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder)
137 | 
138 |             try:
139 | 
140 |                 command = ['cython',
141 |                            fileToCompile,
142 |                            '-a'
143 |                            ]
144 | 
145 |                 output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder)
146 | 
147 |             except:
148 |                 pass
149 | 
150 | 
151 |         print("Compiled module saved in subfolder: {}".format(compiledModuleSubfolder))
152 | 
153 |         # Command to run compilation script
154 |         #python compileCython.py MF_BPR_Cython_Epoch.pyx build_ext --inplace
155 | 
156 |         # Command to generate html report
157 |         #subprocess.call(["cython", "-a", "MF_BPR_Cython_Epoch.pyx"])
158 | 
159 | 
160 |     def epochIteration(self):
161 | 
162 |         self.cythonEpoch.epochIteration_Cython()
163 | 
164 | 
165 | 
166 | 
167 |     def writeCurrentConfig(self, currentEpoch, results_run, logFile):
168 | 
169 |         current_config = {'learn_rate': self.learning_rate,
170 |                           'num_factors': self.num_factors,
171 |                           'batch_size': 1,
172 |                           'epoch': currentEpoch}
173 | 
174 |         print("Test case: {}\nResults {}\n".format(current_config, results_run))
175 | 
176 |         sys.stdout.flush()
177 | 
178 |         if (logFile != None):
179 |             logFile.write("Test case: {}, Results {}\n".format(current_config, results_run))
180 |             logFile.flush()
181 | 
182 | 
183 | 
184 | 
185 |     def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
186 | 
187 |         # compute the scores using the dot product
188 |         user_profile_batch = self.URM_train[users_in_batch]
189 | 
190 |         scores_array = np.dot(self.W[users_in_batch], self.H.T)
191 | 
192 |         if self.normalize:
193 |             raise ValueError("Not implemented")
194 | 
195 |         # To exclude seen items perform a boolean indexing and replace their score with -inf
196 |         # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be
197 |         # recommended
198 |         if exclude_seen:
199 |             scores_array[user_profile_batch.nonzero()] = -np.inf
200 | 
201 |         if filterTopPop:
202 |             scores_array[:,self.filterTopPop_ItemsID] = -np.inf
203 | 
204 |         if filterCustomItems:
205 |             scores_array[:, self.filterCustomItems_ItemsID] = -np.inf
206 | 
207 | 
208 |         # rank items and mirror column to obtain a ranking in descending score
209 |         #ranking = (-scores_array).argsort(axis=1)
210 |         #ranking = np.fliplr(ranking)
211 |         #ranking = ranking[:,0:n]
212 | 
213 |         ranking = np.zeros((scores_array.shape[0],n), dtype=np.int)
214 | 
215 |         for row_index in range(scores_array.shape[0]):
216 |             scores = scores_array[row_index]
217 | 
218 |             relevant_items_partition = (-scores).argpartition(n)[0:n]
219 |             relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition])
220 |             ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting]
221 | 
222 | 
223 |         return ranking
224 | 
225 | 
226 | 
227 |     def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
228 | 
229 | 
230 |         if n==None:
231 |             n=self.URM_train.shape[1]-1
232 | 
233 |         scores_array = np.dot(self.W[user_id], self.H.T)
234 | 
235 |         if self.normalize:
236 |             raise ValueError("Not implemented")
237 | 
238 | 
239 |         if exclude_seen:
240 |             scores = self._filter_seen_on_scores(user_id, scores_array)
241 | 
242 |         if filterTopPop:
243 |             scores = self._filter_TopPop_on_scores(scores_array)
244 | 
245 |         if filterCustomItems:
246 |             scores = self._filterCustomItems_on_scores(scores_array)
247 | 
248 | 
249 |         # rank items and mirror column to obtain a ranking in descending score
250 |         #ranking = scores.argsort()
251 |         #ranking = np.flip(ranking, axis=0)
252 | 
253 |         # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
254 |         # - Partition the data to extract the set of relevant items
255 |         # - Sort only the relevant items
256 |         # - Get the original item index
257 |         relevant_items_partition = (-scores_array).argpartition(n)[0:n]
258 |         relevant_items_partition_sorting = np.argsort(-scores_array[relevant_items_partition])
259 |         ranking = relevant_items_partition[relevant_items_partition_sorting]
260 | 
261 | 
262 |         return ranking
263 | 


--------------------------------------------------------------------------------
/MatrixFactorization/Cython/MF_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/MatrixFactorization/Cython/MF_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/MatrixFactorization/Cython/MF_BPR_Cython_Epoch.pyx:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on 07/09/17
  3 | 
  4 | @author: Maurizio Ferrari Dacrema
  5 | """
  6 | 
  7 | #cython: boundscheck=False
  8 | #cython: wraparound=False
  9 | #cython: initializedcheck=False
 10 | #cython: language_level=3
 11 | #cython: nonecheck=False
 12 | #cython: cdivision=True
 13 | #cython: unpack_method_calls=True
 14 | #cython: overflowcheck=False
 15 | 
 16 | #defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 17 | 
 18 | from Base.Recommender_utils import check_matrix
 19 | import numpy as np
 20 | cimport numpy as np
 21 | import time
 22 | import sys
 23 | 
 24 | from libc.math cimport exp, sqrt
 25 | from libc.stdlib cimport rand, RAND_MAX
 26 | 
 27 | 
 28 | cdef struct BPR_sample:
 29 |     long user
 30 |     long pos_item
 31 |     long neg_item
 32 | 
 33 | 
 34 | cdef class MF_BPR_Cython_Epoch:
 35 | 
 36 |     cdef int n_users, n_items, n_factors
 37 |     cdef int numPositiveIteractions
 38 | 
 39 |     cdef int useAdaGrad, rmsprop
 40 | 
 41 |     cdef float learning_rate, user_reg, positive_reg, negative_reg
 42 | 
 43 |     cdef int batch_size
 44 | 
 45 |     cdef int[:] URM_mask_indices, URM_mask_indptr
 46 | 
 47 |     cdef double[:,:] W, H
 48 | 
 49 | 
 50 |     def __init__(self, URM_mask, n_factors = 10,
 51 |                  learning_rate = 0.01, user_reg = 0.0, positive_reg = 0.0, negative_reg = 0.0,
 52 |                  batch_size = 1, sgd_mode='sgd'):
 53 | 
 54 |         super(MF_BPR_Cython_Epoch, self).__init__()
 55 | 
 56 | 
 57 |         URM_mask = check_matrix(URM_mask, 'csr')
 58 | 
 59 |         self.numPositiveIteractions = int(URM_mask.nnz * 1)
 60 |         self.n_users = URM_mask.shape[0]
 61 |         self.n_items = URM_mask.shape[1]
 62 |         self.n_factors = n_factors
 63 | 
 64 |         self.URM_mask_indices = URM_mask.indices
 65 |         self.URM_mask_indptr = URM_mask.indptr
 66 | 
 67 |         # W and H cannot be initialized as zero, otherwise the gradient will always be zero
 68 |         self.W = np.random.random((self.n_users, self.n_factors))
 69 |         self.H = np.random.random((self.n_items, self.n_factors))
 70 | 
 71 | 
 72 | 
 73 |         if sgd_mode=='adagrad':
 74 |             self.useAdaGrad = True
 75 |         elif sgd_mode=='rmsprop':
 76 |             self.rmsprop = True
 77 |         elif sgd_mode=='sgd':
 78 |             pass
 79 |         else:
 80 |             raise ValueError(
 81 |                 "SGD_mode not valid. Acceptable values are: 'sgd', 'adagrad', 'rmsprop'. Provided value was '{}'".format(
 82 |                     sgd_mode))
 83 | 
 84 | 
 85 | 
 86 |         self.learning_rate = learning_rate
 87 |         self.user_reg = user_reg
 88 |         self.positive_reg = positive_reg
 89 |         self.negative_reg = negative_reg
 90 | 
 91 | 
 92 |         if batch_size!=1:
 93 |             print("MiniBatch not implemented, reverting to default value 1")
 94 |         self.batch_size = 1
 95 | 
 96 | 
 97 |     # Using memoryview instead of the sparse matrix itself allows for much faster access
 98 |     cdef int[:] getSeenItems(self, long index):
 99 |         return self.URM_mask_indices[self.URM_mask_indptr[index]:self.URM_mask_indptr[index + 1]]
100 | 
101 | 
102 | 
103 |     def epochIteration_Cython(self):
104 | 
105 |         # Get number of available interactions
106 |         cdef long totalNumberOfBatch = int(self.numPositiveIteractions / self.batch_size) + 1
107 | 
108 | 
109 |         cdef BPR_sample sample
110 |         cdef long u, i, j
111 |         cdef long index, numCurrentBatch
112 |         cdef double x_uij, sigmoid_user, sigmoid_item
113 | 
114 |         cdef int numSeenItems
115 | 
116 |         # Variables for AdaGrad and RMSprop
117 |         cdef double [:] sgd_cache_item_factors, sgd_cache_user_factors
118 |         cdef double cacheUpdate
119 |         cdef float gamma
120 | 
121 |         cdef double H_i, H_j, W_u
122 | 
123 | 
124 |         if self.useAdaGrad:
125 |              sgd_cache_item_factors = np.zeros((self.n_items), dtype=float)
126 |              sgd_cache_user_factors = np.zeros((self.n_users), dtype=float)
127 | 
128 |         # elif self.rmsprop:
129 |         #     sgd_cache = np.zeros((self.n_items), dtype=float)
130 |         #     gamma = 0.001
131 | 
132 | 
133 |         cdef long start_time_epoch = time.time()
134 |         cdef long start_time_batch = time.time()
135 | 
136 |         for numCurrentBatch in range(totalNumberOfBatch):
137 | 
138 |             # Uniform user sampling with replacement
139 |             sample = self.sampleBPR_Cython()
140 | 
141 |             u = sample.user
142 |             i = sample.pos_item
143 |             j = sample.neg_item
144 | 
145 |             x_uij = 0.0
146 | 
147 |             for index in range(self.n_factors):
148 | 
149 |                 x_uij = self.W[u,index] * (self.H[i,index] - self.H[j,index])
150 | 
151 |             # Use gradient of log(sigm(-x_uij))
152 |             sigmoid_item = 1 / (1 + exp(x_uij))
153 |             sigmoid_user = sigmoid_item
154 | 
155 | 
156 | 
157 | 
158 |             if self.useAdaGrad:
159 |                 cacheUpdate = sigmoid_item ** 2
160 | 
161 |                 sgd_cache_item_factors[i] += cacheUpdate
162 |                 sgd_cache_item_factors[j] += cacheUpdate
163 |                 sgd_cache_user_factors[u] += cacheUpdate
164 | 
165 |                 sigmoid_item = sigmoid_item / (sqrt(sgd_cache_item_factors[i]) + 1e-8)
166 |                 sigmoid_user = sigmoid_user / (sqrt(sgd_cache_user_factors[u]) + 1e-8)
167 | 
168 |             #   INCOMPATIBLE CODE
169 |             # elif self.rmsprop:
170 |             #     cacheUpdate = sgd_cache[i] * gamma + (1 - gamma) * gradient ** 2
171 |             #
172 |             #     sgd_cache[i] = cacheUpdate
173 |             #     sgd_cache[j] = cacheUpdate
174 |             #
175 |             #     gradient = gradient / (sqrt(sgd_cache[i]) + 1e-8)
176 | 
177 | 
178 |             for index in range(self.n_factors):
179 | 
180 |                 # Copy original value to avoid messing up the updates
181 |                 H_i = self.H[i, index]
182 |                 H_j = self.H[j, index]
183 |                 W_u = self.W[u, index]
184 | 
185 |                 self.W[u, index] += self.learning_rate * (sigmoid_user * ( H_i - H_j ) - self.user_reg * W_u)
186 |                 self.H[i, index] += self.learning_rate * (sigmoid_item * ( W_u ) - self.positive_reg * H_i)
187 |                 self.H[j, index] += self.learning_rate * (sigmoid_item * (-W_u ) - self.negative_reg * H_j)
188 | 
189 | 
190 | 
191 |             if((numCurrentBatch%5000000==0 and not numCurrentBatch==0) or numCurrentBatch==totalNumberOfBatch-1):
192 |                 print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Sample per second: {:.0f}".format(
193 |                     numCurrentBatch*self.batch_size,
194 |                     100.0* float(numCurrentBatch*self.batch_size)/self.numPositiveIteractions,
195 |                     time.time() - start_time_batch,
196 |                     float(numCurrentBatch*self.batch_size + 1) / (time.time() - start_time_epoch)))
197 | 
198 |                 sys.stdout.flush()
199 |                 sys.stderr.flush()
200 | 
201 |                 start_time_batch = time.time()
202 | 
203 | 
204 |     def get_W(self):
205 |         return np.array(self.W)
206 | 
207 | 
208 |     def get_H(self):
209 |         return np.array(self.H)
210 | 
211 | 
212 | 
213 |     cdef BPR_sample sampleBPR_Cython(self):
214 | 
215 |         cdef BPR_sample sample = BPR_sample(-1,-1,-1)
216 |         cdef long index, start_pos_seen_items, end_pos_seen_items
217 | 
218 |         cdef int negItemSelected, numSeenItems = 0
219 | 
220 | 
221 |         # Skip users with no interactions or with no negative items
222 |         while numSeenItems == 0 or numSeenItems == self.n_items:
223 | 
224 |             sample.user = rand() % self.n_users
225 | 
226 |             start_pos_seen_items = self.URM_mask_indptr[sample.user]
227 |             end_pos_seen_items = self.URM_mask_indptr[sample.user+1]
228 | 
229 |             numSeenItems = end_pos_seen_items - start_pos_seen_items
230 | 
231 | 
232 |         index = rand() % numSeenItems
233 | 
234 |         sample.pos_item = self.URM_mask_indices[start_pos_seen_items + index]
235 | 
236 | 
237 | 
238 |         negItemSelected = False
239 | 
240 |         # It's faster to just try again then to build a mapping of the non-seen items
241 |         # for every user
242 |         while (not negItemSelected):
243 | 
244 |             sample.neg_item = rand() % self.n_items
245 | 
246 |             index = 0
247 |             while index < numSeenItems and self.URM_mask_indices[start_pos_seen_items + index]!=sample.neg_item:
248 |                 index+=1
249 | 
250 |             if index == numSeenItems:
251 |                 negItemSelected = True
252 | 
253 | 
254 |         return sample
255 | 


--------------------------------------------------------------------------------
/MatrixFactorization/Cython/MF_RMSE.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/MatrixFactorization/Cython/MF_RMSE.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/MatrixFactorization/Cython/MF_RMSE.pyx:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on 23/10/17
  3 | 
  4 | @author: Massimo Quadrana
  5 | """
  6 | 
  7 | #cython: boundscheck=False
  8 | #cython: wraparound=False
  9 | #cython: initializedcheck=False
 10 | #cython: language_level=3
 11 | #cython: nonecheck=False
 12 | #cython: cdivision=True
 13 | #cython: unpack_method_calls=True
 14 | #cython: overflowcheck=False
 15 | 
 16 | 
 17 | cimport cython
 18 | cimport numpy as np
 19 | import numpy as np
 20 | import scipy.sparse as sps
 21 | 
 22 | import time
 23 | import sys
 24 | 
 25 | 
 26 | @cython.boundscheck(False)
 27 | def FunkSVD_sgd(R, int num_factors=50, double lrate=0.01, double reg=0.015, int n_iterations=10, init_mean=0.0, init_std=0.1, double lrate_decay=1.0, rnd_seed=42):
 28 |     if not isinstance(R, sps.csr_matrix):
 29 |         raise ValueError('R must be an instance of scipy.sparse.csr_matrix')
 30 | 
 31 |     # use Cython MemoryViews for fast access to the sparse structure of R
 32 |     cdef int [:] col_indices = R.indices, indptr = R.indptr
 33 |     cdef double [:] data = np.array(R.data, dtype=np.float)
 34 |     cdef int n_users = R.shape[0], n_items = R.shape[1]
 35 |     cdef int nnz = len(R.data)
 36 | 
 37 | 
 38 |     # in csr format, indices correspond to column indices
 39 |     # let's build the vector of row_indices
 40 |     cdef np.ndarray[np.int64_t, ndim=1] row_nnz = np.diff(indptr).astype(np.int64)
 41 |     cdef np.ndarray[np.int64_t, ndim=1] row_indices = np.repeat(np.arange(n_users), row_nnz).astype(np.int64)
 42 | 
 43 |     # set the seed of the random number generator
 44 |     np.random.seed(rnd_seed)
 45 | 
 46 |     # randomly initialize the user and item latent factors
 47 |     cdef double[:,:] U = np.random.normal(init_mean, init_std, (n_users, num_factors)).astype(np.float)
 48 |     cdef double[:,:] V = np.random.normal(init_mean, init_std, (n_items, num_factors)).astype(np.float)
 49 | 
 50 |     # build random index to iterate over the non-zero elements in R
 51 |     cdef np.ndarray[np.int64_t, ndim=1] shuffled_idx = np.random.permutation(nnz).astype(np.int64)
 52 | 
 53 |     # here we define some auxiliary variables
 54 |     cdef int i, j, f, idx, currentIteration, numSample
 55 |     cdef double rij, rij_pred, err, loss
 56 |     cdef double[:] U_i = np.zeros(num_factors, dtype=np.float)
 57 |     cdef double[:] V_j = np.zeros(num_factors, dtype=np.float)
 58 | 
 59 |     start_time_epoch = time.time()
 60 |     start_time_batch = time.time()
 61 | 
 62 |     #
 63 |     # Stochastic Gradient Descent starts here
 64 |     #
 65 |     for currentIteration in range(n_iterations):     # for each iteration
 66 |         loss = 0.0
 67 | 
 68 |         for numSample in range(nnz):    # iterate over non-zero values in R only
 69 |             idx = shuffled_idx[numSample]
 70 |             rij = data[idx]
 71 | 
 72 |             # get the row and col indices of x_ij
 73 |             i = row_indices[idx]
 74 |             j = col_indices[idx]
 75 | 
 76 |             rij_pred = 0
 77 | 
 78 |             # compute the predicted value of R
 79 |             for f in range(num_factors):
 80 |                 U_i[f] = U[i,f]
 81 |                 V_j[f] = V[j,f]
 82 |                 rij_pred += U[i,f]*V[j,f]
 83 | 
 84 |             # compute the prediction error
 85 |             err = rij - rij_pred
 86 | 
 87 |             # update the loss
 88 |             loss += err**2
 89 | 
 90 |             # adjust the latent factors
 91 |             for f in range(num_factors):
 92 |                 U[i, f] += lrate * (err * V_j[f] - reg * U_i[f])
 93 |                 V[j, f] += lrate * (err * U_i[f] - reg * V_j[f])
 94 | 
 95 |         loss /= nnz
 96 | 
 97 |         # update the learning rate
 98 |         lrate *= lrate_decay
 99 | 
100 |         print("Iteration {} of {} completed in {:.2f} minutes. Loss is {:.4f}. Sample per second: {:.0f}".format(
101 |                     currentIteration, n_iterations,
102 |                     (time.time() - start_time_batch)/60,
103 |                     loss,
104 |                     float(nnz) / (time.time() - start_time_batch)))
105 | 
106 |         sys.stdout.flush()
107 |         sys.stderr.flush()
108 | 
109 |         start_time_batch = time.time()
110 | 
111 | 
112 | 
113 |     return U, V
114 | 
115 | 
116 | @cython.boundscheck(False)
117 | def AsySVD_sgd(R, num_factors=50, lrate=0.01, reg=0.015, iters=10, init_mean=0.0, init_std=0.1, lrate_decay=1.0, rnd_seed=42):
118 |     if not isinstance(R, sps.csr_matrix):
119 |         raise ValueError('R must be an instance of scipy.sparse.csr_matrix')
120 | 
121 |     # use Cython MemoryViews for fast access to the sparse structure of R
122 |     cdef int [:] col_indices = R.indices, indptr = R.indptr
123 |     cdef float [:] data = R.data
124 |     cdef int M = R.shape[0], N = R.shape[1]
125 |     cdef int nnz = len(R.data)
126 | 
127 |     # in csr format, indices correspond to column indices
128 |     # let's build the vector of row_indices
129 |     cdef np.ndarray[np.int64_t, ndim=1] row_nnz = np.diff(indptr).astype(np.int64)
130 |     cdef np.ndarray[np.int64_t, ndim=1] row_indices = np.repeat(np.arange(M), row_nnz).astype(np.int64)
131 | 
132 |     # set the seed of the random number generator
133 |     np.random.seed(rnd_seed)
134 | 
135 |     # randomly initialize the item latent factors
136 |     cdef np.ndarray[np.float32_t, ndim=2] X = np.random.normal(init_mean, init_std, (N, num_factors)).astype(np.float32)
137 |     cdef np.ndarray[np.float32_t, ndim=2] Y = np.random.normal(init_mean, init_std, (N, num_factors)).astype(np.float32)
138 | 
139 |     # build random index to iterate over the non-zero elements in R
140 |     cdef np.ndarray[np.int64_t, ndim=1] shuffled_idx = np.random.permutation(nnz).astype(np.int64)
141 | 
142 |     # here we define some auxiliary variables
143 |     cdef int i, j, it, n, idx, n_rated, start, end
144 |     cdef float rij, rij_pred, err, loss
145 |     cdef np.ndarray[np.float32_t, ndim=1] X_j = np.zeros(num_factors, dtype=np.float32)
146 |     cdef np.ndarray[np.float32_t, ndim=1] Y_acc = np.zeros(num_factors, dtype=np.float32)
147 |     cdef np.ndarray[np.float32_t, ndim=2] Y_copy = np.zeros_like(Y, dtype=np.float32)
148 | 
149 |     #
150 |     # Stochastic Gradient Descent starts here
151 |     #
152 |     for it in range(iters):     # for each iteration
153 |         loss = 0.0
154 |         for n in range(nnz):    # iterate over non-zero values in R only
155 |             idx = shuffled_idx[n]
156 |             rij = data[idx]
157 |             # get the row and col indices of x_ij
158 |             i = row_indices[idx]
159 |             j = col_indices[idx]
160 |             # get the latent factor of item j
161 |             X_j = X[j].copy()
162 |             # accumulate the item latent factors over the other items rated by i
163 |             Y_acc = np.zeros(num_factors, dtype=np.float32)
164 |             n_rated = 0
165 |             start, end = indptr[i], indptr[i+1]
166 |             for l in col_indices[start:end]:
167 |                 x_il = data[start + n_rated]
168 |                 Y_acc += x_il * Y[l]
169 |                 n_rated += 1
170 |             if n_rated > 0:
171 |                 Y_acc /= np.sqrt(n_rated)
172 |             # compute the predicted rating
173 |             rij_pred = np.dot(X_j, Y_acc)
174 |             # compute the prediction error
175 |             err = rij - rij_pred
176 |             # update the loss
177 |             loss += err**2
178 |             # adjust the latent factors
179 |             X[j] += lrate * (err * Y_acc - reg * X_j)
180 |             # copy the current item preference factors
181 |             Y_copy = Y.copy()
182 |             for l in col_indices[indptr[i]:indptr[i+1]]:
183 |                 Y_l = Y_copy[l]
184 |                 Y[l] += lrate * (err * X_j - reg * Y_l)
185 | 
186 |         loss /= nnz
187 |         print('Iter {} - loss: {:.4f}'.format(it+1, loss))
188 |         # update the learning rate
189 |         lrate *= lrate_decay
190 | 
191 |     return X, Y
192 | 
193 | @cython.boundscheck(False)
194 | def AsySVD_compute_user_factors(user_profile, Y):
195 |     if not isinstance(user_profile, sps.csr_matrix):
196 |         raise ValueError('user_profile must be an instance of scipy.sparse.csr_matrix')
197 |     assert user_profile.shape[0] == 1, 'user_profile must be a 1-dimensional vector'
198 | 
199 |     # use Cython MemoryViews for fast access to the sparse structure of user_profile
200 |     cdef int [:] col_indices = user_profile.indices
201 |     cdef float [:] data = user_profile.data
202 | 
203 |     # intialize the accumulated user profile
204 |     cdef int num_factors = Y.shape[1]
205 |     cdef np.ndarray[np.float32_t, ndim=1] Y_acc = np.zeros(num_factors, dtype=np.float32)
206 |     cdef int n_rated = len(col_indices)
207 |     # aux variables
208 |     cdef int n
209 |     # accumulate the item vectors for the items rated by the user
210 |     for n in range(n_rated):
211 |         ril = data[n]
212 |         Y_acc += ril * Y[col_indices[n]]
213 |     if n_rated > 0:
214 |         Y_acc /= np.sqrt(n_rated)
215 |     return Y_acc
216 | 
217 | 
218 | from libc.math cimport exp, log
219 | 
220 | @cython.boundscheck(False)
221 | def BPRMF_sgd(R, num_factors=50, lrate=0.01, user_reg=0.015, pos_reg=0.015, neg_reg=0.0015, iters=10,
222 |               sampling_type='user_uniform_item_uniform',sample_with_replacement=True, use_resampling=False, sampling_pop_alpha=1.0,
223 |      init_mean=0.0, init_std=0.1, lrate_decay=1.0, rnd_seed=42,verbose=False):
224 |     if not isinstance(R, sps.csr_matrix):
225 |         raise ValueError('R must be an instance of scipy.sparse.csr_matrix')
226 | 
227 |     # use Cython MemoryViews for fast access to the sparse structure of R
228 |     cdef int [:] col_indices = R.indices, indptr = R.indptr
229 |     cdef float [:] data = R.data
230 |     cdef int M = R.shape[0], N = R.shape[1]
231 |     cdef int nnz = len(R.data)
232 | 
233 |     # set the seed of the random number generator
234 |     np.random.seed(rnd_seed)
235 |     # randomly initialize the user and item latent factors
236 |     cdef np.ndarray[np.float32_t, ndim=2] X = np.random.normal(init_mean, init_std, (M, num_factors)).astype(np.float32)
237 |     cdef np.ndarray[np.float32_t, ndim=2] Y = np.random.normal(init_mean, init_std, (N, num_factors)).astype(np.float32)
238 | 
239 |     # sample the training triples
240 |     cdef np.ndarray[np.int64_t, ndim=2] sample
241 |     if sampling_type == 'user_uniform_item_uniform':
242 |         sample = user_uniform_item_uniform_sampling(R, nnz, replace=sample_with_replacement, seed=rnd_seed, verbose=verbose)
243 |     elif sampling_type == 'user_uniform_item_pop':
244 |         sample = user_uniform_item_pop_sampling(R, nnz, alpha=sampling_pop_alpha, seed=rnd_seed, verbose=verbose)
245 |     else:
246 |         raise RuntimeError('Unknown sampling procedure "{}"'.format(sampling_type))
247 | 
248 |     # here we define some auxiliary variables
249 |     cdef int i, j, k, idx, it, n
250 |     cdef float rij, rik, loss, deriv
251 |     cdef np.ndarray[np.float32_t, ndim=1] X_i = np.zeros(num_factors, dtype=np.float32)
252 |     cdef np.ndarray[np.float32_t, ndim=1] Y_j = np.zeros(num_factors, dtype=np.float32)
253 |     cdef np.ndarray[np.float32_t, ndim=1] Y_k = np.zeros(num_factors, dtype=np.float32)
254 | 
255 |     #
256 |     # Stochastic Gradient Descent starts here
257 |     #
258 |     for it in range(iters):     # for each iteration
259 |         loss = 0.0
260 |         for n in range(nnz):
261 |             i, j, k = sample[n]
262 |             # get the user and item factors
263 |             X_i = X[i].copy()
264 |             Y_j = Y[j].copy()
265 |             Y_k = Y[k].copy()
266 |             # compute the difference of the predicted scores
267 |             diff_yjk = Y_j - Y_k
268 |             zijk = np.dot(X_i, diff_yjk)
269 |             # compute the sigmoid
270 |             sig = 1. / (1. + exp(-zijk))
271 |             # update the loss
272 |             loss += log(sig)
273 | 
274 |             # adjust the latent factors
275 |             deriv = 1. - sig
276 |             X[i] += lrate * (deriv * diff_yjk - user_reg * X_i)
277 |             Y[j] += lrate * (deriv * X_i - pos_reg * Y_j)
278 |             Y[k] += lrate * (-deriv * X_i - neg_reg * Y_k)
279 | 
280 |         loss /= nnz
281 |         if verbose:
282 |             print('Iter {} - loss: {:.4f}'.format(it+1, loss))
283 |         # update the learning rate
284 |         lrate *= lrate_decay
285 |         if use_resampling:
286 |             if sampling_type == 'user_uniform_item_uniform':
287 |                 sample = user_uniform_item_uniform_sampling(R, nnz, replace=sample_with_replacement, seed=rnd_seed, verbose=verbose)
288 |             elif sampling_type == 'user_uniform_item_pop':
289 |                 sample = user_uniform_item_pop_sampling(R, nnz, alpha=sampling_pop_alpha, seed=rnd_seed, verbose=verbose)
290 | 
291 |     return X, Y
292 | 
293 | def user_uniform_item_uniform_sampling(R, size, replace=True, seed=1234, verbose=True):
294 |     # use Cython MemoryViews for fast access to the sparse structure of R
295 |     cdef int [:] col_indices = R.indices, indptr = R.indptr
296 |     cdef int M = R.shape[0], N = R.shape[1]
297 |     cdef int nnz = len(R.data)
298 | 
299 |     cdef np.ndarray[np.int64_t, ndim=2] sample = np.zeros((size, 3), dtype=np.int64)
300 |     cdef np.ndarray[np.int8_t, ndim=1] is_sampled # boolean arrays are not yet supported by Cython
301 |     if not replace:
302 |         is_sampled = np.zeros(nnz, dtype=np.int8)
303 | 
304 |     # set the seed of the random number generator
305 |     np.random.seed(seed)
306 | 
307 |     cdef int i=0, start, end, iid, jid, kid, idx
308 |     cdef np.ndarray[np.int64_t, ndim=1] aux, neg_candidates
309 |     cdef int [:] pos_candidates
310 |     while i < size:
311 |         # 1) sample a user from a uniform distribution
312 |         iid  = np.random.choice(M)
313 | 
314 |         # 2) sample a positive item uniformly at random
315 |         start = indptr[iid]
316 |         end = indptr[iid+1]
317 |         pos_candidates = col_indices[start:end]
318 |         if start == end:
319 |             # empty candidate set
320 |             continue
321 |         if replace:
322 |             # sample positive items with replacement
323 |             jid = np.random.choice(pos_candidates)
324 |         else:
325 |             # sample positive items without replacement
326 |             # use a index vector between start and end
327 |             aux = np.arange(start, end)
328 |             if np.all(is_sampled[aux]):
329 |                 # all positive items have been already sampled
330 |                 continue
331 |             idx = np.random.choice(aux)
332 |             while is_sampled[idx]:
333 |                 # TODO: remove idx from aux to speed up the sampling
334 |                 idx = np.random.choice(aux)
335 |             is_sampled[idx] = 1
336 |             jid = col_indices[idx]
337 | 
338 |         # 3) sample a negative item uniformly at random
339 |         # build the candidate set of negative items
340 |         # TODO: precompute the negative candidate set for speed-up
341 |         neg_candidates = np.delete(np.arange(N), pos_candidates)
342 |         kid = np.random.choice(neg_candidates)
343 |         sample[i, :] = [iid, jid, kid]
344 |         i += 1
345 |         if verbose and i % 10000 == 0:
346 |             print('Sampling... {:.2f}% complete'.format(i/size*100))
347 |     return sample
348 | 
349 | 
350 | def user_uniform_item_pop_sampling(R, size, alpha=1., seed=1234, verbose=True):
351 |     # use Cython MemoryViews for fast access to the sparse structure of R
352 |     cdef int [:] col_indices = R.indices, indptr = R.indptr
353 |     cdef int M = R.shape[0], N = R.shape[1]
354 |     cdef int nnz = len(R.data)
355 | 
356 |     cdef np.ndarray[np.int64_t, ndim=2] sample = np.zeros((size, 3), dtype=np.int64)
357 | 
358 |     # compute the item popularity
359 |     cdef np.ndarray[np.float32_t, ndim=1] item_pop = np.asarray(np.sum(R > 0, axis=0)).squeeze().astype(np.float32)
360 |     # smooth popularity with an exponential factor alpha
361 |     item_pop = np.power(item_pop, alpha)
362 | 
363 |     # set the seed of the random number generator
364 |     np.random.seed(seed)
365 | 
366 |     cdef int i=0, start, end, iid, jid, kid, idx
367 |     cdef np.ndarray[np.int64_t, ndim=1] aux, neg_candidates
368 |     cdef int [:] pos_candidates
369 |     cdef np.ndarray[np.float32_t, ndim=1] p
370 |     while i < size:
371 |         # 1) sample a user from a uniform distribution
372 |         iid  = np.random.choice(M)
373 | 
374 |         # 2) sample a positive item proportionally to its popularity
375 |         start = indptr[iid]
376 |         end = indptr[iid+1]
377 |         pos_candidates = col_indices[start:end]
378 |         if start == end:
379 |             # empty candidate set
380 |             continue
381 |         # always sample with replacement
382 |         p = item_pop[pos_candidates]
383 |         p /= np.sum(p)
384 |         jid = np.random.choice(pos_candidates, p=p)
385 | 
386 |         # 3) sample a negative item uniformly at random
387 |         # build the candidate set of negative items
388 |         # TODO: precompute the negative candidate set for speed-up
389 |         neg_candidates = np.delete(np.arange(N), pos_candidates)
390 |         kid = np.random.choice(neg_candidates)
391 |         sample[i, :] = [iid, jid, kid]
392 |         i += 1
393 |         if verbose and i % 10000 == 0:
394 |             print('Sampling... {:.2f}% complete'.format(i/size*100))
395 |     return sample


--------------------------------------------------------------------------------
/MatrixFactorization/Cython/build/temp.linux-x86_64-3.6/MF_BPR_Cython_Epoch.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/MatrixFactorization/Cython/build/temp.linux-x86_64-3.6/MF_BPR_Cython_Epoch.o


--------------------------------------------------------------------------------
/MatrixFactorization/Cython/compileCython.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on 16/07/2017
 5 | 
 6 | @author: Maurizio Ferrari Dacrema
 7 | """
 8 | 
 9 | 
10 | try:
11 |     from setuptools import setup
12 |     from setuptools import Extension
13 | except ImportError:
14 |     from distutils.core import setup
15 |     from distutils.extension import Extension
16 | 
17 | 
18 | from Cython.Distutils import build_ext
19 | 
20 | 
21 | import numpy
22 | 
23 | import sys
24 | import re
25 | 
26 | 
27 | if len(sys.argv) != 4:
28 |     raise ValueError("Wrong number of paramethers received. Expected 4, got {}".format(sys.argv))
29 | 
30 | 
31 | #fileToCompile = 'MF_BPR_Cython_Epoch.pyx'
32 | 
33 | # Get the name of the file to compile
34 | fileToCompile = sys.argv[1]
35 | # Remove the argument from sys argv in order for it to contain only what setup needs
36 | del sys.argv[1]
37 | 
38 | extensionName = re.sub("\.pyx", "", fileToCompile)
39 | 
40 | 
41 | ext_modules = Extension(extensionName,
42 |                 [fileToCompile],
43 |                 extra_compile_args=['-O3'],
44 |                 include_dirs=[numpy.get_include(),],
45 |                 )
46 | 
47 | setup(
48 |     cmdclass={'build_ext': build_ext},
49 |     ext_modules=[ext_modules]
50 | )
51 | 
52 | 


--------------------------------------------------------------------------------
/MatrixFactorization/MatrixFactorization_RMSE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 23/10/17
  5 | 
  6 | @author: Massimo Quadrana
  7 | """
  8 | 
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | from Base.Recommender_utils import check_matrix
 13 | 
 14 | from Base.Recommender import Recommender
 15 | from MatrixFactorization.Cython.MF_RMSE import FunkSVD_sgd, AsySVD_sgd, AsySVD_compute_user_factors, BPRMF_sgd
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | logging.basicConfig(
 19 |     level=logging.INFO,
 20 |     format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
 21 | 
 22 | 
 23 | 
 24 | 
 25 | class FunkSVD(Recommender):
 26 |     '''
 27 |     FunkSVD model
 28 |     Reference: http://sifter.org/~simon/journal/20061211.html
 29 | 
 30 |     Factorizes the rating matrix R into the dot product of two matrices U and V of latent factors.
 31 |     U represent the user latent factors, V the item latent factors.
 32 |     The model is learned by solving the following regularized Least-squares objective function with Stochastic Gradient Descent
 33 |     \operatornamewithlimits{argmin} \limits_{U,V}\frac{1}{2}||R - UV^T||^2_2 + \frac{\lambda}{2}(||U||^2_F + ||V||^2_F)
 34 |     Latent factors are initialized from a Normal distribution with given mean and std.
 35 |     '''
 36 | 
 37 |     # TODO: add global effects
 38 |     def __init__(self, URM_train):
 39 | 
 40 |         super(FunkSVD, self).__init__()
 41 | 
 42 |         self.URM_train = check_matrix(URM_train, 'csr', dtype=np.float32)
 43 | 
 44 | 
 45 | 
 46 |     def __str__(self):
 47 |         return "FunkSVD(num_factors={}, lrate={}, reg={}, iters={}, init_mean={}, " \
 48 |                "init_std={}, lrate_decay={}, rnd_seed={})".format(
 49 |             self.num_factors, self.learning_rate, self.reg, self.epochs, self.init_mean, self.init_std, self.lrate_decay,
 50 |             self.rnd_seed
 51 |         )
 52 | 
 53 | 
 54 |     def fit(self, num_factors=50,
 55 |                  learning_rate=0.01,
 56 |                  reg=0.015,
 57 |                  epochs=10,
 58 |                  init_mean=0.0,
 59 |                  init_std=0.1,
 60 |                  lrate_decay=1.0,
 61 |                  rnd_seed=42):
 62 |         """
 63 | 
 64 |         Initialize the model
 65 |         :param num_factors: number of latent factors
 66 |         :param learning_rate: initial learning rate used in SGD
 67 |         :param reg: regularization term
 68 |         :param epochs: number of iterations in training the model with SGD
 69 |         :param init_mean: mean used to initialize the latent factors
 70 |         :param init_std: standard deviation used to initialize the latent factors
 71 |         :param lrate_decay: learning rate decay
 72 |         :param rnd_seed: random seed
 73 |         """
 74 | 
 75 |         self.num_factors = num_factors
 76 |         self.learning_rate = learning_rate
 77 |         self.reg = reg
 78 |         self.epochs = epochs
 79 |         self.init_mean = init_mean
 80 |         self.init_std = init_std
 81 |         self.lrate_decay = lrate_decay
 82 |         self.rnd_seed = rnd_seed
 83 | 
 84 |         self.U, self.V = FunkSVD_sgd(self.URM_train, self.num_factors, self.learning_rate, self.reg, self.epochs, self.init_mean,
 85 |                                      self.init_std,
 86 |                                      self.lrate_decay, self.rnd_seed)
 87 | 
 88 |     # def recommend(self, user_id, n=None, exclude_seen=True):
 89 |     #     scores = np.dot(self.U[user_id], self.V.T)
 90 |     #     ranking = scores.argsort()[::-1]
 91 |     #     # rank items
 92 |     #     if exclude_seen:
 93 |     #         ranking = self._filter_seen(user_id, ranking)
 94 |     #     return ranking[:n]
 95 |     #
 96 |     #
 97 |     # def _get_user_ratings(self, user_id):
 98 |     #     return self.dataset[user_id]
 99 |     #
100 |     # def _get_item_ratings(self, item_id):
101 |     #     return self.dataset[:, item_id]
102 |     #
103 |     #
104 |     # def _filter_seen(self, user_id, ranking):
105 |     #     user_profile = self._get_user_ratings(user_id)
106 |     #     seen = user_profile.indices
107 |     #     unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
108 |     #     return ranking[unseen_mask]
109 | 
110 | 
111 | 
112 | 
113 | 
114 |     def recommendBatch(self, users_in_batch, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
115 | 
116 |         # compute the scores using the dot product
117 |         user_profile_batch = self.URM_train[users_in_batch]
118 | 
119 |         scores_array = np.dot(self.U[users_in_batch], self.V.T)
120 | 
121 |         if self.normalize:
122 |             raise ValueError("Not implemented")
123 | 
124 |         # To exclude seen items perform a boolean indexing and replace their score with -inf
125 |         # Seen items will be at the bottom of the list but there is no guarantee they'll NOT be
126 |         # recommended
127 |         if exclude_seen:
128 |             scores_array[user_profile_batch.nonzero()] = -np.inf
129 | 
130 |         if filterTopPop:
131 |             scores_array[:,self.filterTopPop_ItemsID] = -np.inf
132 | 
133 |         if filterCustomItems:
134 |             scores_array[:, self.filterCustomItems_ItemsID] = -np.inf
135 | 
136 | 
137 |         # rank items and mirror column to obtain a ranking in descending score
138 |         #ranking = (-scores_array).argsort(axis=1)
139 |         #ranking = np.fliplr(ranking)
140 |         #ranking = ranking[:,0:n]
141 | 
142 |         ranking = np.zeros((scores_array.shape[0],n), dtype=np.int)
143 | 
144 |         for row_index in range(scores_array.shape[0]):
145 |             scores = scores_array[row_index]
146 | 
147 |             relevant_items_partition = (-scores).argpartition(n)[0:n]
148 |             relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition])
149 |             ranking[row_index] = relevant_items_partition[relevant_items_partition_sorting]
150 | 
151 | 
152 |         return ranking
153 | 
154 | 
155 | 
156 |     def recommend(self, user_id, n=None, exclude_seen=True, filterTopPop = False, filterCustomItems = False):
157 | 
158 | 
159 |         if n==None:
160 |             n=self.URM_train.shape[1]-1
161 | 
162 |         scores_array = np.dot(self.U[user_id], self.V.T)
163 | 
164 |         if self.normalize:
165 |             raise ValueError("Not implemented")
166 | 
167 | 
168 |         if exclude_seen:
169 |             scores = self._filter_seen_on_scores(user_id, scores_array)
170 | 
171 |         if filterTopPop:
172 |             scores = self._filter_TopPop_on_scores(scores_array)
173 | 
174 |         if filterCustomItems:
175 |             scores = self._filterCustomItems_on_scores(scores_array)
176 | 
177 | 
178 |         # rank items and mirror column to obtain a ranking in descending score
179 |         #ranking = scores.argsort()
180 |         #ranking = np.flip(ranking, axis=0)
181 | 
182 |         # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
183 |         # - Partition the data to extract the set of relevant items
184 |         # - Sort only the relevant items
185 |         # - Get the original item index
186 |         relevant_items_partition = (-scores_array).argpartition(n)[0:n]
187 |         relevant_items_partition_sorting = np.argsort(-scores_array[relevant_items_partition])
188 |         ranking = relevant_items_partition[relevant_items_partition_sorting]
189 | 
190 | 
191 |         return ranking
192 | 
193 | 
194 | 
195 | 
196 | class AsySVD(Recommender):
197 |     '''
198 |     AsymmetricSVD model
199 |     Reference: Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model (Koren, 2008)
200 | 
201 |     Factorizes the rating matrix R into two matrices X and Y of latent factors, which both represent item latent features.
202 |     Users are represented by aggregating the latent features in Y of items they have already rated.
203 |     Rating prediction is performed by computing the dot product of this accumulated user profile with the target item's
204 |     latent factor in X.
205 | 
206 |     The model is learned by solving the following regularized Least-squares objective function with Stochastic Gradient Descent
207 |     \operatornamewithlimits{argmin}\limits_{x*,y*}\frac{1}{2}\sum_{i,j \in R}(r_{ij} - x_j^T \sum_{l \in R(i)} r_{il}y_l)^2 + \frac{\lambda}{2}(\sum_{i}{||x_i||^2} + \sum_{j}{||y_j||^2})
208 |     '''
209 | 
210 |     # TODO: add global effects
211 |     # TODO: recommendation for new-users. Update the precomputed profiles online
212 |     def __init__(self,
213 |                  num_factors=50,
214 |                  lrate=0.01,
215 |                  reg=0.015,
216 |                  iters=10,
217 |                  init_mean=0.0,
218 |                  init_std=0.1,
219 |                  lrate_decay=1.0,
220 |                  rnd_seed=42):
221 |         '''
222 |         Initialize the model
223 |         :param num_factors: number of latent factors
224 |         :param lrate: initial learning rate used in SGD
225 |         :param reg: regularization term
226 |         :param iters: number of iterations in training the model with SGD
227 |         :param init_mean: mean used to initialize the latent factors
228 |         :param init_std: standard deviation used to initialize the latent factors
229 |         :param lrate_decay: learning rate decay
230 |         :param rnd_seed: random seed
231 |         '''
232 |         super(AsySVD, self).__init__()
233 |         self.num_factors = num_factors
234 |         self.lrate = lrate
235 |         self.reg = reg
236 |         self.iters = iters
237 |         self.init_mean = init_mean
238 |         self.init_std = init_std
239 |         self.lrate_decay = lrate_decay
240 |         self.rnd_seed = rnd_seed
241 | 
242 |     def __str__(self):
243 |         return "AsySVD(num_factors={}, lrate={}, reg={}, iters={}, init_mean={}, " \
244 |                "init_std={}, lrate_decay={}, rnd_seed={})".format(
245 |             self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, self.init_std, self.lrate_decay,
246 |             self.rnd_seed
247 |         )
248 | 
249 |     def fit(self, R):
250 |         self.dataset = R
251 |         R = check_matrix(R, 'csr', dtype=np.float32)
252 |         self.X, self.Y = AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean,
253 |                                     self.init_std,
254 |                                     self.lrate_decay, self.rnd_seed)
255 |         # precompute the user factors
256 |         M = R.shape[0]
257 |         self.U = np.vstack([AsySVD_compute_user_factors(R[i], self.Y) for i in range(M)])
258 | 
259 |     def recommend(self, user_id, n=None, exclude_seen=True):
260 |         scores = np.dot(self.X, self.U[user_id].T)
261 |         ranking = scores.argsort()[::-1]
262 |         # rank items
263 |         if exclude_seen:
264 |             ranking = self._filter_seen(user_id, ranking)
265 |         return ranking[:n]
266 | 
267 | 
268 |     def _get_user_ratings(self, user_id):
269 |         return self.dataset[user_id]
270 | 
271 |     def _get_item_ratings(self, item_id):
272 |         return self.dataset[:, item_id]
273 | 
274 | 
275 |     def _filter_seen(self, user_id, ranking):
276 |         user_profile = self._get_user_ratings(user_id)
277 |         seen = user_profile.indices
278 |         unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
279 |         return ranking[unseen_mask]
280 | 
281 | 
282 | 
283 | class IALS_numpy(Recommender):
284 |     '''
285 |     binary Alternating Least Squares model (or Weighed Regularized Matrix Factorization)
286 |     Reference: Collaborative Filtering for binary Feedback Datasets (Hu et al., 2008)
287 | 
288 |     Factorization model for binary feedback.
289 |     First, splits the feedback matrix R as the element-wise a Preference matrix P and a Confidence matrix C.
290 |     Then computes the decomposition of them into the dot product of two matrices X and Y of latent factors.
291 |     X represent the user latent factors, Y the item latent factors.
292 | 
293 |     The model is learned by solving the following regularized Least-squares objective function with Stochastic Gradient Descent
294 |     \operatornamewithlimits{argmin}\limits_{x*,y*}\frac{1}{2}\sum_{i,j}{c_{ij}(p_{ij}-x_i^T y_j) + \lambda(\sum_{i}{||x_i||^2} + \sum_{j}{||y_j||^2})}
295 |     '''
296 | 
297 |     # TODO: Add support for multiple confidence scaling functions (e.g. linear and log scaling)
298 |     def __init__(self,
299 |                  num_factors=50,
300 |                  reg=0.015,
301 |                  iters=10,
302 |                  scaling='linear',
303 |                  alpha=40,
304 |                  epsilon=1.0,
305 |                  init_mean=0.0,
306 |                  init_std=0.1,
307 |                  rnd_seed=42):
308 |         '''
309 |         Initialize the model
310 |         :param num_factors: number of latent factors
311 |         :param reg: regularization term
312 |         :param iters: number of iterations in training the model with SGD
313 |         :param scaling: supported scaling modes for the observed values: 'linear' or 'log'
314 |         :param alpha: scaling factor to compute confidence scores
315 |         :param epsilon: epsilon used in log scaling only
316 |         :param init_mean: mean used to initialize the latent factors
317 |         :param init_std: standard deviation used to initialize the latent factors
318 |         :param rnd_seed: random seed
319 |         '''
320 | 
321 |         super(IALS_numpy, self).__init__()
322 |         assert scaling in ['linear', 'log'], 'Unsupported scaling: {}'.format(scaling)
323 | 
324 |         self.num_factors = num_factors
325 |         self.reg = reg
326 |         self.iters = iters
327 |         self.scaling = scaling
328 |         self.alpha = alpha
329 |         self.epsilon = epsilon
330 |         self.init_mean = init_mean
331 |         self.init_std = init_std
332 |         self.rnd_seed = rnd_seed
333 | 
334 |     def __str__(self):
335 |         return "WRMF-iALS(num_factors={},  reg={}, iters={}, scaling={}, alpha={}, episilon={}, init_mean={}, " \
336 |                "init_std={}, rnd_seed={})".format(
337 |             self.num_factors, self.reg, self.iters, self.scaling, self.alpha, self.epsilon, self.init_mean,
338 |             self.init_std, self.rnd_seed
339 |         )
340 | 
341 |     def _linear_scaling(self, R):
342 |         C = R.copy().tocsr()
343 |         C.data *= self.alpha
344 |         C.data += 1.0
345 |         return C
346 | 
347 |     def _log_scaling(self, R):
348 |         C = R.copy().tocsr()
349 |         C.data = 1.0 + self.alpha * np.log(1.0 + C.data / self.epsilon)
350 |         return C
351 | 
352 |     def fit(self, R):
353 |         self.dataset = R
354 |         # compute the confidence matrix
355 |         if self.scaling == 'linear':
356 |             C = self._linear_scaling(R)
357 |         else:
358 |             C = self._log_scaling(R)
359 | 
360 |         Ct = C.T.tocsr()
361 |         M, N = R.shape
362 | 
363 |         # set the seed
364 |         np.random.seed(self.rnd_seed)
365 | 
366 |         # initialize the latent factors
367 |         self.X = np.random.normal(self.init_mean, self.init_std, size=(M, self.num_factors))
368 |         self.Y = np.random.normal(self.init_mean, self.init_std, size=(N, self.num_factors))
369 | 
370 |         for it in range(self.iters):
371 |             self.X = self._lsq_solver_fast(C, self.X, self.Y, self.reg)
372 |             self.Y = self._lsq_solver_fast(Ct, self.Y, self.X, self.reg)
373 |             logger.debug('Finished iter {}'.format(it + 1))
374 | 
375 |     def recommend(self, user_id, n=None, exclude_seen=True):
376 |         scores = np.dot(self.X[user_id], self.Y.T)
377 |         ranking = scores.argsort()[::-1]
378 |         # rank items
379 |         if exclude_seen:
380 |             ranking = self._filter_seen(user_id, ranking)
381 |         return ranking[:n]
382 | 
383 |     def _lsq_solver(self, C, X, Y, reg):
384 |         # precompute YtY
385 |         rows, factors = X.shape
386 |         YtY = np.dot(Y.T, Y)
387 | 
388 |         for i in range(rows):
389 |             # accumulate YtCiY + reg*I in A
390 |             A = YtY + reg * np.eye(factors)
391 | 
392 |             # accumulate Yt*Ci*p(i) in b
393 |             b = np.zeros(factors)
394 | 
395 |             for j, cij in self._nonzeros(C, i):
396 |                 vj = Y[j]
397 |                 A += (cij - 1.0) * np.outer(vj, vj)
398 |                 b += cij * vj
399 | 
400 |             X[i] = np.linalg.solve(A, b)
401 |         return X
402 | 
403 |     def _lsq_solver_fast(self, C, X, Y, reg):
404 |         # precompute YtY
405 |         rows, factors = X.shape
406 |         YtY = np.dot(Y.T, Y)
407 | 
408 |         for i in range(rows):
409 |             # accumulate YtCiY + reg*I in A
410 |             A = YtY + reg * np.eye(factors)
411 | 
412 |             start, end = C.indptr[i], C.indptr[i + 1]
413 |             j = C.indices[start:end]  # indices of the non-zeros in Ci
414 |             ci = C.data[start:end]  # non-zeros in Ci
415 | 
416 |             Yj = Y[j]  # only the factors with non-zero confidence
417 |             # compute Yt(Ci-I)Y
418 |             aux = np.dot(Yj.T, np.diag(ci - 1.0))
419 |             A += np.dot(aux, Yj)
420 |             # compute YtCi
421 |             b = np.dot(Yj.T, ci)
422 | 
423 |             X[i] = np.linalg.solve(A, b)
424 |         return X
425 | 
426 |     def _nonzeros(self, R, row):
427 |         for i in range(R.indptr[row], R.indptr[row + 1]):
428 |             yield (R.indices[i], R.data[i])
429 | 
430 | 
431 |     def _get_user_ratings(self, user_id):
432 |         return self.dataset[user_id]
433 | 
434 |     def _get_item_ratings(self, item_id):
435 |         return self.dataset[:, item_id]
436 | 
437 | 
438 |     def _filter_seen(self, user_id, ranking):
439 |         user_profile = self._get_user_ratings(user_id)
440 |         seen = user_profile.indices
441 |         unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
442 |         return ranking[unseen_mask]
443 | 
444 | 
445 | 
446 | class BPRMF(Recommender):
447 |     '''
448 |     BPRMF model
449 |     '''
450 | 
451 |     # TODO: add global effects
452 |     def __init__(self,
453 |                  num_factors=50,
454 |                  lrate=0.01,
455 |                  user_reg=0.015,
456 |                  pos_reg=0.015,
457 |                  neg_reg=0.0015,
458 |                  iters=10,
459 |                  sampling_type='user_uniform_item_uniform',
460 |                  sample_with_replacement=True,
461 |                  use_resampling=True,
462 |                  sampling_pop_alpha=1.0,
463 |                  init_mean=0.0,
464 |                  init_std=0.1,
465 |                  lrate_decay=1.0,
466 |                  rnd_seed=42,
467 |                  verbose=True):
468 |         '''
469 |         Initialize the model
470 |         :param num_factors: number of latent factors
471 |         :param lrate: initial learning rate used in SGD
472 |         :param user_reg: regularization for the user factors
473 |         :param pos_reg: regularization for the factors of the positive sampled items
474 |         :param neg_reg: regularization for the factors of the negative sampled items
475 |         :param iters: number of iterations in training the model with SGD
476 |         :param sampling_type: type of sampling. Supported types are 'user_uniform_item_uniform' and 'user_uniform_item_pop'
477 |         :param sample_with_replacement: `True` to sample positive items with replacement (doesn't work with 'user_uniform_item_pop')
478 |         :param use_resampling: `True` to resample at each iteration during training
479 |         :param sampling_pop_alpha: float smoothing factor for popularity based samplers (e.g., 'user_uniform_item_pop')
480 |         :param init_mean: mean used to initialize the latent factors
481 |         :param init_std: standard deviation used to initialize the latent factors
482 |         :param lrate_decay: learning rate decay
483 |         :param rnd_seed: random seed
484 |         :param verbose: controls verbosity in output
485 |         '''
486 |         super(BPRMF, self).__init__()
487 |         self.num_factors = num_factors
488 |         self.lrate = lrate
489 |         self.user_reg = user_reg
490 |         self.pos_reg = pos_reg
491 |         self.neg_reg = neg_reg
492 |         self.iters = iters
493 |         self.sampling_type = sampling_type
494 |         self.sample_with_replacement = sample_with_replacement
495 |         self.use_resampling = use_resampling
496 |         self.sampling_pop_alpha = sampling_pop_alpha
497 |         self.init_mean = init_mean
498 |         self.init_std = init_std
499 |         self.lrate_decay = lrate_decay
500 |         self.rnd_seed = rnd_seed
501 |         self.verbose = verbose
502 | 
503 |     def __str__(self):
504 |         return "BPRMF(num_factors={}, lrate={}, user_reg={}. pos_reg={}, neg_reg={}, iters={}, " \
505 |                "sampling_type={}, sample_with_replacement={}, use_resampling={}, sampling_pop_alpha={}, init_mean={}, " \
506 |                "init_std={}, lrate_decay={}, rnd_seed={}, verbose={})".format(
507 |             self.num_factors, self.lrate, self.user_reg, self.pos_reg, self.neg_reg, self.iters,
508 |             self.sampling_type, self.sample_with_replacement, self.use_resampling, self.sampling_pop_alpha,
509 |             self.init_mean,
510 |             self.init_std,
511 |             self.lrate_decay,
512 |             self.rnd_seed,
513 |             self.verbose
514 |         )
515 | 
516 |     def fit(self, R):
517 |         self.dataset = R
518 |         R = check_matrix(R, 'csr', dtype=np.float32)
519 |         self.X, self.Y = BPRMF_sgd(R,
520 |                                    num_factors=self.num_factors,
521 |                                    lrate=self.lrate,
522 |                                    user_reg=self.user_reg,
523 |                                    pos_reg=self.pos_reg,
524 |                                    neg_reg=self.neg_reg,
525 |                                    iters=self.iters,
526 |                                    sampling_type=self.sampling_type,
527 |                                    sample_with_replacement=self.sample_with_replacement,
528 |                                    use_resampling=self.use_resampling,
529 |                                    sampling_pop_alpha=self.sampling_pop_alpha,
530 |                                    init_mean=self.init_mean,
531 |                                    init_std=self.init_std,
532 |                                    lrate_decay=self.lrate_decay,
533 |                                    rnd_seed=self.rnd_seed,
534 |                                    verbose=self.verbose)
535 | 
536 |     def recommend(self, user_id, n=None, exclude_seen=True):
537 |         scores = np.dot(self.X[user_id], self.Y.T)
538 |         ranking = scores.argsort()[::-1]
539 |         # rank items
540 |         if exclude_seen:
541 |             ranking = self._filter_seen(user_id, ranking)
542 |         return ranking[:n]
543 | 
544 | 
545 | 
546 |     def _get_user_ratings(self, user_id):
547 |         return self.dataset[user_id]
548 | 
549 |     def _get_item_ratings(self, item_id):
550 |         return self.dataset[:, item_id]
551 | 
552 | 
553 |     def _filter_seen(self, user_id, ranking):
554 |         user_profile = self._get_user_ratings(user_id)
555 |         seen = user_profile.indices
556 |         unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
557 |         return ranking[unseen_mask]
558 | 
559 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RecSys Course 2017
 2 | This is the official repository for the 2017 Recommender Systems course at Polimi.
 3 | 
 4 | ## This repo is obsolete, please refer to the updated version [HERE](https://github.com/MaurizioFD/RecSys_Course_2018)
 5 | 
 6 | 
 7 | #### This repo contains a Cython implementation of:
 8 |  - SLIM BPR: Uses a Cython tree-based sparse matrix, suitable for datasets whose number of items is too big for the
 9 |     dense similarity matrix to fit in memory. 
10 |     Dense similarity is also supported.
11 |  - MF BPR: Matrix factorization optimizing BPR
12 |  - FunkSVD: Matrix factorization optimizing RMSE
13 |  - AsymmetricSVD
14 | 
15 | #### This repo contains a Python implementation of:
16 |  - Item-based KNN collaborative
17 |  - Item-based KNN content
18 |  - User-based KNN
19 |  - SLIM_RMSE: SLIM solver using ElasticNet. The solver fits every column in the similarity matrix independently
20 | 
21 | #### This repo also provides an implementation of:
22 |  
23 |  - Cosine Similarity, Adjusted Cosine, Pearson Correlation, Jaccard Correlation, Tanimoto Coefficient: Implemented both in Python and Cython with the same interface, Base.cosine_similarity and Base.Cython.cosine_similarity
24 |  - MAP, recall, precision, ROC-AUC, MRR, RR, NDCG to be used in testing
25 |  - Movielens10MReader: reads movielens 10M rating file, splits it into three URMs for train, test and validation. 
26 |  
27 | 
28 | Cython code is already compiled for Linux. To recompile the code just set the recompile_cython flag to True.
29 | For other OS such as Windows the c-imported numpy interface might be different (e.g. return tipe long long insead of long) therefore the code could require modifications in oder to compile.
30 | 
31 | 
32 | ##### In "all_algorithms.py" you can see how to use every model.


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/SLIM_BPR_Cython.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 07/09/17
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | from Base.Recommender_utils import similarityMatrixTopK
 10 | from SLIM_BPR.SLIM_BPR_Python import SLIM_BPR_Python
 11 | import subprocess
 12 | import os, sys
 13 | import numpy as np
 14 | 
 15 | 
 16 | class SLIM_BPR_Cython(SLIM_BPR_Python):
 17 | 
 18 | 
 19 |     def __init__(self, URM_train, positive_threshold=4,
 20 |                  recompile_cython = False, sparse_weights = False,
 21 |                  symmetric = True, sgd_mode='adagrad'):
 22 | 
 23 | 
 24 |         super(SLIM_BPR_Cython, self).__init__(URM_train,
 25 |                                               positive_threshold=positive_threshold,
 26 |                                               sparse_weights = sparse_weights)
 27 | 
 28 | 
 29 |         self.sgd_mode = sgd_mode
 30 |         self.symmetric = symmetric
 31 | 
 32 |         if not sparse_weights:
 33 | 
 34 |             n_items = URM_train.shape[1]
 35 |             requiredGB = 8 * n_items**2 / 1e+06
 36 | 
 37 |             if symmetric:
 38 |                 requiredGB /=2
 39 | 
 40 |             print("SLIM_BPR_Cython: Estimated memory required for similarity matrix of {} items is {:.2f} MB".format(n_items, requiredGB))
 41 | 
 42 | 
 43 | 
 44 | 
 45 |         if recompile_cython:
 46 |             print("Compiling in Cython")
 47 |             self.runCompilationScript()
 48 |             print("Compilation Complete")
 49 | 
 50 | 
 51 | 
 52 |     def fit(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, minRatingsPerUser=1,
 53 |             batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0,
 54 |             lambda_i = 0.0, lambda_j = 0.0, learning_rate = 0.01, topK = 200, sgd_mode='adagrad'):
 55 | 
 56 | 
 57 |         # Select only positive interactions
 58 |         URM_train_positive = self.URM_train.copy()
 59 | 
 60 |         URM_train_positive.data = URM_train_positive.data >= self.positive_threshold
 61 |         URM_train_positive.eliminate_zeros()
 62 | 
 63 | 
 64 |         self.sgd_mode = sgd_mode
 65 | 
 66 | 
 67 |         # Import compiled module
 68 |         from SLIM_BPR.Cython.SLIM_BPR_Cython_Epoch import SLIM_BPR_Cython_Epoch
 69 | 
 70 | 
 71 |         self.cythonEpoch = SLIM_BPR_Cython_Epoch(self.URM_mask,
 72 |                                                  sparse_weights = self.sparse_weights,
 73 |                                                  topK=topK,
 74 |                                                  learning_rate=learning_rate,
 75 |                                                  li_reg = lambda_i,
 76 |                                                  lj_reg = lambda_j,
 77 |                                                  batch_size=1,
 78 |                                                  symmetric = self.symmetric,
 79 |                                                  sgd_mode = sgd_mode)
 80 | 
 81 | 
 82 |         # Cal super.fit to start training
 83 |         super(SLIM_BPR_Cython, self).fit_alreadyInitialized(epochs=epochs,
 84 |                                          logFile=logFile,
 85 |                                          URM_test=URM_test,
 86 |                                          filterTopPop=filterTopPop,
 87 |                                          minRatingsPerUser=minRatingsPerUser,
 88 |                                          batch_size=batch_size,
 89 |                                          validate_every_N_epochs=validate_every_N_epochs,
 90 |                                          start_validation_after_N_epochs=start_validation_after_N_epochs,
 91 |                                          lambda_i = lambda_i,
 92 |                                          lambda_j = lambda_j,
 93 |                                          learning_rate = learning_rate,
 94 |                                          topK = topK)
 95 | 
 96 | 
 97 | 
 98 | 
 99 |     def runCompilationScript(self):
100 | 
101 |         # Run compile script setting the working directory to ensure the compiled file are contained in the
102 |         # appropriate subfolder and not the project root
103 | 
104 |         compiledModuleSubfolder = "/SLIM_BPR/Cython"
105 |         #fileToCompile_list = ['Sparse_Matrix_CSR.pyx', 'SLIM_BPR_Cython_Epoch.pyx']
106 |         fileToCompile_list = ['SLIM_BPR_Cython_Epoch.pyx']
107 | 
108 |         for fileToCompile in fileToCompile_list:
109 | 
110 |             command = ['python',
111 |                        'compileCython.py',
112 |                        fileToCompile,
113 |                        'build_ext',
114 |                        '--inplace'
115 |                        ]
116 | 
117 | 
118 |             output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder)
119 | 
120 |             try:
121 | 
122 |                 command = ['cython',
123 |                            fileToCompile,
124 |                            '-a'
125 |                            ]
126 | 
127 |                 output = subprocess.check_output(' '.join(command), shell=True, cwd=os.getcwd() + compiledModuleSubfolder)
128 | 
129 |             except:
130 |                 pass
131 | 
132 | 
133 |         print("Compiled module saved in subfolder: {}".format(compiledModuleSubfolder))
134 | 
135 |         # Command to run compilation script
136 |         #python compileCython.py SLIM_BPR_Cython_Epoch.pyx build_ext --inplace
137 | 
138 |         # Command to generate html report
139 |         # cython -a SLIM_BPR_Cython_Epoch.pyx
140 | 
141 | 
142 |     def updateSimilarityMatrix(self):
143 | 
144 |         self.S = self.cythonEpoch.get_S()
145 | 
146 |         if self.sparse_weights:
147 |             self.W_sparse = self.S
148 |         else:
149 |             self.W = self.S
150 | 
151 | 
152 | 
153 |     def epochIteration(self):
154 | 
155 |         self.cythonEpoch.epochIteration_Cython()
156 | 
157 | 
158 | 
159 | 
160 |     def writeCurrentConfig(self, currentEpoch, results_run, logFile):
161 | 
162 |         current_config = {'learn_rate': self.learning_rate,
163 |                           'topK_similarity': self.topK,
164 |                           'epoch': currentEpoch,
165 |                           'sgd_mode': self.sgd_mode}
166 | 
167 |         print("Test case: {}\nResults {}\n".format(current_config, results_run))
168 |         # print("Weights: {}\n".format(str(list(self.weights))))
169 | 
170 |         sys.stdout.flush()
171 | 
172 |         if (logFile != None):
173 |             logFile.write("Test case: {}, Results {}\n".format(current_config, results_run))
174 |             # logFile.write("Weights: {}\n".format(str(list(self.weights))))
175 |             logFile.flush()
176 | 


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/lib.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/lib.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/SLIM_BPR_Cython_Epoch.o


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_CSR.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_CSR.o


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_Tree_CSR.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.linux-x86_64-3.6/Sparse_Matrix_Tree_CSR.o


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/SLIM_BPR_Cython_Epoch.cp36-win_amd64.def:
--------------------------------------------------------------------------------
1 | LIBRARY SLIM_BPR_Cython_Epoch.cp36-win_amd64.pyd
2 | EXPORTS
3 | PyInit_SLIM_BPR_Cython_Epoch
4 | 


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/Sparse_Matrix_CSR.cp36-win_amd64.def:
--------------------------------------------------------------------------------
1 | LIBRARY Sparse_Matrix_CSR.cp36-win_amd64.pyd
2 | EXPORTS
3 | PyInit_Sparse_Matrix_CSR
4 | 


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/Sparse_Matrix_Tree_CSR.cp36-win_amd64.def:
--------------------------------------------------------------------------------
1 | LIBRARY Sparse_Matrix_Tree_CSR.cp36-win_amd64.pyd
2 | EXPORTS
3 | PyInit_Sparse_Matrix_Tree_CSR
4 | 


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/slim_bpr_cython_epoch.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/slim_bpr_cython_epoch.o


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_csr.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_csr.o


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_tree_csr.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/SLIM_BPR/Cython/build/temp.win-amd64-3.6/Release/sparse_matrix_tree_csr.o


--------------------------------------------------------------------------------
/SLIM_BPR/Cython/compileCython.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on 16/07/2017
 5 | 
 6 | @author: Maurizio Ferrari Dacrema
 7 | """
 8 | 
 9 | 
10 | try:
11 |     from setuptools import setup
12 |     from setuptools import Extension
13 | except ImportError:
14 |     from distutils.core import setup
15 |     from distutils.extension import Extension
16 | 
17 | 
18 | from Cython.Distutils import build_ext
19 | 
20 | 
21 | import numpy
22 | 
23 | import sys
24 | import re
25 | 
26 | 
27 | if len(sys.argv) != 4:
28 |     raise ValueError("Wrong number of paramethers received. Expected 4, got {}".format(sys.argv))
29 | 
30 | 
31 | #fileToCompile = 'MF_BPR_Cython_Epoch.pyx'
32 | 
33 | # Get the name of the file to compile
34 | fileToCompile = sys.argv[1]
35 | # Remove the argument from sys argv in order for it to contain only what setup needs
36 | del sys.argv[1]
37 | 
38 | extensionName = re.sub("\.pyx", "", fileToCompile)
39 | 
40 | 
41 | ext_modules = Extension(extensionName,
42 |                 [fileToCompile],
43 |                 extra_compile_args=['-O3'],
44 |                 include_dirs=[numpy.get_include(),],
45 |                 )
46 | 
47 | setup(
48 |     cmdclass={'build_ext': build_ext},
49 |     ext_modules=[ext_modules]
50 | )
51 | 
52 | 


--------------------------------------------------------------------------------
/SLIM_BPR/SLIM_BPR_Python.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 28 June 2017
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | import sys
 10 | import time
 11 | 
 12 | import numpy as np
 13 | import scipy.sparse as sps
 14 | from Base.Recommender_utils import similarityMatrixTopK
 15 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender
 16 | from scipy.special import expit
 17 | 
 18 | from BPR.BPR_Sampling import BPR_Sampling
 19 | from Base.Recommender import Recommender
 20 | 
 21 | 
 22 | def sigmoidFunction(x):
 23 |   return 1 / (1 + np.exp(-x))
 24 | 
 25 | 
 26 | class SLIM_BPR_Python(BPR_Sampling, Similarity_Matrix_Recommender, Recommender):
 27 | 
 28 |     def __init__(self, URM_train, positive_threshold=4, sparse_weights = False):
 29 |         super(SLIM_BPR_Python, self).__init__()
 30 | 
 31 |         """
 32 |           Creates a new object for training and testing a Bayesian
 33 |           Personalised Ranking (BPR) SLIM
 34 | 
 35 |           This object uses the Theano library for training the model, meaning
 36 |           it can run on a GPU through CUDA. To make sure your Theano
 37 |           install is using the GPU, see:
 38 | 
 39 |             http://deeplearning.net/software/theano/tutorial/using_gpu.html
 40 | 
 41 |           When running on CPU, we recommend using OpenBLAS.
 42 | 
 43 |             http://www.openblas.net/
 44 |         """
 45 |         """
 46 |         if objective!='sigmoid' and objective != 'logsigmoid':
 47 |             raise ValueError("Objective not valid. Acceptable values are 'sigmoid' and 'logsigmoid'. Provided value was '{}'".format(objective))
 48 |         self.objective = objective
 49 |         """
 50 | 
 51 |         self.URM_train = URM_train
 52 |         self.n_users = URM_train.shape[0]
 53 |         self.n_items = URM_train.shape[1]
 54 |         self.normalize = False
 55 |         self.sparse_weights = sparse_weights
 56 |         self.positive_threshold = positive_threshold
 57 | 
 58 | 
 59 |         self.URM_mask = self.URM_train.copy()
 60 | 
 61 |         self.URM_mask.data = self.URM_mask.data >= self.positive_threshold
 62 |         self.URM_mask.eliminate_zeros()
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 |     def updateSimilarityMatrix(self):
 71 | 
 72 |         if self.topK != False:
 73 |             if self.sparse_weights:
 74 |                 self.W_sparse = similarityMatrixTopK(self.S.T, k=self.topK, forceSparseOutput=True)
 75 |             else:
 76 |                 self.W = similarityMatrixTopK(self.S.T, k=self.topK, forceSparseOutput=False)
 77 | 
 78 |         else:
 79 |             if self.sparse_weights:
 80 |                 self.W_sparse = sps.csr_matrix(self.S.T)
 81 |             else:
 82 |                 self.W = self.S.T
 83 | 
 84 | 
 85 | 
 86 |     def updateWeightsLoop(self, u, i, j):
 87 |         """
 88 |         Define the update rules to be used in the train phase and compile the train function
 89 |         :return:
 90 |         """
 91 | 
 92 |         x_ui = self.S[i]
 93 |         x_uj = self.S[j]
 94 | 
 95 |         # The difference is computed on the whole row not only on the user_seen items
 96 |         # The performance seems to be higher this way
 97 |         x_uij = x_ui - x_uj
 98 | 
 99 |         # Sigmoid whose argument is minus in order for the exponent of the exponential to be positive
100 |         sigmoid = expit(-x_uij)
101 | 
102 |         delta_i = sigmoid-self.lambda_i*self.S[i]
103 |         delta_j = -sigmoid-self.lambda_j*self.S[j]
104 | 
105 |         # Since a shared variable may be the target of only one update rule
106 |         # All the required updates are chained inside a subtensor
107 |         for sampleIndex in range(self.batch_size):
108 | 
109 |             user_id = u[sampleIndex]
110 | 
111 |             for item_id in self.userSeenItems[user_id]:
112 |                 # Do not update items i
113 |                 if item_id != i[sampleIndex]:
114 |                     self.S[i] += self.learning_rate * delta_i
115 | 
116 |                 # Do not update j
117 |                 if item_id != j[sampleIndex]:
118 |                     self.S[j] += self.learning_rate * delta_j
119 | 
120 | 
121 |     def updateWeightsBatch(self, u, i, j):
122 |         """
123 |         Define the update rules to be used in the train phase and compile the train function
124 |         :return:
125 |         """
126 | 
127 |         if self.batch_size==1:
128 |             seenItems = self.userSeenItems[u[0]]
129 | 
130 |             x_ui = self.S[i, seenItems]
131 |             x_uj = self.S[j, seenItems]
132 | 
133 |             # The difference is computed on the user_seen items
134 |             x_uij = x_ui - x_uj
135 | 
136 |             #x_uij = x_uij[0,seenItems]
137 |             x_uij = np.sum(x_uij)
138 | 
139 |             # log(sigm(+x_uij))
140 |             gradient = 1 / (1 + np.exp(x_uij))
141 | 
142 |             # sigm(-x_uij)
143 |             #exp = np.exp(x_uij)
144 |             #gradient = exp/np.power(exp+1, 2)
145 | 
146 |         else:
147 | 
148 |             x_ui = self.S[i]
149 |             x_uj = self.S[j]
150 | 
151 |             # The difference is computed on the user_seen items
152 |             x_uij = x_ui - x_uj
153 | 
154 |             x_uij = self.URM_mask[u,:].dot(x_uij.T).diagonal()
155 | 
156 |             gradient = np.sum(1 / (1 + np.exp(x_uij))) / self.batch_size
157 | 
158 |         # Sigmoid whose argument is minus in order for the exponent of the exponential to be positive
159 |         # Best performance with: gradient = np.sum(expit(-x_uij)) / self.batch_size
160 |         #gradient = np.sum(x_uij) / self.batch_size
161 |         #gradient = expit(-gradient)
162 |         #gradient = np.sum(expit(-x_uij)) / self.batch_size
163 |         #gradient = np.sum(np.log(expit(x_uij))) / self.batch_size
164 |         #gradient = np.sum(1/(1+np.exp(x_uij))) / self.batch_size
165 |         #gradient = min(10, max(-10, gradient))+10
166 | 
167 | 
168 |         if self.batch_size==1:
169 | 
170 |             userSeenItems = self.userSeenItems[u[0]]
171 | 
172 |             self.S[i, userSeenItems] += self.learning_rate * gradient
173 |             self.S[i, i] = 0
174 | 
175 |             self.S[j, userSeenItems] -= self.learning_rate * gradient
176 |             self.S[j, j] = 0
177 | 
178 | 
179 | 
180 |         else:
181 |             itemsToUpdate = np.array(self.URM_mask[u, :].sum(axis=0) > 0).ravel()
182 | 
183 |             # Do not update items i, set all user-posItem to false
184 |             # itemsToUpdate[i] = False
185 | 
186 |             self.S[i] += self.learning_rate * gradient * itemsToUpdate
187 |             self.S[i, i] = 0
188 | 
189 |             # Now update i, setting all user-posItem to true
190 |             # Do not update j
191 | 
192 |             # itemsToUpdate[i] = True
193 |             # itemsToUpdate[j] = False
194 | 
195 |             self.S[j] -= self.learning_rate * gradient * itemsToUpdate
196 |             self.S[j, j] = 0
197 | 
198 |     def fit(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, minRatingsPerUser=1,
199 |             batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0,
200 |             lambda_i = 0.0025, lambda_j = 0.00025, learning_rate = 0.05, topK = False):
201 | 
202 | 
203 | 
204 |         if self.sparse_weights:
205 |             self.S = sps.csr_matrix((self.n_items, self.n_items), dtype=np.float32)
206 |         else:
207 |             self.S = np.zeros((self.n_items, self.n_items)).astype('float32')
208 | 
209 | 
210 |         self.initializeFastSampling(positive_threshold=self.positive_threshold)
211 | 
212 | 
213 |         self.fit_alreadyInitialized(epochs=epochs,
214 |                                     logFile=logFile,
215 |                                     URM_test=URM_test,
216 |                                     filterTopPop = filterTopPop,
217 |                                     minRatingsPerUser=minRatingsPerUser,
218 |                                     batch_size = batch_size,
219 |                                     validate_every_N_epochs = validate_every_N_epochs,
220 |                                     start_validation_after_N_epochs = start_validation_after_N_epochs,
221 |                                     lambda_i = lambda_i,
222 |                                     lambda_j = lambda_j,
223 |                                     learning_rate = learning_rate,
224 |                                     topK = topK)
225 | 
226 | 
227 | 
228 |     def fit_alreadyInitialized(self, epochs=30, logFile=None, URM_test=None, filterTopPop = False, minRatingsPerUser=1,
229 |             batch_size = 1000, validate_every_N_epochs = 1, start_validation_after_N_epochs = 0,
230 |             lambda_i = 0.0025, lambda_j = 0.00025, learning_rate = 0.05, topK = False):
231 |         """
232 |         Fits the model performing a round of testing at the end of each epoch
233 |         :param epochs:
234 |         :param batch_size:
235 |         :param logFile:
236 |         :param URM_test:
237 |         :return:
238 |         """
239 | 
240 | 
241 |         if(topK != False and topK<1):
242 |             raise ValueError("TopK not valid. Acceptable values are either False or a positive integer value. Provided value was '{}'".format(topK))
243 |         self.topK = topK
244 | 
245 | 
246 |         self.batch_size = batch_size
247 |         self.lambda_i = lambda_i
248 |         self.lambda_j = lambda_j
249 |         self.learning_rate = learning_rate
250 | 
251 | 
252 |         start_time_train = time.time()
253 | 
254 |         for currentEpoch in range(epochs):
255 | 
256 |             start_time_epoch = time.time()
257 | 
258 |             if self.batch_size>0:
259 |                 self.epochIteration()
260 |             else:
261 |                 print("No batch not available")
262 | 
263 | 
264 |             if (URM_test is not None) and ((currentEpoch +1 )% validate_every_N_epochs == 0) and \
265 |                             currentEpoch >= start_validation_after_N_epochs:
266 | 
267 |                 print("Evaluation begins")
268 | 
269 |                 self.updateSimilarityMatrix()
270 | 
271 |                 results_run = self.evaluateRecommendations(URM_test, filterTopPop=filterTopPop,
272 |                                                            minRatingsPerUser=minRatingsPerUser)
273 | 
274 |                 self.writeCurrentConfig(currentEpoch, results_run, logFile)
275 | 
276 |                 print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs,
277 |                                                                      float(time.time() - start_time_epoch) / 60))
278 | 
279 | 
280 |             # Fit with no validation
281 |             else:
282 |                 print("Epoch {} of {} complete in {:.2f} minutes".format(currentEpoch+1, epochs,
283 |                                                                          float(time.time() - start_time_epoch) / 60))
284 | 
285 |         self.updateSimilarityMatrix()
286 | 
287 |         print("Fit completed in {:.2f} minutes".format(float(time.time() - start_time_train) / 60))
288 | 
289 |         sys.stdout.flush()
290 | 
291 | 
292 | 
293 |     def writeCurrentConfig(self, currentEpoch, results_run, logFile):
294 | 
295 |         current_config = {'lambda_i': self.lambda_i,
296 |                           'lambda_j': self.lambda_j,
297 |                           'batch_size': self.batch_size,
298 |                           'learn_rate': self.learning_rate,
299 |                           'topK_similarity': self.topK,
300 |                           'epoch': currentEpoch}
301 | 
302 |         print("Test case: {}\nResults {}\n".format(current_config, results_run))
303 |         # print("Weights: {}\n".format(str(list(self.weights))))
304 | 
305 |         sys.stdout.flush()
306 | 
307 |         if (logFile != None):
308 |             logFile.write("Test case: {}, Results {}\n".format(current_config, results_run))
309 |             # logFile.write("Weights: {}\n".format(str(list(self.weights))))
310 |             logFile.flush()
311 | 
312 | 
313 | 
314 |     def epochIteration(self):
315 | 
316 |         # Get number of available interactions
317 |         numPositiveIteractions = int(self.URM_mask.nnz*1)
318 | 
319 |         start_time_epoch = time.time()
320 |         start_time_batch = time.time()
321 | 
322 |         totalNumberOfBatch = int(numPositiveIteractions/self.batch_size)+1
323 | 
324 |         # Uniform user sampling without replacement
325 |         for numCurrentBatch in range(totalNumberOfBatch):
326 | 
327 |             sgd_users, sgd_pos_items, sgd_neg_items = self.sampleBatch()
328 | 
329 |             self.updateWeightsBatch(
330 |                 sgd_users,
331 |                 sgd_pos_items,
332 |                 sgd_neg_items
333 |                 )
334 | 
335 |             """
336 |             self.updateWeightsLoop(
337 |                 sgd_users,
338 |                 sgd_pos_items,
339 |                 sgd_neg_items
340 |                 )
341 |             """
342 | 
343 |             if(time.time() - start_time_batch >= 30 or numCurrentBatch==totalNumberOfBatch-1):
344 |                 print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Sample per second: {:.0f}".format(
345 |                     numCurrentBatch*self.batch_size,
346 |                     100.0* float(numCurrentBatch*self.batch_size)/numPositiveIteractions,
347 |                     time.time() - start_time_batch,
348 |                     float(numCurrentBatch*self.batch_size + 1) / (time.time() - start_time_epoch)))
349 | 
350 |                 sys.stdout.flush()
351 |                 sys.stderr.flush()
352 | 
353 |                 start_time_batch = time.time()
354 | 
355 | 
356 | 
357 |         self.S[np.arange(0, self.n_items), np.arange(0, self.n_items)] = 0.0
358 | 
359 | 
360 | 
361 | 


--------------------------------------------------------------------------------
/SLIM_RMSE/SLIM_RMSE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: Massimo Quadrana
  5 | """
  6 | 
  7 | 
  8 | import numpy as np
  9 | import scipy.sparse as sps
 10 | from Base.Recommender import Recommender
 11 | from Base.Recommender_utils import check_matrix
 12 | from sklearn.linear_model import ElasticNet
 13 | 
 14 | from Base.Similarity_Matrix_Recommender import Similarity_Matrix_Recommender
 15 | import time, sys
 16 | 
 17 | class SLIM_RMSE(Recommender, Similarity_Matrix_Recommender):
 18 |     """
 19 |     Train a Sparse Linear Methods (SLIM) item similarity model.
 20 |     NOTE: ElasticNet solver is parallel, a single intance of SLIM_RMSE will
 21 |           make use of half the cores available
 22 | 
 23 |     See:
 24 |         Efficient Top-N Recommendation by Linear Regression,
 25 |         M. Levy and K. Jack, LSRS workshop at RecSys 2013.
 26 | 
 27 |         SLIM: Sparse linear methods for top-n recommender systems,
 28 |         X. Ning and G. Karypis, ICDM 2011.
 29 |         http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
 30 |     """
 31 | 
 32 |     def __init__(self, URM_train):
 33 | 
 34 |         super(SLIM_RMSE, self).__init__()
 35 | 
 36 |         self.URM_train = URM_train
 37 | 
 38 | 
 39 |     def __str__(self):
 40 |         return "SLIM (l1_penalty={},l2_penalty={},positive_only={})".format(
 41 |             self.l1_penalty, self.l2_penalty, self.positive_only
 42 |         )
 43 | 
 44 |     def fit(self, l1_penalty=0.1, l2_penalty=0.1, positive_only=True, topK = 100):
 45 | 
 46 |         self.l1_penalty = l1_penalty
 47 |         self.l2_penalty = l2_penalty
 48 |         self.positive_only = positive_only
 49 |         self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty)
 50 |         self.topK = topK
 51 | 
 52 |         X = check_matrix(self.URM_train, 'csc', dtype=np.float32)
 53 | 
 54 |         n_items = X.shape[1]
 55 | 
 56 |         # initialize the ElasticNet model
 57 |         self.model = ElasticNet(alpha=1.0,
 58 |                                 l1_ratio=self.l1_ratio,
 59 |                                 positive=self.positive_only,
 60 |                                 fit_intercept=False,
 61 |                                 copy_X=False,
 62 |                                 precompute=True,
 63 |                                 selection='random',
 64 |                                 max_iter=100,
 65 |                                 tol=1e-4)
 66 | 
 67 |         # we'll store the W matrix into a sparse csr_matrix
 68 |         # let's initialize the vectors used by the sparse.csc_matrix constructor
 69 |         values, rows, cols = [], [], []
 70 |         start_time = time.time()
 71 |         start_time_printBatch = start_time
 72 | 
 73 |         # fit each item's factors sequentially (not in parallel)
 74 |         for currentItem in range(n_items):
 75 |             # get the target column
 76 |             y = X[:, currentItem].toarray()
 77 |             # set the j-th column of X to zero
 78 |             startptr = X.indptr[currentItem]
 79 |             endptr = X.indptr[currentItem + 1]
 80 |             bak = X.data[startptr: endptr].copy()
 81 |             X.data[startptr: endptr] = 0.0
 82 |             # fit one ElasticNet model per column
 83 |             self.model.fit(X, y)
 84 | 
 85 |             # self.model.coef_ contains the coefficient of the ElasticNet model
 86 |             # let's keep only the non-zero values
 87 |             #nnz_idx = self.model.coef_ > 0.0
 88 | 
 89 |             # Select topK values
 90 |             # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
 91 |             # - Partition the data to extract the set of relevant items
 92 |             # - Sort only the relevant items
 93 |             # - Get the original item index
 94 |             relevant_items_partition = (-self.model.coef_).argpartition(self.topK)[0:self.topK]
 95 |             relevant_items_partition_sorting = np.argsort(-self.model.coef_[relevant_items_partition])
 96 |             ranking = relevant_items_partition[relevant_items_partition_sorting]
 97 | 
 98 |             notZerosMask = self.model.coef_[ranking] > 0.0
 99 |             ranking = ranking[notZerosMask]
100 | 
101 |             values.extend(self.model.coef_[ranking])
102 |             rows.extend(ranking)
103 |             cols.extend([currentItem]*len(ranking))
104 | 
105 |             # finally, replace the original values of the j-th column
106 |             X.data[startptr:endptr] = bak
107 | 
108 | 
109 |             if time.time() - start_time_printBatch > 300:
110 |                 print("Processed {} ( {:.2f}% ) in {:.2f} minutes. Columns per second: {:.0f}".format(
111 |                                   currentItem,
112 |                                   100.0* float(currentItem)/n_items,
113 |                                   (time.time()-start_time)/60,
114 |                                   float(currentItem)/(time.time()-start_time)))
115 |                 sys.stdout.flush()
116 |                 sys.stderr.flush()
117 | 
118 |                 start_time_printBatch = time.time()
119 | 
120 | 
121 |         # generate the sparse weight matrix
122 |         self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
123 | 
124 | 
125 | 
126 | 
127 | import multiprocessing
128 | from multiprocessing import Pool
129 | from functools import partial
130 | 
131 | 
132 | class MultiThreadSLIM_RMSE(SLIM_RMSE, Similarity_Matrix_Recommender):
133 | 
134 |     def __init__(self, URM_train):
135 | 
136 |         super(MultiThreadSLIM_RMSE, self).__init__(URM_train)
137 | 
138 |     def __str__(self):
139 |         return "SLIM_mt (l1_penalty={},l2_penalty={},positive_only={},workers={})".format(
140 |             self.l1_penalty, self.l2_penalty, self.positive_only, self.workers
141 |         )
142 | 
143 |     def _partial_fit(self, currentItem, X, topK):
144 |         model = ElasticNet(alpha=1.0,
145 |                                 l1_ratio=self.l1_ratio,
146 |                                 positive=self.positive_only,
147 |                                 fit_intercept=False,
148 |                                 copy_X=False,
149 |                                 precompute=True,
150 |                                 selection='random',
151 |                                 max_iter=100,
152 |                                 tol=1e-4)
153 | 
154 |         # WARNING: make a copy of X to avoid race conditions on column j
155 |         # TODO: We can probably come up with something better here.
156 |         X_j = X.copy()
157 |         # get the target column
158 |         y = X_j[:, currentItem].toarray()
159 |         # set the j-th column of X to zero
160 |         X_j.data[X_j.indptr[currentItem]:X_j.indptr[currentItem + 1]] = 0.0
161 |         # fit one ElasticNet model per column
162 |         model.fit(X_j, y)
163 |         # self.model.coef_ contains the coefficient of the ElasticNet model
164 |         # let's keep only the non-zero values
165 |         #nnz_idx = model.coef_ > 0.0
166 | 
167 |         relevant_items_partition = (-model.coef_).argpartition(topK)[0:topK]
168 |         relevant_items_partition_sorting = np.argsort(-model.coef_[relevant_items_partition])
169 |         ranking = relevant_items_partition[relevant_items_partition_sorting]
170 | 
171 |         notZerosMask = model.coef_[ranking] > 0.0
172 |         ranking = ranking[notZerosMask]
173 | 
174 |         values = model.coef_[ranking]
175 |         rows = ranking
176 |         cols = [currentItem] * len(ranking)
177 | 
178 |         return values, rows, cols
179 | 
180 |     def fit(self,l1_penalty=0.1,
181 |                  l2_penalty=0.1,
182 |                  positive_only=True,
183 |                  topK = 100,
184 |                  workers=multiprocessing.cpu_count()):
185 | 
186 | 
187 |         self.l1_penalty = l1_penalty
188 |         self.l2_penalty = l2_penalty
189 |         self.positive_only = positive_only
190 |         self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty)
191 |         self.topK = topK
192 | 
193 |         self.workers = workers
194 | 
195 | 
196 | 
197 | 
198 |         self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)
199 |         n_items = self.URM_train.shape[1]
200 |         # fit item's factors in parallel
201 |         
202 |         #oggetto riferito alla funzione nel quale predefinisco parte dell'input
203 |         _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK)
204 |         
205 |         #creo un pool con un certo numero di processi
206 |         pool = Pool(processes=self.workers)
207 |         
208 |         #avvio il pool passando la funzione (con la parte fissa dell'input) 
209 |         #e il rimanente parametro, variabile
210 |         res = pool.map(_pfit, np.arange(n_items))
211 | 
212 |         # res contains a vector of (values, rows, cols) tuples
213 |         values, rows, cols = [], [], []
214 |         for values_, rows_, cols_ in res:
215 |             values.extend(values_)
216 |             rows.extend(rows_)
217 |             cols.extend(cols_)
218 | 
219 |         # generate the sparse weight matrix
220 |         self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
221 | 
222 | 


--------------------------------------------------------------------------------
/all_algorithms.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
 3 | from SLIM_RMSE.SLIM_RMSE import SLIM_RMSE
 4 | 
 5 | from MatrixFactorization.Cython.MF_BPR_Cython import MF_BPR_Cython
 6 | from MatrixFactorization.MatrixFactorization_RMSE import FunkSVD
 7 | 
 8 | from KNN.user_knn_CF import UserKNNCFRecommender
 9 | from KNN.item_knn_CF import ItemKNNCFRecommender
10 | from KNN.item_knn_CBF import ItemKNNCBFRecommender
11 | 
12 | from data.Movielens10MReader import Movielens10MReader
13 | 
14 | 
15 | if __name__ == '__main__':
16 | 
17 | 
18 |     dataReader = Movielens10MReader()
19 | 
20 |     URM_train = dataReader.get_URM_train()
21 |     URM_validation = dataReader.get_URM_validation()
22 |     URM_test = dataReader.get_URM_test()
23 | 
24 |     recommender_list = []
25 |     recommender_list.append(ItemKNNCFRecommender(URM_train))
26 |     recommender_list.append(UserKNNCFRecommender(URM_train))
27 |     recommender_list.append(MF_BPR_Cython(URM_train))
28 |     recommender_list.append(FunkSVD(URM_train))
29 |     recommender_list.append(SLIM_BPR_Cython(URM_train, sparse_weights=False))
30 |     recommender_list.append(SLIM_RMSE(URM_train))
31 | 
32 | 
33 | 
34 |     for recommender in recommender_list:
35 | 
36 |         print("Algorithm: {}".format(recommender.__class__))
37 | 
38 |         recommender.fit()
39 | 
40 |         results_run = recommender.evaluateRecommendations(URM_test, at=5, exclude_seen=True)
41 |         print("Algorithm: {}, results: {}".format(recommender.__class__, results_run))
42 | 
43 | 


--------------------------------------------------------------------------------
/data/Movielens10MReader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on 14/09/17
  5 | 
  6 | @author: Maurizio Ferrari Dacrema
  7 | """
  8 | 
  9 | 
 10 | import numpy as np
 11 | import scipy.sparse as sps
 12 | import zipfile
 13 | 
 14 | from Base.Recommender_utils import removeZeroRatingRowAndCol
 15 | 
 16 | 
 17 | def loadCSVintoSparse (filePath, header = False, separator="::"):
 18 | 
 19 |     values, rows, cols = [], [], []
 20 | 
 21 |     fileHandle = open(filePath, "r")
 22 |     numCells = 0
 23 | 
 24 |     if header:
 25 |         fileHandle.readline()
 26 | 
 27 |     for line in fileHandle:
 28 |         numCells += 1
 29 |         if (numCells % 1000000 == 0):
 30 |             print("Processed {} cells".format(numCells))
 31 | 
 32 |         if (len(line)) > 1:
 33 |             line = line.split(separator)
 34 | 
 35 |             line[-1] = line[-1].replace("\n", "")
 36 | 
 37 |             if not line[2] == "0" and not line[2] == "NaN":
 38 |                 rows.append(int(line[0]))
 39 |                 cols.append(int(line[1]))
 40 |                 values.append(float(line[2]))
 41 | 
 42 |     fileHandle.close()
 43 | 
 44 |     return  sps.csr_matrix((values, (rows, cols)), dtype=np.float32)
 45 | 
 46 | 
 47 | 
 48 | def saveSparseIntoCSV (filePath, sparse_matrix, separator=","):
 49 | 
 50 |     sparse_matrix = sparse_matrix.tocoo()
 51 | 
 52 |     fileHandle = open(filePath, "w")
 53 | 
 54 |     for index in range(len(sparse_matrix.data)):
 55 |         fileHandle.write("{row}{separator}{col}{separator}{value}\n".format(
 56 |             row = sparse_matrix.row[index], col = sparse_matrix.col[index], value = sparse_matrix.data[index],
 57 |             separator = separator))
 58 | 
 59 | 
 60 | 
 61 | 
 62 | class Movielens10MReader(object):
 63 | 
 64 |     def __init__(self, splitTrainTest = False, splitTrainTestValidation =[0.6, 0.2, 0.2] , loadPredefinedTrainTest = True):
 65 | 
 66 |         super(Movielens10MReader, self).__init__()
 67 | 
 68 |         if sum(splitTrainTestValidation) != 1.0 or len(splitTrainTestValidation) != 3:
 69 |             raise ValueError("Movielens10MReader: splitTrainTestValidation must be a probability distribution over Train, Test and Validation")
 70 | 
 71 |         print("Movielens10MReader: loading data...")
 72 | 
 73 |         dataSubfolder = "./data/"
 74 | 
 75 |         dataFile = zipfile.ZipFile(dataSubfolder + "movielens_10m.zip")
 76 |         URM_path = dataFile.extract("ml-10M100K/ratings.dat", path=dataSubfolder)
 77 | 
 78 | 
 79 |         if not loadPredefinedTrainTest:
 80 |             self.URM_all = loadCSVintoSparse(URM_path, separator="::")
 81 |             self.URM_all = removeZeroRatingRowAndCol(self.URM_all)
 82 | 
 83 |         else:
 84 | 
 85 |             try:
 86 |                 self.URM_train = sps.load_npz(dataSubfolder + "URM_train.npz")
 87 |                 self.URM_test = sps.load_npz(dataSubfolder + "URM_test.npz")
 88 |                 self.URM_validation = sps.load_npz(dataSubfolder + "URM_validation.npz")
 89 | 
 90 |                 return
 91 | 
 92 |             except FileNotFoundError:
 93 |                 # Rebuild split
 94 |                 print("Movielens10MReader: URM_train or URM_test or URM_validation not found. Building new ones")
 95 | 
 96 |                 splitTrainTest = True
 97 |                 self.URM_all = loadCSVintoSparse(URM_path)
 98 |                 self.URM_all = removeZeroRatingRowAndCol(self.URM_all)
 99 | 
100 | 
101 | 
102 |         if splitTrainTest:
103 | 
104 |             self.URM_all = self.URM_all.tocoo()
105 | 
106 |             numInteractions= len(self.URM_all.data)
107 | 
108 |             split = np.random.choice([1, 2, 3], numInteractions, p=splitTrainTestValidation)
109 | 
110 | 
111 |             trainMask = split == 1
112 |             self.URM_train = sps.coo_matrix((self.URM_all.data[trainMask], (self.URM_all.row[trainMask], self.URM_all.col[trainMask])))
113 |             self.URM_train = self.URM_train.tocsr()
114 | 
115 |             testMask = split == 2
116 | 
117 |             self.URM_test = sps.coo_matrix((self.URM_all.data[testMask], (self.URM_all.row[testMask], self.URM_all.col[testMask])))
118 |             self.URM_test = self.URM_test.tocsr()
119 | 
120 |             validationMask = split == 3
121 | 
122 |             self.URM_validation = sps.coo_matrix((self.URM_all.data[validationMask], (self.URM_all.row[validationMask], self.URM_all.col[validationMask])))
123 |             self.URM_validation = self.URM_validation.tocsr()
124 | 
125 |             del self.URM_all
126 | 
127 |             print("Movielens10MReader: saving URM_train and URM_test")
128 |             sps.save_npz(dataSubfolder + "URM_train.npz", self.URM_train)
129 |             sps.save_npz(dataSubfolder + "URM_test.npz", self.URM_test)
130 |             sps.save_npz(dataSubfolder + "URM_validation.npz", self.URM_validation)
131 | 
132 |         print("Movielens10MReader: loading complete")
133 | 
134 | 
135 | 
136 | 
137 |     def get_URM_train(self):
138 |         return self.URM_train
139 | 
140 |     def get_URM_test(self):
141 |         return self.URM_test
142 | 
143 |     def get_URM_validation(self):
144 |         return self.URM_validation
145 | 


--------------------------------------------------------------------------------
/data/URM_test.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/URM_test.npz


--------------------------------------------------------------------------------
/data/URM_train.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/URM_train.npz


--------------------------------------------------------------------------------
/data/URM_validation.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/URM_validation.npz


--------------------------------------------------------------------------------
/data/movielens_10m.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/data/movielens_10m.zip


--------------------------------------------------------------------------------
/run_SLIM_BPR.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
 3 | from MatrixFactorization.Cython.MF_BPR_Cython import MF_BPR_Cython
 4 | from data.Movielens10MReader import Movielens10MReader
 5 | 
 6 | 
 7 | def run_SLIM():
 8 | 
 9 |     dataReader = Movielens10MReader()
10 | 
11 |     URM_train = dataReader.get_URM_train()
12 |     URM_test = dataReader.get_URM_test()
13 | 
14 |     recommender = SLIM_BPR_Cython(URM_train, recompile_cython=False, positive_threshold=4, sparse_weights=True)
15 |     #recommender = MF_BPR_Cython(URM_train, recompile_cython=False, positive_threshold=4)
16 | 
17 |     logFile = open("Result_log.txt", "a")
18 | 
19 | 
20 |     recommender.fit(epochs=2, validate_every_N_epochs=1, URM_test=URM_test,
21 |                     logFile=logFile, batch_size=1, sgd_mode='rmsprop', learning_rate=1e-4)
22 | 
23 | 
24 |     results_run = recommender.evaluateRecommendations(URM_test, at=5)
25 |     print(results_run)
26 | 
27 | 
28 | run_SLIM()


--------------------------------------------------------------------------------
/slides/20161219_BPR.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/20161219_BPR.pptx


--------------------------------------------------------------------------------
/slides/2017_MF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/2017_MF.pdf


--------------------------------------------------------------------------------
/slides/2017_MF.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/2017_MF.pptx


--------------------------------------------------------------------------------
/slides/Amazon AWS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/Amazon AWS.pdf


--------------------------------------------------------------------------------
/slides/FM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/FM.pdf


--------------------------------------------------------------------------------
/slides/FunkSVD - 2006.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/FunkSVD - 2006.pdf


--------------------------------------------------------------------------------
/slides/hu_koren_volinsky.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/hu_koren_volinsky.pdf


--------------------------------------------------------------------------------
/slides/koren_sdv++.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/koren_sdv++.pdf


--------------------------------------------------------------------------------
/slides/rendle_bpr.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaurizioFD/RecSys_Course_2017/dfef3655df8ce32428458aafabfc94ed688231f6/slides/rendle_bpr.pdf


--------------------------------------------------------------------------------