├── setup.cfg
├── requirements.txt
├── matrix_factorization
├── __init__.py
├── utils.py
├── recommender_base.py
├── kernels.py
├── baseline_model.py
└── kernel_matrix_factorization.py
├── LICENSE
├── setup.py
├── examples
├── example.py
└── recommender-system.ipynb
├── README.md
└── .gitignore
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numba>=0.49.1
2 | numpy>=1.18.5
3 | pandas>=1.0.4
4 | scikit-learn>=0.23.1
5 | scipy>=1.4.1
--------------------------------------------------------------------------------
/matrix_factorization/__init__.py:
--------------------------------------------------------------------------------
1 | from .baseline_model import BaselineModel
2 | from .kernel_matrix_factorization import KernelMF
3 | from .recommender_base import RecommenderBase
4 | from .utils import train_update_test_split
5 |
6 | __all__ = [
7 | "BaselineModel",
8 | "KernelMF",
9 | "RecommenderBase",
10 | "train_update_test_split",
11 | ]
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Quang-Vinh Do
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import setuptools
3 |
4 |
5 | readme_path = os.path.join(os.path.dirname(__file__), "README.md")
6 | with open(readme_path) as f:
7 | long_description = f.read()
8 |
9 |
10 | setuptools.setup(
11 | name="matrix_factorization",
12 | version="1.3",
13 | author="Quang-Vinh Do",
14 | author_email="qdo086@uottawa.ca",
15 | description="Library for matrix factorization for recommender systems using collaborative filtering",
16 | long_description=long_description,
17 | long_description_content_type="text/markdown",
18 | url="https://github.com/Quang-Vinh/matrix-factorization",
19 | download_url="https://github.com/Quang-Vinh/matrix-factorization/archive/v1.3.tar.gz",
20 | license="MIT",
21 | packages=setuptools.find_packages(),
22 | classifiers=[
23 | "Programming Language :: Python :: 3",
24 | "Operating System :: OS Independent",
25 | "License :: OSI Approved :: MIT License",
26 | ],
27 | python_requires=">=3.6",
28 | install_requires=[
29 | "numba>=0.49.1",
30 | "numpy>=1.18.5",
31 | "pandas>=1.0.4",
32 | "scikit-learn>=0.23.1",
33 | "scipy>=1.4.1",
34 | ],
35 | )
36 |
--------------------------------------------------------------------------------
/examples/example.py:
--------------------------------------------------------------------------------
1 | from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
2 |
3 | import pandas as pd
4 | from sklearn.metrics import mean_squared_error
5 |
6 | # Movie data found here https://grouplens.org/datasets/movielens/
7 | cols = ["user_id", "item_id", "rating", "timestamp"]
8 | movie_data = pd.read_csv(
9 | "../data/ml-100k/u.data", names=cols, sep="\t", usecols=[0, 1, 2], engine="python"
10 | )
11 |
12 | X = movie_data[["user_id", "item_id"]]
13 | y = movie_data["rating"]
14 |
15 | # Prepare data for online learning
16 | (
17 | X_train_initial,
18 | y_train_initial,
19 | X_train_update,
20 | y_train_update,
21 | X_test_update,
22 | y_test_update,
23 | ) = train_update_test_split(movie_data, frac_new_users=0.2)
24 |
25 | # Initial training
26 | matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
27 | matrix_fact.fit(X_train_initial, y_train_initial)
28 |
29 | # Update model with new users
30 | matrix_fact.update_users(
31 | X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1
32 | )
33 | pred = matrix_fact.predict(X_test_update)
34 | rmse = mean_squared_error(y_test_update, pred, squared=False)
35 | print(f"\nTest RMSE: {rmse:.4f}")
36 |
37 | # Get recommendations
38 | user = 200
39 | items_known = X_train_initial.query("user_id == @user")["item_id"]
40 | matrix_fact.recommend(user=user, items_known=items_known)
41 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Matrix Factorization
2 | Short and simple implementation of kernel matrix factorization with online-updating for use in collaborative recommender systems built on top of scikit-learn.
3 |
4 | ## Prerequisites
5 | - Python 3
6 | - numba
7 | - numpy
8 | - pandas
9 | - scikit-learn
10 | - scipy
11 |
12 | ## Installation
13 | ```
14 | pip install matrix_factorization
15 | ```
16 |
17 | ## Usage
18 | ```python
19 | from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
20 |
21 | import pandas as pd
22 | from sklearn.metrics import mean_squared_error
23 |
24 | # Movie data found here https://grouplens.org/datasets/movielens/
25 | cols = ["user_id", "item_id", "rating", "timestamp"]
26 | movie_data = pd.read_csv(
27 | "../data/ml-100k/u.data", names=cols, sep="\t", usecols=[0, 1, 2], engine="python"
28 | )
29 |
30 | X = movie_data[["user_id", "item_id"]]
31 | y = movie_data["rating"]
32 |
33 | # Prepare data for online learning
34 | (
35 | X_train_initial,
36 | y_train_initial,
37 | X_train_update,
38 | y_train_update,
39 | X_test_update,
40 | y_test_update,
41 | ) = train_update_test_split(movie_data, frac_new_users=0.2)
42 |
43 | # Initial training
44 | matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
45 | matrix_fact.fit(X_train_initial, y_train_initial)
46 |
47 | # Update model with new users
48 | matrix_fact.update_users(
49 | X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1
50 | )
51 | pred = matrix_fact.predict(X_test_update)
52 | rmse = mean_squared_error(y_test_update, pred, squared=False)
53 | print(f"\nTest RMSE: {rmse:.4f}")
54 |
55 | # Get recommendations
56 | user = 200
57 | items_known = X_train_initial.query("user_id == @user")["item_id"]
58 | matrix_fact.recommend(user=user, items_known=items_known)
59 | ```
60 |
61 | Check examples/recommender-system.ipynb for complete examples
62 |
63 | ## License
64 | This project is licensed under the MIT License
65 |
66 |
67 | ## References :book:
68 | - Steffen Rendle, Lars Schmidt-Thieme. Online-updating regularized kernel matrix factorization models for large-scale recommender systems https://dl.acm.org/doi/10.1145/1454008.1454047
69 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # data
2 | /data
3 |
4 | .gitattributes
5 |
6 | # vs code
7 | .vscode
8 |
9 | # Jupyter notbeook
10 | .virtual_documents
11 |
12 | # Byte-compiled / optimized / DLL files
13 | __pycache__/
14 | *.py[cod]
15 | *$py.class
16 |
17 | # C extensions
18 | *.so
19 |
20 | # Distribution / packaging
21 | .Python
22 | build/
23 | develop-eggs/
24 | dist/
25 | downloads/
26 | eggs/
27 | .eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | wheels/
34 | pip-wheel-metadata/
35 | share/python-wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 | MANIFEST
40 |
41 | # PyInstaller
42 | # Usually these files are written by a python script from a template
43 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
44 | *.manifest
45 | *.spec
46 |
47 | # Installer logs
48 | pip-log.txt
49 | pip-delete-this-directory.txt
50 |
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .nox/
55 | .coverage
56 | .coverage.*
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | *.cover
61 | *.py,cover
62 | .hypothesis/
63 | .pytest_cache/
64 |
65 | # Translations
66 | *.mo
67 | *.pot
68 |
69 | # Django stuff:
70 | *.log
71 | local_settings.py
72 | db.sqlite3
73 | db.sqlite3-journal
74 |
75 | # Flask stuff:
76 | instance/
77 | .webassets-cache
78 |
79 | # Scrapy stuff:
80 | .scrapy
81 |
82 | # Sphinx documentation
83 | docs/_build/
84 |
85 | # PyBuilder
86 | target/
87 |
88 | # Jupyter Notebook
89 | .ipynb_checkpoints
90 |
91 | # IPython
92 | profile_default/
93 | ipython_config.py
94 |
95 | # pyenv
96 | .python-version
97 |
98 | # pipenv
99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | # install all needed dependencies.
103 | #Pipfile.lock
104 |
105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106 | __pypackages__/
107 |
108 | # Celery stuff
109 | celerybeat-schedule
110 | celerybeat.pid
111 |
112 | # SageMath parsed files
113 | *.sage.py
114 |
115 | # Environments
116 | .env
117 | .venv
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 |
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 |
128 | # Rope project settings
129 | .ropeproject
130 |
131 | # mkdocs documentation
132 | /site
133 |
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 |
139 | # Pyre type checker
140 | .pyre/
141 |
--------------------------------------------------------------------------------
/matrix_factorization/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.model_selection import train_test_split
4 |
5 | from typing import Tuple
6 |
7 |
8 | def train_update_test_split(
9 | X: pd.DataFrame, frac_new_users: float
10 | ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
11 | """
12 | Split data into 3 parts (train_initial, train_update, test_update) for testing performance of model update for new users. First, a set of new
13 | users is set and all ratings corresponding to all other users is assigned to train_initial. Then, for each new user half of their ratings are
14 | stored in train_update and half are stored in test_update.
15 |
16 | To use the three sets returned:
17 | 1. Fit your model to train_update set.
18 | 2. Update your model with train_update
19 | 3. Calculate predictions on test_update and compare with their actual ratings
20 |
21 | Args:
22 | X (pd.DataFrame): Data frame containing columns user_id, item_id
23 | frac_new_users (float): Fraction of users to not include in train_initial
24 |
25 | Returns:
26 | X_train_initial [pd.DataFrame]: Training set user_ids and item_ids for initial model fitting
27 | y_train_initial [pd.Series]: Corresponding ratings for X_train_initial
28 | X_train_update [pd.DataFrame]: Training set user_ids and item_ids for model updating. Contains users that are not in train_initial
29 | y_train_update [pd.Series]: Corresponding ratings for X_train_update
30 | X_test_update [pd.DataFrame]: Testing set user_ids and item_ids for model updating. Contains same users as train_update
31 | y_test_update [pd.Series]: Corresponding ratings for X_test_update
32 | """
33 | users = X["user_id"].unique()
34 |
35 | # Users that won't be included in the initial training
36 | users_update = np.random.choice(
37 | users, size=round(frac_new_users * len(users)), replace=False
38 | )
39 |
40 | # Initial training matrix
41 | train_initial = X.query("user_id not in @users_update").sample(
42 | frac=1, replace=False
43 | )
44 |
45 | # Train and test sets for updating model. For each new user split their ratings into two sets, one for update and one for test
46 | data_update = X.query("user_id in @users_update")
47 | train_update, test_update = train_test_split(
48 | data_update, stratify=data_update["user_id"], test_size=0.5
49 | )
50 |
51 | # Split into X and y
52 | X_train_initial, y_train_initial = (
53 | train_initial[["user_id", "item_id"]],
54 | train_initial["rating"],
55 | )
56 | X_train_update, y_train_update = (
57 | train_update[["user_id", "item_id"]],
58 | train_update["rating"],
59 | )
60 | X_test_update, y_test_update = (
61 | test_update[["user_id", "item_id"]],
62 | test_update["rating"],
63 | )
64 |
65 | return (
66 | X_train_initial,
67 | y_train_initial,
68 | X_train_update,
69 | y_train_update,
70 | X_test_update,
71 | y_test_update,
72 | )
73 |
74 |
--------------------------------------------------------------------------------
/matrix_factorization/recommender_base.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.base import BaseEstimator, RegressorMixin
4 |
5 | from abc import ABCMeta, abstractmethod
6 | from typing import Any, Tuple, Union
7 |
8 |
9 | class RecommenderBase(BaseEstimator, RegressorMixin, metaclass=ABCMeta):
10 | """
11 | Abstract base class for all recommender models.
12 | All subclasses should implement the fit() and predict() methods
13 |
14 | Arguments:
15 | min_rating {int} -- Smallest rating possible (default: {0})
16 | max_rating {int} -- Largest rating possible (default: {5})
17 | verbose {str} -- Verbosity when fitting. Values possible are 0 to not print anything, 1 to print fitting model (default: {1})
18 |
19 | Attributes:
20 | n_users {int} -- Number of users
21 | n_items {int} -- Number of items
22 | global_mean {float} -- Global mean of all ratings
23 | user_id_map {dict} -- Mapping of user ids to assigned integer ids
24 | item_id_map {dict} -- Mapping of item ids to assigned integer ids
25 | known_users {set} -- Set of known user_ids
26 | known_items {set} -- Set of known item_ids
27 | """
28 |
29 | @abstractmethod
30 | def __init__(self, min_rating: float = 0, max_rating: float = 5, verbose: int = 0):
31 | self.min_rating = min_rating
32 | self.max_rating = max_rating
33 | self.verbose = verbose
34 | return
35 |
36 | @property
37 | def known_users(self):
38 | """
39 | List of known user_ids
40 | """
41 | return set(self.user_id_map.keys())
42 |
43 | @property
44 | def known_items(self):
45 | """
46 | List of known item_ids
47 | """
48 | return set(self.item_id_map.keys())
49 |
50 | def contains_user(self, user_id: Any) -> bool:
51 | """
52 | Checks if model was trained on data containing given user_id
53 |
54 | Args:
55 | user_id (any): User id
56 |
57 | Returns:
58 | bool: If user_id is known
59 | """
60 | return user_id in self.known_users
61 |
62 | def contains_item(self, item_id: Any) -> bool:
63 | """
64 | Checks if model was trained on data containing given item_id
65 |
66 | Args:
67 | item_id (any): Item id
68 |
69 | Returns:
70 | bool: If item_id is known
71 | """
72 | return item_id in self.known_items
73 |
74 | def _preprocess_data(
75 | self, X: pd.DataFrame, y: pd.Series = None, type: str = "fit"
76 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, list, list]]:
77 | """
78 | Preprocessing steps before doing fit, update or predict
79 |
80 | Arguments:
81 | X {pd.DataFrame} -- Dataframe containing columns user_id, item_id
82 | y {pd.Series} -- Series containing rating
83 | type {str} -- The type of preprocessing to do. Allowed options are ('fit', 'predict', 'update'). Defaults to 'fit'
84 |
85 | Returns:
86 | X [pd.DataFrame] -- Dataframe with columns user_id, item_id and rating
87 | known_users [list, 'on update only'] -- List containing already known users in X. Only returned for type update
88 | new_users [list, 'on update only'] -- List containing new users in X. Only returned for type update
89 | """
90 | X = X.loc[:, ["user_id", "item_id"]]
91 |
92 | if type != "predict":
93 | X["rating"] = y
94 |
95 | if type in ("fit", "update"):
96 | # Check for duplicate user-item ratings
97 | if X.duplicated(subset=["user_id", "item_id"]).sum() != 0:
98 | raise ValueError("Duplicate user-item ratings in matrix")
99 |
100 | # Shuffle rows
101 | X = X.sample(frac=1, replace=False)
102 |
103 | if type == "fit":
104 | # Create mapping of user_id and item_id to assigned integer ids
105 | user_ids = X["user_id"].unique()
106 | item_ids = X["item_id"].unique()
107 | self.user_id_map = {user_id: i for (i, user_id) in enumerate(user_ids)}
108 | self.item_id_map = {item_id: i for (i, item_id) in enumerate(item_ids)}
109 | self.n_users = len(user_ids)
110 | self.n_items = len(item_ids)
111 |
112 | elif type == "update":
113 | # Keep only item ratings for which the item is already known
114 | items = self.item_id_map.keys()
115 | X = X.query("item_id in @items").copy()
116 |
117 | # Add information on new users
118 | new_users, known_users = [], []
119 | users = X["user_id"].unique()
120 | new_user_id = max(self.user_id_map.values()) + 1
121 |
122 | for user in users:
123 | if user in self.user_id_map.keys():
124 | known_users.append(user)
125 | continue
126 |
127 | # Add to user id mapping
128 | new_users.append(user)
129 | self.user_id_map[user] = new_user_id
130 | new_user_id += 1
131 |
132 | # Remap user id and item id to assigned integer ids
133 | X.loc[:, "user_id"] = X["user_id"].map(self.user_id_map)
134 | X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
135 |
136 | if type == "predict":
137 | # Replace missing mappings with -1
138 | X.fillna(-1, inplace=True)
139 |
140 | if type == "update":
141 | return X, known_users, new_users
142 | else:
143 | return X
144 |
145 | @abstractmethod
146 | def fit(self, X: pd.DataFrame, y: pd.Series):
147 | """
148 | Fit model to given data
149 |
150 | Args:
151 | X {pandas DataFrame} -- Dataframe containing columns user_id, item_id
152 | y {pandas DataFrame} -- Series containing rating
153 | """
154 | return self
155 |
156 | @abstractmethod
157 | def predict(self, X: pd.DataFrame, bound_ratings: bool = True) -> list:
158 | """
159 | Predict ratings for given users and items
160 |
161 | Args:
162 | X (pd.DataFrame): Dataframe containing columns user_id and item_id
163 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True)
164 |
165 | Returns:
166 | list: List containing rating predictions of all user, items in same order as input X
167 | """
168 | return []
169 |
170 | def recommend(
171 | self,
172 | user: Any,
173 | amount: int = 10,
174 | items_known: list = None,
175 | include_user: bool = True,
176 | bound_ratings: bool = True,
177 | ) -> pd.DataFrame:
178 | """
179 | Returns a DataFrame of recommendations of items for a given user sorted from highest to lowest.
180 |
181 | Args:
182 | user (any): User_id to get recommendations for (not assigned user_id from self.user_id_map)
183 | items_known (list, optional): List of items already known by user and to not be considered in recommendations. Defaults to None.
184 | include_user (bool, optional): Whether to include the user_id in the output DataFrame or not. Defaults to True.
185 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True)
186 |
187 | Returns:
188 | pd.DataFrame: Recommendations DataFrame for user with columns user_id (optional), item_id, rating sorted from highest to lowest rating
189 | """
190 | items = list(self.item_id_map.keys())
191 |
192 | # If items_known is provided then filter by items that the user does not know
193 | if items_known is not None:
194 | items_known = list(items_known)
195 | items = [item for item in items if item not in items_known]
196 |
197 | # Get rating predictions for given user and all unknown items
198 | items_recommend = pd.DataFrame({"user_id": user, "item_id": items})
199 | items_recommend["rating_pred"] = self.predict(
200 | X=items_recommend, bound_ratings=False
201 | )
202 |
203 | # Sort and keep top n items
204 | items_recommend.sort_values(by="rating_pred", ascending=False, inplace=True)
205 | items_recommend = items_recommend.head(amount)
206 |
207 | # Bound ratings
208 | if bound_ratings:
209 | items_recommend["rating_pred"] = items_recommend["rating_pred"].clip(
210 | lower=self.min_rating, upper=self.max_rating
211 | )
212 |
213 | if not include_user:
214 | items_recommend.drop(["user_id"], axis="columns", inplace=True)
215 |
216 | return items_recommend
217 |
218 |
--------------------------------------------------------------------------------
/matrix_factorization/kernels.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numba as nb
3 | import numpy as np
4 |
5 |
6 | @nb.njit()
7 | def sigmoid(x: float) -> float:
8 | """
9 | Calculates sigmoid function at x
10 |
11 | Args:
12 | x (float): Input x
13 |
14 | Returns:
15 | [float]: Sigmoid at x
16 | """
17 | result = 1 / (1 + math.exp(-x))
18 | return result
19 |
20 |
21 | @nb.njit()
22 | def kernel_linear(
23 | global_mean: float,
24 | user_bias: float,
25 | item_bias: float,
26 | user_feature_vec: np.ndarray,
27 | item_feature_vec: np.ndarray,
28 | ) -> float:
29 | """
30 | Calculates result with a linear kernel which is essentially just the dot product
31 |
32 | Args:
33 | global_mean (float): Global mean
34 | user_bias (float): User bias
35 | item_bias (float): Item bias
36 | user_feature_vec (np.ndarray): Vector of user latent features
37 | item_feature_vec (np.ndarray): Vector of item latent features
38 |
39 | Returns:
40 | [float]: Linear kernel result
41 | """
42 | result = (
43 | global_mean + item_bias + user_bias + np.dot(user_feature_vec, item_feature_vec)
44 | )
45 | return result
46 |
47 |
48 | @nb.njit()
49 | def kernel_sigmoid(
50 | global_mean: float,
51 | user_bias: float,
52 | item_bias: float,
53 | user_feature_vec: np.ndarray,
54 | item_feature_vec: np.ndarray,
55 | a: float,
56 | c: float,
57 | ):
58 | """
59 | Calculates result with sigmoid kernel
60 |
61 | Args:
62 | global_mean (float): Global mean
63 | user_bias (float): User bias
64 | item_bias (float): Item bias
65 | user_feature_vec (np.ndarray): Vector of user latent features
66 | item_feature_vec (np.ndarray): Vector of item latent features
67 | a (float): Rescaling parameter for a + c * K(u, i)
68 | c (float): Rescaling parameter for a + c * K(u, i)
69 |
70 | Returns:
71 | [float]: Sigmoid kernel result
72 | """
73 | linear_sum = (
74 | global_mean + user_bias + item_bias + np.dot(user_feature_vec, item_feature_vec)
75 | )
76 | sigmoid_result = sigmoid(linear_sum)
77 | result = a + c * sigmoid_result
78 | return result
79 |
80 |
81 | @nb.njit()
82 | def kernel_rbf(
83 | user_feature_vec: np.ndarray,
84 | item_feature_vec: np.ndarray,
85 | gamma: float,
86 | a: float,
87 | c: float,
88 | ):
89 | """
90 | Calculates result with Radial basis function kernel
91 |
92 | Args:
93 | user_feature_vec (np.ndarray): Vector of user latent features
94 | item_feature_vec (np.ndarray): Vector of item latent features
95 | gamma (float): Kernel coefficient
96 | a (float): Rescaling parameter for a + c * K(u, i)
97 | c (float): Rescaling parameter for a + c * K(u, i)
98 |
99 | Returns:
100 | [float]: RBF kernel result
101 | """
102 | power = -gamma * np.sum(np.square(user_feature_vec - item_feature_vec))
103 | exp_result = math.exp(power)
104 | result = a + c * exp_result
105 | return result
106 |
107 |
108 | @nb.njit()
109 | def kernel_linear_sgd_update(
110 | user_id: int,
111 | item_id: int,
112 | rating: float,
113 | global_mean: float,
114 | user_biases: np.ndarray,
115 | item_biases: np.ndarray,
116 | user_features: np.ndarray,
117 | item_features: np.ndarray,
118 | lr: float,
119 | reg: float,
120 | update_user_params: bool = True,
121 | update_item_params: bool = True,
122 | ):
123 | """
124 | Performs a single update using stochastic gradient descent for a linear kernel given a user and item.
125 | Similar to https://github.com/gbolmier/funk-svd and https://github.com/NicolasHug/Surprise we iterate over each factor manually for a given
126 | user/item instead of indexing by a row such as user_feature[user] since it has shown to be much faster. We have also tested with representing
127 | user_features and item_features as 1D arrays but that also is much slower. Using parallel turned on in numba gives much worse performance as well.
128 |
129 | Args:
130 | user_id (int): User id
131 | item_id (int): Item id
132 | rating (float): Rating for user and item
133 | global_mean {float} -- Global mean of all ratings
134 | user_biases {numpy array} -- User biases vector of shape (n_users, 1)
135 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1)
136 | user_features {numpy array} -- Matrix P of user features of shape (n_users, n_factors)
137 | item_features {numpy array} -- Matrix Q of item features of shape (n_items, n_factors)
138 | lr (float): Learning rate alpha
139 | reg {float} -- Regularization parameter lambda for Frobenius norm
140 | update_user_params {bool} -- Whether to update user parameters or not. Default is True.
141 | update_item_params {bool} -- Whether to update item parameters or not. Default is True.
142 | """
143 | n_factors = user_features.shape[1]
144 | user_bias = user_biases[user_id]
145 | item_bias = item_biases[item_id]
146 |
147 | # Compute predicted rating
148 | rating_pred = (
149 | global_mean
150 | + item_bias
151 | + user_bias
152 | + np.dot(user_features[user_id, :], item_features[item_id, :])
153 | )
154 |
155 | # Compute error
156 | error = rating_pred - rating
157 |
158 | # Update bias parameters
159 | if update_user_params:
160 | user_biases[user_id] -= lr * (error + reg * user_bias)
161 |
162 | if update_item_params:
163 | item_biases[item_id] -= lr * (error + reg * item_bias)
164 |
165 | # Update user and item features
166 | for f in range(n_factors):
167 | user_feature_f = user_features[user_id, f]
168 | item_feature_f = item_features[item_id, f]
169 |
170 | if update_user_params:
171 | user_features[user_id, f] -= lr * (
172 | error * item_feature_f + reg * user_feature_f
173 | )
174 |
175 | if update_item_params:
176 | item_features[item_id, f] -= lr * (
177 | error * user_feature_f + reg * item_feature_f
178 | )
179 |
180 | return
181 |
182 |
183 | @nb.njit()
184 | def kernel_sigmoid_sgd_update(
185 | user_id: int,
186 | item_id: int,
187 | rating: float,
188 | global_mean: float,
189 | user_biases: np.ndarray,
190 | item_biases: np.ndarray,
191 | user_features: np.ndarray,
192 | item_features: np.ndarray,
193 | lr: float,
194 | reg: float,
195 | a: float,
196 | c: float,
197 | update_user_params: bool = True,
198 | update_item_params: bool = True,
199 | ):
200 | """
201 | Performs a single update using stochastic gradient descent for a sigmoid kernel given a user and item.
202 |
203 | Args:
204 | user_id (int): User id
205 | item_id (int): Item id
206 | rating (float): Rating for user and item
207 | global_mean {float} -- Global mean of all ratings
208 | user_biases {numpy array} -- User biases vector of shape (n_users, 1)
209 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1)
210 | user_features {numpy array} -- Matrix P of user features of shape (n_users, n_factors)
211 | item_features {numpy array} -- Matrix Q of item features of shape (n_items, n_factors)
212 | lr (float): Learning rate alpha
213 | reg {float} -- Regularization parameter lambda for Frobenius norm
214 | a (float): Rescaling parameter for a + c * K(u, i)
215 | c (float): Rescaling parameter for a + c * K(u, i)
216 | update_user_params {bool} -- Whether to update user parameters or not. Default is True.
217 | update_item_params {bool} -- Whether to update item parameters or not. Default is True.
218 | """
219 | n_factors = user_features.shape[1]
220 | user_bias = user_biases[user_id]
221 | item_bias = item_biases[item_id]
222 | user_feature_vec = user_features[user_id, :]
223 | item_feature_vec = item_features[item_id, :]
224 |
225 | # Compute predicted rating
226 | linear_sum = (
227 | global_mean + user_bias + item_bias + np.dot(user_feature_vec, item_feature_vec)
228 | )
229 | sigmoid_result = sigmoid(linear_sum)
230 | rating_pred = a + c * sigmoid_result
231 |
232 | # Compute error
233 | error = rating_pred - rating
234 |
235 | # Common term shared between all partial derivatives
236 | deriv_base = (sigmoid_result ** 2) * math.exp(-linear_sum)
237 |
238 | # Update bias parameters
239 | if update_user_params:
240 | opt_deriv = error * deriv_base + reg * user_bias
241 | user_biases[user_id] -= lr * opt_deriv
242 |
243 | if update_item_params:
244 | opt_deriv = error * deriv_base + reg * item_bias
245 | item_biases[item_id] -= lr * opt_deriv
246 |
247 | # Update user and item features
248 | for i in range(n_factors):
249 | user_feature_f = user_features[user_id, i]
250 | item_feature_f = item_features[item_id, i]
251 |
252 | if update_user_params:
253 | user_feature_deriv = item_feature_f * deriv_base
254 | opt_deriv = error * user_feature_deriv + reg * user_feature_f
255 | user_features[user_id, i] -= lr * opt_deriv
256 |
257 | if update_item_params:
258 | item_feature_deriv = user_feature_f * deriv_base
259 | opt_deriv = error * item_feature_deriv + reg * item_feature_f
260 | item_features[item_id, i] -= lr * opt_deriv
261 |
262 | return
263 |
264 |
265 | @nb.njit()
266 | def kernel_rbf_sgd_update(
267 | user_id: int,
268 | item_id: int,
269 | rating: float,
270 | user_features: np.ndarray,
271 | item_features: np.ndarray,
272 | lr: float,
273 | reg: float,
274 | gamma: float,
275 | a: float,
276 | c: float,
277 | update_user_params: bool = True,
278 | update_item_params: bool = True,
279 | ):
280 | """
281 | Performs a single update using stochastic gradient descent for a sigmoid kernel given a user and item.
282 |
283 | Args:
284 | user_id (int): User id
285 | item_id (int): Item id
286 | rating (float): Rating for user and item
287 | user_features {numpy array} -- Matrix P of user features of shape (n_users, n_factors)
288 | item_features {numpy array} -- Matrix Q of item features of shape (n_items, n_factors)
289 | lr (float): Learning rate alpha
290 | reg {float} -- Regularization parameter lambda for Frobenius norm
291 | gamma (float): Kernel coefficient
292 | a (float): Rescaling parameter for a + c * K(u, i)
293 | c (float): Rescaling parameter for a + c * K(u, i)
294 | update_user_params {bool} -- Whether to update user parameters or not. Default is True.
295 | update_item_params {bool} -- Whether to update item parameters or not. Default is True.
296 | """
297 | n_factors = user_features.shape[1]
298 | user_feature_vec = user_features[user_id, :]
299 | item_feature_vec = item_features[item_id, :]
300 |
301 | # Compute predicted rating
302 | power = -gamma * np.sum(np.square(user_feature_vec - item_feature_vec))
303 | exp_result = math.exp(power)
304 | rating_pred = a + c * exp_result
305 |
306 | # Compute error
307 | error = rating_pred - rating
308 |
309 | # Common term shared between partial derivatives
310 | deriv_base = 2 * exp_result * gamma
311 |
312 | # Update user and item features params
313 | for i in range(n_factors):
314 | user_feature_f = user_features[user_id, i]
315 | item_feature_f = item_features[item_id, i]
316 |
317 | if update_user_params:
318 | user_feature_deriv = deriv_base * (item_feature_f - user_feature_f)
319 | opt_deriv = error * user_feature_deriv + reg * user_feature_f
320 | user_features[user_id, i] -= lr * opt_deriv
321 |
322 | if update_item_params:
323 | item_feature_deriv = deriv_base * (user_feature_f - item_feature_f)
324 | opt_deriv = error * item_feature_deriv + reg * item_feature_f
325 | item_features[item_id, i] -= lr * opt_deriv
326 |
327 | return
328 |
--------------------------------------------------------------------------------
/matrix_factorization/baseline_model.py:
--------------------------------------------------------------------------------
1 | import numba as nb
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from .recommender_base import RecommenderBase
6 |
7 | from typing import Tuple
8 |
9 |
10 | class BaselineModel(RecommenderBase):
11 | """
12 | Simple model which models the user item rating as r_{ui} = \mu + ubias_u + ibias_i which is sum of a global mean and the corresponding
13 | user and item biases. The global mean \mu is estimated as the mean of all ratings. The other parameters to be estimated ubias and ibias
14 | are vectors of length n_users and n_items respectively. These two vectors are estimated using stochastic gradient descent on the RMSE
15 | with regularization.
16 |
17 | NOTE: Recommend method with this model will simply recommend the most popular items for every user. This model should mainly be used
18 | for estimating the explicit rating for a given user and item
19 |
20 | Arguments:
21 | method: {str} -- Method to estimate parameters. Can be one of 'sgd' or 'als' (default: {'sgd'})
22 | n_epochs {int} -- Number of epochs to train for (default: {100})
23 | reg {float} -- Lambda parameter for L2 regularization (default: {1})
24 | lr {float} -- Learning rate for gradient optimization step (default: {0.01})
25 | min_rating {int} -- Smallest rating possible (default: {0})
26 | max_rating {int} -- Largest rating possible (default: {5})
27 | verbose {str} -- Verbosity when fitting. 0 to not print anything, 1 to print fitting model (default: {1})
28 |
29 | Attributes:
30 | n_users {int} -- Number of users
31 | n_items {int} -- Number of items
32 | global_mean {float} -- Global mean of all ratings
33 | user_biases {numpy array} -- User bias vector of shape (n_users, 1)
34 | item_biases {numpy array} -- Item bias vector of shape (n_items, i)
35 | user_id_map {dict} -- Mapping of user ids to assigned integer ids
36 | item_id_map {dict} -- Mapping of item ids to assigned integer ids
37 | train_rmse {list} -- Training rmse values
38 | predictions_possible {list} -- Boolean vector of whether both user and item were known for prediction. Only available after calling predict
39 | """
40 |
41 | def __init__(
42 | self,
43 | method: str = "sgd",
44 | n_epochs: int = 100,
45 | reg: float = 1,
46 | lr: float = 0.01,
47 | min_rating: int = 0,
48 | max_rating: int = 5,
49 | verbose=1,
50 | ):
51 | # Check inputs
52 | if method not in ("sgd", "als"):
53 | raise ValueError('Method param must be either "sgd" or "als"')
54 |
55 | super().__init__(min_rating=min_rating, max_rating=max_rating, verbose=verbose)
56 |
57 | self.method = method
58 | self.n_epochs = n_epochs
59 | self.reg = reg
60 | self.lr = lr
61 | return
62 |
63 | def fit(self, X: pd.DataFrame, y: pd.Series):
64 | """
65 | Fits simple mean and bias model to given user item ratings
66 |
67 | Arguments:
68 | X {pandas DataFrame} -- Dataframe containing columns user_id, item_id
69 | y {pandas Series} -- Series containing rating
70 | """
71 | X = self._preprocess_data(X=X, y=y, type="fit")
72 | self.global_mean = X["rating"].mean()
73 |
74 | # Initialize parameters
75 | self.user_biases = np.zeros(self.n_users)
76 | self.item_biases = np.zeros(self.n_items)
77 |
78 | # Run parameter estimation
79 | if self.method == "sgd":
80 | self.user_biases, self.item_biases, self.train_rmse = _sgd(
81 | X=X.to_numpy(dtype=np.float64),
82 | global_mean=self.global_mean,
83 | user_biases=self.user_biases,
84 | item_biases=self.item_biases,
85 | n_epochs=self.n_epochs,
86 | lr=self.lr,
87 | reg=self.reg,
88 | verbose=self.verbose,
89 | )
90 |
91 | elif self.method == "als":
92 | self.user_biases, self.item_biases, self.train_rmse = _als(
93 | X=X.to_numpy(dtype=np.float64),
94 | global_mean=self.global_mean,
95 | user_biases=self.user_biases,
96 | item_biases=self.item_biases,
97 | n_epochs=self.n_epochs,
98 | reg=self.reg,
99 | verbose=self.verbose,
100 | )
101 |
102 | return self
103 |
104 | def predict(self, X: pd.DataFrame, bound_ratings: bool = True) -> list:
105 | """
106 | Predict ratings for given users and items
107 |
108 | Arguments:
109 | X {pd.DataFrame} -- Dataframe containing columns user_id and item_id
110 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True)
111 |
112 | Returns:
113 | predictions [list] -- List containing rating predictions of all user, items in same order as input X
114 | """
115 | # If empty return empty list
116 | if X.shape[0] == 0:
117 | return []
118 |
119 | X = self._preprocess_data(X=X, type="predict")
120 |
121 | # Get predictions
122 | predictions, predictions_possible = _predict(
123 | X=X.to_numpy(dtype=np.float64),
124 | global_mean=self.global_mean,
125 | min_rating=self.min_rating,
126 | max_rating=self.max_rating,
127 | user_biases=self.user_biases,
128 | item_biases=self.item_biases,
129 | bound_ratings=bound_ratings,
130 | )
131 |
132 | self.predictions_possible = predictions_possible
133 |
134 | return predictions
135 |
136 | def update_users(
137 | self,
138 | X: pd.DataFrame,
139 | y: pd.Series,
140 | lr: float = 0.01,
141 | n_epochs: int = 20,
142 | verbose: int = 0,
143 | ):
144 | """
145 | Update user biases vector with new/updated user-item ratings information using SGD. Only the user parameters corresponding for the
146 | new/updated users will be updated and item parameters will be left alone.
147 |
148 | Note: If updating old users then pass all user-item ratings for old users and not just modified ratings
149 |
150 | Args:
151 | X (pd.DataFrame): Dataframe containing columns user_id, item_id
152 | y (pd.Series): Series containing rating
153 | lr (float, optional): Learning rate alpha for gradient optimization step
154 | n_epochs (int, optional): Number of epochs to run SGD. Defaults to 20.
155 | verbose (int, optional): Verbosity when updating, 0 for nothing and 1 for training messages. Defaults to 0.
156 | """
157 | X, known_users, new_users = self._preprocess_data(X=X, y=y, type="update")
158 |
159 | # Re-initialize user bias for old users
160 | for user in known_users:
161 | user_index = self.user_id_map[user]
162 | self.user_biases[user_index] = 0
163 |
164 | # Add user bias param for new users
165 | self.user_biases = np.append(self.user_biases, np.zeros(len(new_users)))
166 |
167 | # Estimate new bias parameter
168 | self.user_biases, _, self.train_rmse = _sgd(
169 | X=X.to_numpy(dtype=np.float64),
170 | global_mean=self.global_mean,
171 | user_biases=self.user_biases,
172 | item_biases=self.item_biases,
173 | n_epochs=n_epochs,
174 | lr=lr,
175 | reg=self.reg,
176 | verbose=verbose,
177 | update_item_params=False,
178 | )
179 |
180 | return
181 |
182 |
183 | @nb.njit()
184 | def _calculate_rmse(
185 | X: np.ndarray, global_mean: float, user_biases: np.ndarray, item_biases: np.ndarray
186 | ):
187 | """
188 | Calculates root mean squared error for given data and model parameters
189 |
190 | Args:
191 | X (np.ndarray): Matrix with columns user, item and rating
192 | global_mean (float): Global mean rating
193 | user_biases (np.ndarray): User biases vector of shape (n_users, 1)
194 | item_biases (np.ndarray): Item biases vector of shape (n_items, 1)
195 |
196 | Returns:
197 | rmse [float]: Root mean squared error
198 | """
199 | n_ratings = X.shape[0]
200 | errors = np.zeros(n_ratings)
201 |
202 | # Iterate through all user-item ratings
203 | for i in range(n_ratings):
204 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]
205 |
206 | # Calculate prediction and error
207 | pred = global_mean + user_biases[user_id] + item_biases[item_id]
208 | errors[i] = rating - pred
209 |
210 | rmse = np.sqrt(np.square(errors).mean())
211 |
212 | return rmse
213 |
214 |
215 | @nb.njit()
216 | def _sgd(
217 | X: np.ndarray,
218 | global_mean: float,
219 | user_biases: np.ndarray,
220 | item_biases: np.ndarray,
221 | n_epochs: int,
222 | lr: float,
223 | reg: float,
224 | verbose: int,
225 | update_user_params: bool = True,
226 | update_item_params: bool = True,
227 | ) -> Tuple[np.ndarray, np.ndarray, list]:
228 | """
229 | Performs Stochastic Gradient Descent to estimate the user_biases and item_biases
230 |
231 | Arguments:
232 | X {numpy array} -- User-item rating matrix
233 | global_mean {float} -- Global mean of all ratings
234 | user_biases {numpy array} -- User biases vector of shape (n_users, 1)
235 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1)
236 | n_epochs {int} -- Number of epochs to run
237 | lr {float} -- Learning rate alpha
238 | reg {float} -- Regularization parameter lambda for Frobenius norm
239 | verbose {int} -- Verbosity when fitting. 0 for nothing and 1 for printing epochs
240 | update_user_params {bool} -- Whether to update user bias parameters or not. Default is True.
241 | update_item_params {bool} -- Whether to update item bias parameters or not. Default is True.
242 |
243 | Returns:
244 | user_biases [np.ndarray] -- Updated user_biases vector
245 | item_biases [np.ndarray] -- Updated item_bases vector
246 | train_rmse -- Training rmse values
247 | """
248 | train_rmse = []
249 |
250 | for epoch in range(n_epochs):
251 | # Shuffle data before each epoch
252 | np.random.shuffle(X)
253 |
254 | # Iterate through all user-item ratings
255 | for i in range(X.shape[0]):
256 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]
257 |
258 | # Compute error
259 | rating_pred = global_mean + user_biases[user_id] + item_biases[item_id]
260 | error = rating - rating_pred
261 |
262 | # Update parameters
263 | if update_user_params:
264 | user_biases[user_id] += lr * (error - reg * user_biases[user_id])
265 | if update_item_params:
266 | item_biases[item_id] += lr * (error - reg * item_biases[item_id])
267 |
268 | # Calculate error and print
269 | rmse = _calculate_rmse(
270 | X=X,
271 | global_mean=global_mean,
272 | user_biases=user_biases,
273 | item_biases=item_biases,
274 | )
275 | train_rmse.append(rmse)
276 |
277 | if verbose == 1:
278 | print("Epoch ", epoch + 1, "/", n_epochs, " - train_rmse:", rmse)
279 |
280 | return user_biases, item_biases, train_rmse
281 |
282 |
283 | @nb.njit()
284 | def _als(
285 | X: np.ndarray,
286 | global_mean: float,
287 | user_biases: np.ndarray,
288 | item_biases: np.ndarray,
289 | n_epochs: int,
290 | reg: float,
291 | verbose: int,
292 | ) -> Tuple[np.ndarray, np.ndarray, list]:
293 | """
294 | Performs Alternating Least Squares to estimate the user_biases and item_biases. For every epoch, the item biases are held constant while
295 | solving directly for the user biases parameters using a closed form equation. Then the user biases parameters is held constant and the same
296 | is done for the item biases. This can be derived easily and is given in the lecture here https://www.youtube.com/watch?v=gCaOa3W9kM0&t=32m55s
297 | which is also similar to the implementation in Surprise.
298 |
299 | Arguments:
300 | X {numpy array} -- User-item rating matrix
301 | global_mean {float} -- Global mean of all ratings
302 | user_biases {numpy array} -- User biases vector of shape (n_users, 1)
303 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1)
304 | n_epochs {int} -- Number of epochs to run
305 | reg {float} -- Regularization parameter lambda for Frobenius norm
306 | verbose {int} -- Verbosity when fitting. 0 for nothing and 1 for printing epochs
307 |
308 | Returns:
309 | user_biases [np.ndarray] -- Updated user_biases vector
310 | item_biases [np.ndarray] -- Updated item_bases vector
311 | train_rmse -- Training rmse values
312 | """
313 | n_users = user_biases.shape[0]
314 | n_items = item_biases.shape[0]
315 | train_rmse = []
316 |
317 | # Get counts of all users and items
318 | user_counts = np.zeros(n_users)
319 | item_counts = np.zeros(n_items)
320 | for i in range(X.shape[0]):
321 | user_id, item_id = int(X[i, 0]), int(X[i, 1])
322 | user_counts[user_id] += 1
323 | item_counts[item_id] += 1
324 |
325 | # For each epoch optimize User biases, and then Item biases
326 | for epoch in range(n_epochs):
327 |
328 | # Update user bias parameters
329 | user_biases = np.zeros(n_users)
330 |
331 | # Iterate through all user-item ratings
332 | for i in range(X.shape[0]):
333 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]
334 | user_biases[user_id] += rating - global_mean - item_biases[item_id]
335 |
336 | # Set user bias estimation
337 | user_biases = user_biases / (reg + user_counts)
338 |
339 | # Update item bias parameters
340 | item_biases = np.zeros(n_items)
341 |
342 | # Iterate through all user-item ratings
343 | for i in range(X.shape[0]):
344 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]
345 | item_biases[item_id] += rating - global_mean - user_biases[user_id]
346 |
347 | # Set item bias estimation
348 | item_biases = item_biases / (reg + item_counts)
349 |
350 | # Calculate error and print
351 | rmse = _calculate_rmse(
352 | X=X,
353 | global_mean=global_mean,
354 | user_biases=user_biases,
355 | item_biases=item_biases,
356 | )
357 | train_rmse.append(rmse)
358 |
359 | if verbose == 1:
360 | print("Epoch ", epoch + 1, "/", n_epochs, " - train_rmse:", rmse)
361 |
362 | return user_biases, item_biases, train_rmse
363 |
364 |
365 | @nb.njit()
366 | def _predict(
367 | X: np.ndarray,
368 | global_mean: float,
369 | min_rating: int,
370 | max_rating: int,
371 | user_biases: np.ndarray,
372 | item_biases: np.ndarray,
373 | bound_ratings: bool,
374 | ) -> Tuple[list, list]:
375 | """
376 | Calculate predicted ratings for each user-item pair.
377 |
378 | Arguments:
379 | X {np.ndarray} -- Matrix with columns representing (user_id, item_id)
380 | global_mean {float} -- Global mean of all ratings
381 | min_rating {int} -- Lowest rating possible
382 | max_rating {int} -- Highest rating possible
383 | user_biases {np.ndarray} -- User biases vector of length n_users
384 | item_biases {np.ndarray} -- Item biases vector of length n_items
385 | bound_ratings {boolean} -- Whether to bound predictions in between range [min_rating, max_rating]
386 |
387 | Returns:
388 | predictions [np.ndarray] -- Vector containing rating predictions of all user, items in same order as input X
389 | predictions_possible [np.ndarray] -- Vector of whether both given user and item were contained in the data that the model was fitted on
390 | """
391 |
392 | predictions = []
393 | predictions_possible = []
394 |
395 | for i in range(X.shape[0]):
396 | user_id, item_id = int(X[i, 0]), int(X[i, 1])
397 | user_known = user_id != -1
398 | item_known = item_id != -1
399 |
400 | rating_pred = global_mean
401 |
402 | if user_known:
403 | rating_pred += user_biases[user_id]
404 | if item_known:
405 | rating_pred += item_biases[item_id]
406 |
407 | # Bound ratings to min and max rating range
408 | if bound_ratings:
409 | if rating_pred > max_rating:
410 | rating_pred = max_rating
411 | elif rating_pred < min_rating:
412 | rating_pred = min_rating
413 |
414 | predictions.append(rating_pred)
415 | predictions_possible.append(user_known and item_known)
416 |
417 | return predictions, predictions_possible
418 |
--------------------------------------------------------------------------------
/matrix_factorization/kernel_matrix_factorization.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numba as nb
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from .kernels import (
7 | kernel_linear,
8 | kernel_sigmoid,
9 | kernel_rbf,
10 | kernel_linear_sgd_update,
11 | kernel_sigmoid_sgd_update,
12 | kernel_rbf_sgd_update,
13 | )
14 | from .recommender_base import RecommenderBase
15 |
16 | from typing import Tuple, Union
17 |
18 |
19 | class KernelMF(RecommenderBase):
20 | """
21 | Kernel Matrix Factorization. Finds the thin matrices P and Q such that P * Q^T give a good low rank approximation to the user-item
22 | ratings matrix A based on RMSE. This is different from SVD despite the name as unlike SVD there is no constraint for matrices P and Q to have mutually
23 | orthogonal columns.
24 |
25 | Arguments:
26 | n_factors {int} -- The number of latent factors in matrices P and Q (default: {100})
27 | n_epochs {int} -- Number of epochs to train for (default: {100})
28 | kernel {str} -- Kernel function to use between user and item features. Options are 'linear', 'logistic' or 'rbf'. (default: {'linear'})
29 | gamma {str or float} -- Kernel coefficient for 'rbf'. Ignored by other kernels. If 'auto' is used then will be set to 1/n_factors. (default: 'auto')
30 | reg {float} -- Regularization parameter lambda for Tikhonov regularization (default: {0.01})
31 | lr {float} -- Learning rate alpha for gradient optimization step (default: {0.01})
32 | init_mean {float} -- Mean of normal distribution to use for initializing parameters (default: {0})
33 | init_sd {float} -- Standard deviation of normal distribution to use for initializing parameters (default: {0.1})
34 | min_rating {int} -- Smallest rating possible (default: {0})
35 | max_rating {int} -- Largest rating possible (default: {5})
36 | verbose {str} -- Verbosity when fitting. Values possible are 0 to not print anything, 1 to print fitting model (default: {1})
37 |
38 | Attributes:
39 | n_users {int} -- Number of users
40 | n_items {int} -- Number of items
41 | global_mean {float} -- Global mean of all ratings
42 | user_biases {numpy array} -- User bias vector of shape (n_users, 1)
43 | item_biases {numpy array} -- Item bias vector of shape (n_items, i)
44 | user_features {numpy array} -- Decomposed P matrix of user features of shape (n_users, n_factors)
45 | item_features {numpy array} -- Decomposed Q matrix of item features of shape (n_items, n_factors)
46 | user_id_map {dict} -- Mapping of user ids to assigned integer ids
47 | item_id_map {dict} -- Mapping of item ids to assigned integer ids
48 | train_rmse -- Training rmse values
49 | predictions_possible {list} -- Boolean vector of whether both user and item were known for prediction. Only available after calling predict
50 | """
51 |
52 | def __init__(
53 | self,
54 | n_factors: int = 100,
55 | n_epochs: int = 100,
56 | kernel: str = "linear",
57 | gamma: Union[str, float] = "auto",
58 | reg: float = 1,
59 | lr: float = 0.01,
60 | init_mean: float = 0,
61 | init_sd: float = 0.1,
62 | min_rating: int = 0,
63 | max_rating: int = 5,
64 | verbose: int = 1,
65 | ):
66 | if kernel not in ("linear", "sigmoid", "rbf"):
67 | raise ValueError("Kernel must be one of linear, sigmoid, or rbf")
68 |
69 | super().__init__(min_rating=min_rating, max_rating=max_rating, verbose=verbose)
70 |
71 | self.n_factors = n_factors
72 | self.n_epochs = n_epochs
73 | self.kernel = kernel
74 | self.gamma = 1 / n_factors if gamma == "auto" else gamma
75 | self.reg = reg
76 | self.lr = lr
77 | self.init_mean = init_mean
78 | self.init_sd = init_sd
79 | return
80 |
81 | def fit(self, X: pd.DataFrame, y: pd.Series):
82 | """
83 | Decompose user-item rating matrix into thin matrices P and Q along with user and item bias vectors
84 |
85 | Arguments:
86 | X {pandas DataFrame} -- Dataframe containing columns user_id, item_id
87 | y {pandas Series} -- Series containing ratings
88 | """
89 | X = self._preprocess_data(X=X, y=y, type="fit")
90 | self.global_mean = X["rating"].mean()
91 |
92 | # Initialize vector bias parameters
93 | self.user_biases = np.zeros(self.n_users)
94 | self.item_biases = np.zeros(self.n_items)
95 |
96 | # Initialize latent factor parameters of matrices P and Q
97 | self.user_features = np.random.normal(
98 | self.init_mean, self.init_sd, (self.n_users, self.n_factors)
99 | )
100 | self.item_features = np.random.normal(
101 | self.init_mean, self.init_sd, (self.n_items, self.n_factors)
102 | )
103 |
104 | # Perform stochastic gradient descent
105 | (
106 | self.user_features,
107 | self.item_features,
108 | self.user_biases,
109 | self.item_biases,
110 | self.train_rmse,
111 | ) = _sgd(
112 | X=X.to_numpy(dtype=np.float64),
113 | global_mean=self.global_mean,
114 | user_biases=self.user_biases,
115 | item_biases=self.item_biases,
116 | user_features=self.user_features,
117 | item_features=self.item_features,
118 | n_epochs=self.n_epochs,
119 | kernel=self.kernel,
120 | gamma=self.gamma,
121 | lr=self.lr,
122 | reg=self.reg,
123 | min_rating=self.min_rating,
124 | max_rating=self.max_rating,
125 | verbose=self.verbose,
126 | )
127 |
128 | return self
129 |
130 | def predict(self, X: pd.DataFrame, bound_ratings: bool = True) -> list:
131 | """
132 | Predict ratings for given users and items
133 |
134 | Arguments:
135 | X {pd.DataFrame} -- Dataframe containing columns user_id and item_id
136 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True)
137 |
138 | Returns:
139 | predictions [list] -- List containing rating predictions of all user, items in same order as input X
140 | """
141 | # If empty return empty list
142 | if X.shape[0] == 0:
143 | return []
144 |
145 | X = self._preprocess_data(X=X, type="predict")
146 |
147 | # Get predictions
148 | predictions, predictions_possible = _predict(
149 | X=X.to_numpy(dtype=np.float64),
150 | global_mean=self.global_mean,
151 | user_biases=self.user_biases,
152 | item_biases=self.item_biases,
153 | user_features=self.user_features,
154 | item_features=self.item_features,
155 | min_rating=self.min_rating,
156 | max_rating=self.max_rating,
157 | kernel=self.kernel,
158 | gamma=self.gamma,
159 | bound_ratings=bound_ratings,
160 | )
161 |
162 | self.predictions_possible = predictions_possible
163 | return predictions
164 |
165 | def update_users(
166 | self,
167 | X: pd.DataFrame,
168 | y: pd.Series,
169 | lr: float = 0.01,
170 | n_epochs: int = 20,
171 | verbose: int = 0,
172 | ):
173 | """
174 | Update P user features matrix with new/updated user-item ratings information using SGD. Only the user parameters corresponding for the
175 | new/updated users will be updated and item parameters will be left alone.
176 |
177 | Note: If updating old users then pass all user-item ratings for old users and not just modified ratings
178 |
179 | Args:
180 | X (pd.DataFrame): Dataframe containing columns user_id, item_id
181 | y (pd.DataFrame): Series containing ratings
182 | lr (float, optional): Learning rate alpha for gradient optimization step
183 | n_epochs (int, optional): Number of epochs to run SGD. Defaults to 20.
184 | verbose (int, optional): Verbosity when updating, 0 for nothing and 1 for training messages. Defaults to 0.
185 | """
186 | X, known_users, new_users = self._preprocess_data(X=X, y=y, type="update")
187 | n_new_users = len(new_users)
188 |
189 | # Re-initialize params for old users
190 | for user in known_users:
191 | user_index = self.user_id_map[user]
192 |
193 | # Initialize bias
194 | self.user_biases[user_index] = 0
195 |
196 | # Initialize latent factors vector
197 | self.user_features[user_index, :] = np.random.normal(
198 | self.init_mean, self.init_sd, (1, self.n_factors)
199 | )
200 |
201 | # Add bias parameters for new users
202 | self.user_biases = np.append(self.user_biases, np.zeros(n_new_users))
203 |
204 | # Add latent factor parameters for new users by adding rows to P matrix
205 | new_user_features = np.random.normal(
206 | self.init_mean, self.init_sd, (n_new_users, self.n_factors)
207 | )
208 | self.user_features = np.concatenate(
209 | (self.user_features, new_user_features), axis=0
210 | )
211 |
212 | # Estimate new parameters
213 | (
214 | self.user_features,
215 | self.item_features,
216 | self.user_biases,
217 | self.item_biases,
218 | self.train_rmse,
219 | ) = _sgd(
220 | X=X.to_numpy(dtype=np.float64),
221 | global_mean=self.global_mean,
222 | user_biases=self.user_biases,
223 | item_biases=self.item_biases,
224 | user_features=self.user_features,
225 | item_features=self.item_features,
226 | n_epochs=n_epochs,
227 | kernel=self.kernel,
228 | gamma=self.gamma,
229 | lr=lr,
230 | reg=self.reg,
231 | min_rating=self.min_rating,
232 | max_rating=self.max_rating,
233 | verbose=verbose,
234 | update_item_params=False,
235 | )
236 |
237 | return
238 |
239 |
240 | @nb.njit()
241 | def _calculate_rmse(
242 | X: np.ndarray,
243 | global_mean: float,
244 | user_biases: np.ndarray,
245 | item_biases: np.ndarray,
246 | user_features: np.ndarray,
247 | item_features: np.ndarray,
248 | min_rating: float,
249 | max_rating: float,
250 | kernel: str,
251 | gamma: float,
252 | ):
253 | """
254 | Calculates root mean squared error for given data and model parameters
255 |
256 | Args:
257 | X (np.ndarray): Matrix with columns user, item and rating
258 | global_mean (float): Global mean rating
259 | user_biases (np.ndarray): User biases vector of shape (n_users, 1)
260 | item_biases (np.ndarray): Item biases vector of shape (n_items, 1)
261 | user_features (np.ndarray): User features matrix P of size (n_users, n_factors)
262 | item_features (np.ndarray): Item features matrix Q of size (n_items, n_factors)
263 | min_rating (float): Minimum possible rating
264 | max_rating (float): Maximum possible rating
265 | kernel (str): Kernel type. Possible options are "linear", "sigmoid" or "rbf" kernel
266 | gamma (float): Kernel coefficient only for "rbf" kernel
267 |
268 | Returns:
269 | rmse [float]: Root mean squared error
270 | """
271 | n_ratings = X.shape[0]
272 | errors = np.zeros(n_ratings)
273 |
274 | # Iterate through all user-item ratings and calculate error
275 | for i in range(n_ratings):
276 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]
277 | user_bias = user_biases[user_id]
278 | item_bias = item_biases[item_id]
279 | user_feature_vec = user_features[user_id, :]
280 | item_feature_vec = item_features[item_id, :]
281 |
282 | # Calculate predicted rating for given kernel
283 | if kernel == "linear":
284 | rating_pred = kernel_linear(
285 | global_mean=global_mean,
286 | user_bias=user_bias,
287 | item_bias=item_bias,
288 | user_feature_vec=user_feature_vec,
289 | item_feature_vec=item_feature_vec,
290 | )
291 |
292 | elif kernel == "sigmoid":
293 | rating_pred = kernel_sigmoid(
294 | global_mean=global_mean,
295 | user_bias=user_bias,
296 | item_bias=item_bias,
297 | user_feature_vec=user_feature_vec,
298 | item_feature_vec=item_feature_vec,
299 | a=min_rating,
300 | c=max_rating - min_rating,
301 | )
302 |
303 | elif kernel == "rbf":
304 | rating_pred = kernel_rbf(
305 | user_feature_vec=user_feature_vec,
306 | item_feature_vec=item_feature_vec,
307 | gamma=gamma,
308 | a=min_rating,
309 | c=max_rating - min_rating,
310 | )
311 |
312 | # Calculate error
313 | errors[i] = rating - rating_pred
314 |
315 | rmse = np.sqrt(np.square(errors).mean())
316 |
317 | return rmse
318 |
319 |
320 | @nb.njit()
321 | def _sgd(
322 | X: np.ndarray,
323 | global_mean: float,
324 | user_biases: np.ndarray,
325 | item_biases: np.ndarray,
326 | user_features: np.ndarray,
327 | item_features: np.ndarray,
328 | n_epochs: int,
329 | kernel: str,
330 | gamma: float,
331 | lr: float,
332 | reg: float,
333 | min_rating: float,
334 | max_rating: float,
335 | verbose: int,
336 | update_user_params: bool = True,
337 | update_item_params: bool = True,
338 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list]:
339 | """
340 | Performs stochastic gradient descent to estimate parameters.
341 |
342 | Arguments:
343 | X {numpy array} -- User-item ranking matrix
344 | global_mean {float} -- Global mean of all ratings
345 | user_biases {numpy array} -- User biases vector of shape (n_users, 1)
346 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1)
347 | user_features {numpy array} -- Start matrix P of user features of shape (n_users, n_factors)
348 | item_features {numpy array} -- Start matrix Q of item features of shape (n_items, n_factors)
349 | n_epochs {int} -- Number of epochs to run
350 | kernel {str} -- Kernel function to use between user and item features. Options are 'linear', 'logistic', and 'rbf'.
351 | gamma {float} -- Kernel coefficient for 'rbf'. Ignored by other kernels.
352 | lr {float} -- Learning rate alpha
353 | reg {float} -- Regularization parameter lambda for Frobenius norm
354 | min_rating {float} -- Minimum possible rating
355 | max_fating {float} -- Maximum possible rating
356 | verbose {int} -- Verbosity when fitting. 0 for nothing and 1 for printing epochs
357 | update_user_params {bool} -- Whether to update user parameters or not. Default is True.
358 | update_item_params {bool} -- Whether to update item parameters or not. Default is True.
359 |
360 | Returns:
361 | user_features [np.ndarray] -- Updated user_features matrix P
362 | item_features [np.ndarray] -- Updated item_features matrix Q
363 | user_biases [np.ndarray] -- Updated user_biases vector
364 | item_biases [np.ndarray] -- Updated item_bases vector
365 | train_rmse [list] -- Training rmse values
366 | """
367 | train_rmse = []
368 |
369 | for epoch in range(n_epochs):
370 | # Shuffle dataset before each epoch
371 | np.random.shuffle(X)
372 |
373 | # Iterate through all user-item ratings
374 | for i in range(X.shape[0]):
375 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]
376 |
377 | if kernel == "linear":
378 | kernel_linear_sgd_update(
379 | user_id=user_id,
380 | item_id=item_id,
381 | rating=rating,
382 | global_mean=global_mean,
383 | user_biases=user_biases,
384 | item_biases=item_biases,
385 | user_features=user_features,
386 | item_features=item_features,
387 | lr=lr,
388 | reg=reg,
389 | update_user_params=update_user_params,
390 | update_item_params=update_item_params,
391 | )
392 |
393 | elif kernel == "sigmoid":
394 | kernel_sigmoid_sgd_update(
395 | user_id=user_id,
396 | item_id=item_id,
397 | rating=rating,
398 | global_mean=global_mean,
399 | user_biases=user_biases,
400 | item_biases=item_biases,
401 | user_features=user_features,
402 | item_features=item_features,
403 | lr=lr,
404 | reg=reg,
405 | a=min_rating,
406 | c=max_rating - min_rating,
407 | update_user_params=update_user_params,
408 | update_item_params=update_item_params,
409 | )
410 |
411 | elif kernel == "rbf":
412 | kernel_rbf_sgd_update(
413 | user_id=user_id,
414 | item_id=item_id,
415 | rating=rating,
416 | user_features=user_features,
417 | item_features=item_features,
418 | lr=lr,
419 | reg=reg,
420 | gamma=gamma,
421 | a=min_rating,
422 | c=max_rating - min_rating,
423 | update_user_params=update_user_params,
424 | update_item_params=update_item_params,
425 | )
426 |
427 | # Calculate error and print
428 | rmse = _calculate_rmse(
429 | X=X,
430 | global_mean=global_mean,
431 | user_biases=user_biases,
432 | item_biases=item_biases,
433 | user_features=user_features,
434 | item_features=item_features,
435 | min_rating=min_rating,
436 | max_rating=max_rating,
437 | kernel=kernel,
438 | gamma=gamma,
439 | )
440 | train_rmse.append(rmse)
441 |
442 | if verbose == 1:
443 | print("Epoch ", epoch + 1, "/", n_epochs, " - train_rmse:", rmse)
444 |
445 | return user_features, item_features, user_biases, item_biases, train_rmse
446 |
447 |
448 | @nb.njit()
449 | def _predict(
450 | X: np.ndarray,
451 | global_mean: float,
452 | user_biases: np.ndarray,
453 | item_biases: np.ndarray,
454 | user_features: np.ndarray,
455 | item_features: np.ndarray,
456 | min_rating: int,
457 | max_rating: int,
458 | kernel: str,
459 | gamma: float,
460 | bound_ratings: bool,
461 | ) -> Tuple[list, list]:
462 | """
463 | Calculate predicted ratings for each user-item pair.
464 |
465 | Arguments:
466 | X {np.ndarray} -- Matrix with columns representing (user_id, item_id)
467 | global_mean {float} -- Global mean of all ratings
468 | user_biases {np.ndarray} -- User biases vector of length n_users
469 | item_biases {np.ndarray} -- Item biases vector of length n_items
470 | user_features {np.ndarray} -- User features matrix P of shape (n_users, n_factors)
471 | item_features {np.ndarray} -- Item features matrix Q of shape (n_items, n_factors)
472 | min_rating {int} -- Lowest rating possible
473 | max_rating {int} -- Highest rating possible
474 | kernel {str} -- Kernel function. Options are 'linear', 'sigmoid', and 'rbf'
475 | gamma {float} -- Kernel coefficient for 'rbf' only
476 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True)
477 |
478 | Returns:
479 | predictions [np.ndarray] -- Vector containing rating predictions of all user, items in same order as input X
480 | predictions_possible [np.ndarray] -- Vector of whether both given user and item were contained in the data that the model was fitted on
481 | """
482 | n_factors = user_features.shape[1]
483 | predictions = []
484 | predictions_possible = []
485 |
486 | for i in range(X.shape[0]):
487 | user_id, item_id = int(X[i, 0]), int(X[i, 1])
488 | user_known = user_id != -1
489 | item_known = item_id != -1
490 |
491 | # Default values if user or item are not known
492 | user_bias = user_biases[user_id] if user_known else 0
493 | item_bias = item_biases[item_id] if item_known else 0
494 | user_feature_vec = (
495 | user_features[user_id, :] if user_known else np.zeros(n_factors)
496 | )
497 | item_feature_vec = (
498 | item_features[item_id, :] if item_known else np.zeros(n_factors)
499 | )
500 |
501 | # Calculate predicted rating given kernel
502 | if kernel == "linear":
503 | rating_pred = kernel_linear(
504 | global_mean=global_mean,
505 | user_bias=user_bias,
506 | item_bias=item_bias,
507 | user_feature_vec=user_feature_vec,
508 | item_feature_vec=item_feature_vec,
509 | )
510 |
511 | elif kernel == "sigmoid":
512 | rating_pred = kernel_sigmoid(
513 | global_mean=global_mean,
514 | user_bias=user_bias,
515 | item_bias=item_bias,
516 | user_feature_vec=user_feature_vec,
517 | item_feature_vec=item_feature_vec,
518 | a=min_rating,
519 | c=max_rating - min_rating,
520 | )
521 |
522 | elif kernel == "rbf":
523 | rating_pred = kernel_rbf(
524 | user_feature_vec=user_feature_vec,
525 | item_feature_vec=item_feature_vec,
526 | gamma=gamma,
527 | a=min_rating,
528 | c=max_rating - min_rating,
529 | )
530 |
531 | # Bound ratings to min and max rating range
532 | if bound_ratings:
533 | if rating_pred > max_rating:
534 | rating_pred = max_rating
535 | elif rating_pred < min_rating:
536 | rating_pred = min_rating
537 |
538 | predictions.append(rating_pred)
539 | predictions_possible.append(user_known and item_known)
540 |
541 | return predictions, predictions_possible
542 |
--------------------------------------------------------------------------------
/examples/recommender-system.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "tags": []
8 | },
9 | "outputs": [],
10 | "source": [
11 | "# Data manipulation\n",
12 | "import numpy as np\n",
13 | "import pandas as pd\n",
14 | "pd.options.display.max_rows = 100\n",
15 | "\n",
16 | "# Modeling\n",
17 | "from matrix_factorization import BaselineModel, KernelMF, train_update_test_split\n",
18 | "from sklearn.metrics import mean_squared_error\n",
19 | "from sklearn.model_selection import train_test_split\n",
20 | "\n",
21 | "# Other\n",
22 | "import os\n",
23 | "import random\n",
24 | "import sys\n",
25 | "\n",
26 | "# Reload imported code \n",
27 | "%load_ext autoreload\n",
28 | "%autoreload 2\n",
29 | "\n",
30 | "# Print all output\n",
31 | "from IPython.core.interactiveshell import InteractiveShell\n",
32 | "InteractiveShell.ast_node_interactivity = \"all\"\n",
33 | " \n",
34 | "rand_seed = 2\n",
35 | "np.random.seed(rand_seed)\n",
36 | "random.seed(rand_seed)"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "# Load data"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "**Movie data found here https://grouplens.org/datasets/movielens/**"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 2,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/html": [
61 | "
\n",
62 | "\n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " | \n",
79 | " user_id | \n",
80 | " item_id | \n",
81 | " rating | \n",
82 | "
\n",
83 | " \n",
84 | " \n",
85 | " \n",
86 | " | 0 | \n",
87 | " 196 | \n",
88 | " 242 | \n",
89 | " 3 | \n",
90 | "
\n",
91 | " \n",
92 | " | 1 | \n",
93 | " 186 | \n",
94 | " 302 | \n",
95 | " 3 | \n",
96 | "
\n",
97 | " \n",
98 | " | 2 | \n",
99 | " 22 | \n",
100 | " 377 | \n",
101 | " 1 | \n",
102 | "
\n",
103 | " \n",
104 | " | 3 | \n",
105 | " 244 | \n",
106 | " 51 | \n",
107 | " 2 | \n",
108 | "
\n",
109 | " \n",
110 | " | 4 | \n",
111 | " 166 | \n",
112 | " 346 | \n",
113 | " 1 | \n",
114 | "
\n",
115 | " \n",
116 | " | 5 | \n",
117 | " 298 | \n",
118 | " 474 | \n",
119 | " 4 | \n",
120 | "
\n",
121 | " \n",
122 | " | 6 | \n",
123 | " 115 | \n",
124 | " 265 | \n",
125 | " 2 | \n",
126 | "
\n",
127 | " \n",
128 | " | 7 | \n",
129 | " 253 | \n",
130 | " 465 | \n",
131 | " 5 | \n",
132 | "
\n",
133 | " \n",
134 | " | 8 | \n",
135 | " 305 | \n",
136 | " 451 | \n",
137 | " 3 | \n",
138 | "
\n",
139 | " \n",
140 | " | 9 | \n",
141 | " 6 | \n",
142 | " 86 | \n",
143 | " 3 | \n",
144 | "
\n",
145 | " \n",
146 | "
\n",
147 | "
"
148 | ],
149 | "text/plain": [
150 | " user_id item_id rating\n",
151 | "0 196 242 3\n",
152 | "1 186 302 3\n",
153 | "2 22 377 1\n",
154 | "3 244 51 2\n",
155 | "4 166 346 1\n",
156 | "5 298 474 4\n",
157 | "6 115 265 2\n",
158 | "7 253 465 5\n",
159 | "8 305 451 3\n",
160 | "9 6 86 3"
161 | ]
162 | },
163 | "execution_count": 2,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "cols = ['user_id', 'item_id', 'rating', 'timestamp']\n",
170 | "# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')\n",
171 | "movie_data = pd.read_csv('../data/ml-100k/u.data', names = cols, sep = '\\t', usecols=[0, 1, 2], engine='python')\n",
172 | "\n",
173 | "X = movie_data[['user_id', 'item_id']]\n",
174 | "y = movie_data['rating']\n",
175 | "\n",
176 | "# Prepare data\n",
177 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
178 | "\n",
179 | "# Prepare data for online learning\n",
180 | "X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(movie_data, frac_new_users=0.2)\n",
181 | "\n",
182 | "movie_data.head(10)"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "# Simple model with global mean"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "This is similar to just the global standard deviation"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 3,
202 | "metadata": {
203 | "tags": []
204 | },
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "\n",
211 | "Test RMSE: 1.120652\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "global_mean = y_train.mean()\n",
217 | "pred = [global_mean for _ in range(y_test.shape[0])]\n",
218 | "\n",
219 | "rmse = mean_squared_error(y_test, pred, squared = False)\n",
220 | "\n",
221 | "print(f'\\nTest RMSE: {rmse:4f}')"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "# Baseline Model with biases"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "## SGD"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 4,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "name": "stdout",
245 | "output_type": "stream",
246 | "text": [
247 | "Epoch 1 / 20 - train_rmse: 0.9685443987174238\n",
248 | "Epoch 2 / 20 - train_rmse: 0.945448032425675\n",
249 | "Epoch 3 / 20 - train_rmse: 0.9350744230954693\n",
250 | "Epoch 4 / 20 - train_rmse: 0.9294774771346712\n",
251 | "Epoch 5 / 20 - train_rmse: 0.9258635943145475\n",
252 | "Epoch 6 / 20 - train_rmse: 0.9235995589398913\n",
253 | "Epoch 7 / 20 - train_rmse: 0.9218589129974872\n",
254 | "Epoch 8 / 20 - train_rmse: 0.9205752967946901\n",
255 | "Epoch 9 / 20 - train_rmse: 0.9197497680553437\n",
256 | "Epoch 10 / 20 - train_rmse: 0.9189075470532244\n",
257 | "Epoch 11 / 20 - train_rmse: 0.9184605627485326\n",
258 | "Epoch 12 / 20 - train_rmse: 0.9180274072268116\n",
259 | "Epoch 13 / 20 - train_rmse: 0.9174771346162836\n",
260 | "Epoch 14 / 20 - train_rmse: 0.9172615435062336\n",
261 | "Epoch 15 / 20 - train_rmse: 0.9169118664096015\n",
262 | "Epoch 16 / 20 - train_rmse: 0.916762599540885\n",
263 | "Epoch 17 / 20 - train_rmse: 0.9165916401686293\n",
264 | "Epoch 18 / 20 - train_rmse: 0.9164009881488299\n",
265 | "Epoch 19 / 20 - train_rmse: 0.9161039428103391\n",
266 | "Epoch 20 / 20 - train_rmse: 0.9160441667784996\n",
267 | "\n",
268 | "Test RMSE: 0.9298\n",
269 | "Wall time: 3.25 s\n"
270 | ]
271 | }
272 | ],
273 | "source": [
274 | "%%time\n",
275 | "\n",
276 | "baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)\n",
277 | "baseline_model.fit(X_train, y_train)\n",
278 | "\n",
279 | "pred = baseline_model.predict(X_test)\n",
280 | "rmse = mean_squared_error(y_test, pred, squared = False)\n",
281 | "\n",
282 | "print(f'\\nTest RMSE: {rmse:.4f}')"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 5,
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "data": {
292 | "text/html": [
293 | "\n",
294 | "\n",
307 | "
\n",
308 | " \n",
309 | " \n",
310 | " | \n",
311 | " user_id | \n",
312 | " item_id | \n",
313 | " rating_pred | \n",
314 | "
\n",
315 | " \n",
316 | " \n",
317 | " \n",
318 | " | 378 | \n",
319 | " 200 | \n",
320 | " 318 | \n",
321 | " 5.0 | \n",
322 | "
\n",
323 | " \n",
324 | " | 457 | \n",
325 | " 200 | \n",
326 | " 357 | \n",
327 | " 5.0 | \n",
328 | "
\n",
329 | " \n",
330 | " | 388 | \n",
331 | " 200 | \n",
332 | " 408 | \n",
333 | " 5.0 | \n",
334 | "
\n",
335 | " \n",
336 | " | 988 | \n",
337 | " 200 | \n",
338 | " 1449 | \n",
339 | " 5.0 | \n",
340 | "
\n",
341 | " \n",
342 | " | 281 | \n",
343 | " 200 | \n",
344 | " 483 | \n",
345 | " 5.0 | \n",
346 | "
\n",
347 | " \n",
348 | " | 790 | \n",
349 | " 200 | \n",
350 | " 114 | \n",
351 | " 5.0 | \n",
352 | "
\n",
353 | " \n",
354 | " | 109 | \n",
355 | " 200 | \n",
356 | " 127 | \n",
357 | " 5.0 | \n",
358 | "
\n",
359 | " \n",
360 | " | 562 | \n",
361 | " 200 | \n",
362 | " 12 | \n",
363 | " 5.0 | \n",
364 | "
\n",
365 | " \n",
366 | " | 212 | \n",
367 | " 200 | \n",
368 | " 169 | \n",
369 | " 5.0 | \n",
370 | "
\n",
371 | " \n",
372 | " | 54 | \n",
373 | " 200 | \n",
374 | " 603 | \n",
375 | " 5.0 | \n",
376 | "
\n",
377 | " \n",
378 | "
\n",
379 | "
"
380 | ],
381 | "text/plain": [
382 | " user_id item_id rating_pred\n",
383 | "378 200 318 5.0\n",
384 | "457 200 357 5.0\n",
385 | "388 200 408 5.0\n",
386 | "988 200 1449 5.0\n",
387 | "281 200 483 5.0\n",
388 | "790 200 114 5.0\n",
389 | "109 200 127 5.0\n",
390 | "562 200 12 5.0\n",
391 | "212 200 169 5.0\n",
392 | "54 200 603 5.0"
393 | ]
394 | },
395 | "execution_count": 5,
396 | "metadata": {},
397 | "output_type": "execute_result"
398 | }
399 | ],
400 | "source": [
401 | "baseline_model.recommend(user=200)"
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "metadata": {},
407 | "source": [
408 | "## ALS"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": 6,
414 | "metadata": {},
415 | "outputs": [
416 | {
417 | "name": "stdout",
418 | "output_type": "stream",
419 | "text": [
420 | "Epoch 1 / 20 - train_rmse: 0.9312489364350157\n",
421 | "Epoch 2 / 20 - train_rmse: 0.9144875214764501\n",
422 | "Epoch 3 / 20 - train_rmse: 0.9134856911195807\n",
423 | "Epoch 4 / 20 - train_rmse: 0.9133800448918423\n",
424 | "Epoch 5 / 20 - train_rmse: 0.9133615794862777\n",
425 | "Epoch 6 / 20 - train_rmse: 0.9133565857003941\n",
426 | "Epoch 7 / 20 - train_rmse: 0.9133544601244424\n",
427 | "Epoch 8 / 20 - train_rmse: 0.9133531004630441\n",
428 | "Epoch 9 / 20 - train_rmse: 0.9133519902067218\n",
429 | "Epoch 10 / 20 - train_rmse: 0.9133509792033206\n",
430 | "Epoch 11 / 20 - train_rmse: 0.9133500175542733\n",
431 | "Epoch 12 / 20 - train_rmse: 0.9133490869495551\n",
432 | "Epoch 13 / 20 - train_rmse: 0.9133481801287349\n",
433 | "Epoch 14 / 20 - train_rmse: 0.9133472939684136\n",
434 | "Epoch 15 / 20 - train_rmse: 0.9133464269599311\n",
435 | "Epoch 16 / 20 - train_rmse: 0.9133455782426871\n",
436 | "Epoch 17 / 20 - train_rmse: 0.9133447472230197\n",
437 | "Epoch 18 / 20 - train_rmse: 0.9133439334215674\n",
438 | "Epoch 19 / 20 - train_rmse: 0.9133431364114416\n",
439 | "Epoch 20 / 20 - train_rmse: 0.9133423557930989\n",
440 | "\n",
441 | "Test RMSE: 0.9294\n",
442 | "Wall time: 1.17 s\n"
443 | ]
444 | }
445 | ],
446 | "source": [
447 | "%%time\n",
448 | "\n",
449 | "baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)\n",
450 | "baseline_model.fit(X_train, y_train)\n",
451 | "\n",
452 | "pred = baseline_model.predict(X_test)\n",
453 | "rmse = mean_squared_error(y_test, pred, squared = False)\n",
454 | "\n",
455 | "print(f'\\nTest RMSE: {rmse:.4f}')"
456 | ]
457 | },
458 | {
459 | "cell_type": "markdown",
460 | "metadata": {},
461 | "source": [
462 | "## Updating with new users"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 7,
468 | "metadata": {
469 | "tags": []
470 | },
471 | "outputs": [
472 | {
473 | "name": "stdout",
474 | "output_type": "stream",
475 | "text": [
476 | "Epoch 1 / 20 - train_rmse: 0.9650236406922229\n",
477 | "Epoch 2 / 20 - train_rmse: 0.9428226226596799\n",
478 | "Epoch 3 / 20 - train_rmse: 0.9331705124882925\n",
479 | "Epoch 4 / 20 - train_rmse: 0.9279749973416741\n",
480 | "Epoch 5 / 20 - train_rmse: 0.9247974571263335\n",
481 | "Epoch 6 / 20 - train_rmse: 0.9226517575035114\n",
482 | "Epoch 7 / 20 - train_rmse: 0.920835039334346\n",
483 | "Epoch 8 / 20 - train_rmse: 0.9197367786245378\n",
484 | "Epoch 9 / 20 - train_rmse: 0.9189681287833118\n",
485 | "Epoch 10 / 20 - train_rmse: 0.9181493468113285\n",
486 | "Epoch 11 / 20 - train_rmse: 0.9177119438426637\n",
487 | "Epoch 12 / 20 - train_rmse: 0.9172589415232193\n",
488 | "Epoch 13 / 20 - train_rmse: 0.9168827001131301\n",
489 | "Epoch 14 / 20 - train_rmse: 0.9164445680503323\n",
490 | "Epoch 15 / 20 - train_rmse: 0.9164404466859075\n",
491 | "Epoch 16 / 20 - train_rmse: 0.9160093360322635\n",
492 | "Epoch 17 / 20 - train_rmse: 0.9158025569643043\n",
493 | "Epoch 18 / 20 - train_rmse: 0.9157375955425434\n",
494 | "Epoch 19 / 20 - train_rmse: 0.9156845197413601\n",
495 | "Epoch 20 / 20 - train_rmse: 0.9153536272183195\n"
496 | ]
497 | },
498 | {
499 | "data": {
500 | "text/plain": [
501 | "BaselineModel(n_epochs=20, reg=0.05)"
502 | ]
503 | },
504 | "execution_count": 7,
505 | "metadata": {},
506 | "output_type": "execute_result"
507 | }
508 | ],
509 | "source": [
510 | "baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)\n",
511 | "baseline_model.fit(X_train_initial, y_train_initial)"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": 8,
517 | "metadata": {
518 | "tags": []
519 | },
520 | "outputs": [
521 | {
522 | "name": "stdout",
523 | "output_type": "stream",
524 | "text": [
525 | "Epoch 1 / 20 - train_rmse: 1.0192369838658015\n",
526 | "Epoch 2 / 20 - train_rmse: 1.0025765882013635\n",
527 | "Epoch 3 / 20 - train_rmse: 0.9901259692095271\n",
528 | "Epoch 4 / 20 - train_rmse: 0.9807144030582827\n",
529 | "Epoch 5 / 20 - train_rmse: 0.9734408997442995\n",
530 | "Epoch 6 / 20 - train_rmse: 0.9677156773644434\n",
531 | "Epoch 7 / 20 - train_rmse: 0.9631030982793267\n",
532 | "Epoch 8 / 20 - train_rmse: 0.9593444020925831\n",
533 | "Epoch 9 / 20 - train_rmse: 0.9562283345776661\n",
534 | "Epoch 10 / 20 - train_rmse: 0.9536075629675317\n",
535 | "Epoch 11 / 20 - train_rmse: 0.9513672180603409\n",
536 | "Epoch 12 / 20 - train_rmse: 0.9494208315066158\n",
537 | "Epoch 13 / 20 - train_rmse: 0.9477253749191763\n",
538 | "Epoch 14 / 20 - train_rmse: 0.946229927618241\n",
539 | "Epoch 15 / 20 - train_rmse: 0.9449080911468511\n",
540 | "Epoch 16 / 20 - train_rmse: 0.943720843305453\n",
541 | "Epoch 17 / 20 - train_rmse: 0.9426516413656599\n",
542 | "Epoch 18 / 20 - train_rmse: 0.9416762680286268\n",
543 | "Epoch 19 / 20 - train_rmse: 0.9407955983703769\n",
544 | "Epoch 20 / 20 - train_rmse: 0.9399846956755161\n",
545 | "\n",
546 | "Test RMSE: 0.9484\n",
547 | "Wall time: 965 ms\n"
548 | ]
549 | }
550 | ],
551 | "source": [
552 | "%%time\n",
553 | "baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)\n",
554 | "pred = baseline_model.predict(X_test_update)\n",
555 | "rmse = mean_squared_error(y_test_update, pred, squared = False)\n",
556 | "\n",
557 | "print(f'\\nTest RMSE: {rmse:.4f}')"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "# Matrix Factorization"
565 | ]
566 | },
567 | {
568 | "cell_type": "markdown",
569 | "metadata": {},
570 | "source": [
571 | "## Linear Kernel"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 9,
577 | "metadata": {},
578 | "outputs": [
579 | {
580 | "name": "stdout",
581 | "output_type": "stream",
582 | "text": [
583 | "Epoch 1 / 20 - train_rmse: 1.0801330309911932\n",
584 | "Epoch 2 / 20 - train_rmse: 1.0473476509450943\n",
585 | "Epoch 3 / 20 - train_rmse: 1.0244646832888804\n",
586 | "Epoch 4 / 20 - train_rmse: 1.0074920647400105\n",
587 | "Epoch 5 / 20 - train_rmse: 0.994246835724601\n",
588 | "Epoch 6 / 20 - train_rmse: 0.9835051043916838\n",
589 | "Epoch 7 / 20 - train_rmse: 0.9745225390156432\n",
590 | "Epoch 8 / 20 - train_rmse: 0.9668223717422572\n",
591 | "Epoch 9 / 20 - train_rmse: 0.9600683414209181\n",
592 | "Epoch 10 / 20 - train_rmse: 0.9540555205061302\n",
593 | "Epoch 11 / 20 - train_rmse: 0.9486137679667849\n",
594 | "Epoch 12 / 20 - train_rmse: 0.9436380921221055\n",
595 | "Epoch 13 / 20 - train_rmse: 0.9390299858326666\n",
596 | "Epoch 14 / 20 - train_rmse: 0.9347250023203936\n",
597 | "Epoch 15 / 20 - train_rmse: 0.9306721252709302\n",
598 | "Epoch 16 / 20 - train_rmse: 0.9268329678953544\n",
599 | "Epoch 17 / 20 - train_rmse: 0.9231713443339361\n",
600 | "Epoch 18 / 20 - train_rmse: 0.919660317751421\n",
601 | "Epoch 19 / 20 - train_rmse: 0.9162775396770947\n",
602 | "Epoch 20 / 20 - train_rmse: 0.9130048063578868\n",
603 | "\n",
604 | "Test RMSE: 0.9534\n",
605 | "Wall time: 15.7 s\n"
606 | ]
607 | }
608 | ],
609 | "source": [
610 | "%%time \n",
611 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)\n",
612 | "matrix_fact.fit(X_train, y_train)\n",
613 | "\n",
614 | "pred = matrix_fact.predict(X_test)\n",
615 | "rmse = mean_squared_error(y_test, pred, squared = False)\n",
616 | "\n",
617 | "print(f'\\nTest RMSE: {rmse:.4f}')"
618 | ]
619 | },
620 | {
621 | "cell_type": "markdown",
622 | "metadata": {},
623 | "source": [
624 | "## Getting list of recommendations for a user"
625 | ]
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": 10,
630 | "metadata": {},
631 | "outputs": [
632 | {
633 | "data": {
634 | "text/html": [
635 | "\n",
636 | "\n",
649 | "
\n",
650 | " \n",
651 | " \n",
652 | " | \n",
653 | " user_id | \n",
654 | " item_id | \n",
655 | " rating_pred | \n",
656 | "
\n",
657 | " \n",
658 | " \n",
659 | " \n",
660 | " | 37 | \n",
661 | " 200 | \n",
662 | " 64 | \n",
663 | " 5.000000 | \n",
664 | "
\n",
665 | " \n",
666 | " | 242 | \n",
667 | " 200 | \n",
668 | " 357 | \n",
669 | " 4.953382 | \n",
670 | "
\n",
671 | " \n",
672 | " | 11 | \n",
673 | " 200 | \n",
674 | " 127 | \n",
675 | " 4.914760 | \n",
676 | "
\n",
677 | " \n",
678 | " | 61 | \n",
679 | " 200 | \n",
680 | " 272 | \n",
681 | " 4.904249 | \n",
682 | "
\n",
683 | " \n",
684 | " | 710 | \n",
685 | " 200 | \n",
686 | " 479 | \n",
687 | " 4.837060 | \n",
688 | "
\n",
689 | " \n",
690 | " | 395 | \n",
691 | " 200 | \n",
692 | " 480 | \n",
693 | " 4.836522 | \n",
694 | "
\n",
695 | " \n",
696 | " | 275 | \n",
697 | " 200 | \n",
698 | " 12 | \n",
699 | " 4.816657 | \n",
700 | "
\n",
701 | " \n",
702 | " | 655 | \n",
703 | " 200 | \n",
704 | " 427 | \n",
705 | " 4.808555 | \n",
706 | "
\n",
707 | " \n",
708 | " | 55 | \n",
709 | " 200 | \n",
710 | " 511 | \n",
711 | " 4.804192 | \n",
712 | "
\n",
713 | " \n",
714 | " | 144 | \n",
715 | " 200 | \n",
716 | " 285 | \n",
717 | " 4.797472 | \n",
718 | "
\n",
719 | " \n",
720 | "
\n",
721 | "
"
722 | ],
723 | "text/plain": [
724 | " user_id item_id rating_pred\n",
725 | "37 200 64 5.000000\n",
726 | "242 200 357 4.953382\n",
727 | "11 200 127 4.914760\n",
728 | "61 200 272 4.904249\n",
729 | "710 200 479 4.837060\n",
730 | "395 200 480 4.836522\n",
731 | "275 200 12 4.816657\n",
732 | "655 200 427 4.808555\n",
733 | "55 200 511 4.804192\n",
734 | "144 200 285 4.797472"
735 | ]
736 | },
737 | "execution_count": 10,
738 | "metadata": {},
739 | "output_type": "execute_result"
740 | }
741 | ],
742 | "source": [
743 | "user = 200\n",
744 | "items_known = X_train.query('user_id == @user')['item_id']\n",
745 | "matrix_fact.recommend(user=user, items_known=items_known)"
746 | ]
747 | },
748 | {
749 | "cell_type": "markdown",
750 | "metadata": {},
751 | "source": [
752 | "## Updating with new users"
753 | ]
754 | },
755 | {
756 | "cell_type": "code",
757 | "execution_count": 11,
758 | "metadata": {},
759 | "outputs": [
760 | {
761 | "name": "stdout",
762 | "output_type": "stream",
763 | "text": [
764 | "Epoch 1 / 20 - train_rmse: 1.0706518319392073\n",
765 | "Epoch 2 / 20 - train_rmse: 1.0382624779438394\n",
766 | "Epoch 3 / 20 - train_rmse: 1.016232308328001\n",
767 | "Epoch 4 / 20 - train_rmse: 0.9999366805279928\n",
768 | "Epoch 5 / 20 - train_rmse: 0.9872308367922817\n",
769 | "Epoch 6 / 20 - train_rmse: 0.9769357406601346\n",
770 | "Epoch 7 / 20 - train_rmse: 0.9683129631342807\n",
771 | "Epoch 8 / 20 - train_rmse: 0.9609022720622064\n",
772 | "Epoch 9 / 20 - train_rmse: 0.9543972792347011\n",
773 | "Epoch 10 / 20 - train_rmse: 0.9485809462916166\n",
774 | "Epoch 11 / 20 - train_rmse: 0.9433059630075376\n",
775 | "Epoch 12 / 20 - train_rmse: 0.9384619306949283\n",
776 | "Epoch 13 / 20 - train_rmse: 0.9339642725110164\n",
777 | "Epoch 14 / 20 - train_rmse: 0.9297503741854064\n",
778 | "Epoch 15 / 20 - train_rmse: 0.9257711482478324\n",
779 | "Epoch 16 / 20 - train_rmse: 0.921985910287917\n",
780 | "Epoch 17 / 20 - train_rmse: 0.9183647974387779\n",
781 | "Epoch 18 / 20 - train_rmse: 0.9148839852245906\n",
782 | "Epoch 19 / 20 - train_rmse: 0.9115179356050906\n",
783 | "Epoch 20 / 20 - train_rmse: 0.9082510051903396\n"
784 | ]
785 | },
786 | {
787 | "data": {
788 | "text/plain": [
789 | "KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)"
790 | ]
791 | },
792 | "execution_count": 11,
793 | "metadata": {},
794 | "output_type": "execute_result"
795 | }
796 | ],
797 | "source": [
798 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)\n",
799 | "matrix_fact.fit(X_train_initial, y_train_initial)"
800 | ]
801 | },
802 | {
803 | "cell_type": "code",
804 | "execution_count": 12,
805 | "metadata": {},
806 | "outputs": [
807 | {
808 | "name": "stdout",
809 | "output_type": "stream",
810 | "text": [
811 | "Epoch 1 / 20 - train_rmse: 1.0397682004761137\n",
812 | "Epoch 2 / 20 - train_rmse: 1.0204525249402976\n",
813 | "Epoch 3 / 20 - train_rmse: 1.0058106710178145\n",
814 | "Epoch 4 / 20 - train_rmse: 0.9945551189184298\n",
815 | "Epoch 5 / 20 - train_rmse: 0.9856789782783212\n",
816 | "Epoch 6 / 20 - train_rmse: 0.9785788434701258\n",
817 | "Epoch 7 / 20 - train_rmse: 0.9727422998246427\n",
818 | "Epoch 8 / 20 - train_rmse: 0.9678438078577599\n",
819 | "Epoch 9 / 20 - train_rmse: 0.9636632891501984\n",
820 | "Epoch 10 / 20 - train_rmse: 0.9600308660297464\n",
821 | "Epoch 11 / 20 - train_rmse: 0.9568136464702428\n",
822 | "Epoch 12 / 20 - train_rmse: 0.9539161652784045\n",
823 | "Epoch 13 / 20 - train_rmse: 0.9512904364030054\n",
824 | "Epoch 14 / 20 - train_rmse: 0.9488745417666238\n",
825 | "Epoch 15 / 20 - train_rmse: 0.9466285136632905\n",
826 | "Epoch 16 / 20 - train_rmse: 0.94452616338993\n",
827 | "Epoch 17 / 20 - train_rmse: 0.9425492611358841\n",
828 | "Epoch 18 / 20 - train_rmse: 0.9406751136767649\n",
829 | "Epoch 19 / 20 - train_rmse: 0.9388943623139107\n",
830 | "Epoch 20 / 20 - train_rmse: 0.9371880494897803\n",
831 | "\n",
832 | "Test RMSE: 0.9677\n",
833 | "Wall time: 2.01 s\n"
834 | ]
835 | }
836 | ],
837 | "source": [
838 | "%%time\n",
839 | "# Update model with new users\n",
840 | "matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)\n",
841 | "pred = matrix_fact.predict(X_test_update)\n",
842 | "rmse = mean_squared_error(y_test_update, pred, squared = False)\n",
843 | "\n",
844 | "print(f'\\nTest RMSE: {rmse:.4f}')"
845 | ]
846 | },
847 | {
848 | "cell_type": "markdown",
849 | "metadata": {},
850 | "source": [
851 | "## Sigmoid kernel"
852 | ]
853 | },
854 | {
855 | "cell_type": "code",
856 | "execution_count": 13,
857 | "metadata": {},
858 | "outputs": [
859 | {
860 | "name": "stdout",
861 | "output_type": "stream",
862 | "text": [
863 | "Epoch 1 / 20 - train_rmse: 1.7254842363611376\n",
864 | "Epoch 2 / 20 - train_rmse: 1.700347578847924\n",
865 | "Epoch 3 / 20 - train_rmse: 1.6622359141199023\n",
866 | "Epoch 4 / 20 - train_rmse: 1.6210456578773018\n",
867 | "Epoch 5 / 20 - train_rmse: 1.5756597449133936\n",
868 | "Epoch 6 / 20 - train_rmse: 1.523379818791774\n",
869 | "Epoch 7 / 20 - train_rmse: 1.4657317754887915\n",
870 | "Epoch 8 / 20 - train_rmse: 1.4093479432787581\n",
871 | "Epoch 9 / 20 - train_rmse: 1.358332738938575\n",
872 | "Epoch 10 / 20 - train_rmse: 1.3133318818212163\n",
873 | "Epoch 11 / 20 - train_rmse: 1.2739396811494321\n",
874 | "Epoch 12 / 20 - train_rmse: 1.2393300382279362\n",
875 | "Epoch 13 / 20 - train_rmse: 1.2087120677746743\n",
876 | "Epoch 14 / 20 - train_rmse: 1.181458675550588\n",
877 | "Epoch 15 / 20 - train_rmse: 1.1570300259298787\n",
878 | "Epoch 16 / 20 - train_rmse: 1.1349358601708097\n",
879 | "Epoch 17 / 20 - train_rmse: 1.114946996505043\n",
880 | "Epoch 18 / 20 - train_rmse: 1.0966573702646067\n",
881 | "Epoch 19 / 20 - train_rmse: 1.079843880247601\n",
882 | "Epoch 20 / 20 - train_rmse: 1.0642701656384883\n",
883 | "\n",
884 | "Test RMSE: 1.1110\n",
885 | "Wall time: 1.77 s\n"
886 | ]
887 | }
888 | ],
889 | "source": [
890 | "%%time \n",
891 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')\n",
892 | "matrix_fact.fit(X_train, y_train)\n",
893 | "\n",
894 | "pred = matrix_fact.predict(X_test)\n",
895 | "rmse = mean_squared_error(y_test, pred, squared = False)\n",
896 | "\n",
897 | "print(f'\\nTest RMSE: {rmse:.4f}')"
898 | ]
899 | },
900 | {
901 | "cell_type": "markdown",
902 | "metadata": {},
903 | "source": [
904 | "## RBF Kernel"
905 | ]
906 | },
907 | {
908 | "cell_type": "code",
909 | "execution_count": 14,
910 | "metadata": {},
911 | "outputs": [
912 | {
913 | "name": "stdout",
914 | "output_type": "stream",
915 | "text": [
916 | "Epoch 1 / 20 - train_rmse: 1.261497709751721\n",
917 | "Epoch 2 / 20 - train_rmse: 1.1098240081612984\n",
918 | "Epoch 3 / 20 - train_rmse: 1.0469994987862579\n",
919 | "Epoch 4 / 20 - train_rmse: 1.005181914551291\n",
920 | "Epoch 5 / 20 - train_rmse: 0.9752579187861348\n",
921 | "Epoch 6 / 20 - train_rmse: 0.9515686603321364\n",
922 | "Epoch 7 / 20 - train_rmse: 0.9340638617221303\n",
923 | "Epoch 8 / 20 - train_rmse: 0.9213238773972364\n",
924 | "Epoch 9 / 20 - train_rmse: 0.9115143003092134\n",
925 | "Epoch 10 / 20 - train_rmse: 0.9039437993331968\n",
926 | "Epoch 11 / 20 - train_rmse: 0.899792715730062\n",
927 | "Epoch 12 / 20 - train_rmse: 0.8949836709174682\n",
928 | "Epoch 13 / 20 - train_rmse: 0.8934174679325033\n",
929 | "Epoch 14 / 20 - train_rmse: 0.8897947618902249\n",
930 | "Epoch 15 / 20 - train_rmse: 0.8861334672817339\n",
931 | "Epoch 16 / 20 - train_rmse: 0.8850958002049469\n",
932 | "Epoch 17 / 20 - train_rmse: 0.883513182070616\n",
933 | "Epoch 18 / 20 - train_rmse: 0.8818590959179743\n",
934 | "Epoch 19 / 20 - train_rmse: 0.8817834058789318\n",
935 | "Epoch 20 / 20 - train_rmse: 0.8826416261286896\n",
936 | "\n",
937 | "Test RMSE: 0.9696\n",
938 | "Wall time: 3.55 s\n"
939 | ]
940 | }
941 | ],
942 | "source": [
943 | "%%time \n",
944 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')\n",
945 | "matrix_fact.fit(X_train, y_train)\n",
946 | "\n",
947 | "pred = matrix_fact.predict(X_test)\n",
948 | "rmse = mean_squared_error(y_test, pred, squared = False)\n",
949 | "\n",
950 | "print(f'\\nTest RMSE: {rmse:.4f}')"
951 | ]
952 | },
953 | {
954 | "cell_type": "markdown",
955 | "metadata": {},
956 | "source": [
957 | "# Scikit-learn compatability"
958 | ]
959 | },
960 | {
961 | "cell_type": "code",
962 | "execution_count": 15,
963 | "metadata": {},
964 | "outputs": [
965 | {
966 | "name": "stdout",
967 | "output_type": "stream",
968 | "text": [
969 | "Fitting 5 folds for each of 81 candidates, totalling 405 fits\n"
970 | ]
971 | },
972 | {
973 | "name": "stderr",
974 | "output_type": "stream",
975 | "text": [
976 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
977 | "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 36.8s\n",
978 | "[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 1.5min\n",
979 | "[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 3.4min finished\n"
980 | ]
981 | },
982 | {
983 | "data": {
984 | "text/plain": [
985 | "GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,\n",
986 | " param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],\n",
987 | " 'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],\n",
988 | " 'reg': [0, 0.005, 0.1]},\n",
989 | " scoring='neg_root_mean_squared_error', verbose=1)"
990 | ]
991 | },
992 | "execution_count": 15,
993 | "metadata": {},
994 | "output_type": "execute_result"
995 | }
996 | ],
997 | "source": [
998 | "from sklearn.model_selection import GridSearchCV, ParameterGrid\n",
999 | "\n",
1000 | "param_grid = {\n",
1001 | " 'kernel': ['linear', 'sigmoid', 'rbf'],\n",
1002 | " 'n_factors': [10, 20, 50],\n",
1003 | " 'n_epochs': [10, 20, 50],\n",
1004 | " 'reg': [0, 0.005, 0.1]\n",
1005 | "}\n",
1006 | "\n",
1007 | "grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)\n",
1008 | "grid_search.fit(X_train, y_train)"
1009 | ]
1010 | },
1011 | {
1012 | "cell_type": "code",
1013 | "execution_count": 16,
1014 | "metadata": {},
1015 | "outputs": [
1016 | {
1017 | "data": {
1018 | "text/plain": [
1019 | "-0.9252872735695155"
1020 | ]
1021 | },
1022 | "execution_count": 16,
1023 | "metadata": {},
1024 | "output_type": "execute_result"
1025 | },
1026 | {
1027 | "data": {
1028 | "text/plain": [
1029 | "{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}"
1030 | ]
1031 | },
1032 | "execution_count": 16,
1033 | "metadata": {},
1034 | "output_type": "execute_result"
1035 | }
1036 | ],
1037 | "source": [
1038 | "grid_search.best_score_\n",
1039 | "grid_search.best_params_"
1040 | ]
1041 | }
1042 | ],
1043 | "metadata": {
1044 | "kernelspec": {
1045 | "display_name": "Python [conda env:recommend]",
1046 | "language": "python",
1047 | "name": "conda-env-recommend-py"
1048 | },
1049 | "language_info": {
1050 | "codemirror_mode": {
1051 | "name": "ipython",
1052 | "version": 3
1053 | },
1054 | "file_extension": ".py",
1055 | "mimetype": "text/x-python",
1056 | "name": "python",
1057 | "nbconvert_exporter": "python",
1058 | "pygments_lexer": "ipython3",
1059 | "version": "3.7.7"
1060 | },
1061 | "toc-autonumbering": true,
1062 | "toc-showcode": false,
1063 | "toc-showmarkdowntxt": false,
1064 | "toc-showtags": false
1065 | },
1066 | "nbformat": 4,
1067 | "nbformat_minor": 4
1068 | }
1069 |
--------------------------------------------------------------------------------