├── setup.cfg ├── requirements.txt ├── matrix_factorization ├── __init__.py ├── utils.py ├── recommender_base.py ├── kernels.py ├── baseline_model.py └── kernel_matrix_factorization.py ├── LICENSE ├── setup.py ├── examples ├── example.py └── recommender-system.ipynb ├── README.md └── .gitignore /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numba>=0.49.1 2 | numpy>=1.18.5 3 | pandas>=1.0.4 4 | scikit-learn>=0.23.1 5 | scipy>=1.4.1 -------------------------------------------------------------------------------- /matrix_factorization/__init__.py: -------------------------------------------------------------------------------- 1 | from .baseline_model import BaselineModel 2 | from .kernel_matrix_factorization import KernelMF 3 | from .recommender_base import RecommenderBase 4 | from .utils import train_update_test_split 5 | 6 | __all__ = [ 7 | "BaselineModel", 8 | "KernelMF", 9 | "RecommenderBase", 10 | "train_update_test_split", 11 | ] 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Quang-Vinh Do 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | 5 | readme_path = os.path.join(os.path.dirname(__file__), "README.md") 6 | with open(readme_path) as f: 7 | long_description = f.read() 8 | 9 | 10 | setuptools.setup( 11 | name="matrix_factorization", 12 | version="1.3", 13 | author="Quang-Vinh Do", 14 | author_email="qdo086@uottawa.ca", 15 | description="Library for matrix factorization for recommender systems using collaborative filtering", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | url="https://github.com/Quang-Vinh/matrix-factorization", 19 | download_url="https://github.com/Quang-Vinh/matrix-factorization/archive/v1.3.tar.gz", 20 | license="MIT", 21 | packages=setuptools.find_packages(), 22 | classifiers=[ 23 | "Programming Language :: Python :: 3", 24 | "Operating System :: OS Independent", 25 | "License :: OSI Approved :: MIT License", 26 | ], 27 | python_requires=">=3.6", 28 | install_requires=[ 29 | "numba>=0.49.1", 30 | "numpy>=1.18.5", 31 | "pandas>=1.0.4", 32 | "scikit-learn>=0.23.1", 33 | "scipy>=1.4.1", 34 | ], 35 | ) 36 | -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | from matrix_factorization import BaselineModel, KernelMF, train_update_test_split 2 | 3 | import pandas as pd 4 | from sklearn.metrics import mean_squared_error 5 | 6 | # Movie data found here https://grouplens.org/datasets/movielens/ 7 | cols = ["user_id", "item_id", "rating", "timestamp"] 8 | movie_data = pd.read_csv( 9 | "../data/ml-100k/u.data", names=cols, sep="\t", usecols=[0, 1, 2], engine="python" 10 | ) 11 | 12 | X = movie_data[["user_id", "item_id"]] 13 | y = movie_data["rating"] 14 | 15 | # Prepare data for online learning 16 | ( 17 | X_train_initial, 18 | y_train_initial, 19 | X_train_update, 20 | y_train_update, 21 | X_test_update, 22 | y_test_update, 23 | ) = train_update_test_split(movie_data, frac_new_users=0.2) 24 | 25 | # Initial training 26 | matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005) 27 | matrix_fact.fit(X_train_initial, y_train_initial) 28 | 29 | # Update model with new users 30 | matrix_fact.update_users( 31 | X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1 32 | ) 33 | pred = matrix_fact.predict(X_test_update) 34 | rmse = mean_squared_error(y_test_update, pred, squared=False) 35 | print(f"\nTest RMSE: {rmse:.4f}") 36 | 37 | # Get recommendations 38 | user = 200 39 | items_known = X_train_initial.query("user_id == @user")["item_id"] 40 | matrix_fact.recommend(user=user, items_known=items_known) 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Matrix Factorization 2 | Short and simple implementation of kernel matrix factorization with online-updating for use in collaborative recommender systems built on top of scikit-learn. 3 | 4 | ## Prerequisites 5 | - Python 3 6 | - numba 7 | - numpy 8 | - pandas 9 | - scikit-learn 10 | - scipy 11 | 12 | ## Installation 13 | ``` 14 | pip install matrix_factorization 15 | ``` 16 | 17 | ## Usage 18 | ```python 19 | from matrix_factorization import BaselineModel, KernelMF, train_update_test_split 20 | 21 | import pandas as pd 22 | from sklearn.metrics import mean_squared_error 23 | 24 | # Movie data found here https://grouplens.org/datasets/movielens/ 25 | cols = ["user_id", "item_id", "rating", "timestamp"] 26 | movie_data = pd.read_csv( 27 | "../data/ml-100k/u.data", names=cols, sep="\t", usecols=[0, 1, 2], engine="python" 28 | ) 29 | 30 | X = movie_data[["user_id", "item_id"]] 31 | y = movie_data["rating"] 32 | 33 | # Prepare data for online learning 34 | ( 35 | X_train_initial, 36 | y_train_initial, 37 | X_train_update, 38 | y_train_update, 39 | X_test_update, 40 | y_test_update, 41 | ) = train_update_test_split(movie_data, frac_new_users=0.2) 42 | 43 | # Initial training 44 | matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005) 45 | matrix_fact.fit(X_train_initial, y_train_initial) 46 | 47 | # Update model with new users 48 | matrix_fact.update_users( 49 | X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1 50 | ) 51 | pred = matrix_fact.predict(X_test_update) 52 | rmse = mean_squared_error(y_test_update, pred, squared=False) 53 | print(f"\nTest RMSE: {rmse:.4f}") 54 | 55 | # Get recommendations 56 | user = 200 57 | items_known = X_train_initial.query("user_id == @user")["item_id"] 58 | matrix_fact.recommend(user=user, items_known=items_known) 59 | ``` 60 | 61 | Check examples/recommender-system.ipynb for complete examples 62 | 63 | ## License 64 | This project is licensed under the MIT License 65 | 66 | 67 | ## References :book: 68 | - Steffen Rendle, Lars Schmidt-Thieme. Online-updating regularized kernel matrix factorization models for large-scale recommender systems https://dl.acm.org/doi/10.1145/1454008.1454047 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # data 2 | /data 3 | 4 | .gitattributes 5 | 6 | # vs code 7 | .vscode 8 | 9 | # Jupyter notbeook 10 | .virtual_documents 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | pip-wheel-metadata/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 106 | __pypackages__/ 107 | 108 | # Celery stuff 109 | celerybeat-schedule 110 | celerybeat.pid 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | .dmypy.json 137 | dmypy.json 138 | 139 | # Pyre type checker 140 | .pyre/ 141 | -------------------------------------------------------------------------------- /matrix_factorization/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | 5 | from typing import Tuple 6 | 7 | 8 | def train_update_test_split( 9 | X: pd.DataFrame, frac_new_users: float 10 | ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: 11 | """ 12 | Split data into 3 parts (train_initial, train_update, test_update) for testing performance of model update for new users. First, a set of new 13 | users is set and all ratings corresponding to all other users is assigned to train_initial. Then, for each new user half of their ratings are 14 | stored in train_update and half are stored in test_update. 15 | 16 | To use the three sets returned: 17 | 1. Fit your model to train_update set. 18 | 2. Update your model with train_update 19 | 3. Calculate predictions on test_update and compare with their actual ratings 20 | 21 | Args: 22 | X (pd.DataFrame): Data frame containing columns user_id, item_id 23 | frac_new_users (float): Fraction of users to not include in train_initial 24 | 25 | Returns: 26 | X_train_initial [pd.DataFrame]: Training set user_ids and item_ids for initial model fitting 27 | y_train_initial [pd.Series]: Corresponding ratings for X_train_initial 28 | X_train_update [pd.DataFrame]: Training set user_ids and item_ids for model updating. Contains users that are not in train_initial 29 | y_train_update [pd.Series]: Corresponding ratings for X_train_update 30 | X_test_update [pd.DataFrame]: Testing set user_ids and item_ids for model updating. Contains same users as train_update 31 | y_test_update [pd.Series]: Corresponding ratings for X_test_update 32 | """ 33 | users = X["user_id"].unique() 34 | 35 | # Users that won't be included in the initial training 36 | users_update = np.random.choice( 37 | users, size=round(frac_new_users * len(users)), replace=False 38 | ) 39 | 40 | # Initial training matrix 41 | train_initial = X.query("user_id not in @users_update").sample( 42 | frac=1, replace=False 43 | ) 44 | 45 | # Train and test sets for updating model. For each new user split their ratings into two sets, one for update and one for test 46 | data_update = X.query("user_id in @users_update") 47 | train_update, test_update = train_test_split( 48 | data_update, stratify=data_update["user_id"], test_size=0.5 49 | ) 50 | 51 | # Split into X and y 52 | X_train_initial, y_train_initial = ( 53 | train_initial[["user_id", "item_id"]], 54 | train_initial["rating"], 55 | ) 56 | X_train_update, y_train_update = ( 57 | train_update[["user_id", "item_id"]], 58 | train_update["rating"], 59 | ) 60 | X_test_update, y_test_update = ( 61 | test_update[["user_id", "item_id"]], 62 | test_update["rating"], 63 | ) 64 | 65 | return ( 66 | X_train_initial, 67 | y_train_initial, 68 | X_train_update, 69 | y_train_update, 70 | X_test_update, 71 | y_test_update, 72 | ) 73 | 74 | -------------------------------------------------------------------------------- /matrix_factorization/recommender_base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, RegressorMixin 4 | 5 | from abc import ABCMeta, abstractmethod 6 | from typing import Any, Tuple, Union 7 | 8 | 9 | class RecommenderBase(BaseEstimator, RegressorMixin, metaclass=ABCMeta): 10 | """ 11 | Abstract base class for all recommender models. 12 | All subclasses should implement the fit() and predict() methods 13 | 14 | Arguments: 15 | min_rating {int} -- Smallest rating possible (default: {0}) 16 | max_rating {int} -- Largest rating possible (default: {5}) 17 | verbose {str} -- Verbosity when fitting. Values possible are 0 to not print anything, 1 to print fitting model (default: {1}) 18 | 19 | Attributes: 20 | n_users {int} -- Number of users 21 | n_items {int} -- Number of items 22 | global_mean {float} -- Global mean of all ratings 23 | user_id_map {dict} -- Mapping of user ids to assigned integer ids 24 | item_id_map {dict} -- Mapping of item ids to assigned integer ids 25 | known_users {set} -- Set of known user_ids 26 | known_items {set} -- Set of known item_ids 27 | """ 28 | 29 | @abstractmethod 30 | def __init__(self, min_rating: float = 0, max_rating: float = 5, verbose: int = 0): 31 | self.min_rating = min_rating 32 | self.max_rating = max_rating 33 | self.verbose = verbose 34 | return 35 | 36 | @property 37 | def known_users(self): 38 | """ 39 | List of known user_ids 40 | """ 41 | return set(self.user_id_map.keys()) 42 | 43 | @property 44 | def known_items(self): 45 | """ 46 | List of known item_ids 47 | """ 48 | return set(self.item_id_map.keys()) 49 | 50 | def contains_user(self, user_id: Any) -> bool: 51 | """ 52 | Checks if model was trained on data containing given user_id 53 | 54 | Args: 55 | user_id (any): User id 56 | 57 | Returns: 58 | bool: If user_id is known 59 | """ 60 | return user_id in self.known_users 61 | 62 | def contains_item(self, item_id: Any) -> bool: 63 | """ 64 | Checks if model was trained on data containing given item_id 65 | 66 | Args: 67 | item_id (any): Item id 68 | 69 | Returns: 70 | bool: If item_id is known 71 | """ 72 | return item_id in self.known_items 73 | 74 | def _preprocess_data( 75 | self, X: pd.DataFrame, y: pd.Series = None, type: str = "fit" 76 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, list, list]]: 77 | """ 78 | Preprocessing steps before doing fit, update or predict 79 | 80 | Arguments: 81 | X {pd.DataFrame} -- Dataframe containing columns user_id, item_id 82 | y {pd.Series} -- Series containing rating 83 | type {str} -- The type of preprocessing to do. Allowed options are ('fit', 'predict', 'update'). Defaults to 'fit' 84 | 85 | Returns: 86 | X [pd.DataFrame] -- Dataframe with columns user_id, item_id and rating 87 | known_users [list, 'on update only'] -- List containing already known users in X. Only returned for type update 88 | new_users [list, 'on update only'] -- List containing new users in X. Only returned for type update 89 | """ 90 | X = X.loc[:, ["user_id", "item_id"]] 91 | 92 | if type != "predict": 93 | X["rating"] = y 94 | 95 | if type in ("fit", "update"): 96 | # Check for duplicate user-item ratings 97 | if X.duplicated(subset=["user_id", "item_id"]).sum() != 0: 98 | raise ValueError("Duplicate user-item ratings in matrix") 99 | 100 | # Shuffle rows 101 | X = X.sample(frac=1, replace=False) 102 | 103 | if type == "fit": 104 | # Create mapping of user_id and item_id to assigned integer ids 105 | user_ids = X["user_id"].unique() 106 | item_ids = X["item_id"].unique() 107 | self.user_id_map = {user_id: i for (i, user_id) in enumerate(user_ids)} 108 | self.item_id_map = {item_id: i for (i, item_id) in enumerate(item_ids)} 109 | self.n_users = len(user_ids) 110 | self.n_items = len(item_ids) 111 | 112 | elif type == "update": 113 | # Keep only item ratings for which the item is already known 114 | items = self.item_id_map.keys() 115 | X = X.query("item_id in @items").copy() 116 | 117 | # Add information on new users 118 | new_users, known_users = [], [] 119 | users = X["user_id"].unique() 120 | new_user_id = max(self.user_id_map.values()) + 1 121 | 122 | for user in users: 123 | if user in self.user_id_map.keys(): 124 | known_users.append(user) 125 | continue 126 | 127 | # Add to user id mapping 128 | new_users.append(user) 129 | self.user_id_map[user] = new_user_id 130 | new_user_id += 1 131 | 132 | # Remap user id and item id to assigned integer ids 133 | X.loc[:, "user_id"] = X["user_id"].map(self.user_id_map) 134 | X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map) 135 | 136 | if type == "predict": 137 | # Replace missing mappings with -1 138 | X.fillna(-1, inplace=True) 139 | 140 | if type == "update": 141 | return X, known_users, new_users 142 | else: 143 | return X 144 | 145 | @abstractmethod 146 | def fit(self, X: pd.DataFrame, y: pd.Series): 147 | """ 148 | Fit model to given data 149 | 150 | Args: 151 | X {pandas DataFrame} -- Dataframe containing columns user_id, item_id 152 | y {pandas DataFrame} -- Series containing rating 153 | """ 154 | return self 155 | 156 | @abstractmethod 157 | def predict(self, X: pd.DataFrame, bound_ratings: bool = True) -> list: 158 | """ 159 | Predict ratings for given users and items 160 | 161 | Args: 162 | X (pd.DataFrame): Dataframe containing columns user_id and item_id 163 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True) 164 | 165 | Returns: 166 | list: List containing rating predictions of all user, items in same order as input X 167 | """ 168 | return [] 169 | 170 | def recommend( 171 | self, 172 | user: Any, 173 | amount: int = 10, 174 | items_known: list = None, 175 | include_user: bool = True, 176 | bound_ratings: bool = True, 177 | ) -> pd.DataFrame: 178 | """ 179 | Returns a DataFrame of recommendations of items for a given user sorted from highest to lowest. 180 | 181 | Args: 182 | user (any): User_id to get recommendations for (not assigned user_id from self.user_id_map) 183 | items_known (list, optional): List of items already known by user and to not be considered in recommendations. Defaults to None. 184 | include_user (bool, optional): Whether to include the user_id in the output DataFrame or not. Defaults to True. 185 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True) 186 | 187 | Returns: 188 | pd.DataFrame: Recommendations DataFrame for user with columns user_id (optional), item_id, rating sorted from highest to lowest rating 189 | """ 190 | items = list(self.item_id_map.keys()) 191 | 192 | # If items_known is provided then filter by items that the user does not know 193 | if items_known is not None: 194 | items_known = list(items_known) 195 | items = [item for item in items if item not in items_known] 196 | 197 | # Get rating predictions for given user and all unknown items 198 | items_recommend = pd.DataFrame({"user_id": user, "item_id": items}) 199 | items_recommend["rating_pred"] = self.predict( 200 | X=items_recommend, bound_ratings=False 201 | ) 202 | 203 | # Sort and keep top n items 204 | items_recommend.sort_values(by="rating_pred", ascending=False, inplace=True) 205 | items_recommend = items_recommend.head(amount) 206 | 207 | # Bound ratings 208 | if bound_ratings: 209 | items_recommend["rating_pred"] = items_recommend["rating_pred"].clip( 210 | lower=self.min_rating, upper=self.max_rating 211 | ) 212 | 213 | if not include_user: 214 | items_recommend.drop(["user_id"], axis="columns", inplace=True) 215 | 216 | return items_recommend 217 | 218 | -------------------------------------------------------------------------------- /matrix_factorization/kernels.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numba as nb 3 | import numpy as np 4 | 5 | 6 | @nb.njit() 7 | def sigmoid(x: float) -> float: 8 | """ 9 | Calculates sigmoid function at x 10 | 11 | Args: 12 | x (float): Input x 13 | 14 | Returns: 15 | [float]: Sigmoid at x 16 | """ 17 | result = 1 / (1 + math.exp(-x)) 18 | return result 19 | 20 | 21 | @nb.njit() 22 | def kernel_linear( 23 | global_mean: float, 24 | user_bias: float, 25 | item_bias: float, 26 | user_feature_vec: np.ndarray, 27 | item_feature_vec: np.ndarray, 28 | ) -> float: 29 | """ 30 | Calculates result with a linear kernel which is essentially just the dot product 31 | 32 | Args: 33 | global_mean (float): Global mean 34 | user_bias (float): User bias 35 | item_bias (float): Item bias 36 | user_feature_vec (np.ndarray): Vector of user latent features 37 | item_feature_vec (np.ndarray): Vector of item latent features 38 | 39 | Returns: 40 | [float]: Linear kernel result 41 | """ 42 | result = ( 43 | global_mean + item_bias + user_bias + np.dot(user_feature_vec, item_feature_vec) 44 | ) 45 | return result 46 | 47 | 48 | @nb.njit() 49 | def kernel_sigmoid( 50 | global_mean: float, 51 | user_bias: float, 52 | item_bias: float, 53 | user_feature_vec: np.ndarray, 54 | item_feature_vec: np.ndarray, 55 | a: float, 56 | c: float, 57 | ): 58 | """ 59 | Calculates result with sigmoid kernel 60 | 61 | Args: 62 | global_mean (float): Global mean 63 | user_bias (float): User bias 64 | item_bias (float): Item bias 65 | user_feature_vec (np.ndarray): Vector of user latent features 66 | item_feature_vec (np.ndarray): Vector of item latent features 67 | a (float): Rescaling parameter for a + c * K(u, i) 68 | c (float): Rescaling parameter for a + c * K(u, i) 69 | 70 | Returns: 71 | [float]: Sigmoid kernel result 72 | """ 73 | linear_sum = ( 74 | global_mean + user_bias + item_bias + np.dot(user_feature_vec, item_feature_vec) 75 | ) 76 | sigmoid_result = sigmoid(linear_sum) 77 | result = a + c * sigmoid_result 78 | return result 79 | 80 | 81 | @nb.njit() 82 | def kernel_rbf( 83 | user_feature_vec: np.ndarray, 84 | item_feature_vec: np.ndarray, 85 | gamma: float, 86 | a: float, 87 | c: float, 88 | ): 89 | """ 90 | Calculates result with Radial basis function kernel 91 | 92 | Args: 93 | user_feature_vec (np.ndarray): Vector of user latent features 94 | item_feature_vec (np.ndarray): Vector of item latent features 95 | gamma (float): Kernel coefficient 96 | a (float): Rescaling parameter for a + c * K(u, i) 97 | c (float): Rescaling parameter for a + c * K(u, i) 98 | 99 | Returns: 100 | [float]: RBF kernel result 101 | """ 102 | power = -gamma * np.sum(np.square(user_feature_vec - item_feature_vec)) 103 | exp_result = math.exp(power) 104 | result = a + c * exp_result 105 | return result 106 | 107 | 108 | @nb.njit() 109 | def kernel_linear_sgd_update( 110 | user_id: int, 111 | item_id: int, 112 | rating: float, 113 | global_mean: float, 114 | user_biases: np.ndarray, 115 | item_biases: np.ndarray, 116 | user_features: np.ndarray, 117 | item_features: np.ndarray, 118 | lr: float, 119 | reg: float, 120 | update_user_params: bool = True, 121 | update_item_params: bool = True, 122 | ): 123 | """ 124 | Performs a single update using stochastic gradient descent for a linear kernel given a user and item. 125 | Similar to https://github.com/gbolmier/funk-svd and https://github.com/NicolasHug/Surprise we iterate over each factor manually for a given 126 | user/item instead of indexing by a row such as user_feature[user] since it has shown to be much faster. We have also tested with representing 127 | user_features and item_features as 1D arrays but that also is much slower. Using parallel turned on in numba gives much worse performance as well. 128 | 129 | Args: 130 | user_id (int): User id 131 | item_id (int): Item id 132 | rating (float): Rating for user and item 133 | global_mean {float} -- Global mean of all ratings 134 | user_biases {numpy array} -- User biases vector of shape (n_users, 1) 135 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1) 136 | user_features {numpy array} -- Matrix P of user features of shape (n_users, n_factors) 137 | item_features {numpy array} -- Matrix Q of item features of shape (n_items, n_factors) 138 | lr (float): Learning rate alpha 139 | reg {float} -- Regularization parameter lambda for Frobenius norm 140 | update_user_params {bool} -- Whether to update user parameters or not. Default is True. 141 | update_item_params {bool} -- Whether to update item parameters or not. Default is True. 142 | """ 143 | n_factors = user_features.shape[1] 144 | user_bias = user_biases[user_id] 145 | item_bias = item_biases[item_id] 146 | 147 | # Compute predicted rating 148 | rating_pred = ( 149 | global_mean 150 | + item_bias 151 | + user_bias 152 | + np.dot(user_features[user_id, :], item_features[item_id, :]) 153 | ) 154 | 155 | # Compute error 156 | error = rating_pred - rating 157 | 158 | # Update bias parameters 159 | if update_user_params: 160 | user_biases[user_id] -= lr * (error + reg * user_bias) 161 | 162 | if update_item_params: 163 | item_biases[item_id] -= lr * (error + reg * item_bias) 164 | 165 | # Update user and item features 166 | for f in range(n_factors): 167 | user_feature_f = user_features[user_id, f] 168 | item_feature_f = item_features[item_id, f] 169 | 170 | if update_user_params: 171 | user_features[user_id, f] -= lr * ( 172 | error * item_feature_f + reg * user_feature_f 173 | ) 174 | 175 | if update_item_params: 176 | item_features[item_id, f] -= lr * ( 177 | error * user_feature_f + reg * item_feature_f 178 | ) 179 | 180 | return 181 | 182 | 183 | @nb.njit() 184 | def kernel_sigmoid_sgd_update( 185 | user_id: int, 186 | item_id: int, 187 | rating: float, 188 | global_mean: float, 189 | user_biases: np.ndarray, 190 | item_biases: np.ndarray, 191 | user_features: np.ndarray, 192 | item_features: np.ndarray, 193 | lr: float, 194 | reg: float, 195 | a: float, 196 | c: float, 197 | update_user_params: bool = True, 198 | update_item_params: bool = True, 199 | ): 200 | """ 201 | Performs a single update using stochastic gradient descent for a sigmoid kernel given a user and item. 202 | 203 | Args: 204 | user_id (int): User id 205 | item_id (int): Item id 206 | rating (float): Rating for user and item 207 | global_mean {float} -- Global mean of all ratings 208 | user_biases {numpy array} -- User biases vector of shape (n_users, 1) 209 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1) 210 | user_features {numpy array} -- Matrix P of user features of shape (n_users, n_factors) 211 | item_features {numpy array} -- Matrix Q of item features of shape (n_items, n_factors) 212 | lr (float): Learning rate alpha 213 | reg {float} -- Regularization parameter lambda for Frobenius norm 214 | a (float): Rescaling parameter for a + c * K(u, i) 215 | c (float): Rescaling parameter for a + c * K(u, i) 216 | update_user_params {bool} -- Whether to update user parameters or not. Default is True. 217 | update_item_params {bool} -- Whether to update item parameters or not. Default is True. 218 | """ 219 | n_factors = user_features.shape[1] 220 | user_bias = user_biases[user_id] 221 | item_bias = item_biases[item_id] 222 | user_feature_vec = user_features[user_id, :] 223 | item_feature_vec = item_features[item_id, :] 224 | 225 | # Compute predicted rating 226 | linear_sum = ( 227 | global_mean + user_bias + item_bias + np.dot(user_feature_vec, item_feature_vec) 228 | ) 229 | sigmoid_result = sigmoid(linear_sum) 230 | rating_pred = a + c * sigmoid_result 231 | 232 | # Compute error 233 | error = rating_pred - rating 234 | 235 | # Common term shared between all partial derivatives 236 | deriv_base = (sigmoid_result ** 2) * math.exp(-linear_sum) 237 | 238 | # Update bias parameters 239 | if update_user_params: 240 | opt_deriv = error * deriv_base + reg * user_bias 241 | user_biases[user_id] -= lr * opt_deriv 242 | 243 | if update_item_params: 244 | opt_deriv = error * deriv_base + reg * item_bias 245 | item_biases[item_id] -= lr * opt_deriv 246 | 247 | # Update user and item features 248 | for i in range(n_factors): 249 | user_feature_f = user_features[user_id, i] 250 | item_feature_f = item_features[item_id, i] 251 | 252 | if update_user_params: 253 | user_feature_deriv = item_feature_f * deriv_base 254 | opt_deriv = error * user_feature_deriv + reg * user_feature_f 255 | user_features[user_id, i] -= lr * opt_deriv 256 | 257 | if update_item_params: 258 | item_feature_deriv = user_feature_f * deriv_base 259 | opt_deriv = error * item_feature_deriv + reg * item_feature_f 260 | item_features[item_id, i] -= lr * opt_deriv 261 | 262 | return 263 | 264 | 265 | @nb.njit() 266 | def kernel_rbf_sgd_update( 267 | user_id: int, 268 | item_id: int, 269 | rating: float, 270 | user_features: np.ndarray, 271 | item_features: np.ndarray, 272 | lr: float, 273 | reg: float, 274 | gamma: float, 275 | a: float, 276 | c: float, 277 | update_user_params: bool = True, 278 | update_item_params: bool = True, 279 | ): 280 | """ 281 | Performs a single update using stochastic gradient descent for a sigmoid kernel given a user and item. 282 | 283 | Args: 284 | user_id (int): User id 285 | item_id (int): Item id 286 | rating (float): Rating for user and item 287 | user_features {numpy array} -- Matrix P of user features of shape (n_users, n_factors) 288 | item_features {numpy array} -- Matrix Q of item features of shape (n_items, n_factors) 289 | lr (float): Learning rate alpha 290 | reg {float} -- Regularization parameter lambda for Frobenius norm 291 | gamma (float): Kernel coefficient 292 | a (float): Rescaling parameter for a + c * K(u, i) 293 | c (float): Rescaling parameter for a + c * K(u, i) 294 | update_user_params {bool} -- Whether to update user parameters or not. Default is True. 295 | update_item_params {bool} -- Whether to update item parameters or not. Default is True. 296 | """ 297 | n_factors = user_features.shape[1] 298 | user_feature_vec = user_features[user_id, :] 299 | item_feature_vec = item_features[item_id, :] 300 | 301 | # Compute predicted rating 302 | power = -gamma * np.sum(np.square(user_feature_vec - item_feature_vec)) 303 | exp_result = math.exp(power) 304 | rating_pred = a + c * exp_result 305 | 306 | # Compute error 307 | error = rating_pred - rating 308 | 309 | # Common term shared between partial derivatives 310 | deriv_base = 2 * exp_result * gamma 311 | 312 | # Update user and item features params 313 | for i in range(n_factors): 314 | user_feature_f = user_features[user_id, i] 315 | item_feature_f = item_features[item_id, i] 316 | 317 | if update_user_params: 318 | user_feature_deriv = deriv_base * (item_feature_f - user_feature_f) 319 | opt_deriv = error * user_feature_deriv + reg * user_feature_f 320 | user_features[user_id, i] -= lr * opt_deriv 321 | 322 | if update_item_params: 323 | item_feature_deriv = deriv_base * (user_feature_f - item_feature_f) 324 | opt_deriv = error * item_feature_deriv + reg * item_feature_f 325 | item_features[item_id, i] -= lr * opt_deriv 326 | 327 | return 328 | -------------------------------------------------------------------------------- /matrix_factorization/baseline_model.py: -------------------------------------------------------------------------------- 1 | import numba as nb 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from .recommender_base import RecommenderBase 6 | 7 | from typing import Tuple 8 | 9 | 10 | class BaselineModel(RecommenderBase): 11 | """ 12 | Simple model which models the user item rating as r_{ui} = \mu + ubias_u + ibias_i which is sum of a global mean and the corresponding 13 | user and item biases. The global mean \mu is estimated as the mean of all ratings. The other parameters to be estimated ubias and ibias 14 | are vectors of length n_users and n_items respectively. These two vectors are estimated using stochastic gradient descent on the RMSE 15 | with regularization. 16 | 17 | NOTE: Recommend method with this model will simply recommend the most popular items for every user. This model should mainly be used 18 | for estimating the explicit rating for a given user and item 19 | 20 | Arguments: 21 | method: {str} -- Method to estimate parameters. Can be one of 'sgd' or 'als' (default: {'sgd'}) 22 | n_epochs {int} -- Number of epochs to train for (default: {100}) 23 | reg {float} -- Lambda parameter for L2 regularization (default: {1}) 24 | lr {float} -- Learning rate for gradient optimization step (default: {0.01}) 25 | min_rating {int} -- Smallest rating possible (default: {0}) 26 | max_rating {int} -- Largest rating possible (default: {5}) 27 | verbose {str} -- Verbosity when fitting. 0 to not print anything, 1 to print fitting model (default: {1}) 28 | 29 | Attributes: 30 | n_users {int} -- Number of users 31 | n_items {int} -- Number of items 32 | global_mean {float} -- Global mean of all ratings 33 | user_biases {numpy array} -- User bias vector of shape (n_users, 1) 34 | item_biases {numpy array} -- Item bias vector of shape (n_items, i) 35 | user_id_map {dict} -- Mapping of user ids to assigned integer ids 36 | item_id_map {dict} -- Mapping of item ids to assigned integer ids 37 | train_rmse {list} -- Training rmse values 38 | predictions_possible {list} -- Boolean vector of whether both user and item were known for prediction. Only available after calling predict 39 | """ 40 | 41 | def __init__( 42 | self, 43 | method: str = "sgd", 44 | n_epochs: int = 100, 45 | reg: float = 1, 46 | lr: float = 0.01, 47 | min_rating: int = 0, 48 | max_rating: int = 5, 49 | verbose=1, 50 | ): 51 | # Check inputs 52 | if method not in ("sgd", "als"): 53 | raise ValueError('Method param must be either "sgd" or "als"') 54 | 55 | super().__init__(min_rating=min_rating, max_rating=max_rating, verbose=verbose) 56 | 57 | self.method = method 58 | self.n_epochs = n_epochs 59 | self.reg = reg 60 | self.lr = lr 61 | return 62 | 63 | def fit(self, X: pd.DataFrame, y: pd.Series): 64 | """ 65 | Fits simple mean and bias model to given user item ratings 66 | 67 | Arguments: 68 | X {pandas DataFrame} -- Dataframe containing columns user_id, item_id 69 | y {pandas Series} -- Series containing rating 70 | """ 71 | X = self._preprocess_data(X=X, y=y, type="fit") 72 | self.global_mean = X["rating"].mean() 73 | 74 | # Initialize parameters 75 | self.user_biases = np.zeros(self.n_users) 76 | self.item_biases = np.zeros(self.n_items) 77 | 78 | # Run parameter estimation 79 | if self.method == "sgd": 80 | self.user_biases, self.item_biases, self.train_rmse = _sgd( 81 | X=X.to_numpy(dtype=np.float64), 82 | global_mean=self.global_mean, 83 | user_biases=self.user_biases, 84 | item_biases=self.item_biases, 85 | n_epochs=self.n_epochs, 86 | lr=self.lr, 87 | reg=self.reg, 88 | verbose=self.verbose, 89 | ) 90 | 91 | elif self.method == "als": 92 | self.user_biases, self.item_biases, self.train_rmse = _als( 93 | X=X.to_numpy(dtype=np.float64), 94 | global_mean=self.global_mean, 95 | user_biases=self.user_biases, 96 | item_biases=self.item_biases, 97 | n_epochs=self.n_epochs, 98 | reg=self.reg, 99 | verbose=self.verbose, 100 | ) 101 | 102 | return self 103 | 104 | def predict(self, X: pd.DataFrame, bound_ratings: bool = True) -> list: 105 | """ 106 | Predict ratings for given users and items 107 | 108 | Arguments: 109 | X {pd.DataFrame} -- Dataframe containing columns user_id and item_id 110 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True) 111 | 112 | Returns: 113 | predictions [list] -- List containing rating predictions of all user, items in same order as input X 114 | """ 115 | # If empty return empty list 116 | if X.shape[0] == 0: 117 | return [] 118 | 119 | X = self._preprocess_data(X=X, type="predict") 120 | 121 | # Get predictions 122 | predictions, predictions_possible = _predict( 123 | X=X.to_numpy(dtype=np.float64), 124 | global_mean=self.global_mean, 125 | min_rating=self.min_rating, 126 | max_rating=self.max_rating, 127 | user_biases=self.user_biases, 128 | item_biases=self.item_biases, 129 | bound_ratings=bound_ratings, 130 | ) 131 | 132 | self.predictions_possible = predictions_possible 133 | 134 | return predictions 135 | 136 | def update_users( 137 | self, 138 | X: pd.DataFrame, 139 | y: pd.Series, 140 | lr: float = 0.01, 141 | n_epochs: int = 20, 142 | verbose: int = 0, 143 | ): 144 | """ 145 | Update user biases vector with new/updated user-item ratings information using SGD. Only the user parameters corresponding for the 146 | new/updated users will be updated and item parameters will be left alone. 147 | 148 | Note: If updating old users then pass all user-item ratings for old users and not just modified ratings 149 | 150 | Args: 151 | X (pd.DataFrame): Dataframe containing columns user_id, item_id 152 | y (pd.Series): Series containing rating 153 | lr (float, optional): Learning rate alpha for gradient optimization step 154 | n_epochs (int, optional): Number of epochs to run SGD. Defaults to 20. 155 | verbose (int, optional): Verbosity when updating, 0 for nothing and 1 for training messages. Defaults to 0. 156 | """ 157 | X, known_users, new_users = self._preprocess_data(X=X, y=y, type="update") 158 | 159 | # Re-initialize user bias for old users 160 | for user in known_users: 161 | user_index = self.user_id_map[user] 162 | self.user_biases[user_index] = 0 163 | 164 | # Add user bias param for new users 165 | self.user_biases = np.append(self.user_biases, np.zeros(len(new_users))) 166 | 167 | # Estimate new bias parameter 168 | self.user_biases, _, self.train_rmse = _sgd( 169 | X=X.to_numpy(dtype=np.float64), 170 | global_mean=self.global_mean, 171 | user_biases=self.user_biases, 172 | item_biases=self.item_biases, 173 | n_epochs=n_epochs, 174 | lr=lr, 175 | reg=self.reg, 176 | verbose=verbose, 177 | update_item_params=False, 178 | ) 179 | 180 | return 181 | 182 | 183 | @nb.njit() 184 | def _calculate_rmse( 185 | X: np.ndarray, global_mean: float, user_biases: np.ndarray, item_biases: np.ndarray 186 | ): 187 | """ 188 | Calculates root mean squared error for given data and model parameters 189 | 190 | Args: 191 | X (np.ndarray): Matrix with columns user, item and rating 192 | global_mean (float): Global mean rating 193 | user_biases (np.ndarray): User biases vector of shape (n_users, 1) 194 | item_biases (np.ndarray): Item biases vector of shape (n_items, 1) 195 | 196 | Returns: 197 | rmse [float]: Root mean squared error 198 | """ 199 | n_ratings = X.shape[0] 200 | errors = np.zeros(n_ratings) 201 | 202 | # Iterate through all user-item ratings 203 | for i in range(n_ratings): 204 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2] 205 | 206 | # Calculate prediction and error 207 | pred = global_mean + user_biases[user_id] + item_biases[item_id] 208 | errors[i] = rating - pred 209 | 210 | rmse = np.sqrt(np.square(errors).mean()) 211 | 212 | return rmse 213 | 214 | 215 | @nb.njit() 216 | def _sgd( 217 | X: np.ndarray, 218 | global_mean: float, 219 | user_biases: np.ndarray, 220 | item_biases: np.ndarray, 221 | n_epochs: int, 222 | lr: float, 223 | reg: float, 224 | verbose: int, 225 | update_user_params: bool = True, 226 | update_item_params: bool = True, 227 | ) -> Tuple[np.ndarray, np.ndarray, list]: 228 | """ 229 | Performs Stochastic Gradient Descent to estimate the user_biases and item_biases 230 | 231 | Arguments: 232 | X {numpy array} -- User-item rating matrix 233 | global_mean {float} -- Global mean of all ratings 234 | user_biases {numpy array} -- User biases vector of shape (n_users, 1) 235 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1) 236 | n_epochs {int} -- Number of epochs to run 237 | lr {float} -- Learning rate alpha 238 | reg {float} -- Regularization parameter lambda for Frobenius norm 239 | verbose {int} -- Verbosity when fitting. 0 for nothing and 1 for printing epochs 240 | update_user_params {bool} -- Whether to update user bias parameters or not. Default is True. 241 | update_item_params {bool} -- Whether to update item bias parameters or not. Default is True. 242 | 243 | Returns: 244 | user_biases [np.ndarray] -- Updated user_biases vector 245 | item_biases [np.ndarray] -- Updated item_bases vector 246 | train_rmse -- Training rmse values 247 | """ 248 | train_rmse = [] 249 | 250 | for epoch in range(n_epochs): 251 | # Shuffle data before each epoch 252 | np.random.shuffle(X) 253 | 254 | # Iterate through all user-item ratings 255 | for i in range(X.shape[0]): 256 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2] 257 | 258 | # Compute error 259 | rating_pred = global_mean + user_biases[user_id] + item_biases[item_id] 260 | error = rating - rating_pred 261 | 262 | # Update parameters 263 | if update_user_params: 264 | user_biases[user_id] += lr * (error - reg * user_biases[user_id]) 265 | if update_item_params: 266 | item_biases[item_id] += lr * (error - reg * item_biases[item_id]) 267 | 268 | # Calculate error and print 269 | rmse = _calculate_rmse( 270 | X=X, 271 | global_mean=global_mean, 272 | user_biases=user_biases, 273 | item_biases=item_biases, 274 | ) 275 | train_rmse.append(rmse) 276 | 277 | if verbose == 1: 278 | print("Epoch ", epoch + 1, "/", n_epochs, " - train_rmse:", rmse) 279 | 280 | return user_biases, item_biases, train_rmse 281 | 282 | 283 | @nb.njit() 284 | def _als( 285 | X: np.ndarray, 286 | global_mean: float, 287 | user_biases: np.ndarray, 288 | item_biases: np.ndarray, 289 | n_epochs: int, 290 | reg: float, 291 | verbose: int, 292 | ) -> Tuple[np.ndarray, np.ndarray, list]: 293 | """ 294 | Performs Alternating Least Squares to estimate the user_biases and item_biases. For every epoch, the item biases are held constant while 295 | solving directly for the user biases parameters using a closed form equation. Then the user biases parameters is held constant and the same 296 | is done for the item biases. This can be derived easily and is given in the lecture here https://www.youtube.com/watch?v=gCaOa3W9kM0&t=32m55s 297 | which is also similar to the implementation in Surprise. 298 | 299 | Arguments: 300 | X {numpy array} -- User-item rating matrix 301 | global_mean {float} -- Global mean of all ratings 302 | user_biases {numpy array} -- User biases vector of shape (n_users, 1) 303 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1) 304 | n_epochs {int} -- Number of epochs to run 305 | reg {float} -- Regularization parameter lambda for Frobenius norm 306 | verbose {int} -- Verbosity when fitting. 0 for nothing and 1 for printing epochs 307 | 308 | Returns: 309 | user_biases [np.ndarray] -- Updated user_biases vector 310 | item_biases [np.ndarray] -- Updated item_bases vector 311 | train_rmse -- Training rmse values 312 | """ 313 | n_users = user_biases.shape[0] 314 | n_items = item_biases.shape[0] 315 | train_rmse = [] 316 | 317 | # Get counts of all users and items 318 | user_counts = np.zeros(n_users) 319 | item_counts = np.zeros(n_items) 320 | for i in range(X.shape[0]): 321 | user_id, item_id = int(X[i, 0]), int(X[i, 1]) 322 | user_counts[user_id] += 1 323 | item_counts[item_id] += 1 324 | 325 | # For each epoch optimize User biases, and then Item biases 326 | for epoch in range(n_epochs): 327 | 328 | # Update user bias parameters 329 | user_biases = np.zeros(n_users) 330 | 331 | # Iterate through all user-item ratings 332 | for i in range(X.shape[0]): 333 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2] 334 | user_biases[user_id] += rating - global_mean - item_biases[item_id] 335 | 336 | # Set user bias estimation 337 | user_biases = user_biases / (reg + user_counts) 338 | 339 | # Update item bias parameters 340 | item_biases = np.zeros(n_items) 341 | 342 | # Iterate through all user-item ratings 343 | for i in range(X.shape[0]): 344 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2] 345 | item_biases[item_id] += rating - global_mean - user_biases[user_id] 346 | 347 | # Set item bias estimation 348 | item_biases = item_biases / (reg + item_counts) 349 | 350 | # Calculate error and print 351 | rmse = _calculate_rmse( 352 | X=X, 353 | global_mean=global_mean, 354 | user_biases=user_biases, 355 | item_biases=item_biases, 356 | ) 357 | train_rmse.append(rmse) 358 | 359 | if verbose == 1: 360 | print("Epoch ", epoch + 1, "/", n_epochs, " - train_rmse:", rmse) 361 | 362 | return user_biases, item_biases, train_rmse 363 | 364 | 365 | @nb.njit() 366 | def _predict( 367 | X: np.ndarray, 368 | global_mean: float, 369 | min_rating: int, 370 | max_rating: int, 371 | user_biases: np.ndarray, 372 | item_biases: np.ndarray, 373 | bound_ratings: bool, 374 | ) -> Tuple[list, list]: 375 | """ 376 | Calculate predicted ratings for each user-item pair. 377 | 378 | Arguments: 379 | X {np.ndarray} -- Matrix with columns representing (user_id, item_id) 380 | global_mean {float} -- Global mean of all ratings 381 | min_rating {int} -- Lowest rating possible 382 | max_rating {int} -- Highest rating possible 383 | user_biases {np.ndarray} -- User biases vector of length n_users 384 | item_biases {np.ndarray} -- Item biases vector of length n_items 385 | bound_ratings {boolean} -- Whether to bound predictions in between range [min_rating, max_rating] 386 | 387 | Returns: 388 | predictions [np.ndarray] -- Vector containing rating predictions of all user, items in same order as input X 389 | predictions_possible [np.ndarray] -- Vector of whether both given user and item were contained in the data that the model was fitted on 390 | """ 391 | 392 | predictions = [] 393 | predictions_possible = [] 394 | 395 | for i in range(X.shape[0]): 396 | user_id, item_id = int(X[i, 0]), int(X[i, 1]) 397 | user_known = user_id != -1 398 | item_known = item_id != -1 399 | 400 | rating_pred = global_mean 401 | 402 | if user_known: 403 | rating_pred += user_biases[user_id] 404 | if item_known: 405 | rating_pred += item_biases[item_id] 406 | 407 | # Bound ratings to min and max rating range 408 | if bound_ratings: 409 | if rating_pred > max_rating: 410 | rating_pred = max_rating 411 | elif rating_pred < min_rating: 412 | rating_pred = min_rating 413 | 414 | predictions.append(rating_pred) 415 | predictions_possible.append(user_known and item_known) 416 | 417 | return predictions, predictions_possible 418 | -------------------------------------------------------------------------------- /matrix_factorization/kernel_matrix_factorization.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numba as nb 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from .kernels import ( 7 | kernel_linear, 8 | kernel_sigmoid, 9 | kernel_rbf, 10 | kernel_linear_sgd_update, 11 | kernel_sigmoid_sgd_update, 12 | kernel_rbf_sgd_update, 13 | ) 14 | from .recommender_base import RecommenderBase 15 | 16 | from typing import Tuple, Union 17 | 18 | 19 | class KernelMF(RecommenderBase): 20 | """ 21 | Kernel Matrix Factorization. Finds the thin matrices P and Q such that P * Q^T give a good low rank approximation to the user-item 22 | ratings matrix A based on RMSE. This is different from SVD despite the name as unlike SVD there is no constraint for matrices P and Q to have mutually 23 | orthogonal columns. 24 | 25 | Arguments: 26 | n_factors {int} -- The number of latent factors in matrices P and Q (default: {100}) 27 | n_epochs {int} -- Number of epochs to train for (default: {100}) 28 | kernel {str} -- Kernel function to use between user and item features. Options are 'linear', 'logistic' or 'rbf'. (default: {'linear'}) 29 | gamma {str or float} -- Kernel coefficient for 'rbf'. Ignored by other kernels. If 'auto' is used then will be set to 1/n_factors. (default: 'auto') 30 | reg {float} -- Regularization parameter lambda for Tikhonov regularization (default: {0.01}) 31 | lr {float} -- Learning rate alpha for gradient optimization step (default: {0.01}) 32 | init_mean {float} -- Mean of normal distribution to use for initializing parameters (default: {0}) 33 | init_sd {float} -- Standard deviation of normal distribution to use for initializing parameters (default: {0.1}) 34 | min_rating {int} -- Smallest rating possible (default: {0}) 35 | max_rating {int} -- Largest rating possible (default: {5}) 36 | verbose {str} -- Verbosity when fitting. Values possible are 0 to not print anything, 1 to print fitting model (default: {1}) 37 | 38 | Attributes: 39 | n_users {int} -- Number of users 40 | n_items {int} -- Number of items 41 | global_mean {float} -- Global mean of all ratings 42 | user_biases {numpy array} -- User bias vector of shape (n_users, 1) 43 | item_biases {numpy array} -- Item bias vector of shape (n_items, i) 44 | user_features {numpy array} -- Decomposed P matrix of user features of shape (n_users, n_factors) 45 | item_features {numpy array} -- Decomposed Q matrix of item features of shape (n_items, n_factors) 46 | user_id_map {dict} -- Mapping of user ids to assigned integer ids 47 | item_id_map {dict} -- Mapping of item ids to assigned integer ids 48 | train_rmse -- Training rmse values 49 | predictions_possible {list} -- Boolean vector of whether both user and item were known for prediction. Only available after calling predict 50 | """ 51 | 52 | def __init__( 53 | self, 54 | n_factors: int = 100, 55 | n_epochs: int = 100, 56 | kernel: str = "linear", 57 | gamma: Union[str, float] = "auto", 58 | reg: float = 1, 59 | lr: float = 0.01, 60 | init_mean: float = 0, 61 | init_sd: float = 0.1, 62 | min_rating: int = 0, 63 | max_rating: int = 5, 64 | verbose: int = 1, 65 | ): 66 | if kernel not in ("linear", "sigmoid", "rbf"): 67 | raise ValueError("Kernel must be one of linear, sigmoid, or rbf") 68 | 69 | super().__init__(min_rating=min_rating, max_rating=max_rating, verbose=verbose) 70 | 71 | self.n_factors = n_factors 72 | self.n_epochs = n_epochs 73 | self.kernel = kernel 74 | self.gamma = 1 / n_factors if gamma == "auto" else gamma 75 | self.reg = reg 76 | self.lr = lr 77 | self.init_mean = init_mean 78 | self.init_sd = init_sd 79 | return 80 | 81 | def fit(self, X: pd.DataFrame, y: pd.Series): 82 | """ 83 | Decompose user-item rating matrix into thin matrices P and Q along with user and item bias vectors 84 | 85 | Arguments: 86 | X {pandas DataFrame} -- Dataframe containing columns user_id, item_id 87 | y {pandas Series} -- Series containing ratings 88 | """ 89 | X = self._preprocess_data(X=X, y=y, type="fit") 90 | self.global_mean = X["rating"].mean() 91 | 92 | # Initialize vector bias parameters 93 | self.user_biases = np.zeros(self.n_users) 94 | self.item_biases = np.zeros(self.n_items) 95 | 96 | # Initialize latent factor parameters of matrices P and Q 97 | self.user_features = np.random.normal( 98 | self.init_mean, self.init_sd, (self.n_users, self.n_factors) 99 | ) 100 | self.item_features = np.random.normal( 101 | self.init_mean, self.init_sd, (self.n_items, self.n_factors) 102 | ) 103 | 104 | # Perform stochastic gradient descent 105 | ( 106 | self.user_features, 107 | self.item_features, 108 | self.user_biases, 109 | self.item_biases, 110 | self.train_rmse, 111 | ) = _sgd( 112 | X=X.to_numpy(dtype=np.float64), 113 | global_mean=self.global_mean, 114 | user_biases=self.user_biases, 115 | item_biases=self.item_biases, 116 | user_features=self.user_features, 117 | item_features=self.item_features, 118 | n_epochs=self.n_epochs, 119 | kernel=self.kernel, 120 | gamma=self.gamma, 121 | lr=self.lr, 122 | reg=self.reg, 123 | min_rating=self.min_rating, 124 | max_rating=self.max_rating, 125 | verbose=self.verbose, 126 | ) 127 | 128 | return self 129 | 130 | def predict(self, X: pd.DataFrame, bound_ratings: bool = True) -> list: 131 | """ 132 | Predict ratings for given users and items 133 | 134 | Arguments: 135 | X {pd.DataFrame} -- Dataframe containing columns user_id and item_id 136 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True) 137 | 138 | Returns: 139 | predictions [list] -- List containing rating predictions of all user, items in same order as input X 140 | """ 141 | # If empty return empty list 142 | if X.shape[0] == 0: 143 | return [] 144 | 145 | X = self._preprocess_data(X=X, type="predict") 146 | 147 | # Get predictions 148 | predictions, predictions_possible = _predict( 149 | X=X.to_numpy(dtype=np.float64), 150 | global_mean=self.global_mean, 151 | user_biases=self.user_biases, 152 | item_biases=self.item_biases, 153 | user_features=self.user_features, 154 | item_features=self.item_features, 155 | min_rating=self.min_rating, 156 | max_rating=self.max_rating, 157 | kernel=self.kernel, 158 | gamma=self.gamma, 159 | bound_ratings=bound_ratings, 160 | ) 161 | 162 | self.predictions_possible = predictions_possible 163 | return predictions 164 | 165 | def update_users( 166 | self, 167 | X: pd.DataFrame, 168 | y: pd.Series, 169 | lr: float = 0.01, 170 | n_epochs: int = 20, 171 | verbose: int = 0, 172 | ): 173 | """ 174 | Update P user features matrix with new/updated user-item ratings information using SGD. Only the user parameters corresponding for the 175 | new/updated users will be updated and item parameters will be left alone. 176 | 177 | Note: If updating old users then pass all user-item ratings for old users and not just modified ratings 178 | 179 | Args: 180 | X (pd.DataFrame): Dataframe containing columns user_id, item_id 181 | y (pd.DataFrame): Series containing ratings 182 | lr (float, optional): Learning rate alpha for gradient optimization step 183 | n_epochs (int, optional): Number of epochs to run SGD. Defaults to 20. 184 | verbose (int, optional): Verbosity when updating, 0 for nothing and 1 for training messages. Defaults to 0. 185 | """ 186 | X, known_users, new_users = self._preprocess_data(X=X, y=y, type="update") 187 | n_new_users = len(new_users) 188 | 189 | # Re-initialize params for old users 190 | for user in known_users: 191 | user_index = self.user_id_map[user] 192 | 193 | # Initialize bias 194 | self.user_biases[user_index] = 0 195 | 196 | # Initialize latent factors vector 197 | self.user_features[user_index, :] = np.random.normal( 198 | self.init_mean, self.init_sd, (1, self.n_factors) 199 | ) 200 | 201 | # Add bias parameters for new users 202 | self.user_biases = np.append(self.user_biases, np.zeros(n_new_users)) 203 | 204 | # Add latent factor parameters for new users by adding rows to P matrix 205 | new_user_features = np.random.normal( 206 | self.init_mean, self.init_sd, (n_new_users, self.n_factors) 207 | ) 208 | self.user_features = np.concatenate( 209 | (self.user_features, new_user_features), axis=0 210 | ) 211 | 212 | # Estimate new parameters 213 | ( 214 | self.user_features, 215 | self.item_features, 216 | self.user_biases, 217 | self.item_biases, 218 | self.train_rmse, 219 | ) = _sgd( 220 | X=X.to_numpy(dtype=np.float64), 221 | global_mean=self.global_mean, 222 | user_biases=self.user_biases, 223 | item_biases=self.item_biases, 224 | user_features=self.user_features, 225 | item_features=self.item_features, 226 | n_epochs=n_epochs, 227 | kernel=self.kernel, 228 | gamma=self.gamma, 229 | lr=lr, 230 | reg=self.reg, 231 | min_rating=self.min_rating, 232 | max_rating=self.max_rating, 233 | verbose=verbose, 234 | update_item_params=False, 235 | ) 236 | 237 | return 238 | 239 | 240 | @nb.njit() 241 | def _calculate_rmse( 242 | X: np.ndarray, 243 | global_mean: float, 244 | user_biases: np.ndarray, 245 | item_biases: np.ndarray, 246 | user_features: np.ndarray, 247 | item_features: np.ndarray, 248 | min_rating: float, 249 | max_rating: float, 250 | kernel: str, 251 | gamma: float, 252 | ): 253 | """ 254 | Calculates root mean squared error for given data and model parameters 255 | 256 | Args: 257 | X (np.ndarray): Matrix with columns user, item and rating 258 | global_mean (float): Global mean rating 259 | user_biases (np.ndarray): User biases vector of shape (n_users, 1) 260 | item_biases (np.ndarray): Item biases vector of shape (n_items, 1) 261 | user_features (np.ndarray): User features matrix P of size (n_users, n_factors) 262 | item_features (np.ndarray): Item features matrix Q of size (n_items, n_factors) 263 | min_rating (float): Minimum possible rating 264 | max_rating (float): Maximum possible rating 265 | kernel (str): Kernel type. Possible options are "linear", "sigmoid" or "rbf" kernel 266 | gamma (float): Kernel coefficient only for "rbf" kernel 267 | 268 | Returns: 269 | rmse [float]: Root mean squared error 270 | """ 271 | n_ratings = X.shape[0] 272 | errors = np.zeros(n_ratings) 273 | 274 | # Iterate through all user-item ratings and calculate error 275 | for i in range(n_ratings): 276 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2] 277 | user_bias = user_biases[user_id] 278 | item_bias = item_biases[item_id] 279 | user_feature_vec = user_features[user_id, :] 280 | item_feature_vec = item_features[item_id, :] 281 | 282 | # Calculate predicted rating for given kernel 283 | if kernel == "linear": 284 | rating_pred = kernel_linear( 285 | global_mean=global_mean, 286 | user_bias=user_bias, 287 | item_bias=item_bias, 288 | user_feature_vec=user_feature_vec, 289 | item_feature_vec=item_feature_vec, 290 | ) 291 | 292 | elif kernel == "sigmoid": 293 | rating_pred = kernel_sigmoid( 294 | global_mean=global_mean, 295 | user_bias=user_bias, 296 | item_bias=item_bias, 297 | user_feature_vec=user_feature_vec, 298 | item_feature_vec=item_feature_vec, 299 | a=min_rating, 300 | c=max_rating - min_rating, 301 | ) 302 | 303 | elif kernel == "rbf": 304 | rating_pred = kernel_rbf( 305 | user_feature_vec=user_feature_vec, 306 | item_feature_vec=item_feature_vec, 307 | gamma=gamma, 308 | a=min_rating, 309 | c=max_rating - min_rating, 310 | ) 311 | 312 | # Calculate error 313 | errors[i] = rating - rating_pred 314 | 315 | rmse = np.sqrt(np.square(errors).mean()) 316 | 317 | return rmse 318 | 319 | 320 | @nb.njit() 321 | def _sgd( 322 | X: np.ndarray, 323 | global_mean: float, 324 | user_biases: np.ndarray, 325 | item_biases: np.ndarray, 326 | user_features: np.ndarray, 327 | item_features: np.ndarray, 328 | n_epochs: int, 329 | kernel: str, 330 | gamma: float, 331 | lr: float, 332 | reg: float, 333 | min_rating: float, 334 | max_rating: float, 335 | verbose: int, 336 | update_user_params: bool = True, 337 | update_item_params: bool = True, 338 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list]: 339 | """ 340 | Performs stochastic gradient descent to estimate parameters. 341 | 342 | Arguments: 343 | X {numpy array} -- User-item ranking matrix 344 | global_mean {float} -- Global mean of all ratings 345 | user_biases {numpy array} -- User biases vector of shape (n_users, 1) 346 | item_biases {numpy array} -- Item biases vector of shape (n_items, 1) 347 | user_features {numpy array} -- Start matrix P of user features of shape (n_users, n_factors) 348 | item_features {numpy array} -- Start matrix Q of item features of shape (n_items, n_factors) 349 | n_epochs {int} -- Number of epochs to run 350 | kernel {str} -- Kernel function to use between user and item features. Options are 'linear', 'logistic', and 'rbf'. 351 | gamma {float} -- Kernel coefficient for 'rbf'. Ignored by other kernels. 352 | lr {float} -- Learning rate alpha 353 | reg {float} -- Regularization parameter lambda for Frobenius norm 354 | min_rating {float} -- Minimum possible rating 355 | max_fating {float} -- Maximum possible rating 356 | verbose {int} -- Verbosity when fitting. 0 for nothing and 1 for printing epochs 357 | update_user_params {bool} -- Whether to update user parameters or not. Default is True. 358 | update_item_params {bool} -- Whether to update item parameters or not. Default is True. 359 | 360 | Returns: 361 | user_features [np.ndarray] -- Updated user_features matrix P 362 | item_features [np.ndarray] -- Updated item_features matrix Q 363 | user_biases [np.ndarray] -- Updated user_biases vector 364 | item_biases [np.ndarray] -- Updated item_bases vector 365 | train_rmse [list] -- Training rmse values 366 | """ 367 | train_rmse = [] 368 | 369 | for epoch in range(n_epochs): 370 | # Shuffle dataset before each epoch 371 | np.random.shuffle(X) 372 | 373 | # Iterate through all user-item ratings 374 | for i in range(X.shape[0]): 375 | user_id, item_id, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2] 376 | 377 | if kernel == "linear": 378 | kernel_linear_sgd_update( 379 | user_id=user_id, 380 | item_id=item_id, 381 | rating=rating, 382 | global_mean=global_mean, 383 | user_biases=user_biases, 384 | item_biases=item_biases, 385 | user_features=user_features, 386 | item_features=item_features, 387 | lr=lr, 388 | reg=reg, 389 | update_user_params=update_user_params, 390 | update_item_params=update_item_params, 391 | ) 392 | 393 | elif kernel == "sigmoid": 394 | kernel_sigmoid_sgd_update( 395 | user_id=user_id, 396 | item_id=item_id, 397 | rating=rating, 398 | global_mean=global_mean, 399 | user_biases=user_biases, 400 | item_biases=item_biases, 401 | user_features=user_features, 402 | item_features=item_features, 403 | lr=lr, 404 | reg=reg, 405 | a=min_rating, 406 | c=max_rating - min_rating, 407 | update_user_params=update_user_params, 408 | update_item_params=update_item_params, 409 | ) 410 | 411 | elif kernel == "rbf": 412 | kernel_rbf_sgd_update( 413 | user_id=user_id, 414 | item_id=item_id, 415 | rating=rating, 416 | user_features=user_features, 417 | item_features=item_features, 418 | lr=lr, 419 | reg=reg, 420 | gamma=gamma, 421 | a=min_rating, 422 | c=max_rating - min_rating, 423 | update_user_params=update_user_params, 424 | update_item_params=update_item_params, 425 | ) 426 | 427 | # Calculate error and print 428 | rmse = _calculate_rmse( 429 | X=X, 430 | global_mean=global_mean, 431 | user_biases=user_biases, 432 | item_biases=item_biases, 433 | user_features=user_features, 434 | item_features=item_features, 435 | min_rating=min_rating, 436 | max_rating=max_rating, 437 | kernel=kernel, 438 | gamma=gamma, 439 | ) 440 | train_rmse.append(rmse) 441 | 442 | if verbose == 1: 443 | print("Epoch ", epoch + 1, "/", n_epochs, " - train_rmse:", rmse) 444 | 445 | return user_features, item_features, user_biases, item_biases, train_rmse 446 | 447 | 448 | @nb.njit() 449 | def _predict( 450 | X: np.ndarray, 451 | global_mean: float, 452 | user_biases: np.ndarray, 453 | item_biases: np.ndarray, 454 | user_features: np.ndarray, 455 | item_features: np.ndarray, 456 | min_rating: int, 457 | max_rating: int, 458 | kernel: str, 459 | gamma: float, 460 | bound_ratings: bool, 461 | ) -> Tuple[list, list]: 462 | """ 463 | Calculate predicted ratings for each user-item pair. 464 | 465 | Arguments: 466 | X {np.ndarray} -- Matrix with columns representing (user_id, item_id) 467 | global_mean {float} -- Global mean of all ratings 468 | user_biases {np.ndarray} -- User biases vector of length n_users 469 | item_biases {np.ndarray} -- Item biases vector of length n_items 470 | user_features {np.ndarray} -- User features matrix P of shape (n_users, n_factors) 471 | item_features {np.ndarray} -- Item features matrix Q of shape (n_items, n_factors) 472 | min_rating {int} -- Lowest rating possible 473 | max_rating {int} -- Highest rating possible 474 | kernel {str} -- Kernel function. Options are 'linear', 'sigmoid', and 'rbf' 475 | gamma {float} -- Kernel coefficient for 'rbf' only 476 | bound_ratings (bool): Whether to bound ratings in range [min_rating, max_rating] (default: True) 477 | 478 | Returns: 479 | predictions [np.ndarray] -- Vector containing rating predictions of all user, items in same order as input X 480 | predictions_possible [np.ndarray] -- Vector of whether both given user and item were contained in the data that the model was fitted on 481 | """ 482 | n_factors = user_features.shape[1] 483 | predictions = [] 484 | predictions_possible = [] 485 | 486 | for i in range(X.shape[0]): 487 | user_id, item_id = int(X[i, 0]), int(X[i, 1]) 488 | user_known = user_id != -1 489 | item_known = item_id != -1 490 | 491 | # Default values if user or item are not known 492 | user_bias = user_biases[user_id] if user_known else 0 493 | item_bias = item_biases[item_id] if item_known else 0 494 | user_feature_vec = ( 495 | user_features[user_id, :] if user_known else np.zeros(n_factors) 496 | ) 497 | item_feature_vec = ( 498 | item_features[item_id, :] if item_known else np.zeros(n_factors) 499 | ) 500 | 501 | # Calculate predicted rating given kernel 502 | if kernel == "linear": 503 | rating_pred = kernel_linear( 504 | global_mean=global_mean, 505 | user_bias=user_bias, 506 | item_bias=item_bias, 507 | user_feature_vec=user_feature_vec, 508 | item_feature_vec=item_feature_vec, 509 | ) 510 | 511 | elif kernel == "sigmoid": 512 | rating_pred = kernel_sigmoid( 513 | global_mean=global_mean, 514 | user_bias=user_bias, 515 | item_bias=item_bias, 516 | user_feature_vec=user_feature_vec, 517 | item_feature_vec=item_feature_vec, 518 | a=min_rating, 519 | c=max_rating - min_rating, 520 | ) 521 | 522 | elif kernel == "rbf": 523 | rating_pred = kernel_rbf( 524 | user_feature_vec=user_feature_vec, 525 | item_feature_vec=item_feature_vec, 526 | gamma=gamma, 527 | a=min_rating, 528 | c=max_rating - min_rating, 529 | ) 530 | 531 | # Bound ratings to min and max rating range 532 | if bound_ratings: 533 | if rating_pred > max_rating: 534 | rating_pred = max_rating 535 | elif rating_pred < min_rating: 536 | rating_pred = min_rating 537 | 538 | predictions.append(rating_pred) 539 | predictions_possible.append(user_known and item_known) 540 | 541 | return predictions, predictions_possible 542 | -------------------------------------------------------------------------------- /examples/recommender-system.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Data manipulation\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "pd.options.display.max_rows = 100\n", 15 | "\n", 16 | "# Modeling\n", 17 | "from matrix_factorization import BaselineModel, KernelMF, train_update_test_split\n", 18 | "from sklearn.metrics import mean_squared_error\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "\n", 21 | "# Other\n", 22 | "import os\n", 23 | "import random\n", 24 | "import sys\n", 25 | "\n", 26 | "# Reload imported code \n", 27 | "%load_ext autoreload\n", 28 | "%autoreload 2\n", 29 | "\n", 30 | "# Print all output\n", 31 | "from IPython.core.interactiveshell import InteractiveShell\n", 32 | "InteractiveShell.ast_node_interactivity = \"all\"\n", 33 | " \n", 34 | "rand_seed = 2\n", 35 | "np.random.seed(rand_seed)\n", 36 | "random.seed(rand_seed)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# Load data" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "**Movie data found here https://grouplens.org/datasets/movielens/**" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
user_iditem_idrating
01962423
11863023
2223771
3244512
41663461
52984744
61152652
72534655
83054513
96863
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " user_id item_id rating\n", 151 | "0 196 242 3\n", 152 | "1 186 302 3\n", 153 | "2 22 377 1\n", 154 | "3 244 51 2\n", 155 | "4 166 346 1\n", 156 | "5 298 474 4\n", 157 | "6 115 265 2\n", 158 | "7 253 465 5\n", 159 | "8 305 451 3\n", 160 | "9 6 86 3" 161 | ] 162 | }, 163 | "execution_count": 2, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "cols = ['user_id', 'item_id', 'rating', 'timestamp']\n", 170 | "# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')\n", 171 | "movie_data = pd.read_csv('../data/ml-100k/u.data', names = cols, sep = '\\t', usecols=[0, 1, 2], engine='python')\n", 172 | "\n", 173 | "X = movie_data[['user_id', 'item_id']]\n", 174 | "y = movie_data['rating']\n", 175 | "\n", 176 | "# Prepare data\n", 177 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", 178 | "\n", 179 | "# Prepare data for online learning\n", 180 | "X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(movie_data, frac_new_users=0.2)\n", 181 | "\n", 182 | "movie_data.head(10)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "# Simple model with global mean" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "This is similar to just the global standard deviation" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 3, 202 | "metadata": { 203 | "tags": [] 204 | }, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "\n", 211 | "Test RMSE: 1.120652\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "global_mean = y_train.mean()\n", 217 | "pred = [global_mean for _ in range(y_test.shape[0])]\n", 218 | "\n", 219 | "rmse = mean_squared_error(y_test, pred, squared = False)\n", 220 | "\n", 221 | "print(f'\\nTest RMSE: {rmse:4f}')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "# Baseline Model with biases" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## SGD" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 4, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "Epoch 1 / 20 - train_rmse: 0.9685443987174238\n", 248 | "Epoch 2 / 20 - train_rmse: 0.945448032425675\n", 249 | "Epoch 3 / 20 - train_rmse: 0.9350744230954693\n", 250 | "Epoch 4 / 20 - train_rmse: 0.9294774771346712\n", 251 | "Epoch 5 / 20 - train_rmse: 0.9258635943145475\n", 252 | "Epoch 6 / 20 - train_rmse: 0.9235995589398913\n", 253 | "Epoch 7 / 20 - train_rmse: 0.9218589129974872\n", 254 | "Epoch 8 / 20 - train_rmse: 0.9205752967946901\n", 255 | "Epoch 9 / 20 - train_rmse: 0.9197497680553437\n", 256 | "Epoch 10 / 20 - train_rmse: 0.9189075470532244\n", 257 | "Epoch 11 / 20 - train_rmse: 0.9184605627485326\n", 258 | "Epoch 12 / 20 - train_rmse: 0.9180274072268116\n", 259 | "Epoch 13 / 20 - train_rmse: 0.9174771346162836\n", 260 | "Epoch 14 / 20 - train_rmse: 0.9172615435062336\n", 261 | "Epoch 15 / 20 - train_rmse: 0.9169118664096015\n", 262 | "Epoch 16 / 20 - train_rmse: 0.916762599540885\n", 263 | "Epoch 17 / 20 - train_rmse: 0.9165916401686293\n", 264 | "Epoch 18 / 20 - train_rmse: 0.9164009881488299\n", 265 | "Epoch 19 / 20 - train_rmse: 0.9161039428103391\n", 266 | "Epoch 20 / 20 - train_rmse: 0.9160441667784996\n", 267 | "\n", 268 | "Test RMSE: 0.9298\n", 269 | "Wall time: 3.25 s\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "%%time\n", 275 | "\n", 276 | "baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)\n", 277 | "baseline_model.fit(X_train, y_train)\n", 278 | "\n", 279 | "pred = baseline_model.predict(X_test)\n", 280 | "rmse = mean_squared_error(y_test, pred, squared = False)\n", 281 | "\n", 282 | "print(f'\\nTest RMSE: {rmse:.4f}')" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 5, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/html": [ 293 | "
\n", 294 | "\n", 307 | "\n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | "
user_iditem_idrating_pred
3782003185.0
4572003575.0
3882004085.0
98820014495.0
2812004835.0
7902001145.0
1092001275.0
562200125.0
2122001695.0
542006035.0
\n", 379 | "
" 380 | ], 381 | "text/plain": [ 382 | " user_id item_id rating_pred\n", 383 | "378 200 318 5.0\n", 384 | "457 200 357 5.0\n", 385 | "388 200 408 5.0\n", 386 | "988 200 1449 5.0\n", 387 | "281 200 483 5.0\n", 388 | "790 200 114 5.0\n", 389 | "109 200 127 5.0\n", 390 | "562 200 12 5.0\n", 391 | "212 200 169 5.0\n", 392 | "54 200 603 5.0" 393 | ] 394 | }, 395 | "execution_count": 5, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "baseline_model.recommend(user=200)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "## ALS" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 6, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "Epoch 1 / 20 - train_rmse: 0.9312489364350157\n", 421 | "Epoch 2 / 20 - train_rmse: 0.9144875214764501\n", 422 | "Epoch 3 / 20 - train_rmse: 0.9134856911195807\n", 423 | "Epoch 4 / 20 - train_rmse: 0.9133800448918423\n", 424 | "Epoch 5 / 20 - train_rmse: 0.9133615794862777\n", 425 | "Epoch 6 / 20 - train_rmse: 0.9133565857003941\n", 426 | "Epoch 7 / 20 - train_rmse: 0.9133544601244424\n", 427 | "Epoch 8 / 20 - train_rmse: 0.9133531004630441\n", 428 | "Epoch 9 / 20 - train_rmse: 0.9133519902067218\n", 429 | "Epoch 10 / 20 - train_rmse: 0.9133509792033206\n", 430 | "Epoch 11 / 20 - train_rmse: 0.9133500175542733\n", 431 | "Epoch 12 / 20 - train_rmse: 0.9133490869495551\n", 432 | "Epoch 13 / 20 - train_rmse: 0.9133481801287349\n", 433 | "Epoch 14 / 20 - train_rmse: 0.9133472939684136\n", 434 | "Epoch 15 / 20 - train_rmse: 0.9133464269599311\n", 435 | "Epoch 16 / 20 - train_rmse: 0.9133455782426871\n", 436 | "Epoch 17 / 20 - train_rmse: 0.9133447472230197\n", 437 | "Epoch 18 / 20 - train_rmse: 0.9133439334215674\n", 438 | "Epoch 19 / 20 - train_rmse: 0.9133431364114416\n", 439 | "Epoch 20 / 20 - train_rmse: 0.9133423557930989\n", 440 | "\n", 441 | "Test RMSE: 0.9294\n", 442 | "Wall time: 1.17 s\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "%%time\n", 448 | "\n", 449 | "baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)\n", 450 | "baseline_model.fit(X_train, y_train)\n", 451 | "\n", 452 | "pred = baseline_model.predict(X_test)\n", 453 | "rmse = mean_squared_error(y_test, pred, squared = False)\n", 454 | "\n", 455 | "print(f'\\nTest RMSE: {rmse:.4f}')" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "## Updating with new users" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 7, 468 | "metadata": { 469 | "tags": [] 470 | }, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "Epoch 1 / 20 - train_rmse: 0.9650236406922229\n", 477 | "Epoch 2 / 20 - train_rmse: 0.9428226226596799\n", 478 | "Epoch 3 / 20 - train_rmse: 0.9331705124882925\n", 479 | "Epoch 4 / 20 - train_rmse: 0.9279749973416741\n", 480 | "Epoch 5 / 20 - train_rmse: 0.9247974571263335\n", 481 | "Epoch 6 / 20 - train_rmse: 0.9226517575035114\n", 482 | "Epoch 7 / 20 - train_rmse: 0.920835039334346\n", 483 | "Epoch 8 / 20 - train_rmse: 0.9197367786245378\n", 484 | "Epoch 9 / 20 - train_rmse: 0.9189681287833118\n", 485 | "Epoch 10 / 20 - train_rmse: 0.9181493468113285\n", 486 | "Epoch 11 / 20 - train_rmse: 0.9177119438426637\n", 487 | "Epoch 12 / 20 - train_rmse: 0.9172589415232193\n", 488 | "Epoch 13 / 20 - train_rmse: 0.9168827001131301\n", 489 | "Epoch 14 / 20 - train_rmse: 0.9164445680503323\n", 490 | "Epoch 15 / 20 - train_rmse: 0.9164404466859075\n", 491 | "Epoch 16 / 20 - train_rmse: 0.9160093360322635\n", 492 | "Epoch 17 / 20 - train_rmse: 0.9158025569643043\n", 493 | "Epoch 18 / 20 - train_rmse: 0.9157375955425434\n", 494 | "Epoch 19 / 20 - train_rmse: 0.9156845197413601\n", 495 | "Epoch 20 / 20 - train_rmse: 0.9153536272183195\n" 496 | ] 497 | }, 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "BaselineModel(n_epochs=20, reg=0.05)" 502 | ] 503 | }, 504 | "execution_count": 7, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)\n", 511 | "baseline_model.fit(X_train_initial, y_train_initial)" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 8, 517 | "metadata": { 518 | "tags": [] 519 | }, 520 | "outputs": [ 521 | { 522 | "name": "stdout", 523 | "output_type": "stream", 524 | "text": [ 525 | "Epoch 1 / 20 - train_rmse: 1.0192369838658015\n", 526 | "Epoch 2 / 20 - train_rmse: 1.0025765882013635\n", 527 | "Epoch 3 / 20 - train_rmse: 0.9901259692095271\n", 528 | "Epoch 4 / 20 - train_rmse: 0.9807144030582827\n", 529 | "Epoch 5 / 20 - train_rmse: 0.9734408997442995\n", 530 | "Epoch 6 / 20 - train_rmse: 0.9677156773644434\n", 531 | "Epoch 7 / 20 - train_rmse: 0.9631030982793267\n", 532 | "Epoch 8 / 20 - train_rmse: 0.9593444020925831\n", 533 | "Epoch 9 / 20 - train_rmse: 0.9562283345776661\n", 534 | "Epoch 10 / 20 - train_rmse: 0.9536075629675317\n", 535 | "Epoch 11 / 20 - train_rmse: 0.9513672180603409\n", 536 | "Epoch 12 / 20 - train_rmse: 0.9494208315066158\n", 537 | "Epoch 13 / 20 - train_rmse: 0.9477253749191763\n", 538 | "Epoch 14 / 20 - train_rmse: 0.946229927618241\n", 539 | "Epoch 15 / 20 - train_rmse: 0.9449080911468511\n", 540 | "Epoch 16 / 20 - train_rmse: 0.943720843305453\n", 541 | "Epoch 17 / 20 - train_rmse: 0.9426516413656599\n", 542 | "Epoch 18 / 20 - train_rmse: 0.9416762680286268\n", 543 | "Epoch 19 / 20 - train_rmse: 0.9407955983703769\n", 544 | "Epoch 20 / 20 - train_rmse: 0.9399846956755161\n", 545 | "\n", 546 | "Test RMSE: 0.9484\n", 547 | "Wall time: 965 ms\n" 548 | ] 549 | } 550 | ], 551 | "source": [ 552 | "%%time\n", 553 | "baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)\n", 554 | "pred = baseline_model.predict(X_test_update)\n", 555 | "rmse = mean_squared_error(y_test_update, pred, squared = False)\n", 556 | "\n", 557 | "print(f'\\nTest RMSE: {rmse:.4f}')" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "# Matrix Factorization" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "## Linear Kernel" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 9, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stdout", 581 | "output_type": "stream", 582 | "text": [ 583 | "Epoch 1 / 20 - train_rmse: 1.0801330309911932\n", 584 | "Epoch 2 / 20 - train_rmse: 1.0473476509450943\n", 585 | "Epoch 3 / 20 - train_rmse: 1.0244646832888804\n", 586 | "Epoch 4 / 20 - train_rmse: 1.0074920647400105\n", 587 | "Epoch 5 / 20 - train_rmse: 0.994246835724601\n", 588 | "Epoch 6 / 20 - train_rmse: 0.9835051043916838\n", 589 | "Epoch 7 / 20 - train_rmse: 0.9745225390156432\n", 590 | "Epoch 8 / 20 - train_rmse: 0.9668223717422572\n", 591 | "Epoch 9 / 20 - train_rmse: 0.9600683414209181\n", 592 | "Epoch 10 / 20 - train_rmse: 0.9540555205061302\n", 593 | "Epoch 11 / 20 - train_rmse: 0.9486137679667849\n", 594 | "Epoch 12 / 20 - train_rmse: 0.9436380921221055\n", 595 | "Epoch 13 / 20 - train_rmse: 0.9390299858326666\n", 596 | "Epoch 14 / 20 - train_rmse: 0.9347250023203936\n", 597 | "Epoch 15 / 20 - train_rmse: 0.9306721252709302\n", 598 | "Epoch 16 / 20 - train_rmse: 0.9268329678953544\n", 599 | "Epoch 17 / 20 - train_rmse: 0.9231713443339361\n", 600 | "Epoch 18 / 20 - train_rmse: 0.919660317751421\n", 601 | "Epoch 19 / 20 - train_rmse: 0.9162775396770947\n", 602 | "Epoch 20 / 20 - train_rmse: 0.9130048063578868\n", 603 | "\n", 604 | "Test RMSE: 0.9534\n", 605 | "Wall time: 15.7 s\n" 606 | ] 607 | } 608 | ], 609 | "source": [ 610 | "%%time \n", 611 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)\n", 612 | "matrix_fact.fit(X_train, y_train)\n", 613 | "\n", 614 | "pred = matrix_fact.predict(X_test)\n", 615 | "rmse = mean_squared_error(y_test, pred, squared = False)\n", 616 | "\n", 617 | "print(f'\\nTest RMSE: {rmse:.4f}')" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "## Getting list of recommendations for a user" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 10, 630 | "metadata": {}, 631 | "outputs": [ 632 | { 633 | "data": { 634 | "text/html": [ 635 | "
\n", 636 | "\n", 649 | "\n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | "
user_iditem_idrating_pred
37200645.000000
2422003574.953382
112001274.914760
612002724.904249
7102004794.837060
3952004804.836522
275200124.816657
6552004274.808555
552005114.804192
1442002854.797472
\n", 721 | "
" 722 | ], 723 | "text/plain": [ 724 | " user_id item_id rating_pred\n", 725 | "37 200 64 5.000000\n", 726 | "242 200 357 4.953382\n", 727 | "11 200 127 4.914760\n", 728 | "61 200 272 4.904249\n", 729 | "710 200 479 4.837060\n", 730 | "395 200 480 4.836522\n", 731 | "275 200 12 4.816657\n", 732 | "655 200 427 4.808555\n", 733 | "55 200 511 4.804192\n", 734 | "144 200 285 4.797472" 735 | ] 736 | }, 737 | "execution_count": 10, 738 | "metadata": {}, 739 | "output_type": "execute_result" 740 | } 741 | ], 742 | "source": [ 743 | "user = 200\n", 744 | "items_known = X_train.query('user_id == @user')['item_id']\n", 745 | "matrix_fact.recommend(user=user, items_known=items_known)" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "## Updating with new users" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 11, 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "name": "stdout", 762 | "output_type": "stream", 763 | "text": [ 764 | "Epoch 1 / 20 - train_rmse: 1.0706518319392073\n", 765 | "Epoch 2 / 20 - train_rmse: 1.0382624779438394\n", 766 | "Epoch 3 / 20 - train_rmse: 1.016232308328001\n", 767 | "Epoch 4 / 20 - train_rmse: 0.9999366805279928\n", 768 | "Epoch 5 / 20 - train_rmse: 0.9872308367922817\n", 769 | "Epoch 6 / 20 - train_rmse: 0.9769357406601346\n", 770 | "Epoch 7 / 20 - train_rmse: 0.9683129631342807\n", 771 | "Epoch 8 / 20 - train_rmse: 0.9609022720622064\n", 772 | "Epoch 9 / 20 - train_rmse: 0.9543972792347011\n", 773 | "Epoch 10 / 20 - train_rmse: 0.9485809462916166\n", 774 | "Epoch 11 / 20 - train_rmse: 0.9433059630075376\n", 775 | "Epoch 12 / 20 - train_rmse: 0.9384619306949283\n", 776 | "Epoch 13 / 20 - train_rmse: 0.9339642725110164\n", 777 | "Epoch 14 / 20 - train_rmse: 0.9297503741854064\n", 778 | "Epoch 15 / 20 - train_rmse: 0.9257711482478324\n", 779 | "Epoch 16 / 20 - train_rmse: 0.921985910287917\n", 780 | "Epoch 17 / 20 - train_rmse: 0.9183647974387779\n", 781 | "Epoch 18 / 20 - train_rmse: 0.9148839852245906\n", 782 | "Epoch 19 / 20 - train_rmse: 0.9115179356050906\n", 783 | "Epoch 20 / 20 - train_rmse: 0.9082510051903396\n" 784 | ] 785 | }, 786 | { 787 | "data": { 788 | "text/plain": [ 789 | "KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)" 790 | ] 791 | }, 792 | "execution_count": 11, 793 | "metadata": {}, 794 | "output_type": "execute_result" 795 | } 796 | ], 797 | "source": [ 798 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)\n", 799 | "matrix_fact.fit(X_train_initial, y_train_initial)" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": 12, 805 | "metadata": {}, 806 | "outputs": [ 807 | { 808 | "name": "stdout", 809 | "output_type": "stream", 810 | "text": [ 811 | "Epoch 1 / 20 - train_rmse: 1.0397682004761137\n", 812 | "Epoch 2 / 20 - train_rmse: 1.0204525249402976\n", 813 | "Epoch 3 / 20 - train_rmse: 1.0058106710178145\n", 814 | "Epoch 4 / 20 - train_rmse: 0.9945551189184298\n", 815 | "Epoch 5 / 20 - train_rmse: 0.9856789782783212\n", 816 | "Epoch 6 / 20 - train_rmse: 0.9785788434701258\n", 817 | "Epoch 7 / 20 - train_rmse: 0.9727422998246427\n", 818 | "Epoch 8 / 20 - train_rmse: 0.9678438078577599\n", 819 | "Epoch 9 / 20 - train_rmse: 0.9636632891501984\n", 820 | "Epoch 10 / 20 - train_rmse: 0.9600308660297464\n", 821 | "Epoch 11 / 20 - train_rmse: 0.9568136464702428\n", 822 | "Epoch 12 / 20 - train_rmse: 0.9539161652784045\n", 823 | "Epoch 13 / 20 - train_rmse: 0.9512904364030054\n", 824 | "Epoch 14 / 20 - train_rmse: 0.9488745417666238\n", 825 | "Epoch 15 / 20 - train_rmse: 0.9466285136632905\n", 826 | "Epoch 16 / 20 - train_rmse: 0.94452616338993\n", 827 | "Epoch 17 / 20 - train_rmse: 0.9425492611358841\n", 828 | "Epoch 18 / 20 - train_rmse: 0.9406751136767649\n", 829 | "Epoch 19 / 20 - train_rmse: 0.9388943623139107\n", 830 | "Epoch 20 / 20 - train_rmse: 0.9371880494897803\n", 831 | "\n", 832 | "Test RMSE: 0.9677\n", 833 | "Wall time: 2.01 s\n" 834 | ] 835 | } 836 | ], 837 | "source": [ 838 | "%%time\n", 839 | "# Update model with new users\n", 840 | "matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)\n", 841 | "pred = matrix_fact.predict(X_test_update)\n", 842 | "rmse = mean_squared_error(y_test_update, pred, squared = False)\n", 843 | "\n", 844 | "print(f'\\nTest RMSE: {rmse:.4f}')" 845 | ] 846 | }, 847 | { 848 | "cell_type": "markdown", 849 | "metadata": {}, 850 | "source": [ 851 | "## Sigmoid kernel" 852 | ] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": 13, 857 | "metadata": {}, 858 | "outputs": [ 859 | { 860 | "name": "stdout", 861 | "output_type": "stream", 862 | "text": [ 863 | "Epoch 1 / 20 - train_rmse: 1.7254842363611376\n", 864 | "Epoch 2 / 20 - train_rmse: 1.700347578847924\n", 865 | "Epoch 3 / 20 - train_rmse: 1.6622359141199023\n", 866 | "Epoch 4 / 20 - train_rmse: 1.6210456578773018\n", 867 | "Epoch 5 / 20 - train_rmse: 1.5756597449133936\n", 868 | "Epoch 6 / 20 - train_rmse: 1.523379818791774\n", 869 | "Epoch 7 / 20 - train_rmse: 1.4657317754887915\n", 870 | "Epoch 8 / 20 - train_rmse: 1.4093479432787581\n", 871 | "Epoch 9 / 20 - train_rmse: 1.358332738938575\n", 872 | "Epoch 10 / 20 - train_rmse: 1.3133318818212163\n", 873 | "Epoch 11 / 20 - train_rmse: 1.2739396811494321\n", 874 | "Epoch 12 / 20 - train_rmse: 1.2393300382279362\n", 875 | "Epoch 13 / 20 - train_rmse: 1.2087120677746743\n", 876 | "Epoch 14 / 20 - train_rmse: 1.181458675550588\n", 877 | "Epoch 15 / 20 - train_rmse: 1.1570300259298787\n", 878 | "Epoch 16 / 20 - train_rmse: 1.1349358601708097\n", 879 | "Epoch 17 / 20 - train_rmse: 1.114946996505043\n", 880 | "Epoch 18 / 20 - train_rmse: 1.0966573702646067\n", 881 | "Epoch 19 / 20 - train_rmse: 1.079843880247601\n", 882 | "Epoch 20 / 20 - train_rmse: 1.0642701656384883\n", 883 | "\n", 884 | "Test RMSE: 1.1110\n", 885 | "Wall time: 1.77 s\n" 886 | ] 887 | } 888 | ], 889 | "source": [ 890 | "%%time \n", 891 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')\n", 892 | "matrix_fact.fit(X_train, y_train)\n", 893 | "\n", 894 | "pred = matrix_fact.predict(X_test)\n", 895 | "rmse = mean_squared_error(y_test, pred, squared = False)\n", 896 | "\n", 897 | "print(f'\\nTest RMSE: {rmse:.4f}')" 898 | ] 899 | }, 900 | { 901 | "cell_type": "markdown", 902 | "metadata": {}, 903 | "source": [ 904 | "## RBF Kernel" 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": 14, 910 | "metadata": {}, 911 | "outputs": [ 912 | { 913 | "name": "stdout", 914 | "output_type": "stream", 915 | "text": [ 916 | "Epoch 1 / 20 - train_rmse: 1.261497709751721\n", 917 | "Epoch 2 / 20 - train_rmse: 1.1098240081612984\n", 918 | "Epoch 3 / 20 - train_rmse: 1.0469994987862579\n", 919 | "Epoch 4 / 20 - train_rmse: 1.005181914551291\n", 920 | "Epoch 5 / 20 - train_rmse: 0.9752579187861348\n", 921 | "Epoch 6 / 20 - train_rmse: 0.9515686603321364\n", 922 | "Epoch 7 / 20 - train_rmse: 0.9340638617221303\n", 923 | "Epoch 8 / 20 - train_rmse: 0.9213238773972364\n", 924 | "Epoch 9 / 20 - train_rmse: 0.9115143003092134\n", 925 | "Epoch 10 / 20 - train_rmse: 0.9039437993331968\n", 926 | "Epoch 11 / 20 - train_rmse: 0.899792715730062\n", 927 | "Epoch 12 / 20 - train_rmse: 0.8949836709174682\n", 928 | "Epoch 13 / 20 - train_rmse: 0.8934174679325033\n", 929 | "Epoch 14 / 20 - train_rmse: 0.8897947618902249\n", 930 | "Epoch 15 / 20 - train_rmse: 0.8861334672817339\n", 931 | "Epoch 16 / 20 - train_rmse: 0.8850958002049469\n", 932 | "Epoch 17 / 20 - train_rmse: 0.883513182070616\n", 933 | "Epoch 18 / 20 - train_rmse: 0.8818590959179743\n", 934 | "Epoch 19 / 20 - train_rmse: 0.8817834058789318\n", 935 | "Epoch 20 / 20 - train_rmse: 0.8826416261286896\n", 936 | "\n", 937 | "Test RMSE: 0.9696\n", 938 | "Wall time: 3.55 s\n" 939 | ] 940 | } 941 | ], 942 | "source": [ 943 | "%%time \n", 944 | "matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')\n", 945 | "matrix_fact.fit(X_train, y_train)\n", 946 | "\n", 947 | "pred = matrix_fact.predict(X_test)\n", 948 | "rmse = mean_squared_error(y_test, pred, squared = False)\n", 949 | "\n", 950 | "print(f'\\nTest RMSE: {rmse:.4f}')" 951 | ] 952 | }, 953 | { 954 | "cell_type": "markdown", 955 | "metadata": {}, 956 | "source": [ 957 | "# Scikit-learn compatability" 958 | ] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": 15, 963 | "metadata": {}, 964 | "outputs": [ 965 | { 966 | "name": "stdout", 967 | "output_type": "stream", 968 | "text": [ 969 | "Fitting 5 folds for each of 81 candidates, totalling 405 fits\n" 970 | ] 971 | }, 972 | { 973 | "name": "stderr", 974 | "output_type": "stream", 975 | "text": [ 976 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", 977 | "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 36.8s\n", 978 | "[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 1.5min\n", 979 | "[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 3.4min finished\n" 980 | ] 981 | }, 982 | { 983 | "data": { 984 | "text/plain": [ 985 | "GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,\n", 986 | " param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],\n", 987 | " 'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],\n", 988 | " 'reg': [0, 0.005, 0.1]},\n", 989 | " scoring='neg_root_mean_squared_error', verbose=1)" 990 | ] 991 | }, 992 | "execution_count": 15, 993 | "metadata": {}, 994 | "output_type": "execute_result" 995 | } 996 | ], 997 | "source": [ 998 | "from sklearn.model_selection import GridSearchCV, ParameterGrid\n", 999 | "\n", 1000 | "param_grid = {\n", 1001 | " 'kernel': ['linear', 'sigmoid', 'rbf'],\n", 1002 | " 'n_factors': [10, 20, 50],\n", 1003 | " 'n_epochs': [10, 20, 50],\n", 1004 | " 'reg': [0, 0.005, 0.1]\n", 1005 | "}\n", 1006 | "\n", 1007 | "grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)\n", 1008 | "grid_search.fit(X_train, y_train)" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 16, 1014 | "metadata": {}, 1015 | "outputs": [ 1016 | { 1017 | "data": { 1018 | "text/plain": [ 1019 | "-0.9252872735695155" 1020 | ] 1021 | }, 1022 | "execution_count": 16, 1023 | "metadata": {}, 1024 | "output_type": "execute_result" 1025 | }, 1026 | { 1027 | "data": { 1028 | "text/plain": [ 1029 | "{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}" 1030 | ] 1031 | }, 1032 | "execution_count": 16, 1033 | "metadata": {}, 1034 | "output_type": "execute_result" 1035 | } 1036 | ], 1037 | "source": [ 1038 | "grid_search.best_score_\n", 1039 | "grid_search.best_params_" 1040 | ] 1041 | } 1042 | ], 1043 | "metadata": { 1044 | "kernelspec": { 1045 | "display_name": "Python [conda env:recommend]", 1046 | "language": "python", 1047 | "name": "conda-env-recommend-py" 1048 | }, 1049 | "language_info": { 1050 | "codemirror_mode": { 1051 | "name": "ipython", 1052 | "version": 3 1053 | }, 1054 | "file_extension": ".py", 1055 | "mimetype": "text/x-python", 1056 | "name": "python", 1057 | "nbconvert_exporter": "python", 1058 | "pygments_lexer": "ipython3", 1059 | "version": "3.7.7" 1060 | }, 1061 | "toc-autonumbering": true, 1062 | "toc-showcode": false, 1063 | "toc-showmarkdowntxt": false, 1064 | "toc-showtags": false 1065 | }, 1066 | "nbformat": 4, 1067 | "nbformat_minor": 4 1068 | } 1069 | --------------------------------------------------------------------------------