├── tfcf ├── __init__.py ├── models │ ├── __init__.py │ ├── model_base.py │ ├── svdpp.py │ └── svd.py ├── utils │ ├── __init__.py │ └── data_utils.py ├── datasets │ ├── __init__.py │ ├── ml100k.py │ └── ml1m.py ├── config.py └── metrics.py ├── setup.cfg ├── setup.py ├── LICENSE ├── .gitignore └── README.md /tfcf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tfcf/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tfcf/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tfcf/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /tfcf/datasets/ml100k.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from ..utils.data_utils import get_zip_file 4 | 5 | 6 | def load_data(): 7 | """Loads MovieLens 100k dataset. 8 | 9 | Returns: 10 | Tuple of numpy array (x, y) 11 | """ 12 | 13 | URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip' 14 | FILE_PATH = 'ml-100k/u.data' 15 | 16 | file = get_zip_file(URL, FILE_PATH) 17 | df = pd.read_csv(file, sep='\t', header=None) 18 | 19 | return df.iloc[:, :2].values, df.iloc[:, 2].values 20 | -------------------------------------------------------------------------------- /tfcf/config.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Config(object): 5 | """Configuration class for collaborative filtering model 6 | """ 7 | 8 | num_users = None 9 | num_items = None 10 | num_factors = 15 11 | 12 | # minimum and maximum value of prediction for clipping 13 | min_value = -np.inf 14 | max_value = np.inf 15 | 16 | # regularization scale 17 | reg_b_u = 0.0001 18 | reg_b_i = 0.0001 19 | reg_p_u = 0.005 20 | reg_q_i = 0.005 21 | reg_y_u = 0.005 22 | reg_g_i = 0.005 23 | -------------------------------------------------------------------------------- /tfcf/datasets/ml1m.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from ..utils.data_utils import get_zip_file 4 | 5 | 6 | def load_data(): 7 | """Loads MovieLens 1M dataset. 8 | 9 | Returns: 10 | Tuple of numpy array (x, y) 11 | """ 12 | 13 | URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' 14 | FILE_PATH = 'ml-1m/ratings.dat' 15 | 16 | file = get_zip_file(URL, FILE_PATH) 17 | df = pd.read_csv(file, sep='::', header=None, engine='python') 18 | 19 | return df.iloc[:, :2].values, df.iloc[:, 2].values 20 | -------------------------------------------------------------------------------- /tfcf/metrics.py: -------------------------------------------------------------------------------- 1 | """Operations related to evaluating predictions. 2 | """ 3 | 4 | import numpy as np 5 | 6 | 7 | def mse(y, y_pred): 8 | """Returns the mean squared error between 9 | ground truths and predictions. 10 | """ 11 | return np.mean((y - y_pred) ** 2) 12 | 13 | 14 | def rmse(y, y_pred): 15 | """Returns the root mean squared error between 16 | ground truths and predictions. 17 | """ 18 | return np.sqrt(mse(y, y_pred)) 19 | 20 | 21 | def mae(y, y_pred): 22 | """Returns mean absolute error between 23 | ground truths and predictions. 24 | """ 25 | return np.mean(np.fabs(y - y_pred)) 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | 5 | setup( 6 | name='tfcf', 7 | packages=find_packages(), 8 | 9 | version='0.0.0', 10 | 11 | license='MIT', 12 | 13 | description='A tensorflow-based recommender system.', 14 | 15 | author='Tzu-Wei Sung', 16 | author_email='windqaq@gmail.com', 17 | 18 | url='https://github.com/WindQAQ/tf-recsys', 19 | 20 | keywords=['recommender system', 'collaborative filtering', 21 | 'tensorflow', 'SVD', 'SVD++'], 22 | 23 | install_requires=[ 24 | 'requests', 25 | 'pandas', 26 | 'numpy', 27 | 'tensorflow>=1.2.0', 28 | ], 29 | 30 | classifiers=[ 31 | 'Development Status :: 3 - Alpha', 32 | 'License :: OSI Approved :: MIT License', 33 | 'Programming Language :: Python :: 3', 34 | 'Programming Language :: Python :: 3.4', 35 | 'Programming Language :: Python :: 3.5', 36 | 'Programming Language :: Python :: 3.6' 37 | ], 38 | 39 | python_requires='>=3', 40 | ) 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 WindQAQ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /tfcf/utils/data_utils.py: -------------------------------------------------------------------------------- 1 | """Classes and operations related to processing data. 2 | """ 3 | 4 | import numpy as np 5 | 6 | 7 | def get_zip_file(url, filepath): 8 | """Gets zip file from url. 9 | 10 | Args: 11 | url: A string, the url of zip file. 12 | filepath: A string, the file path inside the zip file. 13 | 14 | Returns: 15 | A String, the content of wanted file. 16 | """ 17 | 18 | from io import BytesIO 19 | from io import StringIO 20 | from zipfile import ZipFile 21 | import requests 22 | 23 | zipfile = ZipFile(BytesIO(requests.get(url).content)) 24 | file = zipfile.open(filepath).read().decode('utf8') 25 | 26 | return StringIO(file) 27 | 28 | 29 | class BatchGenerator(object): 30 | """Generator for data. 31 | """ 32 | 33 | def __init__(self, x, y=None, batch_size=1024, shuffle=True): 34 | if y is not None and x.shape[0] != y.shape[0]: 35 | raise ValueError('The shape 0 of x should ' 36 | 'be equal to that of y. ') 37 | 38 | self.x = x 39 | self.y = y 40 | self.length = x.shape[0] 41 | self.batch_size = batch_size 42 | self.shuffle = shuffle 43 | 44 | def next(self): 45 | start = end = 0 46 | length = self.length 47 | batch_size = self.batch_size 48 | 49 | if self.shuffle: 50 | permutation = np.random.permutation(length) 51 | self.x = self.x[permutation] 52 | self.y = self.y[permutation] 53 | 54 | flag = False 55 | while not flag: 56 | end += batch_size 57 | 58 | if end > length: 59 | end = length - 1 60 | flag = True 61 | 62 | yield self._get_batch(start, end) 63 | 64 | start = end 65 | 66 | def _get_batch(self, start, end): 67 | if self.y is not None: 68 | return self.x[start:end], self.y[start:end] 69 | else: 70 | return self.x[start:end] 71 | -------------------------------------------------------------------------------- /tfcf/models/model_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import inspect 3 | import tensorflow as tf 4 | 5 | 6 | def _class_vars(obj): 7 | return {k: v for k, v in inspect.getmembers(obj) 8 | if not k.startswith('__') and not callable(k)} 9 | 10 | 11 | class BaseModel(object): 12 | """Base model for SVD and SVD++. 13 | """ 14 | 15 | def __init__(self, config): 16 | self._built = False 17 | self._saver = None 18 | 19 | for attr in _class_vars(config): 20 | name = attr if not attr.startswith('_') else attr[1:] 21 | setattr(self, name, getattr(config, attr)) 22 | 23 | def save_model(self, model_dir): 24 | """Saves Tensorflow model. 25 | 26 | Args: 27 | model_dir: A string, the path of saving directory 28 | """ 29 | 30 | if not self._built: 31 | raise RunTimeError('The model must be trained ' 32 | 'before saving.') 33 | 34 | self._saver = tf.train.Saver() 35 | 36 | model_name = type(self).__name__ 37 | 38 | if not os.path.exists(model_dir): 39 | os.makedirs(model_dir) 40 | 41 | model_path = os.path.join(model_dir, model_name) 42 | 43 | self._saver.save(self._sess, model_path) 44 | 45 | def load_model(self, model_dir): 46 | """Loads Tensorflow model. 47 | 48 | Args: 49 | model_dir: A string, the path of saving directory 50 | """ 51 | 52 | tensor_names = ['placeholder/users:0', 'placeholder/items:0', 53 | 'placeholder/ratings:0', 'prediction/pred:0'] 54 | operation_names = ['optimizer/optimizer'] 55 | 56 | model_name = type(self).__name__ 57 | 58 | model_path = os.path.join(model_dir, model_name) 59 | 60 | self._saver = tf.train.import_meta_graph(model_path + '.meta') 61 | self._saver.restore(self._sess, model_path) 62 | 63 | for name in tensor_names: 64 | attr = '_' + name.split('/')[1].split(':')[0] 65 | setattr(self, attr, tf.get_default_graph().get_tensor_by_name(name)) 66 | 67 | for name in operation_names: 68 | attr = '_' + name.split('/')[1].split(':')[0] 69 | setattr(self, attr, tf.get_default_graph( 70 | ).get_operation_by_name(name)) 71 | 72 | self._built = True 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tf-recsys 2 | 3 | ## Overview 4 | **tf-recsys** contains collaborative filtering (CF) model based on famous SVD and SVD++ algorithm. Both of them are implemented by [Tensorflow][Tensorflow] in order to utilize GPU acceleration. 5 | 6 | 7 | ## Installation 8 | 9 | ``` 10 | pip install tfcf 11 | ``` 12 | 13 | Note that if you want to use GPU, please pre-install [Tensorflow][Tensorflow] with GPU version, that is, run 14 | ``` 15 | pip install tensorflow-gpu 16 | ``` 17 | or follow the instructions at [Installing Tensorflow](https://www.tensorflow.org/install/). 18 | 19 | ## Algorithms 20 | 21 | ### SVD 22 | 23 | SVD algorithm does matrix factorization via the following formula: 24 | 25 | ![SVD](http://latex.codecogs.com/gif.latex?r_%7Bui%7D%20%3D%20%5Cmu%20+%20b_u%20+%20b_i%20+%20p_u%20q_i) 26 | 27 | LHS is the prediction rating. The objective function is summation of the L2 loss between prediction and real rating and the regularization terms. For parameter updating, the gradient descent is used to minimize objective function. 28 | 29 | ### SVD++ 30 | 31 | Similar to SVD, the original SVD++ algorithm incorporate *implicit feedback* of users. 32 | 33 | ![SVD++](http://latex.codecogs.com/gif.latex?r_%7Bui%7D%20%3D%20%5Cmu%20+%20b_u%20+%20b_i%20+%20%28p_u%20+%20%5Cfrac%7B1%7D%7B%5Csqrt%7B%7CN%28u%29%7C%7D%7D%20%5Csum_%7Bj%20%5Cin%20N%28u%29%7D%20y_j%29%20q_i) 34 | 35 | The ![implicit feedback of user](http://latex.codecogs.com/gif.latex?N%28u%29) here is the set of implicit feedback of users. 36 | 37 | In this package, we also provide *dual* option for SVD++, or incoporate the implicit feedback of items. The equation can be re-written as follows: 38 | 39 | ![dual SVD++](http://latex.codecogs.com/gif.latex?r_%7Bui%7D%20%3D%20%5Cmu%20+%20b_u%20+%20b_i%20+%20%28p_u%20+%20%5Cfrac%7B1%7D%7B%5Csqrt%7B%7CN%28u%29%7C%7D%7D%20%5Csum_%7Bj%20%5Cin%20N%28u%29%7D%20y_j%29%20%28q_i%20+%20%5Cfrac%7B1%7D%7B%5Csqrt%7B%7CH%28i%29%7C%7D%7D%20%5Csum_%7Bj%20%5Cin%20H%28i%29%7D%20g_j%29) 40 | 41 | where ![implicit feedback of item](http://latex.codecogs.com/gif.latex?H%28i%29) is the set of implicit feedback of items. 42 | 43 | In our experiments, dual SVD++ outperform both original SVD++ and SVD but with slower training procedure. 44 | 45 | ## Example 46 | 47 | ```python 48 | import numpy as np 49 | import tensorflow as tf 50 | from tfcf.metrics import mae 51 | from tfcf.metrics import rmse 52 | from tfcf.datasets import ml1m 53 | from tfcf.config import Config 54 | from tfcf.models.svd import SVD 55 | from tfcf.models.svd import SVDPP 56 | from sklearn.model_selection import train_test_split 57 | 58 | # Note that x is a 2D numpy array, 59 | # x[i, :] contains the user-item pair, and y[i] is the corresponding rating. 60 | x, y = ml1m.load_data() 61 | 62 | x_train, x_test, y_train, y_test = train_test_split( 63 | x, y, test_size=0.2, random_state=0) 64 | 65 | config = Config() 66 | config.num_users = np.max(x[:, 0]) + 1 67 | config.num_items = np.max(x[:, 1]) + 1 68 | config.min_value = np.min(y) 69 | config.max_value = np.max(y) 70 | 71 | with tf.Session() as sess: 72 |   # For SVD++ algorithm, if `dual` is True, then the dual term of items' 73 |    # implicit feedback will be added into the original SVD++ algorithm. 74 |   # model = SVDPP(config, sess, dual=False) 75 | # model = SVDPP(config, sess, dual=True) 76 |    model = SVD(config, sess) 77 | model.train(x_train, y_train, validation_data=( 78 | x_test, y_test), epochs=20, batch_size=1024) 79 | 80 | y_pred = model.predict(x_test) 81 | print('rmse: {}, mae: {}'.format(rmse(y_test, y_pred), mae(y_test, y_pred))) 82 | 83 | # Save model 84 | model = model.save_model('model/') 85 | 86 | # Load model 87 | # model = model.load_model('model/') 88 | 89 | ``` 90 | 91 | ## Performance 92 | 93 | The experiments are set up on [MovieLens 100K][MovieLens100K] and [MovieLens 1M][MovieLens1M]. The results reported here are evaluated on 5-folds cross validation with random seed 0 and taken average of them. All models use default configuration. For [MovieLens 100K][MovieLens100K], the batch size is 128. As for [MovieLens 1M][MovieLens1M], a quite larger dataset, the batch size is 1024. With GPU acceleration, both SVD and SVD++ speed up significantly compared with [Surprise][Surprise], which is the implementation based on cPython. The following is the performance on GTX 1080: 94 | 95 | ### MovieLens 100K 96 | 97 | | | RMSE | MAE | Time (sec/epoch) | 98 | |:----------:|:-------:|:-------:|:----------------:| 99 | | SVD | 0.91572 | 0.71964 | < 1 | 100 | | SVD++ | 0.90484 | 0.70982 | 4 | 101 | | Dual SVD++ | 0.89334 | 0.70020 | 7 | 102 | 103 | ### MovieLens 1M 104 | | | RMSE | MAE | Time (sec/epoch) | 105 | |:----------:|:-------:|:-------:|:----------------:| 106 | | SVD | 0.85524 | 0.66922 | 4 | 107 | | SVD++ | 0.84846 | 0.66306 | 40 | 108 | | Dual SVD++ | 0.83672 | 0.65256 | 50 | 109 | 110 | Some similar experiments can be found at [MyMediaLite][MyMediaLite], [Surprise][Surprise] and [LibRec][LibRec]. 111 | 112 | 113 | ## References 114 | [Tensorflow][Tensorflow] 115 | 116 | [MyMediaLite][MyMediaLite] 117 | 118 | [Surprise][Surprise] 119 | 120 | [LibRec][LibRec] 121 | 122 | Also see my [ML2017][ML2017] repo, there is a [Keras][Keras] implementation for SVD and SVD++ in [hw6][hw6]. 123 | 124 | [MovieLens100K]: https://grouplens.org/datasets/movielens/100k/ 125 | [MovieLens1M]: https://grouplens.org/datasets/movielens/1m/ 126 | [LibRec]: https://www.librec.net/release/v1.3/example.html 127 | [Tensorflow]: https://www.tensorflow.org/ 128 | [Keras]: https://keras.io/ 129 | [MyMediaLite]: http://www.mymedialite.net/examples/datasets.html 130 | [Surprise]: https://github.com/NicolasHug/Surprise 131 | [ML2017]: https://github.com/WindQAQ/ML2017 132 | [hw6]: https://github.com/WindQAQ/ML2017/blob/master/hw6/train.py 133 | 134 | ## Contact 135 | Issues and pull requests are welcomed. Feel free to [contact me](mailto:windqaq@gmail.com) if there's any problems. 136 | -------------------------------------------------------------------------------- /tfcf/models/svdpp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | try: 5 | from tensorflow.keras import utils 6 | except: 7 | from tensorflow.contrib.keras import utils 8 | 9 | from .svd import SVD 10 | from ..utils.data_utils import BatchGenerator 11 | from ..metrics import mae 12 | from ..metrics import rmse 13 | 14 | 15 | def _convert_to_sparse_format(x): 16 | """Converts a list of lists into sparse format. 17 | 18 | Args: 19 | x: A list of lists. 20 | 21 | Returns: 22 | A dictionary that contains three fields, which are 23 | indices, values, and the dense shape of sparse matrix. 24 | """ 25 | 26 | sparse = { 27 | 'indices': [], 28 | 'values': [] 29 | } 30 | 31 | for row, x_i in enumerate(x): 32 | for col, x_ij in enumerate(x_i): 33 | sparse['indices'].append((row, col)) 34 | sparse['values'].append(x_ij) 35 | 36 | max_col = np.max([len(x_i) for x_i in x]).astype(np.int32) 37 | 38 | sparse['dense_shape'] = (len(x), max_col) 39 | 40 | return sparse 41 | 42 | 43 | def _get_implicit_feedback(x, num_users, num_items, dual): 44 | """Gets implicit feedback from (users, items) pair. 45 | 46 | Args: 47 | x: A numpy array of shape `(samples, 2)`. 48 | num_users: An integer, total number of users. 49 | num_items: An integer, total number of items. 50 | dual: A bool, deciding whether returns the 51 | dual term of implicit feedback of items. 52 | 53 | Returns: 54 | A dictionary that is the sparse format of implicit 55 | feedback of users, if dual is true. 56 | A tuple of dictionarys that are the sparse format of 57 | implicit feedback of users and items, otherwise. 58 | """ 59 | 60 | if not dual: 61 | N = [[] for u in range(num_users)] 62 | for u, i, in zip(x[:, 0], x[:, 1]): 63 | N[u].append(i) 64 | 65 | return _convert_to_sparse_format(N) 66 | else: 67 | N = [[] for u in range(num_users)] 68 | H = [[] for u in range(num_items)] 69 | for u, i, in zip(x[:, 0], x[:, 1]): 70 | N[u].append(i) 71 | H[i].append(u) 72 | 73 | return _convert_to_sparse_format(N), _convert_to_sparse_format(H) 74 | 75 | 76 | class SVDPP(SVD): 77 | """Collaborative filtering model based on SVD++ algorithm. 78 | """ 79 | 80 | def __init__(self, config, sess, dual=False): 81 | super(SVDPP, self).__init__(config, sess) 82 | self.dual = dual 83 | 84 | def _create_implicit_feedback(self, implicit_feedback, dual=False): 85 | """Returns the (tuple of) sparse tensor(s) of implicit feedback. 86 | """ 87 | with tf.variable_scope('implicit_feedback'): 88 | if not dual: 89 | N = tf.SparseTensor(**implicit_feedback) 90 | 91 | return N 92 | else: 93 | N = tf.SparseTensor(**implicit_feedback[0]) 94 | H = tf.SparseTensor(**implicit_feedback[1]) 95 | 96 | return N, H 97 | 98 | def _create_user_terms(self, users, N): 99 | num_users = self.num_users 100 | num_items = self.num_items 101 | num_factors = self.num_factors 102 | 103 | p_u, b_u = super(SVDPP, self)._create_user_terms(users) 104 | 105 | with tf.variable_scope('user'): 106 | implicit_feedback_embeddings = tf.get_variable( 107 | name='implict_feedback_embedding', 108 | shape=[num_items, num_factors], 109 | initializer=tf.zeros_initializer(), 110 | regularizer=tf.contrib.layers.l2_regularizer(self.reg_y_u)) 111 | 112 | y_u = tf.gather( 113 | tf.nn.embedding_lookup_sparse( 114 | implicit_feedback_embeddings, 115 | N, 116 | sp_weights=None, 117 | combiner='sqrtn'), 118 | users, 119 | name='y_u' 120 | ) 121 | 122 | return p_u, b_u, y_u 123 | 124 | def _create_item_terms(self, items, H=None): 125 | num_users = self.num_users 126 | num_items = self.num_items 127 | num_factors = self.num_factors 128 | 129 | q_i, b_i = super(SVDPP, self)._create_item_terms(items) 130 | 131 | if H is None: 132 | return q_i, b_i 133 | else: 134 | with tf.variable_scope('item'): 135 | implicit_feedback_embeddings = tf.get_variable( 136 | name='implict_feedback_embedding', 137 | shape=[num_users, num_factors], 138 | initializer=tf.zeros_initializer(), 139 | regularizer=tf.contrib.layers.l2_regularizer(self.reg_g_i)) 140 | 141 | g_i = tf.gather( 142 | tf.nn.embedding_lookup_sparse( 143 | implicit_feedback_embeddings, 144 | H, 145 | sp_weights=None, 146 | combiner='sqrtn'), 147 | items, 148 | name='g_i' 149 | ) 150 | 151 | return q_i, b_i, g_i 152 | 153 | def _create_prediction(self, mu, b_u, b_i, p_u, q_i, y_u, g_i=None): 154 | with tf.variable_scope('prediction'): 155 | if g_i is None: 156 | pred = tf.reduce_sum( 157 | tf.multiply(tf.add(p_u, y_u), q_i), 158 | axis=1) 159 | else: 160 | pred = tf.reduce_sum( 161 | tf.multiply(tf.add(p_u, y_u), tf.add(q_i, g_i)), 162 | axis=1) 163 | 164 | pred = tf.add_n([b_u, b_i, pred]) 165 | 166 | pred = tf.add(pred, mu, name='pred') 167 | 168 | return pred 169 | 170 | def _build_graph(self, mu, implicit_feedback): 171 | _mu = super(SVDPP, self)._create_constants(mu) 172 | 173 | self._users, self._items, self._ratings = super( 174 | SVDPP, self)._create_placeholders() 175 | 176 | if not self.dual: 177 | N = self._create_implicit_feedback(implicit_feedback) 178 | 179 | p_u, b_u, y_u = self._create_user_terms(self._users, N) 180 | q_i, b_i = self._create_item_terms(self._items) 181 | 182 | self._pred = self._create_prediction(_mu, b_u, b_i, p_u, q_i, y_u) 183 | else: 184 | N, H = self._create_implicit_feedback(implicit_feedback, True) 185 | 186 | p_u, b_u, y_u = self._create_user_terms(self._users, N) 187 | q_i, b_i, g_i = self._create_item_terms(self._items, H) 188 | 189 | self._pred = self._create_prediction( 190 | _mu, b_u, b_i, p_u, q_i, y_u, g_i) 191 | 192 | loss = super(SVDPP, self)._create_loss(self._ratings, self._pred) 193 | 194 | self._optimizer = super(SVDPP, self)._create_optimizer(loss) 195 | 196 | self._built = True 197 | 198 | def train(self, x, y, epochs=100, batch_size=1024, validation_data=None): 199 | 200 | if x.shape[0] != y.shape[0] or x.shape[1] != 2: 201 | raise ValueError('The shape of x should be (samples, 2) and ' 202 | 'the shape of y should be (samples, 1).') 203 | 204 | if not self._built: 205 | implicit_feedback = _get_implicit_feedback( 206 | x, self.num_users, self.num_items, self.dual) 207 | self._build_graph(np.mean(y), implicit_feedback) 208 | 209 | self._run_train(x, y, epochs, batch_size, validation_data) 210 | -------------------------------------------------------------------------------- /tfcf/models/svd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | try: 5 | from tensorflow.keras import utils 6 | except: 7 | from tensorflow.contrib.keras import utils 8 | 9 | from .model_base import BaseModel 10 | from ..utils.data_utils import BatchGenerator 11 | from ..metrics import mae 12 | from ..metrics import rmse 13 | 14 | 15 | class SVD(BaseModel): 16 | """Collaborative filtering model based on SVD algorithm. 17 | """ 18 | 19 | def __init__(self, config, sess): 20 | super(SVD, self).__init__(config) 21 | self._sess = sess 22 | 23 | def _create_placeholders(self): 24 | """Returns the placeholders. 25 | """ 26 | with tf.variable_scope('placeholder'): 27 | users = tf.placeholder(tf.int32, shape=[None, ], name='users') 28 | items = tf.placeholder(tf.int32, shape=[None, ], name='items') 29 | ratings = tf.placeholder( 30 | tf.float32, shape=[None, ], name='ratings') 31 | 32 | return users, items, ratings 33 | 34 | def _create_constants(self, mu): 35 | """Returns the constants. 36 | """ 37 | with tf.variable_scope('constant'): 38 | _mu = tf.constant(mu, shape=[], dtype=tf.float32) 39 | 40 | return _mu 41 | 42 | def _create_user_terms(self, users): 43 | """Returns the tensors related to users. 44 | """ 45 | num_users = self.num_users 46 | num_factors = self.num_factors 47 | 48 | with tf.variable_scope('user'): 49 | user_embeddings = tf.get_variable( 50 | name='embedding', 51 | shape=[num_users, num_factors], 52 | initializer=tf.contrib.layers.xavier_initializer(), 53 | regularizer=tf.contrib.layers.l2_regularizer(self.reg_p_u)) 54 | 55 | user_bias = tf.get_variable( 56 | name='bias', 57 | shape=[num_users, ], 58 | initializer=tf.contrib.layers.xavier_initializer(), 59 | regularizer=tf.contrib.layers.l2_regularizer(self.reg_b_u)) 60 | 61 | p_u = tf.nn.embedding_lookup( 62 | user_embeddings, 63 | users, 64 | name='p_u') 65 | 66 | b_u = tf.nn.embedding_lookup( 67 | user_bias, 68 | users, 69 | name='b_u') 70 | 71 | return p_u, b_u 72 | 73 | def _create_item_terms(self, items): 74 | """Returns the tensors related to items. 75 | """ 76 | num_items = self.num_items 77 | num_factors = self.num_factors 78 | 79 | with tf.variable_scope('item'): 80 | item_embeddings = tf.get_variable( 81 | name='embedding', 82 | shape=[num_items, num_factors], 83 | initializer=tf.contrib.layers.xavier_initializer(), 84 | regularizer=tf.contrib.layers.l2_regularizer(self.reg_q_i)) 85 | 86 | item_bias = tf.get_variable( 87 | name='bias', 88 | shape=[num_items, ], 89 | initializer=tf.contrib.layers.xavier_initializer(), 90 | regularizer=tf.contrib.layers.l2_regularizer(self.reg_b_i)) 91 | 92 | q_i = tf.nn.embedding_lookup( 93 | item_embeddings, 94 | items, 95 | name='q_i') 96 | 97 | b_i = tf.nn.embedding_lookup( 98 | item_bias, 99 | items, 100 | name='b_i') 101 | 102 | return q_i, b_i 103 | 104 | def _create_prediction(self, mu, b_u, b_i, p_u, q_i): 105 | """Returns the tensor of prediction. 106 | 107 | Note that the prediction 108 | r_hat = \mu + b_u + b_i + p_u * q_i 109 | """ 110 | with tf.variable_scope('prediction'): 111 | pred = tf.reduce_sum( 112 | tf.multiply(p_u, q_i), 113 | axis=1) 114 | 115 | pred = tf.add_n([b_u, b_i, pred]) 116 | 117 | pred = tf.add(pred, mu, name='pred') 118 | 119 | return pred 120 | 121 | def _create_loss(self, pred, ratings): 122 | """Returns the L2 loss of the difference between 123 | ground truths and predictions. 124 | 125 | The formula is here: 126 | L2 = sum((r - r_hat) ** 2) / 2 127 | """ 128 | with tf.variable_scope('loss'): 129 | loss = tf.nn.l2_loss(tf.subtract(ratings, pred), name='loss') 130 | 131 | return loss 132 | 133 | def _create_optimizer(self, loss): 134 | """Returns the optimizer. 135 | 136 | The objective function is defined as the sum of 137 | loss and regularizers' losses. 138 | """ 139 | with tf.variable_scope('optimizer'): 140 | objective = tf.add( 141 | loss, 142 | tf.add_n(tf.get_collection( 143 | tf.GraphKeys.REGULARIZATION_LOSSES)), 144 | name='objective') 145 | 146 | try: 147 | optimizer = tf.contrib.keras.optimizers.Nadam( 148 | ).minimize(objective, name='optimizer') 149 | except: 150 | optimizer = tf.train.AdamOptimizer().minimize(objective, name='optimizer') 151 | 152 | return optimizer 153 | 154 | def _build_graph(self, mu): 155 | _mu = self._create_constants(mu) 156 | 157 | self._users, self._items, self._ratings = self._create_placeholders() 158 | 159 | p_u, b_u = self._create_user_terms(self._users) 160 | q_i, b_i = self._create_item_terms(self._items) 161 | 162 | self._pred = self._create_prediction(_mu, b_u, b_i, p_u, q_i) 163 | 164 | loss = self._create_loss(self._ratings, self._pred) 165 | 166 | self._optimizer = self._create_optimizer(loss) 167 | 168 | self._built = True 169 | 170 | def _run_train(self, x, y, epochs, batch_size, validation_data): 171 | train_gen = BatchGenerator(x, y, batch_size) 172 | steps_per_epoch = np.ceil(train_gen.length / batch_size).astype(int) 173 | 174 | self._sess.run(tf.global_variables_initializer()) 175 | 176 | for e in range(1, epochs + 1): 177 | print('Epoch {}/{}'.format(e, epochs)) 178 | 179 | pbar = utils.Progbar(steps_per_epoch) 180 | 181 | for step, batch in enumerate(train_gen.next(), 1): 182 | users = batch[0][:, 0] 183 | items = batch[0][:, 1] 184 | ratings = batch[1] 185 | 186 | self._sess.run( 187 | self._optimizer, 188 | feed_dict={ 189 | self._users: users, 190 | self._items: items, 191 | self._ratings: ratings 192 | }) 193 | 194 | pred = self.predict(batch[0]) 195 | 196 | update_values = [ 197 | ('rmse', rmse(ratings, pred)), 198 | ('mae', mae(ratings, pred)) 199 | ] 200 | 201 | if validation_data is not None and step == steps_per_epoch: 202 | valid_x, valid_y = validation_data 203 | valid_pred = self.predict(valid_x) 204 | 205 | update_values += [ 206 | ('val_rmse', rmse(valid_y, valid_pred)), 207 | ('val_mae', mae(valid_y, valid_pred)) 208 | ] 209 | 210 | pbar.update(step, values=update_values, 211 | force=(step == steps_per_epoch)) 212 | 213 | def train(self, x, y, epochs=100, batch_size=1024, validation_data=None): 214 | 215 | if x.shape[0] != y.shape[0] or x.shape[1] != 2: 216 | raise ValueError('The shape of x should be (samples, 2) and ' 217 | 'the shape of y should be (samples, 1).') 218 | 219 | if not self._built: 220 | self._build_graph(np.mean(y)) 221 | 222 | self._run_train(x, y, epochs, batch_size, validation_data) 223 | 224 | def predict(self, x): 225 | if not self._built: 226 | raise RunTimeError('The model must be trained ' 227 | 'before prediction.') 228 | 229 | if x.shape[1] != 2: 230 | raise ValueError('The shape of x should be ' 231 | '(samples, 2)') 232 | 233 | pred = self._sess.run( 234 | self._pred, 235 | feed_dict={ 236 | self._users: x[:, 0], 237 | self._items: x[:, 1] 238 | }) 239 | 240 | pred = pred.clip(min=self.min_value, max=self.max_value) 241 | 242 | return pred 243 | --------------------------------------------------------------------------------