├── tfcf
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── model_base.py
    │   ├── svdpp.py
    │   └── svd.py
    ├── utils
    │   ├── __init__.py
    │   └── data_utils.py
    ├── datasets
    │   ├── __init__.py
    │   ├── ml100k.py
    │   └── ml1m.py
    ├── config.py
    └── metrics.py
├── setup.cfg
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/tfcf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tfcf/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tfcf/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tfcf/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/tfcf/datasets/ml100k.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from ..utils.data_utils import get_zip_file
 4 | 
 5 | 
 6 | def load_data():
 7 |     """Loads MovieLens 100k dataset.
 8 | 
 9 |     Returns:
10 |         Tuple of numpy array (x, y)
11 |     """
12 | 
13 |     URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
14 |     FILE_PATH = 'ml-100k/u.data'
15 | 
16 |     file = get_zip_file(URL, FILE_PATH)
17 |     df = pd.read_csv(file, sep='\t', header=None)
18 | 
19 |     return df.iloc[:, :2].values, df.iloc[:, 2].values
20 | 


--------------------------------------------------------------------------------
/tfcf/config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Config(object):
 5 |     """Configuration class for collaborative filtering model
 6 |     """
 7 | 
 8 |     num_users = None
 9 |     num_items = None
10 |     num_factors = 15
11 | 
12 |     # minimum and maximum value of prediction for clipping
13 |     min_value = -np.inf
14 |     max_value = np.inf
15 | 
16 |     # regularization scale
17 |     reg_b_u = 0.0001
18 |     reg_b_i = 0.0001
19 |     reg_p_u = 0.005
20 |     reg_q_i = 0.005
21 |     reg_y_u = 0.005
22 |     reg_g_i = 0.005
23 | 


--------------------------------------------------------------------------------
/tfcf/datasets/ml1m.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from ..utils.data_utils import get_zip_file
 4 | 
 5 | 
 6 | def load_data():
 7 |     """Loads MovieLens 1M dataset.
 8 | 
 9 |     Returns:
10 |         Tuple of numpy array (x, y)
11 |     """
12 | 
13 |     URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
14 |     FILE_PATH = 'ml-1m/ratings.dat'
15 | 
16 |     file = get_zip_file(URL, FILE_PATH)
17 |     df = pd.read_csv(file, sep='::', header=None, engine='python')
18 | 
19 |     return df.iloc[:, :2].values, df.iloc[:, 2].values
20 | 


--------------------------------------------------------------------------------
/tfcf/metrics.py:
--------------------------------------------------------------------------------
 1 | """Operations related to evaluating predictions.
 2 | """
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | def mse(y, y_pred):
 8 |     """Returns the mean squared error between
 9 |     ground truths and predictions.
10 |     """
11 |     return np.mean((y - y_pred) ** 2)
12 | 
13 | 
14 | def rmse(y, y_pred):
15 |     """Returns the root mean squared error between
16 |     ground truths and predictions.
17 |     """
18 |     return np.sqrt(mse(y, y_pred))
19 | 
20 | 
21 | def mae(y, y_pred):
22 |     """Returns mean absolute error between
23 |     ground truths and predictions.
24 |     """
25 |     return np.mean(np.fabs(y - y_pred))
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools import find_packages
 3 | 
 4 | 
 5 | setup(
 6 |     name='tfcf',
 7 |     packages=find_packages(),
 8 | 
 9 |     version='0.0.0',
10 | 
11 |     license='MIT',
12 | 
13 |     description='A tensorflow-based recommender system.',
14 | 
15 |     author='Tzu-Wei Sung',
16 |     author_email='windqaq@gmail.com',
17 | 
18 |     url='https://github.com/WindQAQ/tf-recsys',
19 | 
20 |     keywords=['recommender system', 'collaborative filtering',
21 |               'tensorflow', 'SVD', 'SVD++'],
22 | 
23 |     install_requires=[
24 |         'requests',
25 |         'pandas',
26 |         'numpy',
27 |         'tensorflow>=1.2.0',
28 |     ],
29 | 
30 |     classifiers=[
31 |         'Development Status :: 3 - Alpha',
32 |         'License :: OSI Approved :: MIT License',
33 |         'Programming Language :: Python :: 3',
34 |         'Programming Language :: Python :: 3.4',
35 |         'Programming Language :: Python :: 3.5',
36 |         'Programming Language :: Python :: 3.6'
37 |     ],
38 | 
39 |     python_requires='>=3',
40 | )
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 WindQAQ
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | .static_storage/
 56 | .media/
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/tfcf/utils/data_utils.py:
--------------------------------------------------------------------------------
 1 | """Classes and operations related to processing data.
 2 | """
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | def get_zip_file(url, filepath):
 8 |     """Gets zip file from url.
 9 | 
10 |     Args:
11 |         url: A string, the url of zip file.
12 |         filepath: A string, the file path inside the zip file.
13 | 
14 |     Returns:
15 |         A String, the content of wanted file.
16 |     """
17 | 
18 |     from io import BytesIO
19 |     from io import StringIO
20 |     from zipfile import ZipFile
21 |     import requests
22 | 
23 |     zipfile = ZipFile(BytesIO(requests.get(url).content))
24 |     file = zipfile.open(filepath).read().decode('utf8')
25 | 
26 |     return StringIO(file)
27 | 
28 | 
29 | class BatchGenerator(object):
30 |     """Generator for data.
31 |     """
32 | 
33 |     def __init__(self, x, y=None, batch_size=1024, shuffle=True):
34 |         if y is not None and x.shape[0] != y.shape[0]:
35 |             raise ValueError('The shape 0 of x should '
36 |                              'be equal to that of y. ')
37 | 
38 |         self.x = x
39 |         self.y = y
40 |         self.length = x.shape[0]
41 |         self.batch_size = batch_size
42 |         self.shuffle = shuffle
43 | 
44 |     def next(self):
45 |         start = end = 0
46 |         length = self.length
47 |         batch_size = self.batch_size
48 | 
49 |         if self.shuffle:
50 |             permutation = np.random.permutation(length)
51 |             self.x = self.x[permutation]
52 |             self.y = self.y[permutation]
53 | 
54 |         flag = False
55 |         while not flag:
56 |             end += batch_size
57 | 
58 |             if end > length:
59 |                 end = length - 1
60 |                 flag = True
61 | 
62 |             yield self._get_batch(start, end)
63 | 
64 |             start = end
65 | 
66 |     def _get_batch(self, start, end):
67 |         if self.y is not None:
68 |             return self.x[start:end], self.y[start:end]
69 |         else:
70 |             return self.x[start:end]
71 | 


--------------------------------------------------------------------------------
/tfcf/models/model_base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import inspect
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def _class_vars(obj):
 7 |     return {k: v for k, v in inspect.getmembers(obj)
 8 |             if not k.startswith('__') and not callable(k)}
 9 | 
10 | 
11 | class BaseModel(object):
12 |     """Base model for SVD and SVD++.
13 |     """
14 | 
15 |     def __init__(self, config):
16 |         self._built = False
17 |         self._saver = None
18 | 
19 |         for attr in _class_vars(config):
20 |             name = attr if not attr.startswith('_') else attr[1:]
21 |             setattr(self, name, getattr(config, attr))
22 | 
23 |     def save_model(self, model_dir):
24 |         """Saves Tensorflow model.
25 | 
26 |         Args:
27 |             model_dir: A string, the path of saving directory
28 |         """
29 | 
30 |         if not self._built:
31 |             raise RunTimeError('The model must be trained '
32 |                                'before saving.')
33 | 
34 |         self._saver = tf.train.Saver()
35 | 
36 |         model_name = type(self).__name__
37 | 
38 |         if not os.path.exists(model_dir):
39 |             os.makedirs(model_dir)
40 | 
41 |         model_path = os.path.join(model_dir, model_name)
42 | 
43 |         self._saver.save(self._sess, model_path)
44 | 
45 |     def load_model(self, model_dir):
46 |         """Loads Tensorflow model.
47 | 
48 |         Args:
49 |             model_dir: A string, the path of saving directory
50 |         """
51 | 
52 |         tensor_names = ['placeholder/users:0', 'placeholder/items:0',
53 |                         'placeholder/ratings:0', 'prediction/pred:0']
54 |         operation_names = ['optimizer/optimizer']
55 | 
56 |         model_name = type(self).__name__
57 | 
58 |         model_path = os.path.join(model_dir, model_name)
59 | 
60 |         self._saver = tf.train.import_meta_graph(model_path + '.meta')
61 |         self._saver.restore(self._sess, model_path)
62 | 
63 |         for name in tensor_names:
64 |             attr = '_' + name.split('/')[1].split(':')[0]
65 |             setattr(self, attr, tf.get_default_graph().get_tensor_by_name(name))
66 | 
67 |         for name in operation_names:
68 |             attr = '_' + name.split('/')[1].split(':')[0]
69 |             setattr(self, attr, tf.get_default_graph(
70 |             ).get_operation_by_name(name))
71 | 
72 |         self._built = True
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tf-recsys
  2 | 
  3 | ## Overview
  4 | **tf-recsys** contains collaborative filtering (CF) model based on famous SVD and SVD++ algorithm. Both of them are implemented by [Tensorflow][Tensorflow] in order to utilize GPU acceleration.
  5 | 
  6 | 
  7 | ## Installation
  8 | 
  9 | ```
 10 | pip install tfcf
 11 | ```
 12 | 
 13 | Note that if you want to use GPU, please pre-install [Tensorflow][Tensorflow] with GPU version, that is, run
 14 | ```
 15 | pip install tensorflow-gpu
 16 | ```
 17 | or follow the instructions at [Installing Tensorflow](https://www.tensorflow.org/install/).
 18 | 
 19 | ## Algorithms
 20 | 
 21 | ### SVD
 22 | 
 23 | SVD algorithm does matrix factorization via the following formula:
 24 | 
 25 | ![SVD](http://latex.codecogs.com/gif.latex?r_%7Bui%7D%20%3D%20%5Cmu%20&plus;%20b_u%20&plus;%20b_i%20&plus;%20p_u%20q_i)
 26 | 
 27 | LHS is the prediction rating. The objective function is summation of the L2 loss between prediction and real rating and the regularization terms. For parameter updating, the gradient descent is used to minimize objective function.
 28 | 
 29 | ### SVD++
 30 | 
 31 | Similar to SVD, the original SVD++ algorithm incorporate *implicit feedback* of users.
 32 | 
 33 | ![SVD++](http://latex.codecogs.com/gif.latex?r_%7Bui%7D%20%3D%20%5Cmu%20&plus;%20b_u%20&plus;%20b_i%20&plus;%20%28p_u%20&plus;%20%5Cfrac%7B1%7D%7B%5Csqrt%7B%7CN%28u%29%7C%7D%7D%20%5Csum_%7Bj%20%5Cin%20N%28u%29%7D%20y_j%29%20q_i)
 34 | 
 35 | The ![implicit feedback of user](http://latex.codecogs.com/gif.latex?N%28u%29) here is the set of implicit feedback of users.
 36 | 
 37 | In this package, we also provide *dual* option for SVD++, or incoporate the implicit feedback of items. The equation can be re-written as follows:
 38 | 
 39 | ![dual SVD++](http://latex.codecogs.com/gif.latex?r_%7Bui%7D%20%3D%20%5Cmu%20&plus;%20b_u%20&plus;%20b_i%20&plus;%20%28p_u%20&plus;%20%5Cfrac%7B1%7D%7B%5Csqrt%7B%7CN%28u%29%7C%7D%7D%20%5Csum_%7Bj%20%5Cin%20N%28u%29%7D%20y_j%29%20%28q_i%20&plus;%20%5Cfrac%7B1%7D%7B%5Csqrt%7B%7CH%28i%29%7C%7D%7D%20%5Csum_%7Bj%20%5Cin%20H%28i%29%7D%20g_j%29)
 40 | 
 41 | where ![implicit feedback of item](http://latex.codecogs.com/gif.latex?H%28i%29) is the set of implicit feedback of items.
 42 | 
 43 | In our experiments, dual SVD++ outperform both original SVD++ and SVD but with slower training procedure.
 44 | 
 45 | ## Example
 46 | 
 47 | ```python
 48 | import numpy as np
 49 | import tensorflow as tf
 50 | from tfcf.metrics import mae
 51 | from tfcf.metrics import rmse
 52 | from tfcf.datasets import ml1m
 53 | from tfcf.config import Config
 54 | from tfcf.models.svd import SVD
 55 | from tfcf.models.svd import SVDPP
 56 | from sklearn.model_selection import train_test_split
 57 | 
 58 | # Note that x is a 2D numpy array, 
 59 | # x[i, :] contains the user-item pair, and y[i] is the corresponding rating.
 60 | x, y = ml1m.load_data()
 61 | 
 62 | x_train, x_test, y_train, y_test = train_test_split(
 63 |     x, y, test_size=0.2, random_state=0)
 64 | 
 65 | config = Config()
 66 | config.num_users = np.max(x[:, 0]) + 1
 67 | config.num_items = np.max(x[:, 1]) + 1
 68 | config.min_value = np.min(y)
 69 | config.max_value = np.max(y)
 70 | 
 71 | with tf.Session() as sess:
 72 |     # For SVD++ algorithm, if `dual` is True, then the dual term of items' 
 73 |     # implicit feedback will be added into the original SVD++ algorithm.
 74 |     # model = SVDPP(config, sess, dual=False)
 75 |     # model = SVDPP(config, sess, dual=True)
 76 |     model = SVD(config, sess)
 77 |     model.train(x_train, y_train, validation_data=(
 78 |         x_test, y_test), epochs=20, batch_size=1024)
 79 |         
 80 |     y_pred = model.predict(x_test)
 81 |     print('rmse: {}, mae: {}'.format(rmse(y_test, y_pred), mae(y_test, y_pred)))
 82 |         
 83 |     # Save model
 84 |     model = model.save_model('model/')
 85 |     
 86 |     # Load model
 87 |     # model = model.load_model('model/')
 88 | 
 89 | ```
 90 | 
 91 | ## Performance
 92 | 
 93 | The experiments are set up on [MovieLens 100K][MovieLens100K] and [MovieLens 1M][MovieLens1M]. The results reported here are evaluated on 5-folds cross validation with random seed 0 and taken average of them. All models use default configuration. For [MovieLens 100K][MovieLens100K], the batch size is 128. As for [MovieLens 1M][MovieLens1M], a quite larger dataset, the batch size is 1024. With GPU acceleration, both SVD and SVD++ speed up significantly compared with [Surprise][Surprise], which is the implementation based on cPython. The following is the performance on GTX 1080:
 94 | 
 95 | ### MovieLens 100K
 96 | 
 97 | |            |   RMSE  |   MAE   | Time (sec/epoch) |
 98 | |:----------:|:-------:|:-------:|:----------------:|
 99 | |     SVD    | 0.91572 | 0.71964 |        < 1       |
100 | |    SVD++   | 0.90484 | 0.70982 |         4        |
101 | | Dual SVD++ | 0.89334 | 0.70020 |         7        |
102 | 
103 | ### MovieLens 1M
104 | |            |   RMSE  |   MAE   | Time (sec/epoch) |
105 | |:----------:|:-------:|:-------:|:----------------:|
106 | |     SVD    | 0.85524 | 0.66922 |         4        |
107 | |    SVD++   | 0.84846 | 0.66306 |        40        |
108 | | Dual SVD++ | 0.83672 | 0.65256 |        50        |
109 | 
110 | Some similar experiments can be found at [MyMediaLite][MyMediaLite], [Surprise][Surprise] and [LibRec][LibRec].
111 | 
112 | 
113 | ## References
114 | [Tensorflow][Tensorflow]
115 | 
116 | [MyMediaLite][MyMediaLite]
117 | 
118 | [Surprise][Surprise]
119 | 
120 | [LibRec][LibRec]
121 | 
122 | Also see my [ML2017][ML2017] repo, there is a [Keras][Keras] implementation for SVD and SVD++ in [hw6][hw6].
123 | 
124 | [MovieLens100K]: https://grouplens.org/datasets/movielens/100k/
125 | [MovieLens1M]: https://grouplens.org/datasets/movielens/1m/
126 | [LibRec]: https://www.librec.net/release/v1.3/example.html
127 | [Tensorflow]: https://www.tensorflow.org/
128 | [Keras]: https://keras.io/
129 | [MyMediaLite]: http://www.mymedialite.net/examples/datasets.html
130 | [Surprise]: https://github.com/NicolasHug/Surprise
131 | [ML2017]: https://github.com/WindQAQ/ML2017
132 | [hw6]: https://github.com/WindQAQ/ML2017/blob/master/hw6/train.py
133 | 
134 | ## Contact
135 | Issues and pull requests are welcomed. Feel free to [contact me](mailto:windqaq@gmail.com) if there's any problems.
136 | 


--------------------------------------------------------------------------------
/tfcf/models/svdpp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | try:
  5 |     from tensorflow.keras import utils
  6 | except:
  7 |     from tensorflow.contrib.keras import utils
  8 | 
  9 | from .svd import SVD
 10 | from ..utils.data_utils import BatchGenerator
 11 | from ..metrics import mae
 12 | from ..metrics import rmse
 13 | 
 14 | 
 15 | def _convert_to_sparse_format(x):
 16 |     """Converts a list of lists into sparse format.  
 17 | 
 18 |     Args:
 19 |         x: A list of lists.
 20 | 
 21 |     Returns:
 22 |         A dictionary that contains three fields, which are 
 23 |             indices, values, and the dense shape of sparse matrix.
 24 |     """
 25 | 
 26 |     sparse = {
 27 |         'indices': [],
 28 |         'values': []
 29 |     }
 30 | 
 31 |     for row, x_i in enumerate(x):
 32 |         for col, x_ij in enumerate(x_i):
 33 |             sparse['indices'].append((row, col))
 34 |             sparse['values'].append(x_ij)
 35 | 
 36 |     max_col = np.max([len(x_i) for x_i in x]).astype(np.int32)
 37 | 
 38 |     sparse['dense_shape'] = (len(x), max_col)
 39 | 
 40 |     return sparse
 41 | 
 42 | 
 43 | def _get_implicit_feedback(x, num_users, num_items, dual):
 44 |     """Gets implicit feedback from (users, items) pair.
 45 | 
 46 |     Args:
 47 |         x: A numpy array of shape `(samples, 2)`.
 48 |         num_users: An integer, total number of users.
 49 |         num_items: An integer, total number of items.
 50 |         dual: A bool, deciding whether returns the
 51 |             dual term of implicit feedback of items.
 52 | 
 53 |     Returns:
 54 |         A dictionary that is the sparse format of implicit
 55 |             feedback of users, if dual is true.
 56 |         A tuple of dictionarys that are the sparse format of
 57 |             implicit feedback of users and items, otherwise.
 58 |     """
 59 | 
 60 |     if not dual:
 61 |         N = [[] for u in range(num_users)]
 62 |         for u, i, in zip(x[:, 0], x[:, 1]):
 63 |             N[u].append(i)
 64 | 
 65 |         return _convert_to_sparse_format(N)
 66 |     else:
 67 |         N = [[] for u in range(num_users)]
 68 |         H = [[] for u in range(num_items)]
 69 |         for u, i, in zip(x[:, 0], x[:, 1]):
 70 |             N[u].append(i)
 71 |             H[i].append(u)
 72 | 
 73 |         return _convert_to_sparse_format(N), _convert_to_sparse_format(H)
 74 | 
 75 | 
 76 | class SVDPP(SVD):
 77 |     """Collaborative filtering model based on SVD++ algorithm.
 78 |     """
 79 | 
 80 |     def __init__(self, config, sess, dual=False):
 81 |         super(SVDPP, self).__init__(config, sess)
 82 |         self.dual = dual
 83 | 
 84 |     def _create_implicit_feedback(self, implicit_feedback, dual=False):
 85 |         """Returns the (tuple of) sparse tensor(s) of implicit feedback.
 86 |         """
 87 |         with tf.variable_scope('implicit_feedback'):
 88 |             if not dual:
 89 |                 N = tf.SparseTensor(**implicit_feedback)
 90 | 
 91 |                 return N
 92 |             else:
 93 |                 N = tf.SparseTensor(**implicit_feedback[0])
 94 |                 H = tf.SparseTensor(**implicit_feedback[1])
 95 | 
 96 |                 return N, H
 97 | 
 98 |     def _create_user_terms(self, users, N):
 99 |         num_users = self.num_users
100 |         num_items = self.num_items
101 |         num_factors = self.num_factors
102 | 
103 |         p_u, b_u = super(SVDPP, self)._create_user_terms(users)
104 | 
105 |         with tf.variable_scope('user'):
106 |             implicit_feedback_embeddings = tf.get_variable(
107 |                 name='implict_feedback_embedding',
108 |                 shape=[num_items, num_factors],
109 |                 initializer=tf.zeros_initializer(),
110 |                 regularizer=tf.contrib.layers.l2_regularizer(self.reg_y_u))
111 | 
112 |             y_u = tf.gather(
113 |                 tf.nn.embedding_lookup_sparse(
114 |                     implicit_feedback_embeddings,
115 |                     N,
116 |                     sp_weights=None,
117 |                     combiner='sqrtn'),
118 |                 users,
119 |                 name='y_u'
120 |             )
121 | 
122 |         return p_u, b_u, y_u
123 | 
124 |     def _create_item_terms(self, items, H=None):
125 |         num_users = self.num_users
126 |         num_items = self.num_items
127 |         num_factors = self.num_factors
128 | 
129 |         q_i, b_i = super(SVDPP, self)._create_item_terms(items)
130 | 
131 |         if H is None:
132 |             return q_i, b_i
133 |         else:
134 |             with tf.variable_scope('item'):
135 |                 implicit_feedback_embeddings = tf.get_variable(
136 |                     name='implict_feedback_embedding',
137 |                     shape=[num_users, num_factors],
138 |                     initializer=tf.zeros_initializer(),
139 |                     regularizer=tf.contrib.layers.l2_regularizer(self.reg_g_i))
140 | 
141 |                 g_i = tf.gather(
142 |                     tf.nn.embedding_lookup_sparse(
143 |                         implicit_feedback_embeddings,
144 |                         H,
145 |                         sp_weights=None,
146 |                         combiner='sqrtn'),
147 |                     items,
148 |                     name='g_i'
149 |                 )
150 | 
151 |             return q_i, b_i, g_i
152 | 
153 |     def _create_prediction(self, mu, b_u, b_i, p_u, q_i, y_u, g_i=None):
154 |         with tf.variable_scope('prediction'):
155 |             if g_i is None:
156 |                 pred = tf.reduce_sum(
157 |                     tf.multiply(tf.add(p_u, y_u), q_i),
158 |                     axis=1)
159 |             else:
160 |                 pred = tf.reduce_sum(
161 |                     tf.multiply(tf.add(p_u, y_u), tf.add(q_i, g_i)),
162 |                     axis=1)
163 | 
164 |             pred = tf.add_n([b_u, b_i, pred])
165 | 
166 |             pred = tf.add(pred, mu, name='pred')
167 | 
168 |         return pred
169 | 
170 |     def _build_graph(self, mu, implicit_feedback):
171 |         _mu = super(SVDPP, self)._create_constants(mu)
172 | 
173 |         self._users, self._items, self._ratings = super(
174 |             SVDPP, self)._create_placeholders()
175 | 
176 |         if not self.dual:
177 |             N = self._create_implicit_feedback(implicit_feedback)
178 | 
179 |             p_u, b_u, y_u = self._create_user_terms(self._users, N)
180 |             q_i, b_i = self._create_item_terms(self._items)
181 | 
182 |             self._pred = self._create_prediction(_mu, b_u, b_i, p_u, q_i, y_u)
183 |         else:
184 |             N, H = self._create_implicit_feedback(implicit_feedback, True)
185 | 
186 |             p_u, b_u, y_u = self._create_user_terms(self._users, N)
187 |             q_i, b_i, g_i = self._create_item_terms(self._items, H)
188 | 
189 |             self._pred = self._create_prediction(
190 |                 _mu, b_u, b_i, p_u, q_i, y_u, g_i)
191 | 
192 |         loss = super(SVDPP, self)._create_loss(self._ratings, self._pred)
193 | 
194 |         self._optimizer = super(SVDPP, self)._create_optimizer(loss)
195 | 
196 |         self._built = True
197 | 
198 |     def train(self, x, y, epochs=100, batch_size=1024, validation_data=None):
199 | 
200 |         if x.shape[0] != y.shape[0] or x.shape[1] != 2:
201 |             raise ValueError('The shape of x should be (samples, 2) and '
202 |                              'the shape of y should be (samples, 1).')
203 | 
204 |         if not self._built:
205 |             implicit_feedback = _get_implicit_feedback(
206 |                 x, self.num_users, self.num_items, self.dual)
207 |             self._build_graph(np.mean(y), implicit_feedback)
208 | 
209 |         self._run_train(x, y, epochs, batch_size, validation_data)
210 | 


--------------------------------------------------------------------------------
/tfcf/models/svd.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | try:
  5 |     from tensorflow.keras import utils
  6 | except:
  7 |     from tensorflow.contrib.keras import utils
  8 | 
  9 | from .model_base import BaseModel
 10 | from ..utils.data_utils import BatchGenerator
 11 | from ..metrics import mae
 12 | from ..metrics import rmse
 13 | 
 14 | 
 15 | class SVD(BaseModel):
 16 |     """Collaborative filtering model based on SVD algorithm.
 17 |     """
 18 | 
 19 |     def __init__(self, config, sess):
 20 |         super(SVD, self).__init__(config)
 21 |         self._sess = sess
 22 | 
 23 |     def _create_placeholders(self):
 24 |         """Returns the placeholders.
 25 |         """
 26 |         with tf.variable_scope('placeholder'):
 27 |             users = tf.placeholder(tf.int32, shape=[None, ], name='users')
 28 |             items = tf.placeholder(tf.int32, shape=[None, ], name='items')
 29 |             ratings = tf.placeholder(
 30 |                 tf.float32, shape=[None, ], name='ratings')
 31 | 
 32 |         return users, items, ratings
 33 | 
 34 |     def _create_constants(self, mu):
 35 |         """Returns the constants.
 36 |         """
 37 |         with tf.variable_scope('constant'):
 38 |             _mu = tf.constant(mu, shape=[], dtype=tf.float32)
 39 | 
 40 |         return _mu
 41 | 
 42 |     def _create_user_terms(self, users):
 43 |         """Returns the tensors related to users.
 44 |         """
 45 |         num_users = self.num_users
 46 |         num_factors = self.num_factors
 47 | 
 48 |         with tf.variable_scope('user'):
 49 |             user_embeddings = tf.get_variable(
 50 |                 name='embedding',
 51 |                 shape=[num_users, num_factors],
 52 |                 initializer=tf.contrib.layers.xavier_initializer(),
 53 |                 regularizer=tf.contrib.layers.l2_regularizer(self.reg_p_u))
 54 | 
 55 |             user_bias = tf.get_variable(
 56 |                 name='bias',
 57 |                 shape=[num_users, ],
 58 |                 initializer=tf.contrib.layers.xavier_initializer(),
 59 |                 regularizer=tf.contrib.layers.l2_regularizer(self.reg_b_u))
 60 | 
 61 |             p_u = tf.nn.embedding_lookup(
 62 |                 user_embeddings,
 63 |                 users,
 64 |                 name='p_u')
 65 | 
 66 |             b_u = tf.nn.embedding_lookup(
 67 |                 user_bias,
 68 |                 users,
 69 |                 name='b_u')
 70 | 
 71 |         return p_u, b_u
 72 | 
 73 |     def _create_item_terms(self, items):
 74 |         """Returns the tensors related to items.
 75 |         """
 76 |         num_items = self.num_items
 77 |         num_factors = self.num_factors
 78 | 
 79 |         with tf.variable_scope('item'):
 80 |             item_embeddings = tf.get_variable(
 81 |                 name='embedding',
 82 |                 shape=[num_items, num_factors],
 83 |                 initializer=tf.contrib.layers.xavier_initializer(),
 84 |                 regularizer=tf.contrib.layers.l2_regularizer(self.reg_q_i))
 85 | 
 86 |             item_bias = tf.get_variable(
 87 |                 name='bias',
 88 |                 shape=[num_items, ],
 89 |                 initializer=tf.contrib.layers.xavier_initializer(),
 90 |                 regularizer=tf.contrib.layers.l2_regularizer(self.reg_b_i))
 91 | 
 92 |             q_i = tf.nn.embedding_lookup(
 93 |                 item_embeddings,
 94 |                 items,
 95 |                 name='q_i')
 96 | 
 97 |             b_i = tf.nn.embedding_lookup(
 98 |                 item_bias,
 99 |                 items,
100 |                 name='b_i')
101 | 
102 |         return q_i, b_i
103 | 
104 |     def _create_prediction(self, mu, b_u, b_i, p_u, q_i):
105 |         """Returns the tensor of prediction.
106 | 
107 |            Note that the prediction 
108 |             r_hat = \mu + b_u + b_i + p_u * q_i
109 |         """
110 |         with tf.variable_scope('prediction'):
111 |             pred = tf.reduce_sum(
112 |                 tf.multiply(p_u, q_i),
113 |                 axis=1)
114 | 
115 |             pred = tf.add_n([b_u, b_i, pred])
116 | 
117 |             pred = tf.add(pred, mu, name='pred')
118 | 
119 |         return pred
120 | 
121 |     def _create_loss(self, pred, ratings):
122 |         """Returns the L2 loss of the difference between
123 |             ground truths and predictions.
124 | 
125 |            The formula is here:
126 |             L2 = sum((r - r_hat) ** 2) / 2
127 |         """
128 |         with tf.variable_scope('loss'):
129 |             loss = tf.nn.l2_loss(tf.subtract(ratings, pred), name='loss')
130 | 
131 |         return loss
132 | 
133 |     def _create_optimizer(self, loss):
134 |         """Returns the optimizer.
135 | 
136 |            The objective function is defined as the sum of
137 |             loss and regularizers' losses.
138 |         """
139 |         with tf.variable_scope('optimizer'):
140 |             objective = tf.add(
141 |                 loss,
142 |                 tf.add_n(tf.get_collection(
143 |                     tf.GraphKeys.REGULARIZATION_LOSSES)),
144 |                 name='objective')
145 | 
146 |             try:
147 |                 optimizer = tf.contrib.keras.optimizers.Nadam(
148 |                 ).minimize(objective, name='optimizer')
149 |             except:
150 |                 optimizer = tf.train.AdamOptimizer().minimize(objective, name='optimizer')
151 | 
152 |         return optimizer
153 | 
154 |     def _build_graph(self, mu):
155 |         _mu = self._create_constants(mu)
156 | 
157 |         self._users, self._items, self._ratings = self._create_placeholders()
158 | 
159 |         p_u, b_u = self._create_user_terms(self._users)
160 |         q_i, b_i = self._create_item_terms(self._items)
161 | 
162 |         self._pred = self._create_prediction(_mu, b_u, b_i, p_u, q_i)
163 | 
164 |         loss = self._create_loss(self._ratings, self._pred)
165 | 
166 |         self._optimizer = self._create_optimizer(loss)
167 | 
168 |         self._built = True
169 | 
170 |     def _run_train(self, x, y, epochs, batch_size, validation_data):
171 |         train_gen = BatchGenerator(x, y, batch_size)
172 |         steps_per_epoch = np.ceil(train_gen.length / batch_size).astype(int)
173 | 
174 |         self._sess.run(tf.global_variables_initializer())
175 | 
176 |         for e in range(1, epochs + 1):
177 |             print('Epoch {}/{}'.format(e, epochs))
178 | 
179 |             pbar = utils.Progbar(steps_per_epoch)
180 | 
181 |             for step, batch in enumerate(train_gen.next(), 1):
182 |                 users = batch[0][:, 0]
183 |                 items = batch[0][:, 1]
184 |                 ratings = batch[1]
185 | 
186 |                 self._sess.run(
187 |                     self._optimizer,
188 |                     feed_dict={
189 |                         self._users: users,
190 |                         self._items: items,
191 |                         self._ratings: ratings
192 |                     })
193 | 
194 |                 pred = self.predict(batch[0])
195 | 
196 |                 update_values = [
197 |                     ('rmse', rmse(ratings, pred)),
198 |                     ('mae', mae(ratings, pred))
199 |                 ]
200 | 
201 |                 if validation_data is not None and step == steps_per_epoch:
202 |                     valid_x, valid_y = validation_data
203 |                     valid_pred = self.predict(valid_x)
204 | 
205 |                     update_values += [
206 |                         ('val_rmse', rmse(valid_y, valid_pred)),
207 |                         ('val_mae', mae(valid_y, valid_pred))
208 |                     ]
209 | 
210 |                 pbar.update(step, values=update_values,
211 |                             force=(step == steps_per_epoch))
212 | 
213 |     def train(self, x, y, epochs=100, batch_size=1024, validation_data=None):
214 | 
215 |         if x.shape[0] != y.shape[0] or x.shape[1] != 2:
216 |             raise ValueError('The shape of x should be (samples, 2) and '
217 |                              'the shape of y should be (samples, 1).')
218 | 
219 |         if not self._built:
220 |             self._build_graph(np.mean(y))
221 | 
222 |         self._run_train(x, y, epochs, batch_size, validation_data)
223 | 
224 |     def predict(self, x):
225 |         if not self._built:
226 |             raise RunTimeError('The model must be trained '
227 |                                'before prediction.')
228 | 
229 |         if x.shape[1] != 2:
230 |             raise ValueError('The shape of x should be '
231 |                              '(samples, 2)')
232 | 
233 |         pred = self._sess.run(
234 |             self._pred,
235 |             feed_dict={
236 |                 self._users: x[:, 0],
237 |                 self._items: x[:, 1]
238 |             })
239 | 
240 |         pred = pred.clip(min=self.min_value, max=self.max_value)
241 | 
242 |         return pred
243 | 


--------------------------------------------------------------------------------