├── reclab ├── recommenders │ ├── llorma │ │ ├── llorma_lib │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── train_utils.py │ │ │ └── anchor.py │ │ ├── __init__.py │ │ └── llorma.py │ ├── cfnade │ │ ├── __init__.py │ │ ├── cfnade_lib │ │ │ ├── nade.py │ │ │ └── utils.py │ │ └── cfnade.py │ ├── autorec │ │ ├── __init__.py │ │ ├── autorec_lib │ │ │ └── autorec.py │ │ └── autorec.py │ ├── __init__.py │ ├── top_pop.py │ ├── baseline.py │ ├── README.md │ ├── sparse.py │ ├── knn_recommender.py │ └── libfm.py ├── __init__.py ├── environments │ ├── __init__.py │ ├── fixed_rating.py │ ├── contextual.py │ ├── README.md │ ├── beta_rank.py │ ├── schmit.py │ ├── registry.py │ ├── topics.py │ └── latent_factors.py └── data_utils.py ├── MANIFEST.in ├── setup.cfg ├── tests ├── __init__.py ├── test_ease.py ├── test_slim.py ├── test_simple_example.py ├── test_knn.py ├── test_cfnade.py ├── test_contextual.py ├── test_autorec.py ├── test_llorma.py ├── test_top_pop.py ├── test_beta_rank.py ├── test_fixed.py ├── test_libfm.py ├── utils.py └── test_topics.py ├── figures └── RecSys.png ├── models └── ml-100k │ └── fm_model.npz ├── update_docs.sh ├── .gitignore ├── lint.sh ├── LICENSE.txt ├── .travis.yml ├── requirements.txt ├── setup.py ├── README.md └── .pylintrc /reclab/recommenders/llorma/llorma_lib/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include data/ml-100k-model/fm_model.npz 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """A set of tests for both recommenders and environments.""" 2 | -------------------------------------------------------------------------------- /figures/RecSys.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeley-reclab/RecLab/HEAD/figures/RecSys.png -------------------------------------------------------------------------------- /models/ml-100k/fm_model.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeley-reclab/RecLab/HEAD/models/ml-100k/fm_model.npz -------------------------------------------------------------------------------- /update_docs.sh: -------------------------------------------------------------------------------- 1 | pdoc --force --html --output-dir docs reclab 2 | mv -r docs/reclab/* ../berkeley-reclab.github.io/docs/ 3 | -------------------------------------------------------------------------------- /reclab/__init__.py: -------------------------------------------------------------------------------- 1 | """This package contains environments and models for recommendation.""" 2 | from .environments import make 3 | -------------------------------------------------------------------------------- /reclab/recommenders/cfnade/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The package for the Cfnade recommender. 3 | 4 | See https://arxiv.org/abs/1605.09477 for details. 5 | """ 6 | from .cfnade import Cfnade 7 | -------------------------------------------------------------------------------- /reclab/recommenders/autorec/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The package for the Autorec recommender. 3 | 4 | See https://doi.org/10.1145/2740908.2742726 for details. 5 | """ 6 | from .autorec import Autorec 7 | -------------------------------------------------------------------------------- /reclab/recommenders/llorma/__init__.py: -------------------------------------------------------------------------------- 1 | """The package for the Global LLORMA recommender. 2 | 3 | Code modified from https://github.com/JoonyoungYi/LLORMA-tensorflow 4 | """ 5 | from .llorma import Llorma 6 | -------------------------------------------------------------------------------- /reclab/recommenders/llorma/llorma_lib/__init__.py: -------------------------------------------------------------------------------- 1 | """ Init 2 | """ 3 | from .llorma_g import Llorma, LocalModel, BatchManager 4 | from .anchor import AnchorManager 5 | from .train_utils import init_latent_mat, init_session, get_train_op 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /**/__pycache__/ 2 | 3 | models 4 | train.libfm 5 | test.libfm 6 | predictions 7 | 8 | .ipynb_checkpoints 9 | *.swo 10 | *.swp 11 | *.egg-info 12 | 13 | results 14 | experiments/data 15 | experiments/results 16 | 17 | .coverage 18 | .venv 19 | model.h5 20 | dist/ 21 | -------------------------------------------------------------------------------- /reclab/environments/__init__.py: -------------------------------------------------------------------------------- 1 | """The package that contains all environments.""" 2 | from .beta_rank import BetaRank 3 | from .contextual import Contextual 4 | from .environment import DictEnvironment 5 | from .environment import Environment 6 | from .fixed_rating import FixedRating 7 | from .latent_factors import LatentFactorBehavior, DatasetLatentFactor 8 | from .registry import make 9 | from .schmit import Schmit 10 | from .topics import Topics 11 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | pylint --rcfile=.pylintrc reclab -f parseable -r n --load-plugins pylint_quotes 2 | pycodestyle reclab --max-line-length=100 --exclude=reclab/recommenders/autorec/autorec_lib,reclab/recommenders/cfnade/cfnade_lib,reclab/recommenders/llorma/llorma_lib 3 | pydocstyle reclab --match-dir="^(?!autorec_lib|cfnade_lib|llorma_lib).*" 4 | pylint --rcfile=.pylintrc tests -f parseable -r n --load-plugins pylint_quotes 5 | pycodestyle tests --max-line-length=100 6 | pydocstyle tests 7 | -------------------------------------------------------------------------------- /tests/test_ease.py: -------------------------------------------------------------------------------- 1 | """Tests for the EASE recommender.""" 2 | from reclab.recommenders import EASE 3 | from . import utils 4 | 5 | 6 | def test_predict(): 7 | """Test that EASE predicts well and that it gets better with more data.""" 8 | recommender = EASE(lam=100, binarize=True) 9 | utils.test_binary_recommend_ml100k(recommender, 0.1) 10 | 11 | 12 | def test_recommend(): 13 | """Test that EASE will recommend reasonable items.""" 14 | recommender = EASE(lam=100) 15 | utils.test_recommend_simple(recommender) 16 | -------------------------------------------------------------------------------- /tests/test_slim.py: -------------------------------------------------------------------------------- 1 | """Tests for the SLIM recommender.""" 2 | from reclab.recommenders import SLIM 3 | from . import utils 4 | 5 | 6 | def test_predict(): 7 | """Test that SLIM predicts well and that it gets better with more data.""" 8 | recommender = SLIM(alpha=0.1, l1_ratio=1e-3, seed=0) 9 | utils.test_binary_recommend_ml100k(recommender, 0.1) 10 | 11 | 12 | def test_recommend(): 13 | """Test that SLIM will recommend reasonable items.""" 14 | recommender = SLIM(alpha=0.1, l1_ratio=1e-3, seed=0) 15 | utils.test_recommend_simple(recommender) 16 | -------------------------------------------------------------------------------- /reclab/recommenders/__init__.py: -------------------------------------------------------------------------------- 1 | """A set of recommender to be used in conjunction with environments.""" 2 | from .baseline import RandomRec 3 | from .baseline import PerfectRec 4 | from .knn_recommender import KNNRecommender 5 | from .recommender import Recommender 6 | from .recommender import PredictRecommender 7 | from .top_pop import TopPop 8 | 9 | try: 10 | from .autorec import Autorec 11 | from .cfnade import Cfnade 12 | from .libfm import LibFM 13 | from .llorma import Llorma 14 | from .sparse import SLIM, EASE 15 | except ImportError: 16 | pass 17 | -------------------------------------------------------------------------------- /tests/test_simple_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the basic example found in the README 3 | """ 4 | import numpy as np 5 | import reclab 6 | 7 | 8 | def test_basic_example(): 9 | """Test the basic example in the README.""" 10 | env = reclab.make('topics-dynamic-v1') 11 | items, users, ratings = env.reset() 12 | for i in range(1): 13 | online_users = env.online_users 14 | # Your recommendation algorithm here. This recommends 10 random items to each online user. 15 | recommendations = np.random.choice(list(items), size=(len(online_users), 10)) 16 | _, _, ratings, info = env.step(recommendations) 17 | env.close() 18 | -------------------------------------------------------------------------------- /tests/test_knn.py: -------------------------------------------------------------------------------- 1 | """Tests for the KNN recommender.""" 2 | from reclab.recommenders import KNNRecommender 3 | from . import utils 4 | 5 | 6 | def test_user_predict(): 7 | """Test that KNN-user predicts well and that it gets better with more data.""" 8 | recommender = KNNRecommender(user_based=True) 9 | utils.test_predict_ml100k(recommender, rmse_threshold=1.1, test_dense=True) 10 | 11 | 12 | def test_item_predict(): 13 | """Test that KNN-user predicts well and that it gets better with more data.""" 14 | recommender = KNNRecommender(user_based=False, shrinkage=0.1) 15 | utils.test_predict_ml100k(recommender, rmse_threshold=1.5, test_dense=True) 16 | 17 | 18 | def test_user_recommend(): 19 | """Test that KNN-item will recommend reasonable items.""" 20 | recommender = KNNRecommender(user_based=True) 21 | utils.test_recommend_simple(recommender) 22 | 23 | 24 | def test_item_recommend(): 25 | """Test that KNN-item will recommend reasonable items.""" 26 | recommender = KNNRecommender(user_based=True) 27 | utils.test_recommend_simple(recommender) 28 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Karl Krauth 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - os: osx 6 | language: generic 7 | env: PYTHON_VERSION=3.8 8 | - os: linux 9 | dist: xenial 10 | python: 3.8 11 | env: PYTHON_VERSION=3.8 12 | 13 | # Command to install dependencies 14 | install: 15 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then 16 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 17 | else 18 | wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; 19 | fi 20 | - bash miniconda.sh -b -p $HOME/miniconda 21 | - export PATH="$HOME/miniconda/bin:$PATH" 22 | - hash -r 23 | - conda config --set always_yes yes --set changeps1 no 24 | - conda update -q conda 25 | - conda info -a 26 | - conda create -q -n test-environment python=$PYTHON_VERSION 27 | - source activate test-environment 28 | - pip --version 29 | - pip install -r requirements.txt 30 | 31 | # Command to run tests 32 | script: 33 | - bash lint.sh 34 | - pytest --durations=0 --cov=reclab tests 35 | after_success: 36 | - coveralls 37 | notifications: 38 | email: false 39 | -------------------------------------------------------------------------------- /tests/test_cfnade.py: -------------------------------------------------------------------------------- 1 | """Tests for the CFNADE recommender.""" 2 | from reclab.recommenders.cfnade import Cfnade 3 | from . import utils 4 | 5 | 6 | def test_cfnade_predict(): 7 | """Test that CFNADE predicts well and that it gets better with more data.""" 8 | recommender = Cfnade(num_users=utils.NUM_USERS_ML100K, 9 | num_items=utils.NUM_ITEMS_ML100K, 10 | batch_size=64, 11 | train_epoch=10, 12 | rating_bucket=5, 13 | hidden_dim=250, 14 | learning_rate=0.001, 15 | random_seed=0) 16 | utils.test_predict_ml100k(recommender, rmse_threshold=1.2) 17 | 18 | 19 | def test_cfnade_recommend(): 20 | """Test that CFNADE will recommend reasonable items.""" 21 | recommender = Cfnade(num_users=utils.NUM_USERS_SIMPLE, 22 | num_items=utils.NUM_ITEMS_SIMPLE, 23 | batch_size=1, 24 | train_epoch=10, 25 | rating_bucket=5, 26 | hidden_dim=250, 27 | learning_rate=0.001, 28 | random_seed=0) 29 | utils.test_recommend_simple(recommender) 30 | -------------------------------------------------------------------------------- /tests/test_contextual.py: -------------------------------------------------------------------------------- 1 | """Tests for the Contextual environment.""" 2 | import numpy as np 3 | 4 | from reclab.environments import Contextual 5 | 6 | 7 | def test_contextual_wiki(): 8 | """Test contextual instantiated with Wiki10-31k.""" 9 | env = Contextual('wiki10-31k') 10 | assert env.name == 'contextual' 11 | users, items, ratings = env.reset() 12 | 13 | # Test that the users and items have empty features. 14 | assert users[0].shape == (0,) 15 | assert items[0].shape == (0,) 16 | 17 | # Test that contexts have a given size. 18 | assert env.online_users[0].shape == (101938,) 19 | context = env.online_users[0] 20 | 21 | # Test the number of users and items. 22 | assert len(env.online_users) == 1 23 | assert len(users) == 1 24 | assert len(items) == 30938 25 | 26 | # Recommend item 0, we should a new user and no new items. 27 | users, items, ratings, _ = env.step(np.array([[0]])) 28 | assert len(users) == 1 29 | assert 1 in users 30 | assert len(items) == 0 31 | 32 | # The first user should have left. 33 | assert 0 not in env.users 34 | 35 | # We should only have received one rating of 0. 36 | assert len(ratings) == 1 37 | assert ratings[(0, 0)][0] == 0.0 38 | assert np.array_equal(ratings[(0, 0)][1], context) 39 | -------------------------------------------------------------------------------- /tests/test_autorec.py: -------------------------------------------------------------------------------- 1 | """Tests for the Autorec recommender.""" 2 | from reclab.recommenders import Autorec 3 | from . import utils 4 | 5 | 6 | def test_predict(): 7 | """Test that Autorec predicts well and that it gets better with more data.""" 8 | recommender = Autorec(utils.NUM_USERS_ML100K, 9 | utils.NUM_ITEMS_ML100K, 10 | hidden_neuron=500, 11 | lambda_value=20, 12 | train_epoch=50, 13 | batch_size=20, 14 | grad_clip=False, 15 | base_lr=1e-4, 16 | random_seed=0) 17 | utils.test_predict_ml100k(recommender, rmse_threshold=1.3) 18 | 19 | 20 | def test_recommend(): 21 | """Test that Autorec will recommend reasonable items.""" 22 | recommender = Autorec(utils.NUM_USERS_SIMPLE, 23 | utils.NUM_ITEMS_SIMPLE, 24 | hidden_neuron=500, 25 | lambda_value=20, 26 | train_epoch=1000, 27 | batch_size=20, 28 | grad_clip=False, 29 | base_lr=1e-4, 30 | random_seed=0) 31 | utils.test_recommend_simple(recommender) 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | astroid==2.4.2 3 | astunparse==1.6.3 4 | attrs==19.3.0 5 | cachetools==4.1.1 6 | certifi==2020.6.20 7 | chardet==3.0.4 8 | coveralls==2.1.2 9 | future==0.18.2 10 | gast==0.3.3 11 | google-auth==1.19.1 12 | google-auth-oauthlib==0.4.1 13 | google-pasta==0.2.0 14 | grpcio==1.30.0 15 | h5py==2.10.0 16 | idna==2.10 17 | importlib-metadata==1.7.0 18 | isort==4.3.21 19 | joblib==0.16.0 20 | Keras==2.4.3 21 | Keras-Preprocessing==1.1.2 22 | lazy-object-proxy==1.4.3 23 | Markdown==3.2.2 24 | mccabe==0.6.1 25 | more-itertools==8.4.0 26 | numpy==1.19.0 27 | oauthlib==3.1.0 28 | opt-einsum==3.2.1 29 | packaging==20.4 30 | pandas==1.0.5 31 | pluggy==0.13.1 32 | protobuf==3.15.0 33 | py==1.10.0 34 | pyasn1==0.4.8 35 | pyasn1-modules==0.2.8 36 | pybind11==2.5.0 37 | pycodestyle==2.6.0 38 | pydocstyle==5.0.2 39 | pylint==2.5.3 40 | pylint-quotes==0.2.1 41 | pyparsing==2.4.7 42 | pytest==5.4.3 43 | pytest-cov==2.10.1 44 | pytest-mock==3.3.0 45 | python-dateutil==2.8.1 46 | pytz==2020.1 47 | PyYAML==5.4 48 | requests==2.24.0 49 | requests-oauthlib==1.3.0 50 | rsa==4.7 51 | scikit-learn==0.23.1 52 | scipy==1.4.1 53 | six==1.15.0 54 | sklearn==0.0 55 | snowballstemmer==2.0.0 56 | tensorboard==2.2.2 57 | tensorboard-plugin-wit==1.7.0 58 | tensorflow>=2.2.1 59 | tensorflow-estimator==2.2.0 60 | termcolor==1.1.0 61 | threadpoolctl==2.1.0 62 | toml==0.10.1 63 | torch==1.5.1 64 | typed-ast==1.4.1 65 | urllib3==1.26.5 66 | wcwidth==0.2.5 67 | Werkzeug==1.0.1 68 | wpyfm==0.1.9 69 | wrapt==1.12.1 70 | zipp==3.1.0 71 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | setup( 5 | name='RecLab', 6 | version='0.1.2', 7 | author='Karl Krauth', 8 | author_email='karl.krauth@gmail.com', 9 | description='A simulation framework for recommender systems.', 10 | license='MIT', 11 | download_url= 'https://github.com/berkeley-reclab/RecLab/archive/v0.1.2.tar.gz', 12 | packages=find_packages(), 13 | include_package_data=True, 14 | url='https://berkeley-reclab.github.io/', 15 | keywords=[ 16 | 'recommender', 17 | 'recommendation', 18 | 'simulation', 19 | 'evaluation' 20 | ], 21 | install_requires=[ 22 | 'numpy>=1.19.0', 23 | 'pandas>=1.0.5', 24 | 'scipy>=1.4.1', 25 | ], 26 | extras_require={ 27 | 'recommenders': [ 28 | 'keras>=2.4.3', 29 | 'scikit-learn>=0.23.1', 30 | 'tensorflow>=2.2.0', 31 | 'torch>=1.5.1', 32 | 'wpyfm>=0.1.9', 33 | ] 34 | }, 35 | tests_require=[ 36 | 'pytest>=5.4.3', 37 | 'pytest-mock>=3.3.0', 38 | ], 39 | python_requires='>=3.6', 40 | classifiers=[ 41 | 'Development Status :: 3 - Alpha', 42 | 'Intended Audience :: Science/Research', 43 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 44 | 'License :: OSI Approved :: MIT License', 45 | 'Programming Language :: Python :: 3', 46 | 'Programming Language :: Python :: 3.6', 47 | 'Programming Language :: Python :: 3.7', 48 | 'Programming Language :: Python :: 3.8', 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /reclab/recommenders/top_pop.py: -------------------------------------------------------------------------------- 1 | """An implementation of the top popularity baseline recommender.""" 2 | 3 | import numpy as np 4 | import scipy.sparse 5 | 6 | from . import recommender 7 | 8 | 9 | # TODO: add flag to allow this to also be based on number of times rated. 10 | class TopPop(recommender.PredictRecommender): 11 | """The top popularity recommendation model based on ratings.""" 12 | 13 | @property 14 | def name(self): # noqa: D102 15 | return 'top-pop' 16 | 17 | @property 18 | def dense_predictions(self): # noqa: D102 19 | if self._dense_predictions is None: 20 | item_vector = self._average_item_ratings() 21 | self._dense_predictions = np.vstack([item_vector] * self._ratings.shape[0]) 22 | return self._dense_predictions 23 | 24 | def _average_item_ratings(self): 25 | # Compute average rating of each item 26 | row, col = self._ratings.nonzero() 27 | data = np.ones(len(row)) 28 | binary_ratings = scipy.sparse.csr_matrix((data, (row, col)), shape=self._ratings.shape) 29 | 30 | summed_item_ratings = self._ratings.sum(0) 31 | num_times_rated = binary_ratings.sum(0) 32 | 33 | item_vector = np.mean(self._ratings) * np.ones(num_times_rated.shape) 34 | idx_rated = np.where(num_times_rated > 0) 35 | item_vector[idx_rated] = summed_item_ratings[idx_rated] / num_times_rated[idx_rated] 36 | 37 | return item_vector.flatten() 38 | 39 | def _predict(self, user_item): # noqa: D102 40 | # Predict on all user-item pairs. 41 | average_item_ratings = self._average_item_ratings() 42 | predictions = [] 43 | for _, item_id, _ in user_item: 44 | predictions.append(average_item_ratings[item_id]) 45 | 46 | return np.array(predictions) 47 | -------------------------------------------------------------------------------- /tests/test_llorma.py: -------------------------------------------------------------------------------- 1 | """Tests for the LLORMA recommender.""" 2 | from reclab.recommenders.llorma import Llorma 3 | from . import utils 4 | 5 | 6 | def test_llorma_predict(): 7 | """Test that LLORMA predicts well and that it gets better with more data.""" 8 | recommender = Llorma(max_user=utils.NUM_USERS_ML100K, 9 | max_item=utils.NUM_ITEMS_ML100K, 10 | n_anchor=10, 11 | pre_rank=10, 12 | pre_learning_rate=3e-4, 13 | pre_lambda_val=0.01, 14 | pre_train_steps=70, 15 | rank=20, 16 | learning_rate=2e-2, 17 | lambda_val=1e-4, 18 | train_steps=50, 19 | batch_size=1000, 20 | use_cache=False, 21 | result_path='results', 22 | random_seed=0) 23 | utils.test_predict_ml100k(recommender, rmse_threshold=1.1) 24 | 25 | 26 | def test_llorma_recommend(): 27 | """Test that LLORMA will recommend reasonable items.""" 28 | recommender = Llorma(max_user=utils.NUM_USERS_ML100K, 29 | max_item=utils.NUM_ITEMS_ML100K, 30 | n_anchor=10, 31 | pre_rank=10, 32 | pre_learning_rate=3e-4, 33 | pre_lambda_val=0.01, 34 | pre_train_steps=70, 35 | rank=20, 36 | learning_rate=2e-2, 37 | lambda_val=1e-4, 38 | train_steps=50, 39 | batch_size=1000, 40 | use_cache=False, 41 | result_path='results', 42 | random_seed=0) 43 | utils.test_recommend_simple(recommender) -------------------------------------------------------------------------------- /reclab/recommenders/llorma/llorma_lib/train_utils.py: -------------------------------------------------------------------------------- 1 | """ LLORMA training utils 2 | """ 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import math 8 | import tensorflow as tf 9 | 10 | 11 | def init_session(): 12 | """Initializes TF session 13 | 14 | Returns 15 | ------- 16 | obj: tf.Session 17 | Returns TF Session 18 | """ 19 | # gpu_options = tf.GPUOptions( 20 | # per_process_gpu_memory_fraction=GPU_MEMORY_FRAC) 21 | # gpu_config = tf.ConfigProto(gpu_options=gpu_options) 22 | # session = tf.Session(config=gpu_config) 23 | 24 | config = tf.compat.v1.ConfigProto() 25 | config.gpu_options.allow_growth = True 26 | 27 | session = tf.compat.v1.Session(config=config) 28 | session.run(tf.compat.v1.global_variables_initializer()) 29 | return session 30 | 31 | 32 | def get_train_op(optimizer, loss, var_list): 33 | """ Get a train operation 34 | 35 | Parameters 36 | ---------- 37 | optimizer : obj 38 | Valid TensorFlow optimizer, 39 | e.g. tf.train.GradientDescentOptimizer 40 | loss : obj 41 | TF variable 42 | var_list : obj 43 | List of TF variables 44 | """ 45 | gvs = optimizer.compute_gradients(loss, var_list=var_list) 46 | # capped_gvs = [(tf.clip_by_value(grad, -100.0, 100.0), var) 47 | # for grad, var in gvs] 48 | capped_gvs = gvs 49 | train_op = optimizer.apply_gradients(capped_gvs) 50 | return train_op 51 | 52 | 53 | def init_latent_mat(n, rank, mu_val, std_val): 54 | """Initialize a matrix for the latent factors 55 | 56 | Parameters 57 | ---------- 58 | n : int 59 | Number of user/items 60 | rank : int 61 | Size of the latent dimension 62 | mu_val : float 63 | Unscaled mean value 64 | std_val : float 65 | Unscaled standard deviation value 66 | """ 67 | _mu = math.sqrt(mu_val / rank) 68 | _std = math.sqrt((math.sqrt(mu_val * mu_val + std_val * std_val) - mu_val) / rank) 69 | return tf.Variable( 70 | tf.compat.v1.truncated_normal([n, rank], _mu, _std, dtype=tf.float64)) 71 | -------------------------------------------------------------------------------- /reclab/recommenders/autorec/autorec_lib/autorec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class AutoRec(torch.nn.Module): 4 | def __init__(self, num_users, num_items, 5 | seen_users, seen_items, 6 | hidden_neuron, 7 | dropout=0.05, random_seed=0): 8 | super(AutoRec, self).__init__() 9 | self.num_users = num_users 10 | self.num_items = num_items 11 | self.seen_users = seen_users 12 | self.seen_items = seen_items 13 | 14 | self.hidden_neuron = hidden_neuron 15 | self.random_seed = random_seed 16 | self.dropout_p = dropout 17 | self.sigmoid = torch.nn.Sigmoid() 18 | 19 | def loss(self, pred, test, mask, lambda_value=1): 20 | mse = (((pred * mask) - test) ** 2).sum() 21 | reg_value_enc = torch.mul(lambda_value / 2, list(self.encoder.parameters())[0].norm(p='fro') ** 2) 22 | reg_value_dec = torch.mul(lambda_value / 2, list(self.decoder.parameters())[0].norm(p='fro') ** 2) 23 | return torch.add(mse, torch.add(reg_value_enc, reg_value_dec)) 24 | 25 | def prepare_model(self): 26 | self.encoder = torch.nn.Linear(self.num_users, self.hidden_neuron, bias=True) 27 | self.dropout = torch.nn.Dropout(p=self.dropout_p) 28 | self.decoder = torch.nn.Linear(self.hidden_neuron, self.num_users, bias=True) 29 | 30 | def forward(self, x): 31 | x = self.encoder(x) 32 | x = self.sigmoid(x) 33 | x = self.dropout(x) 34 | x = self.decoder(x) 35 | return x 36 | 37 | def predict(self, user_item, test_data): 38 | users = [triple[0] for triple in user_item] 39 | items = [triple[1] for triple in user_item] 40 | 41 | user_item = zip(users, items) 42 | user_idx = set(users) 43 | item_idx = set(items) 44 | Estimated_R = self.forward(test_data) 45 | for item in range(test_data.shape[0]): 46 | for user in range(test_data.shape[1]): 47 | if user not in self.seen_users and item not in self.seen_items: 48 | Estimated_R[item, user] = 3 49 | idx = [tuple(users), tuple(items)] 50 | Estimated_R = Estimated_R.clamp(1, 5) 51 | return Estimated_R.T[idx].cpu().detach().numpy() 52 | -------------------------------------------------------------------------------- /tests/test_top_pop.py: -------------------------------------------------------------------------------- 1 | """Tests for the TopPop recommender.""" 2 | import collections 3 | 4 | import numpy as np 5 | 6 | from reclab.recommenders import TopPop 7 | 8 | 9 | def test_top_pop_one_step(): 10 | """Test a single recommendation step.""" 11 | users = {0: np.zeros((0,)), 12 | 1: np.zeros((0,)), 13 | 2: np.zeros((0,))} 14 | items = {0: np.zeros((0,)), 15 | 1: np.zeros((0,)), 16 | 2: np.zeros((0,))} 17 | ratings = {(0, 0): (5, np.zeros((0,))), 18 | (0, 1): (4, np.zeros((0,))), 19 | (1, 1): (4, np.zeros((0,))), 20 | (1, 2): (3, np.zeros((0,)))} 21 | user_contexts = collections.OrderedDict([(0, np.zeros((0,))), 22 | (1, np.zeros((0,))), 23 | (2, np.zeros((0,)))]) 24 | 25 | recommender = TopPop() 26 | recommender.reset(users, items, ratings) 27 | recs, _ = recommender.recommend(user_contexts, 1) 28 | assert recs.shape == (3, 1) 29 | assert recs[0, 0] == 2 30 | assert recs[1, 0] == 0 31 | assert recs[2, 0] == 0 32 | 33 | 34 | def test_top_pop_multi_step(): 35 | """Test multiple rounds of recommending and rating.""" 36 | users = {0: np.zeros((0,)), 37 | 1: np.zeros((0,))} 38 | items = {0: np.zeros((0,)), 39 | 1: np.zeros((0,)), 40 | 2: np.zeros((0,))} 41 | ratings = {(0, 0): (5, np.zeros((0,))), 42 | (1, 1): (3, np.zeros((0,)))} 43 | user_contexts = collections.OrderedDict([(0, np.zeros((0,))), 44 | (1, np.zeros((0,)))]) 45 | 46 | recommender = TopPop() 47 | recommender.reset(users, items, ratings) 48 | recs, _ = recommender.recommend(user_contexts, 1) 49 | assert recs.shape == (2, 1) 50 | assert recs[0, 0] == 1 51 | assert recs[1, 0] == 0 52 | user_contexts[2] = np.zeros((0,)) 53 | recommender.update(users={2: np.zeros((0,))}, 54 | ratings={(0, 1): (5, np.zeros((0,))), 55 | (1, 0): (1, np.zeros((0,)))}) 56 | recs, _ = recommender.recommend(user_contexts, 1) 57 | assert recs.shape == (3, 1) 58 | assert recs[0, 0] == 2 59 | assert recs[1, 0] == 2 60 | assert recs[2, 0] == 1 61 | -------------------------------------------------------------------------------- /reclab/environments/fixed_rating.py: -------------------------------------------------------------------------------- 1 | """A simple environment for debugging. Each user will either always rate an item a 1 or a 5.""" 2 | import numpy as np 3 | 4 | from . import environment 5 | 6 | 7 | class FixedRating(environment.DictEnvironment): 8 | """An environment in which half the users rate all items with a 1 and the other half with a 5. 9 | 10 | Parameters 11 | ---------- 12 | num_users : int 13 | The number of users in the environment. 14 | num_items : int 15 | The number of items in the environment. 16 | rating_frequency : float 17 | What proportion of users will need a recommendation at each step. 18 | num_init_ratings : int 19 | The number of initial ratings available when the environment is reset. 20 | 21 | """ 22 | 23 | def __init__(self, num_users, num_items, 24 | rating_frequency=0.2, num_init_ratings=0): 25 | """Create a FixedRating environment.""" 26 | super().__init__(rating_frequency, num_init_ratings) 27 | self._num_users = num_users 28 | self._num_items = num_items 29 | 30 | @property 31 | def name(self): # noqa: D102 32 | return 'fixed' 33 | 34 | def _get_dense_ratings(self): # noqa: D102 35 | ratings = np.ones([self._num_users, self._num_items]) 36 | ratings[:, self._num_items // 2:] = 5.0 37 | return ratings 38 | 39 | def _reset_state(self): # noqa: D102 40 | self._users = {user_id: np.zeros((0,)) for user_id in range(self._num_users)} 41 | self._items = {item_id: np.zeros((0,)) for item_id in range(self._num_items)} 42 | 43 | def _rate_items(self, user_id, item_ids): # noqa: D102 44 | # Find the largest item id that has not yet been rated. 45 | max_id = None 46 | for item_id in sorted(item_ids, reverse=True): 47 | if (user_id, item_id) not in self._ratings: 48 | max_id = item_id 49 | break 50 | 51 | # If we have found an unrated item, rate it either 1 or 5. 52 | ratings = np.ones(len(item_ids)) * np.nan 53 | if max_id is not None: 54 | if max_id >= self._num_items // 2: 55 | ratings[item_ids == max_id] = 5.0 56 | else: 57 | ratings[item_ids == max_id] = 1.0 58 | 59 | return ratings 60 | -------------------------------------------------------------------------------- /tests/test_beta_rank.py: -------------------------------------------------------------------------------- 1 | """Tests for the BetaRank environment.""" 2 | import numpy as np 3 | 4 | from reclab.environments import BetaRank 5 | 6 | 7 | def test_beta_simple(): 8 | """Test BetaRank with only one user.""" 9 | env = BetaRank(dimension=10, 10 | num_users=1, 11 | num_items=2, 12 | rating_frequency=1.0, 13 | num_init_ratings=0) 14 | assert env.name == 'beta-rank' 15 | users, items, ratings = env.reset() 16 | 17 | # Test that the users and items have empty features. 18 | assert users[0].shape == (0,) 19 | assert items[0].shape == (0,) 20 | assert env.online_users[0].shape == (0,) 21 | 22 | # Recommend item 0, we shouldn't observe new users or items. 23 | users, items, ratings, _ = env.step(np.array([[0]])) 24 | assert users == {} 25 | assert items == {} 26 | 27 | # Test that item 0 falls in the [0, 1] range. 28 | assert ratings[(0, 0)][0] <= 1 and ratings[(0, 0)][0] >= 0 29 | 30 | 31 | def test_fixed_slates(): 32 | """Test FixedRating with slate recommendations.""" 33 | env = BetaRank(dimension=10, 34 | num_users=1, 35 | num_items=100, 36 | rating_frequency=1.0, 37 | num_init_ratings=0) 38 | env.seed(0) 39 | env.reset() 40 | assert ((env.dense_ratings >= 0) & (env.dense_ratings <= 1)).all() 41 | # Sort item ids from best to worst. 42 | item_ids = env.dense_ratings[0].argsort() 43 | # Swap the second largest and second smallest elements. 44 | item_ids[1], item_ids[-2] = item_ids[-2], item_ids[1] 45 | # The environment should pick the second item here since it will 46 | # have a larger value than other highly ranked items. 47 | _, _, ratings, _ = env.step(np.array([item_ids])) 48 | assert len(ratings) == 1 49 | assert (0, item_ids[1]) in ratings 50 | # Swap the tenth largest and tenth smallest elements. 51 | item_ids[9], item_ids[-10] = item_ids[-10], item_ids[9] 52 | # The environment should pick the tenth item here since it will 53 | # have a larger value than other highly ranked items, except for 54 | # the second item which has already been rated. 55 | _, _, ratings, _ = env.step(np.array([item_ids])) 56 | assert len(ratings) == 1 57 | assert (0, item_ids[9]) in ratings 58 | -------------------------------------------------------------------------------- /reclab/environments/contextual.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the implementation for the Contextual environment. 3 | 4 | In a contextual environment, only one user is on the platform at a time. 5 | The user has no state, and only stays for one timestep. However, each user comes 6 | with a context that is predictive of its preferences for items. 7 | 8 | """ 9 | import collections 10 | 11 | import numpy as np 12 | 13 | from .. import data_utils 14 | from . import environment 15 | 16 | 17 | class Contextual(environment.DictEnvironment): 18 | """ 19 | An environment that implements the contextual bandit assumption. 20 | 21 | Parameters 22 | ---------- 23 | name: string 24 | The dataset to instantiate the environment with. Can be one of: 'wiki10-31k'. 25 | user_dist_choice : str 26 | The choice of user distribution for selecting online users. By default, the subset of 27 | online users is chosen from a uniform distribution. Currently supports normal and lognormal. 28 | 29 | """ 30 | 31 | def __init__(self, name, user_dist_choice='uniform'): 32 | """Create a Contextual environment.""" 33 | self._features, self._full_ratings = data_utils.read_bandit_dataset(name) 34 | self._curr_user = 0 35 | super().__init__(rating_frequency=1, 36 | num_init_ratings=0, 37 | memory_length=0, 38 | user_dist_choice=user_dist_choice) 39 | 40 | @property 41 | def name(self): # noqa: D102 42 | return 'contextual' 43 | 44 | def _get_dense_ratings(self): # noqa: D102 45 | return self._full_ratings[:self._curr_user + 1].toarray() 46 | 47 | def _reset_state(self): # noqa: D102 48 | self._curr_user = 0 49 | self._users = collections.OrderedDict([(self._curr_user, np.zeros(0))]) 50 | self._items = collections.OrderedDict((item_id, np.zeros(0)) 51 | for item_id in range(self._full_ratings.shape[1])) 52 | 53 | def _rate_items(self, user_id, item_ids): # noqa: D102 54 | assert user_id in self._users 55 | assert len(item_ids) == 1 56 | rating = self._full_ratings[user_id, item_ids[0]] 57 | return np.array([rating]) 58 | 59 | def _rating_context(self, user_id): # noqa: D102 60 | return self._features[self._curr_user].toarray().flatten() 61 | 62 | def _update_state(self): # noqa: D102 63 | self._curr_user += 1 64 | self._users = collections.OrderedDict([(self._curr_user, np.zeros(0))]) 65 | return self._users.copy(), collections.OrderedDict() 66 | -------------------------------------------------------------------------------- /reclab/recommenders/baseline.py: -------------------------------------------------------------------------------- 1 | """An implementation of baseline perfect and random recommenders.""" 2 | import numpy as np 3 | 4 | from . import recommender 5 | 6 | 7 | class RandomRec(recommender.PredictRecommender): 8 | """A random recommendation model. 9 | 10 | Parameters 11 | ---------- 12 | range : tuple 13 | Upper and lower bounds for the uniformly random predictions. 14 | seed : int 15 | The random seed to use for recommendations. 16 | 17 | """ 18 | 19 | def __init__(self, rating_range=(0, 5), seed=0): 20 | """Create a random recommender.""" 21 | self._range = rating_range 22 | np.random.seed(seed) 23 | super().__init__() 24 | 25 | @property 26 | def name(self): # noqa: D102 27 | return 'random' 28 | 29 | @property 30 | def dense_predictions(self): # noqa: D102 31 | if self._dense_predictions is None: 32 | num_users = len(self._users) 33 | num_items = len(self._items) 34 | self._dense_predictions = np.random.uniform(low=self._range[0], 35 | high=self._range[1], 36 | size=[num_users, num_items]) 37 | return self._dense_predictions 38 | 39 | def _predict(self, user_item): # noqa: D102 40 | # Random predictions for all pairs. 41 | all_predictions = self.dense_predictions 42 | predictions = [] 43 | for user_id, item_id, _ in user_item: 44 | predictions.append(all_predictions[user_id, item_id]) 45 | return np.array(predictions) 46 | 47 | 48 | class PerfectRec(recommender.PredictRecommender): 49 | """A perfect recommendation model. 50 | 51 | Parameters 52 | ---------- 53 | dense_rating_function : function 54 | The function which generates true user ratings. 55 | 56 | """ 57 | 58 | def __init__(self, dense_rating_function): 59 | """Create a perfect recommender.""" 60 | self._dense_rating_function = dense_rating_function 61 | super().__init__() 62 | 63 | @property 64 | def name(self): # noqa: D102 65 | return 'perfect' 66 | 67 | @property 68 | def dense_predictions(self): # noqa: D102 69 | if self._dense_predictions is None: 70 | self._dense_predictions = self._dense_rating_function() 71 | return self._dense_predictions 72 | 73 | def _predict(self, user_item): # noqa: D102 74 | # Use provided function to predict for all pairs. 75 | all_predictions = self.dense_predictions 76 | predictions = [] 77 | for user_id, item_id, _ in user_item: 78 | predictions.append(all_predictions[user_id, item_id]) 79 | return np.array(predictions) 80 | -------------------------------------------------------------------------------- /reclab/environments/README.md: -------------------------------------------------------------------------------- 1 | ## List of Environments 2 | 3 | All the provided environments inherit from `DictEnvironment`, which is an environment where data is passed around as dictionaries. 4 | 5 | ### [Topics](reclab/environments/topics.py) 6 | In the `Topics` environment, each item is uniformly assigned to one of *K* topics and users prefer certain topics. 7 | The preference of user *u* for items *i* of topic *k_i* is initialized as *p(u,k_i) ~ Unif(0.5, 5.5)*, while the topic *k* of item *i* is chosen randomly from the set of all topics. When user *u* is recommended item *i* it will rate the item as 8 | *r_t(u,i) = clip(p(u,k_i) + eps)* where *eps* is normally distributed independent noise. 9 | 10 | User preferences can change as a result of the items they consume. We model the fact that users might become more interested in a topic through repeated exposure (`topic_change` parameter). The item rating also models negative effects arising from boredom. The effect of boredom arises from three parameters: `memory_length, boredom_threshold, boredom_penalty`. 11 | 12 | ### [Latent Factor](reclab/environments/latent_factors.py) 13 | In the `LatentFactorBehaviour`environment users and items have latent factors and biases. Ratings of items are modelled as: 14 | *r_t(u,i = < p_u, q_i > + b_u + b_i + b_0)*, where *p_u* is a user's latent factor, *q_i* is an item's latent factor, *b_u* is the user bias, *b_i* is the item bias, and *b_0* is the global bias. 15 | 16 | The `DatasetLatentFactor` environment initializes latent factors and biases by fitting a Matrix Factorization model on a rating dataset. Currently, it supports environment initialization based on [`MovieLens 100K`, `Movie Lense 10M`](https://grouplens.org/datasets/movielens/) and [`Last FM`](http://millionsongdataset.com/lastfm/) data. 17 | 18 | Similarly to the `Topics` environment, latent factor based environments allow for changes in user preferences: `affinity_change` models increased interest in a topic via alignments of user factors with item factors while `memory_length, boredom_threshold, boredom_penalty` model user boredom. 19 | 20 | ### Partial Information 21 | #### [Schmit](reclab/environments/schmit.py) 22 | `Schmit` contains the implementation for the environment in ["Human Interaction with Recommendation Systems"](https://arxiv.org/pdf/1703.00535.pdf). It is a slate based environment and it is similar to a latent-factor environment in that users and items are described by latent factors. However, users have only partial knowledge of an item value to them. Rather than using true rating, they use this partial information, along with the recommender's predicted score, to select an item from a slate of recommended items. 23 | 24 | #### [Engelhardt](reclab/environments/engelhardt.py) 25 | `Engelhardt` contains the implementation for the environment in [How Algorithmic Confounding in Recommendation Systems Increases Homogeneity and Decreases Utility](https://arxiv.org/abs/1710.11214). It is similar to `Schmit`: users know part of the value for each item and users/items are represented by latent vectors. The rating of a user *i* on an item *j* is given by: 26 | *r(u, i) ~ Beta(< p_u, q_i >, s^2)* 27 | where *p_u* is the latent vector for user *i*, *q_i* is the latent vector for item *i*, and the Beta distribution is parametrized according to its mean and variance. In this setting users chose from a slate of items based upon their observed utility and the recommender's ranking. 28 | 29 | ### Debug environments 30 | The [`FixedRating`](reclab/environments/fixed_rating.py) is a simple environment for debugging, in which half the users rate all items with a 1 and the other half with a 5. 31 | -------------------------------------------------------------------------------- /reclab/environments/beta_rank.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the implementation for the BetaRank environment from the algorithmic confounding paper. 3 | 4 | In this environment users have a hidden preference for each topic and each item has a 5 | hidden topic assigned to it. 6 | """ 7 | import collections 8 | 9 | import numpy as np 10 | 11 | from . import environment 12 | 13 | 14 | class BetaRank(environment.DictEnvironment): 15 | """ 16 | Implementation of environment with known and unknown user utility, static over time. 17 | 18 | Based on "How Algorithmic Confounding in Recommendation Systems Increases Homogeneity 19 | and Decreases Utility" by Chaney, Stewart, and Engelhardt (2018). 20 | 21 | """ 22 | 23 | def __init__(self, dimension, num_users, num_items, rating_frequency=0.2, 24 | num_init_ratings=0, known_mean=0.98, user_dist_choice='uniform'): 25 | """Create a BetaRank environment.""" 26 | super().__init__(rating_frequency, num_init_ratings, 0, user_dist_choice) 27 | self._dimension = dimension 28 | self._num_users = num_users 29 | self._num_items = num_items 30 | self._known_mean = known_mean 31 | self._user_preferences = None 32 | self._item_preferences = None 33 | 34 | @property 35 | def name(self): # noqa: D102 36 | return 'beta-rank' 37 | 38 | def _get_dense_ratings(self): # noqa: D102 39 | return np.clip(np.round(20 * (self._user_preferences @ self._item_preferences.T) + 1), 1, 5) 40 | 41 | def _reset_state(self): # noqa: D102 42 | # TODO: We should probably pass the magic numbers below as parameters. 43 | self._user_preferences = self._init_random.dirichlet( 44 | 100 * self._init_random.dirichlet(np.ones(self._dimension)), 45 | size=self._num_users 46 | ) 47 | self._item_preferences = self._init_random.dirichlet( 48 | 0.1 * self._init_random.dirichlet(100 * np.ones(self._dimension)), 49 | size=self._num_items 50 | ) 51 | self._users = collections.OrderedDict((user_id, np.zeros(0)) 52 | for user_id in range(self._num_users)) 53 | self._items = collections.OrderedDict((item_id, np.zeros(0)) 54 | for item_id in range(self._num_items)) 55 | 56 | def _rate_items(self, user_id, item_ids): # noqa: D102 57 | # Compute the user's known values for each items and sort them accordingly. 58 | means = self._item_preferences[item_ids] @ self._user_preferences[user_id].T 59 | values = self._beta_prime(means) 60 | known = self._beta_prime(self._known_mean, size=len(item_ids)) 61 | sorted_idxs = reversed( 62 | np.argsort(np.arange(1, len(item_ids) + 1) ** (-0.8) * known * values)) 63 | 64 | # Find the index of the item with the highest known value that hasn't been rated yet. 65 | chosen_idx = None 66 | for idx in sorted_idxs: 67 | if (user_id, item_ids[idx]) not in self._ratings: 68 | chosen_idx = idx 69 | break 70 | 71 | # Rate the chosen item and don't rate anything else. 72 | ratings = np.ones(len(item_ids)) * np.nan 73 | if chosen_idx is not None: 74 | ratings[chosen_idx] = values[chosen_idx] 75 | return np.clip(np.round((ratings).flatten() * 20 + 1), 1, 5) 76 | 77 | def _beta_prime(self, mean, std_dev=1e-5, size=None): 78 | alpha = ((1 - mean) / std_dev ** 2 - 1 / mean) * mean ** 2 + 1e-6 79 | beta = alpha * (1 / mean - 1) 80 | return self._dynamics_random.beta(alpha, beta, size=size) 81 | -------------------------------------------------------------------------------- /reclab/recommenders/README.md: -------------------------------------------------------------------------------- 1 | ## List of Recommenders 2 | 3 | All provided recommenders are subclasses of `PredictRecommender`, which uses rating predictions to make recommendations. It supports both deterministic and stochastic item selection policies. 4 | 5 | ### Baseline Recommenders 6 | 7 | #### [RandomRec](reclab/recommenders/baseline.py) 8 | A recommender that returns a random item from the list of unconsumed items foreach online user. It is a useful baseline for calibrating lower-bounds of recommender performance. 9 | 10 | #### [TopPop](reclab/recommenders/top_pop.py) 11 | A recommender that uses historical ratings to make global rankings of items and recommend items based on items with highest overall popularity. This is a useful baseline for measuring the benefits of personalization. 12 | 13 | #### [PerfectRec](reclab/recommenders/baseline.py) 14 | A recommender that is instantiated with a `dense_rating_function`, which provides the true ratings of the users for all items. It is a useful baseline for calibrating upper-bounds of recommender performance. 15 | 16 | ### [Neighborhood-based recommenders](reclab/recommenders/knn_recommender.py) 17 | `KNNRecommender` neighborhood based collaborative filtering algorithm. The class supports both user and item based collaborative filtering. In an `user_based` KNN recommender, user features are stacked and pairwise similarity metrics between users are measured. An online user is thus recommended an item that was highly rated by a similar user. Conversely in an `item_based` KNN recommender, item features are stacked and pairwise similarity metrics between items are measured. An online user is thus recommended an item that is highly similar to other highly rated items of the user. 18 | 19 | ### [Matrix Factorization](reclab/recommenders/libfm.py) 20 | It is wrapper for the [LibFM recommender](https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf). See www.libfm.org for implementation details. We built a pip installable python package **`ypyfm`** based on [this C++ implementation](https://github.com/srendle/libfm), that might be of interest in it's own right. 21 | 22 | At each step of the simulation the `LibFM` recommender re-trains a matrix factorization model. It computes rating predictions as the inner product of user and item factors plus bias terms. 23 | 24 | ### [AutoRec](reclab/recommenders/autorec/autorec.py) 25 | `AutoRec` is an autoencoder framework for collaborative filtering proposed by this [paper](https://dl.acm.org/doi/10.1145/2740908.2742726). It can be seen as a non-linear generalization of factorization models. We adapted a publicly available [implementation](https://github.com/mesuvash/NNRec) 26 | 27 | ### [CF-NADE](/reclab/recommenders/cfnade/cfnade.py) 28 | `Cfnade` is neural autoregressive architecture for collaborative filtering proposed by this [paper](https://arxiv.org/pdf/1605.09477.pdf). We adapted a publicly available [implementation](https://github.com/JoonyoungYi/CFNADE-keras). 29 | 30 | ### [LLORMA](reclab/recommenders/llorma/llorma.py) 31 | `Llorma` is a generalization of low rank matrix factorization techniques based on this (paper)[http://jmlr.org/papers/v17/14-301.html]. The LLORMA algorithm approximates the observed rating matrix as a weighted sum of low-rank matrices which are limited to a local region of the observed matrix. We adapted a publicly available [implementation](https://github.com/JoonyoungYi/LLORMA-tensorflow). 32 | 33 | ### [Sparse Recommenders](reclab/recommenders/sparse.py) 34 | `SLIM` is a sparse linear recommendation model based on this [paper](http://glaros.dtc.umn.edu/gkhome/node/774). For an user *i* it models the predicted rating of an unseen item *i* as a weighted average of the ratings of items previously rated by user *u*. 35 | 36 | `EASE` predicts ratings bases on item-item similarity model based on this [paper](https://arxiv.org/pdf/1905.03375.pdf). Assuming that the historical data contains *N* users and *M* items in a *NxM* rating matrix *X*. The model computes a *MxM* self-similarity matrix *B*. Unseen ratings are predicted as *XB*. 37 | -------------------------------------------------------------------------------- /tests/test_fixed.py: -------------------------------------------------------------------------------- 1 | """Tests for the FixedRating environment. 2 | 3 | The primary intent of these tests is to validate the code in the DictEnvironment 4 | parent class instead of only testing FixedRating. 5 | """ 6 | import numpy as np 7 | 8 | from reclab.environments import FixedRating 9 | from . import utils 10 | 11 | 12 | def test_fixed_simple(): 13 | """Test FixedRating with only two items.""" 14 | env = FixedRating(num_users=1, 15 | num_items=2, 16 | rating_frequency=1.0, 17 | num_init_ratings=0) 18 | assert env.name == 'fixed' 19 | users, items, ratings = env.reset() 20 | 21 | # Test that the users and items have empty features. 22 | assert users[0].shape == (0,) 23 | assert items[0].shape == (0,) 24 | assert env.online_users[0].shape == (0,) 25 | 26 | # Recommend item 0, we shouldn't observe new users or items. 27 | users, items, ratings, _ = env.step(np.array([[0]])) 28 | assert users == {} 29 | assert items == {} 30 | 31 | # Test that item 0 will have a rating of 1. 32 | assert ratings[(0, 0)][0] == 1 33 | 34 | # Recommend item 1, the environment should rate it 5. 35 | users, items, ratings, _ = env.step(np.array([[1]])) 36 | assert users == {} 37 | assert items == {} 38 | assert ratings[(0, 1)][0] == 5 39 | 40 | # Test the internal state of the environment. 41 | assert len(env.users) == 1 42 | assert env.users[0].shape == (0,) 43 | assert len(env.items) == 2 44 | assert env.items[0].shape == (0,) 45 | assert len(env.ratings) == 2 46 | assert env.ratings[0, 0][0] == 1 47 | assert env.ratings[0, 1][0] == 5 48 | 49 | 50 | def test_fixed_two_users(mocker): 51 | """Test FixedRating with two users.""" 52 | mocker.patch('reclab.environments.FixedRating._select_online_users', 53 | utils.mock_select_online_users) 54 | env = FixedRating(num_users=2, 55 | num_items=2, 56 | rating_frequency=0.5, 57 | num_init_ratings=0) 58 | env.reset() 59 | assert env.dense_ratings.shape == (2, 2) 60 | assert (env.dense_ratings[:, 0] == 1).all() 61 | assert (env.dense_ratings[:, 1] == 5).all() 62 | assert len(env.online_users) == 1 63 | assert 0 in env.online_users 64 | env.step(np.array([[0]])) 65 | assert len(env.online_users) == 1 66 | assert 1 in env.online_users 67 | env.step(np.array([[1]])) 68 | assert len(env.ratings) == 2 69 | assert env.ratings[0, 0][0] == 1 70 | assert env.ratings[1, 1][0] == 5 71 | 72 | 73 | def test_fixed_slates(): 74 | """Test FixedRating with slate recommendations.""" 75 | env = FixedRating(num_users=1, 76 | num_items=4, 77 | rating_frequency=1.0, 78 | num_init_ratings=0) 79 | env.reset() 80 | _, _, ratings, _ = env.step(np.array([[0, 1, 2, 3]])) 81 | assert len(ratings) == 1 82 | assert ratings[0, 3][0] == 5 83 | _, _, ratings, _ = env.step(np.array([[0, 1, 2, 3]])) 84 | assert len(ratings) == 1 85 | assert ratings[0, 2][0] == 5 86 | _, _, ratings, _ = env.step(np.array([[0, 2, 3]])) 87 | assert len(ratings) == 1 88 | assert ratings[0, 0][0] == 1 89 | _, _, ratings, _ = env.step(np.array([[0, 1, 2, 3]])) 90 | assert len(ratings) == 1 91 | assert ratings[0, 1][0] == 1 92 | 93 | 94 | def test_init_ratings(): 95 | """Test FixedRating properly initializes ratings.""" 96 | env = FixedRating(num_users=50, 97 | num_items=50, 98 | rating_frequency=1.0, 99 | num_init_ratings=100) 100 | env.seed(0) 101 | _, _, ratings = env.reset() 102 | assert len(ratings) == 100 103 | for (user_id, item_id), (rating, context) in ratings.items(): 104 | assert context.shape == (0,) 105 | assert user_id < 50 106 | assert item_id < 50 107 | if rating == 5.0: 108 | assert item_id >= 25 109 | else: 110 | assert item_id < 25 111 | -------------------------------------------------------------------------------- /tests/test_libfm.py: -------------------------------------------------------------------------------- 1 | """Tests for the LibFM recommender.""" 2 | from reclab.recommenders import LibFM 3 | from . import utils 4 | 5 | 6 | def test_sgd_predict(): 7 | """Test that LibFM trained with SGD predicts well and that it gets better with more data.""" 8 | recommender = LibFM(num_user_features=0, 9 | num_item_features=0, 10 | num_rating_features=0, 11 | max_num_users=utils.NUM_USERS_ML100K, 12 | max_num_items=utils.NUM_ITEMS_ML100K, 13 | method='sgd', 14 | learning_rate=0.003, 15 | num_two_way_factors=8, 16 | bias_reg=0.04, 17 | one_way_reg=0.04, 18 | two_way_reg=0.04, 19 | num_iter=128, 20 | seed=0) 21 | utils.test_predict_ml100k(recommender, rmse_threshold=1.1) 22 | 23 | 24 | def test_sgd_recommend(): 25 | """Test that LibFM trained with SGD will recommend reasonable items.""" 26 | recommender = LibFM(num_user_features=0, 27 | num_item_features=0, 28 | num_rating_features=0, 29 | max_num_users=utils.NUM_USERS_SIMPLE, 30 | max_num_items=utils.NUM_ITEMS_SIMPLE, 31 | method='sgd', 32 | learning_rate=0.01, 33 | num_two_way_factors=8, 34 | num_iter=128, 35 | seed=0) 36 | utils.test_recommend_simple(recommender) 37 | 38 | 39 | def test_mcmc_predict(): 40 | """Test that LibFM trained with MCMC predicts well and that it gets better with more data.""" 41 | recommender = LibFM(num_user_features=0, 42 | num_item_features=0, 43 | num_rating_features=0, 44 | max_num_users=utils.NUM_USERS_ML100K, 45 | max_num_items=utils.NUM_ITEMS_ML100K, 46 | method='mcmc', 47 | num_two_way_factors=8, 48 | num_iter=128, 49 | seed=0) 50 | utils.test_predict_ml100k(recommender, rmse_threshold=1.1) 51 | 52 | 53 | def test_mcmc_recommend(): 54 | """Test that LibFM trained with MCMC will recommend reasonable items.""" 55 | recommender = LibFM(num_user_features=0, 56 | num_item_features=0, 57 | num_rating_features=0, 58 | max_num_users=utils.NUM_USERS_SIMPLE, 59 | max_num_items=utils.NUM_ITEMS_SIMPLE, 60 | method='mcmc', 61 | num_two_way_factors=8, 62 | num_iter=128, 63 | seed=0) 64 | utils.test_recommend_simple(recommender) 65 | 66 | 67 | def test_als_predict(): 68 | """Test that LibFM trained with ALS predicts well and that it gets better with more data.""" 69 | recommender = LibFM(num_user_features=0, 70 | num_item_features=0, 71 | num_rating_features=0, 72 | max_num_users=utils.NUM_USERS_ML100K, 73 | max_num_items=utils.NUM_ITEMS_ML100K, 74 | method='als', 75 | num_two_way_factors=8, 76 | reg=0.02, 77 | num_iter=128, 78 | seed=0) 79 | utils.test_predict_ml100k(recommender, rmse_threshold=1.4) 80 | 81 | 82 | def test_als_recommend(): 83 | """Test that LibFM trained with ALS will recommend reasonable items.""" 84 | recommender = LibFM(num_user_features=0, 85 | num_item_features=0, 86 | num_rating_features=0, 87 | max_num_users=utils.NUM_USERS_SIMPLE, 88 | max_num_items=utils.NUM_ITEMS_SIMPLE, 89 | method='als', 90 | num_two_way_factors=8, 91 | num_iter=128, 92 | seed=0) 93 | utils.test_recommend_simple(recommender) 94 | -------------------------------------------------------------------------------- /reclab/recommenders/cfnade/cfnade_lib/nade.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras.engine import Layer, InputSpec 3 | from keras import backend as K 4 | from keras import initializers 5 | from keras import regularizers 6 | from keras import constraints 7 | 8 | # def dot_product(x, kernel): 9 | # """ 10 | # Wrapper for dot product operation, in order to be compatible with both 11 | # Theano and Tensorflow 12 | # Args: 13 | # x (): input 14 | # kernel (): weights 15 | # Returns: 16 | # """ 17 | # return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) 18 | 19 | 20 | class NADE(Layer): 21 | def __init__(self, 22 | hidden_dim, 23 | activation, 24 | W_regularizer=None, 25 | V_regularizer=None, 26 | b_regularizer=None, 27 | c_regularizer=None, 28 | bias=False, 29 | normalized_layer=False, 30 | **kwargs): 31 | 32 | self.init = initializers.get('uniform') 33 | 34 | self.bias = bias 35 | self.activation = activation 36 | self.hidden_dim = hidden_dim 37 | 38 | self.W_regularizer = regularizers.get(W_regularizer) 39 | self.V_regularizer = regularizers.get(V_regularizer) 40 | self.b_regularizer = regularizers.get(b_regularizer) 41 | self.c_regularizer = regularizers.get(c_regularizer) 42 | 43 | self.normalized_layer = normalized_layer 44 | 45 | super(NADE, self).__init__(**kwargs) 46 | 47 | def build(self, input_shape): 48 | self.input_dim1 = input_shape[1] 49 | self.input_dim2 = input_shape[2] 50 | 51 | self.W = self.add_weight( 52 | shape=(self.input_dim1, self.input_dim2, self.hidden_dim), 53 | initializer=self.init, 54 | name='{}_W'.format(self.name), 55 | regularizer=self.W_regularizer) 56 | if self.bias: 57 | self.c = self.add_weight( 58 | shape=(self.hidden_dim, ), 59 | initializer=self.init, 60 | name='{}_c'.format(self.name), 61 | regularizer=self.c_regularizer) 62 | 63 | if self.bias: 64 | self.b = self.add_weight( 65 | shape=(self.input_dim1, self.input_dim2), 66 | initializer=self.init, 67 | name='{}_b'.format(self.name), 68 | regularizer=self.b_regularizer) 69 | 70 | self.V = self.add_weight( 71 | shape=(self.hidden_dim, self.input_dim1, self.input_dim2), 72 | initializer=self.init, 73 | name='{}_V'.format(self.name), 74 | regularizer=self.V_regularizer) 75 | 76 | super().build(input_shape) 77 | 78 | def call(self, original_x): 79 | 80 | x = K.cumsum(original_x[:, :, ::-1], axis=2)[:, :, ::-1] 81 | # x.shape = (?,6040,5) 82 | # W.shape = (6040, 5, 500) 83 | # c.shape = (500,) 84 | output_ = tf.tensordot(x, self.W, axes=[[1, 2], [0, 1]]) 85 | 86 | if self.normalized_layer: 87 | output_ /= tf.matmul( 88 | tf.maximum( 89 | tf.reshape( 90 | tf.reduce_sum( 91 | tf.reduce_sum(original_x, axis=2), axis=1), 92 | [-1, 1]), 1), tf.ones([1, output_.shape[1]])) 93 | 94 | if self.bias: 95 | output_ = output_ + self.c 96 | 97 | h_out = tf.reshape(output_, [-1, self.hidden_dim]) 98 | #tf.cast(indices, tf.float32) 99 | # output_.shape = (?,500) 100 | 101 | h_out_act = K.tanh(h_out) 102 | # h_out_act.shape = (?,500) 103 | # V.shape = (500, 6040, 5) 104 | # b.shape = (6040,5) 105 | if self.bias: 106 | output = tf.tensordot(h_out_act, self.V, axes=[[1], [0]]) + self.b 107 | else: 108 | output = tf.tensordot(h_out_act, self.V, axes=[[1], [0]]) 109 | # output.shape = (?,6040,5) 110 | output = tf.reshape(output, [-1, self.input_dim1, self.input_dim2]) 111 | return output 112 | 113 | def compute_output_shape(self, input_shape): 114 | return (input_shape[0], input_shape[1], input_shape[2]) 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Build Status](https://travis-ci.com/berkeley-reclab/RecLab.svg?branch=master) 2 | [![Coverage Status](https://coveralls.io/repos/github/berkeley-reclab/RecLab/badge.svg?branch=master)](https://coveralls.io/github/berkeley-reclab/RecLab?branch=master) 3 | 4 | # RecLab 5 | RecLab is a simulation framework used to evaluate recommendation algorithms. The framework makes 6 | no platform-specific assumptions. As such, it can be used to evaluate recommendation algorithms 7 | implemented with any computational library. 8 | 9 | Reclab is under active development. If you find a bug or would like to request a new feature 10 | please file an [issue](https://github.com/berkeley-reclab/reclab/issues). Furthermore, we welcome a 11 | broad set of contributions including: documentation, tests, new environments, reproduced 12 | recommenders, and code quality improvements. Simply fork the repo and make a 13 | [pull request](https://github.com/berkeley-reclab/reclab/pulls). 14 | 15 | ## Getting Started 16 | This section contains a brief guide on how to get started with RecLab. 17 | 18 | ### Setup 19 | RecLab was developed and tested in Python 3.8. To install RecLab run 20 | ``` 21 | pip install reclab 22 | ``` 23 | RecLab also implements a set of benchmark recommender systems, however the default 24 | `pip install` command will not fetch the necessary dependencies. To fetch these dependencies 25 | you must have g++ 5.0 or higher and [python3-dev](https://stackoverflow.com/a/21530768) 26 | installed. You should then run 27 | ``` 28 | pip install reclab[recommenders] 29 | ``` 30 | which will install both the core reclab framework and the benchmark recommendation algorithms. 31 | 32 | ### Example 33 | The code below shows a simple use-case with random recommendations. 34 | ```python 35 | import numpy as np 36 | import reclab 37 | env = reclab.make('topics-dynamic-v1') 38 | items, users, ratings = env.reset() 39 | for i in range(1000): 40 | online_users = env.online_users 41 | # Your recommendation algorithm here. This recommends 10 random items to each online user. 42 | recommendations = np.random.choice(list(items), size=(len(online_users), 10)) 43 | _, _, ratings, info = env.step(recommendations) 44 | env.close() 45 | ``` 46 | 47 | ## RecLab Design 48 | This section briefly outlines the overall design of RecLab, and how to add new environments. 49 | 50 | ### Basics 51 | Evaluation in RecLab consists of two basic components: **Environments** and **Recommenders**. 52 | An environment consists of a set of users and items. A recommender and an environment interact 53 | iteratively. At each time-step the environment specifies a set of _online users_ that need to be 54 | recommended an item. The recommender uses the history of user-item interactions to either recommend 55 | a single item (top-1 recommendation), or a set of items (slate-based recommendation) to each online 56 | user. The environment then provides ratings to some of, or all, the recommended items. 57 | 58 | Below is a visualization of the interaction between environment and recommender. 59 | 60 | ![Flowchart](/figures/RecSys.png) 61 | 62 | #### Environments 63 | In RecLab all environments inherit from the [`Environment`](reclab/environments/environment.py) interface. The following methods must be implemented: 64 | - `reset`: Reset the environment to its original state. Must be called before the first step of the simulation. 65 | - `online_users`: Return a list of available users at each timestep. 66 | - `step(recommendations)`: Given `recommendations`, update the internal state of the environment and return the following data: 67 | - `users`: New users and users whose information got updated this timestep, along with any side information about each user. 68 | - `items`: New items and items whose information got updated this timestep, along with any side information about each item. 69 | - `ratings`: New ratings and ratings whose information got updated this timestep, along with any side information about each rating. 70 | - `info`: Extra information that can be used for debugging but should not be made accessible to the recommender. 71 | 72 | To see a description of available environments see the [list of enviroments](reclab/environments/README.md). 73 | 74 | #### Recommenders 75 | RecLab does not assume recommendation algorithms are implemented in any specific way. However, we 76 | also provide a [convenient interface](reclab/recommenders/recommender.py) to simplify the design of 77 | new recommendation algorithms. 78 | 79 | To see a description of available recommenders see the 80 | [list of recommenders](reclab/recommenders/README.md). Note that you must install the optional 81 | dependencies to use some of these recommenders as outline under the [setup section](#Setup). 82 | 83 | **Coming soon:** More functionality for running experiments and custom performance metrics. 84 | -------------------------------------------------------------------------------- /reclab/recommenders/llorma/llorma.py: -------------------------------------------------------------------------------- 1 | """Tensorflow implementation of AutoRec recommender.""" 2 | import numpy as np 3 | from .llorma_lib import llorma_g 4 | from .. import recommender 5 | 6 | 7 | class Llorma(recommender.PredictRecommender): 8 | """Many local low rank models averaged via kernels. 9 | 10 | Parameters 11 | ---------- 12 | max_user : int 13 | Maximum number of users in the environment 14 | max_item : int 15 | Maximum number of items in the environment 16 | n_anchor : int 17 | Number of local model to build in the train phase 18 | pre_rank : int 19 | Dimension of the pre-train user/item latent factors 20 | rank : int 21 | Dimension of the train user/item factors 22 | pre_lambda_val : float 23 | Regularization parameter for the pre-train matrix factorization 24 | lambda_val : float 25 | Regularization parameter for the train model 26 | pre_learning_rate : float 27 | Learning rate when optimizing the pre-train matrix factorization 28 | learning_rate : float 29 | Learning rate for the the train model 30 | pre_train_steps : int 31 | Number of epochs in the pre-train phase 32 | train_steps : int 33 | Number of epochs in the training phase 34 | batch_size : int 35 | Batch size in training phase 36 | use_cache : bool 37 | If True use stored pre-trained item/user latent factors 38 | results_path : str 39 | Folder to save model outputs and checkpoints. 40 | kernel_fun : callable 41 | kernel function used for similarity, 42 | 43 | """ 44 | 45 | def __init__(self, 46 | max_user, 47 | max_item, 48 | n_anchor=10, 49 | pre_rank=5, 50 | pre_learning_rate=1e-3, 51 | pre_lambda_val=10, 52 | pre_train_steps=10, 53 | rank=10, 54 | learning_rate=1e-2, 55 | lambda_val=1e-3, 56 | train_steps=10, 57 | batch_size=128, 58 | use_cache=True, 59 | result_path='results', 60 | kernel_fun=None, 61 | random_seed=0): 62 | """Create new Local Low-Rank Matrix Approximation (LLORMA) recommender.""" 63 | super().__init__() 64 | 65 | self.model = llorma_g.Llorma(max_user, max_item, n_anchor, pre_rank, 66 | pre_learning_rate, pre_lambda_val, pre_train_steps, 67 | rank, learning_rate, lambda_val, train_steps, 68 | batch_size, use_cache, result_path, kernel_fun) 69 | self._hyperparameters.update(locals()) 70 | 71 | # We only want the function arguments so remove class related objects. 72 | del self._hyperparameters['self'] 73 | del self._hyperparameters['__class__'] 74 | np.random.seed(random_seed) 75 | 76 | @property 77 | def name(self): # noqa: D102 78 | return 'llorma' 79 | 80 | def _predict(self, user_item): # noqa: D102 81 | users, items, _ = list(zip(*user_item)) 82 | users = np.array(users) 83 | items = np.array(items) 84 | # Check that both the item and the user have been seen in historical data. 85 | is_seen_uid = np.array(users <= (self.model.batch_manager.n_user - 1)) 86 | is_seen_iid = np.array(items <= (self.model.batch_manager.n_item - 1)) 87 | is_seen_id = np.logical_and(is_seen_iid, is_seen_uid) 88 | 89 | seen_user_item = np.column_stack((users[is_seen_id], items[is_seen_id])) 90 | seen_estimate = self.model.predict(seen_user_item) 91 | # Choose the mean of the seen values as the estimate for the unseen ids. 92 | unseen_estimate = np.mean(seen_estimate) 93 | estimate = np.ones(len(users))*unseen_estimate 94 | estimate[is_seen_id] = seen_estimate 95 | print('Low: {:.3f}, Mean: {:.3f}, High: {:.3f}'.format(np.quantile(seen_estimate, 0.25), 96 | np.quantile(seen_estimate, 0.5), 97 | np.quantile(seen_estimate, 0.75))) 98 | return estimate 99 | 100 | def update(self, users=None, items=None, ratings=None): # noqa: D102 101 | super().update(users, items, ratings) 102 | updated_ratings = dict(self._ratings) 103 | user_items = np.array(list(updated_ratings.keys())) 104 | rating_arr = list(updated_ratings.values()) 105 | 106 | data = np.column_stack((user_items, rating_arr)) 107 | self.model.reset_data(data, data, data) 108 | self.model.train() 109 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | """A set of utility functions for testing.""" 2 | import collections 3 | import numpy as np 4 | 5 | from reclab import data_utils 6 | 7 | NUM_USERS_ML100K = 943 8 | NUM_ITEMS_ML100K = 1682 9 | 10 | NUM_USERS_SIMPLE = 2 11 | NUM_ITEMS_SIMPLE = 3 12 | 13 | 14 | def test_predict_ml100k(recommender, rmse_threshold=1.1, seed=None, test_dense=False): 15 | """Test that recommender predicts well and that it gets better with more data.""" 16 | users, items, ratings = data_utils.read_dataset('ml-100k') 17 | assert NUM_USERS_ML100K == len(users) 18 | assert NUM_ITEMS_ML100K == len(items) 19 | train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.9, shuffle=True, seed=seed) 20 | train_ratings_1, train_ratings_2 = data_utils.split_ratings(train_ratings, 0.5) 21 | recommender.reset(users, items, train_ratings_1) 22 | user_item = [(key[0], key[1], val[1]) for key, val in test_ratings.items()] 23 | preds = recommender.predict(user_item) 24 | targets = [t[0] for t in test_ratings.values()] 25 | rmse1 = rmse(preds, targets) 26 | 27 | # We should get a relatively low RMSE here. 28 | assert rmse1 < rmse_threshold 29 | 30 | recommender.update(ratings=train_ratings_2) 31 | preds = recommender.predict(user_item) 32 | rmse2 = rmse(preds, targets) 33 | 34 | # The RMSE should have reduced. 35 | assert rmse1 > rmse2 36 | 37 | if test_dense: 38 | # Test that the dense predictions work as well. 39 | dense = recommender.dense_predictions 40 | preds = np.array([dense[key[0] - 1, key[1] - 1] for key in test_ratings]) 41 | rmse3 = rmse(preds, targets) 42 | # The RMSE should have reduced. 43 | assert rmse1 > rmse3 44 | 45 | 46 | def test_binary_recommend_ml100k(recommender, hit_rate_threshold, seed=None): 47 | """Test that the recommender will recommend good items and it gets better with more data.""" 48 | users, items, ratings = data_utils.read_dataset('ml-100k') 49 | assert NUM_USERS_ML100K == len(users) 50 | assert NUM_ITEMS_ML100K == len(items) 51 | train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.9, shuffle=True, seed=seed) 52 | train_ratings_1, train_ratings_2 = data_utils.split_ratings(train_ratings, 0.5) 53 | all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users]) 54 | 55 | recommender.reset(users, items, train_ratings_1) 56 | recs, _ = recommender.recommend(all_contexts, 1) 57 | num_hits = sum((user_id, rec) in test_ratings for user_id, rec in zip(users, recs[:, 0])) 58 | hit_rate1 = num_hits / NUM_USERS_ML100K 59 | 60 | # We should get a relatively low hit rate here. 61 | assert hit_rate1 > hit_rate_threshold, hit_rate1 62 | 63 | recommender.reset(users, items, train_ratings_1) 64 | recommender.update(ratings=train_ratings_2) 65 | recs, _ = recommender.recommend(all_contexts, 1) 66 | num_hits = sum((user_id, rec) in test_ratings for user_id, rec in zip(users, recs[:, 0])) 67 | hit_rate2 = num_hits / NUM_USERS_ML100K 68 | 69 | # The hit rate should have increased. 70 | assert hit_rate1 < hit_rate2, hit_rate2 71 | 72 | 73 | def test_recommend_simple(recommender): 74 | """Test that recommender will recommend reasonable items in simple setting.""" 75 | users = {0: np.zeros((0,)), 76 | 1: np.zeros((0,))} 77 | items = {0: np.zeros((0,)), 78 | 1: np.zeros((0,)), 79 | 2: np.zeros((0,))} 80 | assert NUM_USERS_SIMPLE == len(users) 81 | assert NUM_ITEMS_SIMPLE == len(items) 82 | ratings = {(0, 0): (5, np.zeros((0,))), 83 | (0, 1): (1, np.zeros((0,))), 84 | (0, 2): (5, np.zeros((0,))), 85 | (1, 0): (5, np.zeros((0,)))} 86 | recommender.reset(users, items, ratings) 87 | user_contexts = collections.OrderedDict([(1, np.zeros((0,)))]) 88 | recs, _ = recommender.recommend(user_contexts, 1) 89 | recommender.predict([(1, 1, np.zeros(0,)), (1, 2, np.zeros(0,))]) 90 | assert recs.shape == (1, 1) 91 | # The recommender should have recommended the item that user0 rated the highest. 92 | assert recs[0, 0] == 2 93 | 94 | 95 | def rmse(predictions, targets): 96 | """Compute the root mean squared error (RMSE) between prediction and target vectors.""" 97 | return np.sqrt(((predictions - targets) ** 2).mean()) 98 | 99 | 100 | def mock_select_online_users(self): 101 | """Return the users online at a given timestep. 102 | 103 | This functions is meant to replace the _select_online_users method in an environment 104 | when used for testing. 105 | """ 106 | # pylint: disable=protected-access 107 | num_online = int(len(self._users) * self._rating_frequency) 108 | start_id = (num_online * (self._timestep + 1)) % len(self._users) 109 | end_id = min(start_id + num_online, len(self._users)) 110 | return np.arange(start_id, end_id) 111 | -------------------------------------------------------------------------------- /reclab/environments/schmit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains implementation for environment in "Human Interaction with Recommendation Systems". 3 | 4 | https://arxiv.org/pdf/1703.00535.pdf 5 | """ 6 | 7 | import numpy as np 8 | 9 | from . import environment 10 | 11 | 12 | class Schmit(environment.DictEnvironment): 13 | """ 14 | Implementation of environment with static private user preferences and user-item interactions. 15 | 16 | Based on "Human Interaction with Recommendation Systems" by Schmit and Riquelme (2018). 17 | 18 | Parameters 19 | ---------- 20 | num_users : int 21 | The number of users in the environment. 22 | num_items : int 23 | The number of items in the environment. 24 | rating_frequency : float 25 | What proportion of users will need a recommendation at each step. 26 | num_init_ratings: : int 27 | The number of initial ratings available when the environment is reset. 28 | rank : int 29 | Rank of user preferences. 30 | sigma : float 31 | Variance of the Gaussian noise added to determine user-item value. 32 | user_dist_choice : str 33 | The choice of user distribution for selecting online users. By default, the subset of 34 | online users is chosen from a uniform distribution. Currently supports normal and lognormal. 35 | 36 | """ 37 | 38 | def __init__(self, num_users, num_items, rating_frequency=0.2, 39 | num_init_ratings=0, rank=10, sigma=0.2, 40 | user_dist_choice='uniform'): 41 | """Create an environment.""" 42 | super().__init__(rating_frequency, num_init_ratings, 0, user_dist_choice) 43 | self._num_users = num_users 44 | self._num_items = num_items 45 | 46 | self.rank = rank 47 | self.sigma = sigma 48 | 49 | # constants 50 | self.item_bias = self._init_random.randn(num_items, 1) / 1.5 51 | self.user_bias = self._init_random.randn(num_users, 1) / 3 52 | 53 | # unobserved by agents 54 | self.U = self._init_random.randn(num_users, rank) / np.sqrt(self.rank) 55 | self.V = self._init_random.randn(num_items, rank) / np.sqrt(self.rank) 56 | 57 | # observed by agents 58 | self.X = self._init_random.randn(num_users, rank) / np.sqrt(self.rank) 59 | self.Y = self._init_random.randn(num_items, rank) / np.sqrt(self.rank) 60 | 61 | @property 62 | def name(self): 63 | """Name of environment, used for saving.""" 64 | return 'schmit' 65 | 66 | def true_score(self, user, item): 67 | """ 68 | Calculate true score. 69 | 70 | Parameters 71 | ---------- 72 | user : int 73 | User id for calculating preferences. 74 | item : int 75 | Item id. 76 | 77 | Returns 78 | ------- 79 | score : float 80 | The true score of the item for the user. 81 | 82 | """ 83 | return float(self.item_bias[item] + self.user_bias[user] + self.U[user] @ self.V[item].T) 84 | 85 | def value(self, user, item): 86 | """ 87 | Add private user preferences and Gaussian noise to true score. 88 | 89 | Parameters 90 | ---------- 91 | user : int 92 | User id for calculating preferences. 93 | item : int 94 | Item id. 95 | 96 | Returns 97 | ------- 98 | value : float 99 | The (noisy) value of the item to the user. 100 | 101 | """ 102 | ratings = float(self.true_score(user, item) + self.X[user] @ self.Y[item].T + 103 | self._dynamics_random.normal(loc=0, scale=self.sigma) + 3) 104 | return np.clip(ratings, 1, 5) 105 | 106 | def _reset_state(self): 107 | self._users = {user_id: np.zeros((0,)) 108 | for user_id in range(self._num_users)} 109 | self._items = {item_id: np.zeros((0,)) 110 | for item_id in range(self._num_items)} 111 | 112 | self.item_bias = self._init_random.randn(self._num_items, 1) / 1.5 113 | self.user_bias = self._init_random.randn(self._num_users, 1) / 3 114 | 115 | self.U = self._init_random.randn( 116 | self._num_users, self.rank) / np.sqrt(self.rank) 117 | self.V = self._init_random.randn( 118 | self._num_items, self.rank) / np.sqrt(self.rank) 119 | self.X = self._init_random.randn( 120 | self._num_users, self.rank) / np.sqrt(self.rank) 121 | self.Y = self._init_random.randn( 122 | self._num_items, self.rank) / np.sqrt(self.rank) 123 | 124 | def _rate_items(self, user_id, item_id): 125 | return self.value(user_id, item_id) 126 | 127 | def _get_dense_ratings(self): 128 | """Compute all the true ratings on every user-item pair at the current timestep. 129 | 130 | A true rating is defined as the rating a user would make with all noise removed. 131 | 132 | Returns 133 | ------- 134 | dense_ratings : np.ndarray 135 | The array of all true ratings where true_ratings[i, j] is the rating by user i 136 | on item j. 137 | 138 | """ 139 | dense_ratings = np.zeros([self._num_users, self._num_items]) 140 | for u in range(self._num_users): 141 | for i in range(self._num_items): 142 | dense_ratings[u, i] = self.true_score( 143 | u, i) + self.X[u] @ self.Y[i].T + 3 144 | return dense_ratings 145 | -------------------------------------------------------------------------------- /reclab/environments/registry.py: -------------------------------------------------------------------------------- 1 | """Contains make, a function to instantiate a standardized environment from a string.""" 2 | from .beta_rank import BetaRank 3 | from .latent_factors import LatentFactorBehavior, DatasetLatentFactor 4 | from .schmit import Schmit 5 | from .topics import Topics 6 | 7 | NAMED_ENV_DICT = { 8 | 'topics-static-v1': ( 9 | Topics, 10 | dict(num_topics=19, 11 | num_users=1000, 12 | num_items=1700, 13 | rating_frequency=0.2, 14 | num_init_ratings=100000, 15 | noise=0.5, 16 | topic_change=0, 17 | memory_length=0, 18 | boredom_threshold=0, 19 | boredom_penalty=0) 20 | ), 21 | 'topics-static-v1-small': ( 22 | Topics, 23 | dict(num_topics=19, 24 | num_users=100, 25 | num_items=170, 26 | rating_frequency=0.2, 27 | num_init_ratings=5000, 28 | noise=0.5, 29 | topic_change=0, 30 | memory_length=0, 31 | boredom_threshold=0, 32 | boredom_penalty=0) 33 | ), 34 | 'topics-dynamic-v1': ( 35 | Topics, 36 | dict(num_topics=19, 37 | num_users=1000, 38 | num_items=1700, 39 | rating_frequency=0.2, 40 | num_init_ratings=100000, 41 | noise=0.5, 42 | topic_change=0.1, 43 | memory_length=5, 44 | boredom_threshold=2, 45 | boredom_penalty=1) 46 | ), 47 | 'topics-satiation-v1': ( 48 | Topics, 49 | dict(num_topics=19, 50 | num_users=1000, 51 | num_items=1700, 52 | rating_frequency=0.2, 53 | num_init_ratings=100000, 54 | noise=0.5, 55 | satiation_factor=3, 56 | satiation_decay=0.5, 57 | satiation_noise=0.1) 58 | ), 59 | 'topics-sensitization-v1': ( 60 | Topics, 61 | dict(num_topics=19, 62 | num_users=1000, 63 | num_items=1700, 64 | rating_frequency=0.2, 65 | num_init_ratings=100000, 66 | noise=0.5, 67 | satiation_factor=3, 68 | satiation_decay=(0.1, 0.5), 69 | satiation_noise=0.1, 70 | switch_probability=(0.05, 0.2)) 71 | ), 72 | 'latent-static-v1': ( 73 | LatentFactorBehavior, 74 | dict(latent_dim=100, 75 | num_users=943, 76 | num_items=1682, 77 | rating_frequency=0.2, 78 | num_init_ratings=100000, 79 | noise=0.5, 80 | affinity_change=0, 81 | memory_length=0, 82 | boredom_threshold=0, 83 | boredom_penalty=0) 84 | ), 85 | 'latent-dynamic-v1': ( 86 | LatentFactorBehavior, 87 | dict(latent_dim=100, 88 | num_users=943, 89 | num_items=1682, 90 | rating_frequency=0.2, 91 | num_init_ratings=100000, 92 | noise=0.5, 93 | affinity_change=0.2, 94 | memory_length=5, 95 | boredom_threshold=0, 96 | boredom_penalty=2) 97 | ), 98 | 'ml-100k-v1': ( 99 | DatasetLatentFactor, 100 | dict(name='ml-100k', 101 | latent_dim=0, 102 | rating_frequency=0.00107, 103 | num_init_ratings=0, 104 | noise=0.5, 105 | affinity_change=0, 106 | memory_length=0, 107 | boredom_threshold=0, 108 | boredom_penalty=0) 109 | ), 110 | 'latent-score-v1': ( 111 | Schmit, 112 | dict(num_users=1000, 113 | num_items=1700, 114 | rating_frequency=0.2, 115 | num_init_ratings=100000, 116 | rank=10, 117 | sigma=0.2) 118 | ), 119 | 'beta-rank-v1': ( 120 | BetaRank, 121 | dict(num_users=1000, 122 | num_items=1700, 123 | dimension=19, 124 | rating_frequency=0.001, 125 | num_init_ratings=0, 126 | known_mean=0.98) 127 | ), 128 | 'beta-rank-lowdata-v1': ( 129 | BetaRank, 130 | dict(num_users=1000, 131 | num_items=1700, 132 | dimension=19, 133 | rating_frequency=0.001, 134 | num_init_ratings=0, 135 | known_mean=0.98) 136 | ), 137 | 'beta-rank-small-v1': ( 138 | BetaRank, 139 | dict(num_users=100, 140 | num_items=170, 141 | dimension=19, 142 | rating_frequency=0.01, 143 | num_init_ratings=0, 144 | known_mean=0.98) 145 | ), 146 | } 147 | 148 | 149 | def make(name, **kwargs): 150 | """ 151 | Create an environment by name. 152 | 153 | You may optionally override the arguments for the environment constructor by specifying kwargs. 154 | 155 | Parameters 156 | ---------- 157 | name : str 158 | The name of the environment. 159 | 160 | Returns 161 | ------ 162 | env : Environment 163 | The constructed environment. 164 | 165 | """ 166 | if name not in NAMED_ENV_DICT: 167 | raise ValueError('{} is not a valid environment name. '.format(name) + 168 | 'Valid named environments: {}'.format(NAMED_ENV_DICT.keys())) 169 | env_class, params = NAMED_ENV_DICT[name] 170 | params.update(kwargs) 171 | return env_class(**params) 172 | -------------------------------------------------------------------------------- /reclab/recommenders/cfnade/cfnade_lib/utils.py: -------------------------------------------------------------------------------- 1 | """ Util functions for class Cfnade""" 2 | from itertools import islice 3 | import numpy as np 4 | import keras 5 | from keras import backend as K 6 | from keras.callbacks import Callback 7 | 8 | class DataSet(Callback): 9 | """ 10 | A datagenerator the feeds data in batches. 11 | 12 | ratings_df: rating matrix, num_iters * num_users, entry is input rating rounded to integer 13 | batch_size: int, batch size, default is 64 14 | num_users: int, number of users 15 | num_items: int, number of items 16 | mode: int, 0 for train, 1 for eval, 2 for test 17 | """ 18 | def __init__(self,ratings_df, 19 | num_users, 20 | num_items, 21 | batch_size, 22 | rating_bucket, 23 | mode): 24 | 25 | self.num_users = num_users 26 | self.num_items = num_items 27 | self.batch_size = batch_size 28 | self.ratings_df = ratings_df 29 | self.rating_bucket = rating_bucket 30 | self.mode = mode 31 | 32 | def generate(self, eval=False): 33 | """ 34 | a generator function yields ratings_df for each batch 35 | 36 | """ 37 | line_pointer = 0 38 | while True: 39 | next_n_data_lines = list(islice(self.ratings_df, line_pointer, line_pointer+self.batch_size)) 40 | if not next_n_data_lines: 41 | if self.mode == 0 and eval==False: 42 | line_pointer = 0 43 | next_n_data_lines = list(islice(self.ratings_df, line_pointer, line_pointer+self.batch_size)) 44 | else: 45 | break 46 | input_ranking_vectors = np.zeros((self.batch_size, self.num_users, self.rating_bucket), dtype='int8') 47 | output_ranking_vectors = np.zeros((self.batch_size, self.num_users, self.rating_bucket), dtype='int8') 48 | input_mask_vectors = np.zeros((self.batch_size, self.num_users), dtype='int8') 49 | output_mask_vectors = np.zeros((self.batch_size, self.num_users), dtype='int8') 50 | for i, line in enumerate(next_n_data_lines): 51 | user_ids = np.nonzero(line)[0] 52 | ratings_line = line[line != 0] 53 | 54 | if self.mode == 0 and len(user_ids) != 0: 55 | # a random ordered list 0 to len(user_ids)-1 56 | 57 | ordering = np.random.permutation(np.arange(len(user_ids))) 58 | random_num = np.random.randint(0, len(ordering)) 59 | flag_in = (ordering < random_num) 60 | flag_out = (ordering >= random_num) 61 | 62 | input_mask_vectors[i][user_ids] = flag_in 63 | output_mask_vectors[i][user_ids] = flag_out 64 | 65 | for j, (user_id, value) in enumerate(zip(user_ids, ratings_line)): 66 | if flag_in[j]: 67 | input_ranking_vectors[i, user_id, (value-1)] = 1 68 | else: 69 | output_ranking_vectors[i, user_id, (value-1)] = 1 70 | if self.mode == 2: 71 | for j, (user_id, value) in enumerate(zip(user_ids, ratings_line)): 72 | input_ranking_vectors[i, user_id, (value-1)] = 1 73 | 74 | inputs = { 75 | 'input_ratings': input_ranking_vectors, 76 | 'output_ratings': output_ranking_vectors, 77 | 'input_masks': input_mask_vectors, 78 | 'output_masks': output_mask_vectors} 79 | 80 | outputs = {'nade_loss': np.zeros([self.batch_size])} 81 | yield (inputs, outputs) 82 | line_pointer = line_pointer + self.batch_size 83 | 84 | 85 | def prediction_layer(x): 86 | # x.shape = (?,6040,5) 87 | x_cumsum = K.cumsum(x, axis=2) 88 | # x_cumsum.shape = (?,6040,5) 89 | 90 | output = K.softmax(x_cumsum) 91 | # output = (?,6040,5) 92 | return output 93 | 94 | 95 | def prediction_output_shape(input_shape): 96 | 97 | return input_shape 98 | 99 | 100 | def d_layer(x): 101 | 102 | return K.sum(x, axis=1) 103 | 104 | 105 | def d_output_shape(input_shape): 106 | 107 | return (input_shape[0], ) 108 | 109 | 110 | def D_layer(x): 111 | 112 | return K.sum(x, axis=1) 113 | 114 | 115 | def D_output_shape(input_shape): 116 | 117 | return (input_shape[0],) 118 | 119 | 120 | def rating_cost_lambda_func(args): 121 | alpha=0.01 #in the paper they reported alpha = 0.01 and std = 1.0. THis is what was used in the repo. 122 | std=1.0 123 | pred_score, true_ratings, input_masks, output_masks, D, d = args 124 | pred_score_cum = K.cumsum(pred_score, axis=2) 125 | prob_item_ratings = K.softmax(pred_score_cum) 126 | accu_prob_1N = K.cumsum(prob_item_ratings, axis=2) 127 | accu_prob_N1 = K.cumsum(prob_item_ratings[:, :, ::-1], axis=2)[:, :, ::-1] 128 | mask1N = K.cumsum(true_ratings[:, :, ::-1], axis=2)[:, :, ::-1] 129 | maskN1 = K.cumsum(true_ratings, axis=2) 130 | cost_ordinal_1N = -K.sum((K.log(prob_item_ratings) - K.log(accu_prob_1N)) * mask1N, axis=2) 131 | cost_ordinal_N1 = -K.sum((K.log(prob_item_ratings) - K.log(accu_prob_N1)) * maskN1, axis=2) 132 | cost_ordinal = cost_ordinal_1N + cost_ordinal_N1 133 | nll_item_ratings = K.sum(-(true_ratings * K.log(prob_item_ratings)), axis=2) 134 | nll = std * K.sum(nll_item_ratings, axis=1) * 1.0 * D / (D - d + 1e-6) \ 135 | + alpha * K.sum(cost_ordinal, axis=1) * 1.0 * D / (D - d + 1e-6) 136 | cost = K.mean(nll) 137 | cost = K.expand_dims(cost, 0) 138 | 139 | return cost 140 | -------------------------------------------------------------------------------- /reclab/recommenders/autorec/autorec.py: -------------------------------------------------------------------------------- 1 | """Pytorch implementation of AutoRec recommender.""" 2 | 3 | import math 4 | import numpy as np 5 | import torch 6 | 7 | from .autorec_lib import autorec 8 | from .. import recommender 9 | 10 | 11 | class Autorec(recommender.PredictRecommender): 12 | """The Autorec recommender. 13 | 14 | Parameters 15 | ---------- 16 | num_users : int 17 | Number of users in the environment. 18 | num_items : int 19 | Number of items in the environment. 20 | hidden_neuron : int 21 | Output dimension of hidden layer. 22 | lambda_value : float 23 | Coefficient for regularization while training layers. 24 | train_epoch : int 25 | Number of epochs to train for each call. 26 | batch_size : int 27 | Batch size during initial training phase. 28 | optimizer_method : str 29 | Optimizer for training model; either Adam or RMSProp. 30 | grad_clip : bool 31 | Set to true to clip gradients to [-5, 5]. 32 | base_lr : float 33 | Base learning rate for optimizer. 34 | lr_decay : float 35 | Rate for decaying learning rate during training. 36 | dropout : float 37 | Probability to initialize dropout layer. Set to 0 for no dropout. 38 | random_seed : int 39 | Random seed to reproduce results. 40 | 41 | """ 42 | 43 | def __init__(self, num_users, num_items, 44 | hidden_neuron=500, lambda_value=1, 45 | train_epoch=1000, batch_size=1000, optimizer_method='RMSProp', 46 | grad_clip=False, base_lr=1e-3, lr_decay=1e-2, 47 | dropout=0.05, random_seed=0): 48 | """Create new Autorec recommender.""" 49 | super().__init__() 50 | 51 | # We only want the function arguments so remove class related objects. 52 | self._hyperparameters.update(locals()) 53 | del self._hyperparameters['self'] 54 | del self._hyperparameters['__class__'] 55 | 56 | self.model = autorec.AutoRec(num_users, 57 | num_items, 58 | seen_users=set(), 59 | seen_items=set(), 60 | hidden_neuron=hidden_neuron, 61 | dropout=dropout, 62 | random_seed=random_seed) 63 | self.lambda_value = lambda_value 64 | self.num_users = num_users 65 | self.num_items = num_items 66 | self.train_epoch = train_epoch 67 | self.batch_size = batch_size 68 | self.num_batch = int(math.ceil(self.num_items / float(self.batch_size))) 69 | self.base_lr = base_lr 70 | self.optimizer_method = optimizer_method 71 | self.random_seed = random_seed 72 | 73 | self.lr_decay = lr_decay 74 | self.grad_clip = grad_clip 75 | np.random.seed(self.random_seed) 76 | # pylint: disable=no-member 77 | self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 78 | 79 | def train_model(self, data): 80 | """Train for all epochs in train_epoch.""" 81 | self.model.train() 82 | if self.optimizer_method == 'Adam': 83 | optimizer = torch.optim.Adam(self.model.parameters(), lr=self.base_lr) 84 | 85 | elif self.optimizer_method == 'RMSProp': 86 | optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.base_lr) 87 | else: 88 | raise ValueError('Optimizer Key ERROR') 89 | 90 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=self.lr_decay) 91 | 92 | self.model.to(self.device) 93 | for epoch in range(self.train_epoch): 94 | self.train(data, optimizer, scheduler) 95 | 96 | def train(self, data, optimizer, scheduler): 97 | """Train for a single epoch.""" 98 | random_perm_doc_idx = np.random.permutation(self.num_items) 99 | for i in range(self.num_batch): 100 | if i == self.num_batch - 1: 101 | batch_set_idx = random_perm_doc_idx[i * self.batch_size:] 102 | elif i < self.num_batch - 1: 103 | batch_set_idx = random_perm_doc_idx[i * self.batch_size:(i+1) * self.batch_size] 104 | 105 | batch = data[batch_set_idx, :].to(self.device) 106 | output = self.model.forward(batch) 107 | mask = self.mask_ratings[batch_set_idx, :].to(self.device) 108 | loss = self.model.loss(output, 109 | batch, 110 | mask, 111 | lambda_value=self.lambda_value) 112 | 113 | loss.backward() 114 | if self.grad_clip: 115 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5) 116 | 117 | optimizer.step() 118 | scheduler.step() 119 | 120 | @property 121 | def name(self): # noqa: D102 122 | return 'autorec' 123 | 124 | def _predict(self, user_item): 125 | self.model = self.model.eval() 126 | return self.model.predict(user_item, self.ratings.to(self.device)) 127 | 128 | def reset(self, users=None, items=None, ratings=None): # noqa: D102 129 | self.model.prepare_model() 130 | super().reset(users, items, ratings) 131 | 132 | def update(self, users=None, items=None, ratings=None): # noqa: D102 133 | super().update(users, items, ratings) 134 | self.model.prepare_model() 135 | self.model = self.model.train() 136 | for user_item in ratings: 137 | self.model.seen_users.add(user_item[0]) 138 | self.model.seen_items.add(user_item[1]) 139 | 140 | ratings = self._ratings.toarray() 141 | # Item-based autorec expects rows that represent items 142 | # pylint: disable=no-member 143 | self.ratings = torch.FloatTensor(ratings.T) 144 | # pylint: disable=no-member 145 | self.mask_ratings = torch.FloatTensor(ratings.T).clamp(0, 1) 146 | 147 | self.train_model(self.ratings) 148 | -------------------------------------------------------------------------------- /tests/test_topics.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=protected-access 2 | """Tests for the Topics environment.""" 3 | import copy 4 | import numpy as np 5 | 6 | from reclab.environments import Topics 7 | 8 | 9 | def _test_dimension_consistency(environment): 10 | """ Basic Helper Test to check if dimension of 11 | various environment properties.""" 12 | env = copy.deepcopy(environment) 13 | 14 | assert env.name == 'topics' 15 | users, items, _ = env.reset() 16 | 17 | # Test that the users and items have empty features. 18 | num_users = len(env.users) 19 | num_items = len(env.items) 20 | num_topics = env._num_topics 21 | assert users[0].shape == (0,) 22 | assert items[0].shape == (0,) 23 | assert env.online_users[0].shape == (0,) 24 | 25 | # Test that item topics and user preferences are of the correct size. 26 | assert env._item_topics.shape == (num_items,) 27 | assert env._user_preferences.shape == (num_users, num_topics) 28 | 29 | # Recommend item 0, we shouldn't observe new users or items. 30 | users, items, _, _ = env.step(np.array([[0]])) 31 | assert users == {} 32 | assert items == {} 33 | 34 | 35 | def test_topics_static_simple(): 36 | """Test Topics with only one user, with no preference shifts 37 | and no topic change and no boredom.""" 38 | env = Topics(num_topics=2, 39 | num_users=1, 40 | num_items=2, 41 | rating_frequency=1.0, 42 | num_init_ratings=0, 43 | noise=0.0, 44 | topic_change=0.0, 45 | memory_length=0, 46 | boredom_threshold=0, 47 | boredom_penalty=0.0, 48 | user_dist_choice='uniform', 49 | shift_steps=1, 50 | shift_frequency=0.0, 51 | shift_weight=0.0) 52 | 53 | _test_dimension_consistency(env) 54 | env.reset() 55 | 56 | old_user_preferences = copy.deepcopy(env._user_preferences) 57 | old_dense_ratings = env._get_dense_ratings() 58 | 59 | # Recommend item 0 60 | env.step(np.array([[0]])) 61 | 62 | # Test that the preferences didn't change 63 | assert np.array_equal(old_user_preferences, env._user_preferences) 64 | # Test that the dense ratings didn't change 65 | assert np.array_equal(old_dense_ratings, env._get_dense_ratings()) 66 | 67 | 68 | def test_topics_shift(): 69 | """Test Topics with random preference shifts""" 70 | env = Topics(num_topics=2, 71 | num_users=1, 72 | num_items=10, 73 | rating_frequency=1.0, 74 | num_init_ratings=0, 75 | noise=0.0, 76 | topic_change=0.0, 77 | memory_length=0, 78 | boredom_threshold=0, 79 | boredom_penalty=0.0, 80 | user_dist_choice='uniform', 81 | shift_steps=2, 82 | shift_frequency=1, 83 | shift_weight=0.5, 84 | user_bias_type='normal') 85 | 86 | _test_dimension_consistency(env) 87 | env.reset() 88 | 89 | old_user_preferences = copy.deepcopy(env._user_preferences) 90 | old_user_biases = copy.deepcopy(env._user_biases) 91 | 92 | # Recommend item 0. 93 | env.step(np.array([[0]])) 94 | 95 | # Test that the preferences and biases didn't change. 96 | assert np.array_equal(old_user_preferences, env._user_preferences) 97 | assert np.array_equal(old_user_biases, env._user_biases) 98 | 99 | # Recommend another item and check that preferences have changed. 100 | env.step(np.array([[1]])) 101 | assert not np.array_equal(old_user_preferences, env._user_preferences) 102 | assert not np.array_equal(old_user_biases, env._user_biases) 103 | 104 | 105 | def test_topics_boredom(): 106 | """Test Topics with boredom shifts""" 107 | env = Topics(num_topics=2, 108 | num_users=1, 109 | num_items=10, 110 | rating_frequency=1.0, 111 | num_init_ratings=0, 112 | noise=0.0, 113 | topic_change=0.0, 114 | memory_length=3, 115 | boredom_threshold=1, 116 | boredom_penalty=1, 117 | user_dist_choice='uniform', 118 | shift_steps=1, 119 | shift_frequency=0, 120 | shift_weight=0) 121 | 122 | _test_dimension_consistency(env) 123 | env.reset() 124 | # Change all the item types to type 0. 125 | env._item_topics = np.zeros(len(env.items), dtype=int) 126 | 127 | old_ratings = env._get_dense_ratings() 128 | 129 | # Recommend item 0 and check that ratings don't change. 130 | env.step(np.array([[0]])) 131 | assert np.array_equal(old_ratings, env._get_dense_ratings()) 132 | 133 | # Recommend item 1 and check that dense ratings decrease by the 134 | # same amount as the boredom penalty. 135 | env.step(np.array([[1]])) 136 | assert np.array_equal(old_ratings-env._boredom_penalty, env._get_dense_ratings()) 137 | 138 | 139 | def test_topics_change(): 140 | """Test Topics with topic change""" 141 | env = Topics(num_topics=2, 142 | num_users=1, 143 | num_items=10, 144 | rating_frequency=1.0, 145 | num_init_ratings=0, 146 | noise=0.0, 147 | topic_change=0.5, 148 | memory_length=0, 149 | boredom_threshold=0, 150 | boredom_penalty=0, 151 | user_dist_choice='uniform', 152 | shift_steps=1, 153 | shift_frequency=0, 154 | shift_weight=0) 155 | 156 | _test_dimension_consistency(env) 157 | env.reset() 158 | # Change all the item types to type 0. 159 | env._item_topics = np.zeros(len(env.items), dtype=int) 160 | 161 | old_user_preferences = copy.deepcopy(env._user_preferences) 162 | 163 | # Recommend item 0 and check that preferences for the recommended topic have 164 | # increased while the preference for the other topic decreased. 165 | env.step(np.array([[0]])) 166 | topic = env._item_topics[0] 167 | new_user_preferences = env._user_preferences 168 | assert new_user_preferences[0][topic] >= old_user_preferences[0][topic] 169 | assert new_user_preferences[0][1-topic] <= old_user_preferences[0][1-topic] 170 | -------------------------------------------------------------------------------- /reclab/recommenders/sparse.py: -------------------------------------------------------------------------------- 1 | """An implementation of SLIM and EASE sparse linear recommenders. 2 | 3 | For details, see: 4 | - http://glaros.dtc.umn.edu/gkhome/node/774 5 | - https://arxiv.org/pdf/1905.03375.pdf 6 | """ 7 | import warnings 8 | import numpy as np 9 | import scipy.sparse 10 | import sklearn.linear_model 11 | from sklearn.exceptions import ConvergenceWarning 12 | 13 | from . import recommender 14 | 15 | warnings.simplefilter('ignore', category=ConvergenceWarning) 16 | 17 | 18 | class SLIM(recommender.PredictRecommender): 19 | """The SLIM recommendation model which is a sparse linear method. 20 | 21 | Parameters 22 | ---------- 23 | binarize : boolean 24 | Determines whether to binarize ratings before fitting a model. 25 | alpha : float 26 | Constant that multiplies the regularization terms. 27 | l1_ratio : float 28 | The ratio of the L1 regularization term with respect to the L2 regularization. 29 | max_iter : int 30 | The maximum number of iterations to train the model for. 31 | tol : float 32 | The tolerance below which the optimization will stop. 33 | seed : int 34 | The random seed to use when training the model. 35 | 36 | """ 37 | 38 | def __init__(self, 39 | binarize=False, 40 | alpha=1.0, 41 | l1_ratio=0.1, 42 | positive=True, 43 | max_iter=100, 44 | tol=1e-4, 45 | seed=0): 46 | """Create a SLIM recommender.""" 47 | super().__init__() 48 | self._binarize = binarize 49 | self._model = sklearn.linear_model.ElasticNet(alpha=alpha, 50 | l1_ratio=l1_ratio, 51 | positive=positive, 52 | fit_intercept=False, 53 | copy_X=False, 54 | precompute=True, 55 | selection='random', 56 | max_iter=max_iter, 57 | tol=tol, 58 | random_state=seed) 59 | self._weights = None 60 | self._hyperparameters.update(locals()) 61 | 62 | # We only want the function arguments so remove class related objects. 63 | del self._hyperparameters['self'] 64 | del self._hyperparameters['__class__'] 65 | 66 | @property 67 | def name(self): # noqa: D102 68 | return 'slim' 69 | 70 | def update(self, users=None, items=None, ratings=None): # noqa: D102 71 | super().update(users, items, ratings) 72 | num_items = len(self._items) 73 | self._weights = scipy.sparse.dok_matrix((num_items, num_items)) 74 | if self._binarize: 75 | row, col = self._ratings.nonzero() 76 | data = np.ones(len(row)) 77 | ratings = scipy.sparse.csr_matrix((data, (row, col)), shape=self._ratings.shape).tolil() 78 | else: 79 | ratings = self._ratings.tolil() 80 | for item_id in range(num_items): 81 | target = ratings[:, item_id].toarray() 82 | # Zero out the column of the current item to prevent a trivial solution. 83 | ratings[:, item_id] = 0 84 | # Fit the mode and save the weights 85 | # This currently takes 0.02s/item on ML100k 86 | self._model.fit(ratings, target) 87 | self._weights[:, item_id] = self._model.sparse_coef_.T 88 | self._weights[item_id, item_id] = 0 89 | # Restore the rating column. 90 | ratings[:, item_id] = target 91 | self._weights = scipy.sparse.csr_matrix(self._weights) 92 | 93 | @property 94 | def dense_predictions(self): # noqa: D102 95 | if self._dense_predictions is None: 96 | self._dense_predictions = (self._ratings @ self._weights).todense() 97 | return self._dense_predictions 98 | 99 | def _predict(self, user_item): # noqa: D102 100 | # Predict on all user-item pairs. 101 | all_predictions = self.dense_predictions 102 | predictions = [] 103 | for user_id, item_id, _ in user_item: 104 | predictions.append(all_predictions[user_id, item_id]) 105 | 106 | return np.array(predictions) 107 | 108 | 109 | class EASE(recommender.PredictRecommender): 110 | """The EASE recommendation model which is a simple linear method. 111 | 112 | Parameters 113 | ---------- 114 | binarize : boolean 115 | Determines whether to binarize ratings before fitting a model. 116 | lam : float 117 | Constant that multiplies the regularization terms. 118 | 119 | """ 120 | 121 | def __init__(self, 122 | binarize=False, 123 | lam=1.0): 124 | """Create an EASE recommender.""" 125 | super().__init__() 126 | 127 | self._binarize = binarize 128 | self._lam = lam 129 | 130 | self._weights = None 131 | self._hyperparameters.update(locals()) 132 | 133 | # We only want the function arguments so remove class related objects. 134 | del self._hyperparameters['self'] 135 | del self._hyperparameters['__class__'] 136 | 137 | @property 138 | def name(self): # noqa: D102 139 | return 'ease' 140 | 141 | def update(self, users=None, items=None, ratings=None): # noqa: D102 142 | super().update(users, items, ratings) 143 | 144 | if self._binarize: 145 | row, col = self._ratings.nonzero() 146 | data = np.ones(len(row)) 147 | ratings = scipy.sparse.csr_matrix((data, (row, col)), shape=self._ratings.shape) 148 | else: 149 | ratings = self._ratings 150 | 151 | item_products = ratings.T @ ratings 152 | 153 | diag_ind = np.diag_indices(item_products.shape[0]) 154 | item_products[diag_ind] += self._lam 155 | inverse_mat = np.linalg.inv(item_products.todense()) 156 | self._weights = inverse_mat / (-np.diag(inverse_mat)) 157 | self._weights[diag_ind] = 0 158 | 159 | @property 160 | def dense_predictions(self): # noqa: D102 161 | if self._dense_predictions is None: 162 | self._dense_predictions = (self._ratings @ self._weights) 163 | return self._dense_predictions 164 | 165 | def _predict(self, user_item): # noqa: D102 166 | # Predict on all user-item pairs. 167 | all_predictions = self.dense_predictions 168 | predictions = [] 169 | for user_id, item_id, _ in user_item: 170 | predictions.append(all_predictions[user_id, item_id]) 171 | 172 | return np.array(predictions) 173 | -------------------------------------------------------------------------------- /reclab/recommenders/cfnade/cfnade.py: -------------------------------------------------------------------------------- 1 | """Implementation of the CF-NADE recommender using Keras.""" 2 | from keras.layers import Input, Dropout, Lambda, add 3 | from keras.models import Model 4 | import keras.regularizers 5 | from tensorflow.keras.optimizers import Adam 6 | import numpy as np 7 | 8 | from .cfnade_lib.nade import NADE 9 | from .cfnade_lib import utils 10 | from .. import recommender 11 | 12 | 13 | class Cfnade(recommender.PredictRecommender): 14 | """ 15 | A Neural Autoregressive Distribution Estimator (NADE) for collaborative filtering (CF) tasks. 16 | 17 | Parameters 18 | --------- 19 | num_users : int 20 | Number of users in the environment. 21 | num_items : int 22 | Number of items in the environment. 23 | train_set : np.matrix 24 | Matrix of shape (num_users, num_items) populated with user ratings. 25 | train_epoch : int 26 | Number of epochs to train for each call. 27 | batch_size : int 28 | Batch size during initial training phase. 29 | rating_bucket: int 30 | number of rating buckets 31 | rate_score: array of float 32 | An array of corresponding rating score for each bucket 33 | hidden_dim: int 34 | hidden dimension to construct the layer 35 | learning_rate: float 36 | learning rate 37 | 38 | """ 39 | 40 | def __init__( 41 | self, num_users, num_items, 42 | batch_size=64, train_epoch=10, 43 | rating_bucket=5, hidden_dim=500, 44 | learning_rate=0.001, normalized_layer=False, 45 | random_seed=0): 46 | """Create new Cfnade recommender.""" 47 | super().__init__() 48 | self._num_users = num_users 49 | self._num_items = num_items 50 | self._batch_size = batch_size 51 | if num_items <= batch_size: 52 | self._batch_size = num_items 53 | self._input_dim0 = num_users 54 | self._rating_bucket = rating_bucket 55 | self._rate_score = np.array(np.arange(1, rating_bucket+1), np.float32) 56 | self._hidden_dim = hidden_dim 57 | self._learning_rate = learning_rate 58 | self._train_epoch = train_epoch 59 | self._hyperparameters.update(locals()) 60 | self._new_items = np.zeros(num_items) 61 | np.random.seed(random_seed) 62 | 63 | # We only want the function arguments so remove class related objects. 64 | del self._hyperparameters['self'] 65 | del self._hyperparameters['__class__'] 66 | 67 | # Prepare model 68 | input_layer = Input(shape=(self._input_dim0, self._rating_bucket), name='input_ratings') 69 | output_ratings = Input(shape=(self._input_dim0, self._rating_bucket), name='output_ratings') 70 | input_masks = Input(shape=(self._input_dim0,), name='input_masks') 71 | output_masks = Input(shape=(self._input_dim0,), name='output_masks') 72 | nade_layer = Dropout(0.0)(input_layer) 73 | nade_layer = NADE( 74 | hidden_dim=self._hidden_dim, activation='tanh', bias=True, 75 | W_regularizer=keras.regularizers.l2(0.02), 76 | V_regularizer=keras.regularizers.l2(0.02), 77 | b_regularizer=keras.regularizers.l2(0.02), 78 | c_regularizer=keras.regularizers.l2(0.02), 79 | normalized_layer=normalized_layer)(nade_layer) 80 | 81 | predicted_ratings = Lambda( 82 | utils.prediction_layer, 83 | output_shape=utils.prediction_output_shape, 84 | name='predicted_ratings')(nade_layer) 85 | 86 | func_d = Lambda( 87 | utils.d_layer, output_shape=utils.d_output_shape, 88 | name='func_d')(input_masks) 89 | sum_masks = add([input_masks, output_masks]) 90 | func_d_2 = Lambda( 91 | utils.D_layer, output_shape=utils.D_output_shape, 92 | name='func_d_2')(sum_masks) 93 | loss_out = Lambda( 94 | utils.rating_cost_lambda_func, output_shape=(1, ), 95 | name='nade_loss')([nade_layer, output_ratings, 96 | input_masks, output_masks, func_d_2, func_d]) 97 | 98 | self._cf_nade_model = Model( 99 | inputs=[input_layer, output_ratings, input_masks, output_masks], 100 | outputs=[loss_out, predicted_ratings]) 101 | optimizer = Adam(self._learning_rate, 0.9, 0.999, 1e-8) 102 | self._cf_nade_model.compile( 103 | loss={'nade_loss': lambda y_true, y_pred: y_pred}, 104 | optimizer=optimizer) 105 | self._cf_nade_model.save_weights('model.h5') 106 | 107 | @property 108 | def name(self): # noqa: D102 109 | return 'cfnade' 110 | 111 | def update(self, users=None, items=None, ratings=None): # noqa: D102 112 | super().update(users, items, ratings) 113 | self._cf_nade_model.load_weights('model.h5') 114 | 115 | ratings_matrix = self._ratings.toarray() 116 | ratings_matrix = np.around(ratings_matrix.transpose()) 117 | ratings_matrix = ratings_matrix.astype(int) 118 | 119 | train_set = utils.DataSet(ratings_matrix, 120 | num_users=self._num_users, 121 | num_items=self._num_items, 122 | batch_size=self._batch_size, 123 | rating_bucket=self._rating_bucket, 124 | mode=0) 125 | self._cf_nade_model.fit_generator(train_set.generate(), 126 | steps_per_epoch=(self._num_items // self._batch_size), 127 | epochs=self._train_epoch, 128 | callbacks=[train_set], verbose=1) 129 | 130 | def _predict(self, user_item): # noqa: D102 131 | ratings_matrix = self._ratings.toarray() 132 | ratings_matrix = np.around(ratings_matrix.transpose()) 133 | ratings_matrix = ratings_matrix.astype(int) 134 | 135 | # keep track of unseen items in ratings 136 | ratings_matrix_total = ratings_matrix.transpose().sum(axis=1) 137 | self._new_items = np.where(ratings_matrix_total == 0)[0] 138 | 139 | test_set = utils.DataSet(ratings_matrix, 140 | num_users=self._num_users, 141 | num_items=self._num_items, 142 | batch_size=self._batch_size, 143 | rating_bucket=self._rating_bucket, 144 | mode=2) 145 | pred_rating = [] 146 | for batch in test_set.generate(): 147 | pred_matrix = self._cf_nade_model.predict(batch[0])[1] 148 | pred_rating_batch = pred_matrix * self._rate_score[np.newaxis, np.newaxis, :] 149 | pred_rating_batch = pred_rating_batch.sum(axis=2) 150 | pred_rating.append(pred_rating_batch) 151 | pred_rating = np.concatenate(pred_rating, axis=0) 152 | 153 | predictions = [] 154 | for user, item, _ in user_item: 155 | if item in self._new_items: 156 | predictions.append(3) 157 | else: 158 | predictions.append(pred_rating[item, user]) 159 | 160 | return np.array(predictions) 161 | -------------------------------------------------------------------------------- /reclab/recommenders/knn_recommender.py: -------------------------------------------------------------------------------- 1 | """The implementation for a neighborhood based recommender.""" 2 | import heapq 3 | 4 | import numpy as np 5 | import scipy.sparse 6 | import scipy.sparse.linalg 7 | 8 | from . import recommender 9 | 10 | 11 | class KNNRecommender(recommender.PredictRecommender): 12 | """A neighborhood based collaborative filtering algorithm. 13 | 14 | The class supports both user and item based collaborative filtering. 15 | 16 | Parameters 17 | ---------- 18 | shrinkage : float 19 | The shrinkage parameter applied to the similarity measure. 20 | neighborhood_size : int 21 | The number of users/items to consider when estimating a rating. 22 | user_based : bool 23 | If this variable is set to true the created object will use user-based collaborative 24 | filtering, otherwise it will use item-based collaborative filtering. 25 | use_content : bool 26 | Whether to use the user/item features when computing the similarity measure. 27 | use_means : bool 28 | Whether to adjust the ratings based on the mean rating of each user/item. 29 | 30 | """ 31 | 32 | def __init__(self, shrinkage=0, neighborhood_size=40, 33 | user_based=True, use_content=True, use_means=True, 34 | **kwargs): 35 | """Create a new neighborhood recommender.""" 36 | super().__init__(**kwargs) 37 | self._shrinkage = shrinkage 38 | self._neighborhood_size = neighborhood_size 39 | self._user_based = user_based 40 | self._use_content = use_content 41 | self._use_means = use_means 42 | self._feature_matrix = scipy.sparse.csr_matrix((0, 0)) 43 | self._means = np.empty(0) 44 | self._similarity_matrix = np.empty((0, 0)) 45 | self._ratings_matrix = np.empty((0, 0)) 46 | self._hyperparameters.update(locals()) 47 | 48 | # We only want the function arguments so remove class related objects. 49 | del self._hyperparameters['self'] 50 | del self._hyperparameters['__class__'] 51 | 52 | @property 53 | def name(self): # noqa: D102 54 | return 'knn' 55 | 56 | @property 57 | def dense_predictions(self): # noqa: D102 58 | if self._dense_predictions is not None: 59 | return self._dense_predictions 60 | 61 | # Set up whether we will loop over users or items. 62 | if self._user_based: 63 | loop_range = range(len(self._users)) 64 | ratings_matrix = self._ratings_matrix 65 | else: 66 | loop_range = range(len(self._items)) 67 | ratings_matrix = self._ratings_matrix.T 68 | 69 | preds = [] 70 | for idx in loop_range: 71 | relevant_idxs = nlargest_indices( 72 | self._neighborhood_size, self._similarity_matrix[idx]) 73 | ratings = ratings_matrix[relevant_idxs] 74 | # We only care about means and similarities with corresponding nonzero ratings. 75 | zero = ratings == 0 76 | 77 | # Create a matrix of means that can easily be subtracted by the ratings. 78 | relevant_means = self._means[relevant_idxs] 79 | relevant_means = np.tile(relevant_means, (ratings_matrix.shape[1], 1)).T 80 | relevant_means[zero] = 0.0 81 | 82 | # Create a matrix of relevant similarities that can easily be multiplied with ratings. 83 | similarities = self._similarity_matrix[relevant_idxs, idx] 84 | similarities = np.tile(similarities, (ratings_matrix.shape[1], 1)).T 85 | similarities[zero] = 0.0 86 | 87 | # Ensure that we aren't weighting by all 0. 88 | zero = np.all(np.isclose(similarities, 0), axis=0) 89 | similarities[:, zero] = 1.0 90 | 91 | # Compute the predictions. 92 | if self._use_means: 93 | ratings_sum = self._means[idx] + (ratings - relevant_means) 94 | else: 95 | ratings_sum = ratings 96 | preds.append((ratings_sum * similarities).sum(axis=0) / similarities.sum(axis=0)) 97 | 98 | preds = np.array(preds) 99 | if not self._user_based: 100 | preds = preds.T 101 | 102 | self._dense_predictions = preds 103 | return preds 104 | 105 | def reset(self, users=None, items=None, ratings=None): # noqa: D102 106 | self._feature_matrix = scipy.sparse.csr_matrix((0, 0)) 107 | self._similarity_matrix = np.empty((0, 0)) 108 | self._means = np.empty(0) 109 | self._ratings_matrix = np.empty((0, 0)) 110 | super().reset(users, items, ratings) 111 | 112 | def update(self, users=None, items=None, ratings=None): # noqa: D102 113 | super().update(users, items, ratings) 114 | if self._user_based: 115 | self._feature_matrix = scipy.sparse.csr_matrix(self._ratings) 116 | else: 117 | self._feature_matrix = scipy.sparse.csr_matrix(self._ratings.T) 118 | self._means = divide_zero(flatten(self._feature_matrix.sum(axis=1)), 119 | self._feature_matrix.getnnz(axis=1)) 120 | if self._use_content: 121 | if self._user_based: 122 | self._feature_matrix = scipy.sparse.hstack([self._feature_matrix, self._users]) 123 | else: 124 | self._feature_matrix = scipy.sparse.hstack([self._feature_matrix, self._items]) 125 | self._similarity_matrix = cosine_similarity(self._feature_matrix, self._feature_matrix, 126 | self._shrinkage) 127 | np.fill_diagonal(self._similarity_matrix, 0) 128 | # TODO: this may not be the best way to store ratings, but it does speed access 129 | self._ratings_matrix = self._ratings.A 130 | 131 | def _predict(self, user_item): # noqa: D102 132 | preds = [] 133 | relevant_idxs_cache = {} 134 | for user_id, item_id, _ in user_item: 135 | if self._user_based: 136 | if user_id not in relevant_idxs_cache: 137 | relevant_idxs_cache[user_id] = nlargest_indices( 138 | self._neighborhood_size, self._similarity_matrix[user_id]) 139 | relevant_idxs = relevant_idxs_cache[user_id] 140 | similarities = self._similarity_matrix[relevant_idxs, user_id] 141 | ratings = self._ratings_matrix[relevant_idxs, item_id].ravel() 142 | mean = self._means[user_id] 143 | else: 144 | if item_id not in relevant_idxs_cache: 145 | relevant_idxs_cache[item_id] = nlargest_indices( 146 | self._neighborhood_size, self._similarity_matrix[item_id]) 147 | relevant_idxs = relevant_idxs_cache[item_id] 148 | similarities = self._similarity_matrix[relevant_idxs, item_id] 149 | ratings = self._ratings_matrix.T[relevant_idxs, user_id].ravel() 150 | mean = self._means[item_id] 151 | relevant_means = self._means[relevant_idxs] 152 | nonzero = ratings != 0 153 | ratings = ratings[nonzero] 154 | similarities = similarities[nonzero] 155 | # ensure that we aren't weighting by all 0 156 | if np.all(np.isclose(similarities, 0)): 157 | similarities = np.ones_like(similarities) 158 | if self._use_means: 159 | if len(ratings) == 0: 160 | preds.append(mean) 161 | else: 162 | preds.append(mean + np.average(ratings - relevant_means[nonzero], 163 | weights=similarities)) 164 | else: 165 | if len(ratings) == 0: 166 | preds.append(0) 167 | else: 168 | preds.append(np.average(ratings, weights=similarities)) 169 | 170 | return np.array(preds) 171 | 172 | 173 | def cosine_similarity(X, Y, shrinkage): 174 | """Compute the cosine similarity between each row vector in each matrix X and Y. 175 | 176 | Parameters 177 | ---------- 178 | X : np.matrix 179 | The first matrix for which to compute the cosine similarity. 180 | Y : np.matrix 181 | The second matrix for which to compute the cosine similarity. 182 | shrinkage : float 183 | The amount of shrinkage to apply to the similarity computation. 184 | 185 | Returns 186 | ------- 187 | similarity : np.ndarray 188 | The similarity array between each pairs of row, where similarity[i, j] 189 | is the cosine similarity between X[i] and Y[j]. 190 | 191 | """ 192 | return divide_zero((X @ Y.T).A, scipy.sparse.linalg.norm(X, axis=1)[:, np.newaxis] * 193 | scipy.sparse.linalg.norm(Y, axis=1)[np.newaxis, :] + shrinkage) 194 | 195 | 196 | def nlargest_indices(n, iterable): 197 | """Given an iterable, computes the indices of the n largest items. 198 | 199 | Parameters 200 | ---------- 201 | n : int 202 | How many indices to retrieve. 203 | iterable : iterable 204 | The iterable from which to compute the n largest indices. 205 | 206 | Returns 207 | ------- 208 | largest : list of int 209 | The n largest indices where largest[i] is the index of the i-th largest index. 210 | 211 | """ 212 | nlargest = heapq.nlargest(n, enumerate(iterable), 213 | key=lambda x: x[1]) 214 | return [i[0] for i in nlargest] 215 | 216 | 217 | def flatten(matrix): 218 | """Given a matrix return a flattened numpy array.""" 219 | return matrix.A.ravel() 220 | 221 | 222 | def divide_zero(num, denom): 223 | """Divide a and b but return 0 instead of nan for divide by 0.""" 224 | # TODO: is this the desired zero-division behavior? 225 | return np.divide(num, denom, out=np.zeros_like(num), where=(denom != 0)) 226 | -------------------------------------------------------------------------------- /reclab/recommenders/llorma/llorma_lib/anchor.py: -------------------------------------------------------------------------------- 1 | """Anchor Manager module 2 | """ 3 | import random 4 | 5 | import numpy as np 6 | from sklearn.preprocessing import normalize 7 | from scipy.spatial import distance_matrix 8 | 9 | 10 | def _init_anchor_points(data, n_anchor, row_k, col_k): 11 | """ Helper function that 12 | 13 | Parameters 14 | ---------- 15 | data : array-like, shape [n_ratings, 3] 16 | Rating data 17 | Each row is of the form [user_id, item_id, rating] 18 | n_anchor : int 19 | Number of anchor points 20 | row_k : array-like, shape [n_users, n_users] 21 | Symmetric kernel matrix where entry (i,j) is 22 | the similarity between user_i and user_j 23 | col_k : array-like, shape [n_items, n_items] 24 | Symmetric kernel matrix where entry (i, j) id 25 | the similarity between item_i and item_j 26 | 27 | Returns 28 | ------- 29 | np.ndarray, shape (n_anchor,) 30 | Array of anchor indices, indexed according 31 | to their order in the rating data 32 | """ 33 | user_ids = data[:, 0].astype(np.int64) 34 | item_ids = data[:, 1].astype(np.int64) 35 | 36 | anchor_idxs = [] 37 | while len(anchor_idxs) < n_anchor: 38 | anchor_idx = random.randint(0, data.shape[0] - 1) 39 | if anchor_idx in anchor_idxs: 40 | continue 41 | 42 | anchor_row = data[anchor_idx] 43 | uid = int(anchor_row[0]) 44 | iid = int(anchor_row[1]) 45 | 46 | k = np.multiply(row_k[uid][user_ids], 47 | col_k[iid][item_ids]) 48 | sum_a_of_anchor = np.sum(k) 49 | if sum_a_of_anchor < 1: 50 | continue 51 | 52 | #print('>> %10d\t%d' % (anchor_idx, sum_a_of_anchor)) 53 | anchor_idxs.append(anchor_idx) 54 | 55 | return anchor_idxs 56 | 57 | 58 | def _get_distance_matrix(latent): 59 | """Helper function to compute a matrix 60 | of pairwise cosine distances between latent 61 | factors of a pair of users of a pair of items 62 | 63 | Parameters 64 | ---------- 65 | latent : array-like, shape (N, latent_dim) 66 | Matrix of latent factors 67 | Number of rows is the number of users or items 68 | Number of columns is the latent dimension 69 | 70 | Returns 71 | ------- 72 | array-like, shape (N, N) 73 | Matrix of cosine distances between every 74 | pair of users (items) 75 | """ 76 | _normalized_latent = normalize(latent, axis=1) 77 | 78 | d_mat = distance_matrix(_normalized_latent, _normalized_latent) 79 | assert np.count_nonzero(np.isnan(d_mat)) == 0 80 | return d_mat 81 | 82 | 83 | def _get_k_from_distance(d_mat): 84 | """Helper function to compute kernel matrix from distance matrix 85 | 86 | Parameters 87 | ---------- 88 | d_mat : array-like, shape [N, N] 89 | Matrix of cosine distances between every 90 | pair of users (items) 91 | 92 | Returns 93 | ------- 94 | np.ndarray, shape [N, N] 95 | Kernel matrix corresponding to the distance matrix 96 | """ 97 | m_mat = np.zeros(d_mat.shape) 98 | m_mat[d_mat < 0.9] = 1 99 | k_mat = np.multiply(np.subtract(np.ones(d_mat.shape), np.square(d_mat)), m_mat) 100 | return k_mat 101 | 102 | def _get_rbf_k(latent, gamma=None, scaled=True): 103 | """Helper function to compute scaled 104 | Gaussian Kernel matrix for latent factors 105 | 106 | Parameters 107 | ---------- 108 | latent : array-like, shape (N, latent_dim) 109 | Matrix of latent factors 110 | Number of rows is the number of users or items 111 | Number of columns is the latent dimension 112 | gamma : float, optional 113 | parameter for the , by default None 114 | scaled : bool, optional 115 | if true, the kernel is scaled by the norms of the factors 116 | by default True 117 | """ 118 | 119 | if gamma is None: 120 | gamma = 1 121 | d_mat = _get_distance_matrix(latent) 122 | 123 | rbf_mat = np.exp(-1*gamma*d_mat) 124 | row_norms = np.linalg.norm(latent, axis=1) 125 | if scaled: 126 | norms_mat = np.outer(row_norms, row_norms) 127 | k_mat = np.multiply(rbf_mat, norms_mat) 128 | else: k_mat = rbf_mat 129 | 130 | # normalize such that diagonals have value 1 131 | row_avg = np.mean(k_mat, axis=1, keepdims=True).reshape(-1, 1) 132 | col_avg = np.mean(k_mat, axis=0, keepdims=True).reshape(1, -1) 133 | avg = np.mean(k_mat) 134 | k_mat = k_mat-col_avg-row_avg+2*avg 135 | k_diag = np.sqrt(np.diagonal(k_mat)) 136 | k_diag_outer = np.outer(k_diag, k_diag) 137 | k_mat = np.divide(k_mat, k_diag_outer) 138 | # return (k_mat - 1)*2 139 | return(k_mat) 140 | 141 | 142 | 143 | def _get_ks_from_latents(row_latent, col_latent): 144 | """Helper function to get kernels 145 | 146 | Parameters 147 | ---------- 148 | row_latent : array-like, shape (N_users, rank) 149 | Matrix of latent factors corresponding to users 150 | col_latent : array-like, shape (N_items, rank) 151 | Matrix of latent factors corresponding to items 152 | 153 | Returns 154 | ------- 155 | (row_k, col_k): array-like, (N_users, N_users), (N_items, N_items) 156 | Returns two square matrices corresponding to similarity kernels 157 | row_k: entry (i,j) is the similarity between user_i and user_j 158 | col_k: entry (i,j) is the similarity between item_i and item_j 159 | """ 160 | # row_d = _get_distance_matrix(row_latent) 161 | # col_d = _get_distance_matrix(col_latent) 162 | 163 | # row_k = _get_k_from_distance(row_d) 164 | # col_k = _get_k_from_distance(col_d) 165 | 166 | row_k = _get_rbf_k(row_latent) 167 | col_k = _get_rbf_k(col_latent) 168 | 169 | return row_k, col_k 170 | 171 | 172 | class AnchorManager: 173 | """ AnchorManager class 174 | 175 | Parameters 176 | ---------- 177 | n_anchor : int 178 | number of anchor points 179 | batch_manager : obj: BatchManager 180 | an instance of BatchManager class 181 | row_latent_init : array-like, shape (n_users, latent_dim) 182 | Matrix of latent factors for users. 183 | Typically this is set to factors pre-trained in a 184 | pre-train Matrix Factorization step 185 | col_latent_init : array-like, shape (n_item, latent_dim) 186 | Matrix of latent factors for items. 187 | Typically this is set to factors pre-trained in a 188 | pre-train Matrix Factorization step 189 | """ 190 | 191 | def __init__( 192 | self, 193 | n_anchor, 194 | batch_manager, 195 | row_latent_init, 196 | col_latent_init, 197 | kernel_fun): 198 | """ Instantiate an AnchorManager 199 | """ 200 | 201 | train_data = batch_manager.train_data 202 | 203 | row_latent = row_latent_init 204 | col_latent = col_latent_init 205 | 206 | if kernel_fun is None: 207 | row_k, col_k = _get_ks_from_latents(row_latent, col_latent) 208 | else: 209 | row_k = kernel_fun(row_latent) 210 | col_k = kernel_fun(col_latent) 211 | 212 | anchor_idxs = _init_anchor_points(train_data, n_anchor, row_k, col_k) 213 | assert len(anchor_idxs) == n_anchor 214 | anchor_points = train_data[anchor_idxs] 215 | 216 | self.train_data = train_data 217 | self.valid_data = batch_manager.valid_data 218 | self.test_data = batch_manager.test_data 219 | 220 | self.anchor_idxs = anchor_idxs 221 | self.anchor_points = anchor_points 222 | 223 | self.row_k = row_k 224 | self.col_k = col_k 225 | 226 | def get_k(self, anchor_idx, user_item_data): 227 | """Returns the Kernel similarity between the 228 | anchor user_item pair and the user_item pairs 229 | in the user_item data 230 | 231 | Parameters 232 | ---------- 233 | anchor_idx : Array-like, shape (2,) 234 | (user_id, item_id) of the anchor point 235 | user_item_data : Array-like, shape (N_ratings, >2) 236 | Array where first 2 columns are (user_id, item_id) pairs 237 | 238 | Returns 239 | ------- 240 | np.ndarray, shape (N_ratings,) 241 | Returns an array of kernel weights corresponding to 242 | the chosen anchor for each user_item pair in the data 243 | """ 244 | row_k = self.row_k 245 | col_k = self.col_k 246 | anchor_point = self.anchor_points[anchor_idx] 247 | 248 | anchor_uid = int(anchor_point[0]) 249 | anchor_iid = int(anchor_point[1]) 250 | 251 | user_ids = user_item_data[:, 0].astype(np.int64) 252 | item_ids = user_item_data[:, 1].astype(np.int64) 253 | 254 | return np.multiply(row_k[anchor_uid][user_ids], col_k[anchor_iid][item_ids]) 255 | 256 | def get_train_k(self, anchor_idx): 257 | """ Get Kernel matrix of the train_data of a given anchor 258 | 259 | Parameters 260 | ---------- 261 | anchor_idx : Array-like, shape (2,) 262 | (user_id, item_id) of the anchor point 263 | 264 | Returns 265 | ------- 266 | np.ndarray, shape (N_ratings,) 267 | Returns an array of kernel weights corresponding to 268 | the chosen anchor for each user_item pair in the train data 269 | """ 270 | return self.get_k(anchor_idx, self.train_data) 271 | 272 | def get_valid_k(self, anchor_idx): 273 | """ Get Kernel matrix of the validation_data of a given anchor 274 | 275 | Parameters 276 | ---------- 277 | anchor_idx : Array-like, shape (2,) 278 | (user_id, item_id) of the anchor point 279 | 280 | Returns 281 | ------- 282 | np.ndarray, shape (N_ratings,) 283 | Returns an array of kernel weights corresponding to 284 | the chosen anchor for each user_item pair in the valid data 285 | """ 286 | return self.get_k(anchor_idx, self.valid_data) 287 | 288 | def get_test_k(self, anchor_idx): 289 | """ Get Kernel matrix of the test_data of a given anchor 290 | 291 | Parameters 292 | ---------- 293 | anchor_idx : Array-like, shape (2,) 294 | (user_id, item_id) of the anchor point 295 | 296 | Returns 297 | ------- 298 | np.ndarray, shape (N_ratings,) 299 | Returns an array of kernel weights corresponding to 300 | the chosen anchor for each user_item pair in the test data 301 | """ 302 | return self.get_k(anchor_idx, self.test_data) 303 | -------------------------------------------------------------------------------- /reclab/recommenders/libfm.py: -------------------------------------------------------------------------------- 1 | """A wrapper for the LibFM recommender. See www.libfm.org for implementation details.""" 2 | import numpy as np 3 | import scipy.sparse 4 | 5 | import wpyfm 6 | from . import recommender 7 | 8 | 9 | class LibFM(recommender.PredictRecommender): 10 | """The libFM recommendation model which is a factorization machine. 11 | 12 | Parameters 13 | ---------- 14 | num_user_features : int 15 | The number of features that describe each user. 16 | num_item_features : int 17 | The number of features that describe each item. 18 | num_rating_features : int 19 | The number of features that describe the context in which each rating occurs. 20 | max_num_users : int 21 | The maximum number of users that we will be making predictions for. Note that 22 | setting this value to be too large will lead to a degradation in performance. 23 | max_num_items : int 24 | The maximum number of items that we will be making predictions for. Note that 25 | setting this value to be too large will lead to a degradation in performance. 26 | method : str 27 | The method to learn parameters. Can be one of: 'sgd', 'sgda', or 'mcmc'. 28 | use_global_bias : bool 29 | Whether to use a global bias term. 30 | use_one_way : bool 31 | Whether to use one way interactions. 32 | num_two_way_factors : int 33 | The number of factors to use for the two way interactions. 34 | learning_rate : float 35 | The learning rate for sgd or sgda. 36 | reg : float 37 | The regularization across all parameters. Will be overwritten for their respective 38 | parameters if bias_reg, one_way_reg, or two_way_reg is not None. 39 | bias_reg : float 40 | The regularization for the global bias. 41 | one_way_reg : float 42 | The regularization for the one-way interactions. 43 | two_way_reg : float 44 | The regularization for the two-way interactions. 45 | init_stdev : float 46 | Standard deviation for initialization of the 2-way factors. 47 | num_iter : int 48 | The number of iterations to train the model for. 49 | seed : int 50 | The random seed to use when training the model. 51 | 52 | """ 53 | 54 | def __init__(self, 55 | num_user_features, 56 | num_item_features, 57 | num_rating_features, 58 | max_num_users, 59 | max_num_items, 60 | method='sgd', 61 | use_global_bias=True, 62 | use_one_way=True, 63 | num_two_way_factors=8, 64 | learning_rate=0.1, 65 | reg=0.0, 66 | bias_reg=None, 67 | one_way_reg=None, 68 | two_way_reg=None, 69 | init_stdev=0.1, 70 | num_iter=100, 71 | seed=0, 72 | **kwargs): 73 | """Create a LibFM recommender.""" 74 | super().__init__(**kwargs) 75 | if bias_reg is None: 76 | bias_reg = reg 77 | if one_way_reg is None: 78 | one_way_reg = reg 79 | if two_way_reg is None: 80 | two_way_reg = reg 81 | self._max_num_users = max_num_users 82 | self._max_num_items = max_num_items 83 | self._train_data = None 84 | self._num_features = (self._max_num_users + num_user_features + self._max_num_items + 85 | num_item_features + num_rating_features) 86 | self._model = wpyfm.PyFM(method=method, 87 | dim=(use_global_bias, use_one_way, num_two_way_factors), 88 | lr=learning_rate, 89 | reg=(bias_reg, one_way_reg, two_way_reg), 90 | init_stdev=init_stdev, 91 | num_iter=num_iter, 92 | seed=seed) 93 | self._hyperparameters.update(locals()) 94 | self._has_xt = method in ('mcmc', 'als') 95 | 96 | # We only want the function arguments so remove class related objects. 97 | del self._hyperparameters['self'] 98 | del self._hyperparameters['__class__'] 99 | 100 | # Each row of rating_inputs has the following structure: 101 | # (user_id, user_features, item_id, item_features, rating_features). 102 | # Where user_id and item_id are one hot encoded. 103 | rating_inputs = scipy.sparse.csr_matrix((0, self._num_features)) 104 | # Each row of rating_outputs consists of the numerical value assigned to that interaction. 105 | rating_outputs = np.empty((0,)) 106 | self._train_data = wpyfm.Data(rating_inputs, rating_outputs, has_xt=self._has_xt) 107 | 108 | @property 109 | def name(self): # noqa: D102 110 | return 'libfm' 111 | 112 | def reset(self, users=None, items=None, ratings=None): # noqa: D102 113 | rating_inputs = scipy.sparse.csr_matrix((0, self._num_features)) 114 | rating_outputs = np.empty((0,)) 115 | self._train_data = wpyfm.Data(rating_inputs, rating_outputs, has_xt=self._has_xt) 116 | super().reset(users, items, ratings) 117 | 118 | def update(self, users=None, items=None, ratings=None, retrain=True): # noqa: D102 119 | super().update(users, items, ratings) 120 | self._retrain = retrain 121 | 122 | if ratings is not None: 123 | data = [] 124 | row_col = [[], []] 125 | new_rating_outputs = [] 126 | # TODO: create internal _update function for dealing with inner ids 127 | for row, ((user_id_outer, item_id_outer), 128 | (rating, rating_context)) in enumerate(ratings.items()): 129 | user_id = self._outer_to_inner_uid[user_id_outer] 130 | item_id = self._outer_to_inner_iid[item_id_outer] 131 | user_features = self._users[user_id] 132 | item_features = self._items[item_id] 133 | row_col[0].append(row) 134 | row_col[1].append(user_id) 135 | data.append(1) 136 | for i, feature in enumerate(user_features): 137 | row_col[0].append(row) 138 | row_col[1].append(self._max_num_users + i) 139 | data.append(feature) 140 | row_col[0].append(row) 141 | row_col[1].append(self._max_num_users + len(user_features) + item_id) 142 | data.append(1) 143 | for i, feature in enumerate(item_features): 144 | row_col[0].append(row) 145 | row_col[1].append(self._max_num_users + len(user_features) + 146 | self._max_num_items + i) 147 | data.append(feature) 148 | for i, feature in enumerate(rating_context): 149 | row_col[0].append(row) 150 | row_col[1].append(self._max_num_users + len(user_features) + 151 | self._max_num_items + len(item_features) + i) 152 | data.append(feature) 153 | 154 | new_rating_outputs.append(rating) 155 | 156 | new_rating_inputs = scipy.sparse.csr_matrix((data, row_col), 157 | shape=(len(ratings), self._num_features)) 158 | new_rating_outputs = np.array(new_rating_outputs) 159 | # TODO: We need to account for when the same rating gets added again. Right now 160 | # this will just add duplicate rows with different ratings. 161 | self._train_data.add_rows(new_rating_inputs, new_rating_outputs) 162 | 163 | def _predict(self, user_item): # noqa: D102 164 | # Create a test_inputs array that can be parsed by our output function. 165 | test_inputs = [] 166 | data = [] 167 | row_col = [[], []] 168 | for row, (user_id, item_id, rating_context) in enumerate(user_item): 169 | user_features = self._users[user_id] 170 | item_features = self._items[item_id] 171 | row_col[0].append(row) 172 | row_col[1].append(user_id) 173 | data.append(1) 174 | for i, feature in enumerate(user_features): 175 | row_col[0].append(row) 176 | row_col[1].append(self._max_num_users + i) 177 | data.append(feature) 178 | row_col[0].append(row) 179 | row_col[1].append(self._max_num_users + len(user_features) + item_id) 180 | data.append(1) 181 | for i, feature in enumerate(item_features): 182 | row_col[0].append(row) 183 | row_col[1].append(self._max_num_users + len(user_features) + 184 | self._max_num_items + i) 185 | data.append(feature) 186 | for i, feature in enumerate(rating_context): 187 | row_col[0].append(row) 188 | row_col[1].append(self._max_num_users + len(user_features) + 189 | self._max_num_items + len(item_features) + i) 190 | data.append(feature) 191 | 192 | test_inputs = scipy.sparse.csr_matrix((data, row_col), 193 | shape=(len(user_item), self._num_features)) 194 | test_data = wpyfm.Data(test_inputs, np.zeros(test_inputs.shape[0]), has_xt=self._has_xt) 195 | 196 | if self._retrain: 197 | if self._has_xt: 198 | self._model.train(self._train_data, test=test_data) 199 | else: 200 | self._model.train(self._train_data) 201 | predictions = self._model.predict(test_data) 202 | 203 | return predictions 204 | 205 | def model_parameters(self): 206 | """Train a libfm model and get the resulting model's parameters. 207 | 208 | The degree-2 factorization machine model predicts a rating by 209 | 210 | r(x) = b_0 + w^T x + Ind(j = i) Ind(k = u) V_j^T V_k 211 | 212 | where b_0 is the global bias, w is the weights, and 213 | V is the pairwise interactions with dimension k * (m+n) 214 | V_j is the j^th row of V 215 | x is defined as the concatenation of two one-hot encodings e_i and e_u, 216 | and w^T x correpond to the user and item biases. 217 | 218 | Returns 219 | ------- 220 | global_bias : float 221 | Global bias term in the model. 222 | weights : np.ndarray 223 | Linear terms in the model (related to user/item biases). 224 | pairwise_interactions : np.ndarray 225 | Interaction term in the model (related to user/item factors). 226 | 227 | """ 228 | self._model.train(self._train_data) 229 | return self._model.parameters() 230 | -------------------------------------------------------------------------------- /reclab/environments/topics.py: -------------------------------------------------------------------------------- 1 | """Contains the implementation for the Topics environment. 2 | 3 | In this environment users have a hidden preference for each topic and each item has a 4 | hidden topic assigned to it. 5 | """ 6 | import collections 7 | import numpy as np 8 | 9 | from . import environment 10 | 11 | 12 | class Topics(environment.DictEnvironment): 13 | """ 14 | An environment where items have a single topic and users prefer certain topics. 15 | 16 | The user preference for any given topic is initialized as Unif(0.5, 5.5) while 17 | topics are uniformly assigned to items. Users will 18 | also have a changing preference for topics they get recommended based on the topic_change 19 | parameter. Users and items can have biases, there can also exist an underlying bias. 20 | 21 | Ratings are generated as 22 | r = clip( user preference for a given topic + b_u + b_i + b_0, 1, 5) 23 | where b_u is a user bias, b_i is an item bias, and b_0 is a global bias. 24 | 25 | Parameters 26 | ---------- 27 | num_topics : int 28 | The number of topics items can be assigned to. 29 | num_users : int 30 | The number of users in the environment. 31 | num_items : int 32 | The number of items in the environment. 33 | rating_frequency : float 34 | The proportion of users that will need a recommendation at each step. 35 | Must be between 0 and 1. 36 | num_init_ratings : int 37 | The number of ratings available from the start. User-item pairs are randomly selected. 38 | noise : float 39 | The standard deviation of the noise added to ratings. 40 | topic_change : float 41 | How much the user's preference for a topic changes each time that topic is recommended 42 | to them. The negative of topic_change gets split across all other topics as well. 43 | memory_length : int 44 | The number of recent topics a user remembers which affect the rating 45 | boredom_threshold : int 46 | The number of times a topics has to be seen within the memory to gain a 47 | penalty. 48 | boredom_penalty : float 49 | The penalty on the rating when a user is bored 50 | satiation_factor : float 51 | The extent to which satiation affects user ratings. 52 | satiation_decay : float or tuple 53 | A number between 0 and 1 that indicates how quickly satiation decays. 54 | If a tuple, the decay will alternate between the two values depending on the user's 55 | sensitization state 56 | satiation_noise : float 57 | The standard deviation of the noise influencing satiation at each timestep. 58 | switch_probability : tuple 59 | Represents a probability matrix where index 0 is the conditional probability of a user 60 | switching from a state of sensitization (S) to a state of boredom (B): P(B | S). 61 | Similarly, index 1 is P(S | B). The probability of staying in a state is 1 - P(switching) 62 | user_dist_choice : str 63 | The choice of user distribution for selecting online users. By default, the subset of 64 | online users is chosen from a uniform distribution. Currently supports normal and lognormal. 65 | initial_sampling: str or array 66 | How the initial ratings should be sampled. Can be 'uniform', 'powerlaw', or an 67 | array of tuples where arr[i][0] and arr[i][1] are the user-id and item-id respectively 68 | of the i-th initial rating. If initial_sampling is a string, then users are sampled 69 | according to user_dist_choice and items are sampled according to initial_sampling. 70 | shift_steps : int 71 | The number of timesteps to wait between each user preference shift. 72 | shift_frequency : float 73 | The proportion of users whose preference we wish to change during a preference shift. 74 | shift_weight : float 75 | The weight to assign to a user's new preferences after a preference shift. 76 | User's old preferences get assigned a weight of 1 - shift_weight. 77 | user_bias_type : normal or power 78 | distribution type for user biases. 79 | normal is normal distribution with default mean zero and variance 0.5 80 | power is power law distribution 81 | item_bias_type : normal or power 82 | distribution type for item biases. 83 | normal is normal distribution with default mean zero and variance 0.5 84 | power is power law distribution 85 | 86 | """ 87 | 88 | def __init__(self, 89 | num_topics, 90 | num_users, 91 | num_items, 92 | rating_frequency=1.0, 93 | num_init_ratings=0, 94 | noise=0.0, 95 | topic_change=0.0, 96 | memory_length=0, 97 | boredom_threshold=0, 98 | boredom_penalty=0.0, 99 | satiation_factor=0.0, 100 | satiation_decay=0.0, 101 | satiation_noise=0.0, 102 | switch_probability=(0.0, 0.0), 103 | user_dist_choice='uniform', 104 | initial_sampling='uniform', 105 | shift_steps=1, 106 | shift_frequency=0.0, 107 | shift_weight=0.0, 108 | user_bias_type='none', 109 | item_bias_type='none'): 110 | """Create a Topics environment.""" 111 | super().__init__(rating_frequency=rating_frequency, 112 | num_init_ratings=num_init_ratings, 113 | memory_length=memory_length, 114 | user_dist_choice=user_dist_choice, 115 | initial_sampling=initial_sampling) 116 | self._num_topics = num_topics 117 | self._num_users = num_users 118 | self._num_items = num_items 119 | self._topic_change = topic_change 120 | self._noise = noise 121 | self._user_preferences = None 122 | self._item_topics = None 123 | self._boredom_threshold = boredom_threshold 124 | self._boredom_penalty = boredom_penalty 125 | self._satiation_factor = satiation_factor 126 | self._satiation_decay = satiation_decay 127 | self._satiation_noise = satiation_noise 128 | self._satiations = None 129 | self._switch_probability = switch_probability 130 | self._sensitization_state = None 131 | self._shift_steps = shift_steps 132 | self._shift_frequency = shift_frequency 133 | self._shift_weight = shift_weight 134 | self._user_biases = None 135 | self._item_biases = None 136 | self._offset = None 137 | self._user_bias_type = user_bias_type 138 | self._item_bias_type = item_bias_type 139 | 140 | @property 141 | def name(self): # noqa: D102 142 | return 'topics' 143 | 144 | def _get_dense_ratings(self): # noqa: D102 145 | ratings = np.zeros([self._num_users, self._num_items]) 146 | for item_id in range(self._num_items): 147 | topic = self._item_topics[item_id] 148 | ratings[:, item_id] = (self._user_preferences[:, topic] + 149 | self._satiation_factor * self._satiations[:, topic] + 150 | np.full((self._num_users), self._item_biases[item_id]) + 151 | self._user_biases + np.full((self._num_users), self._offset)) 152 | 153 | # Account for boredom. 154 | for user_id in range(self._num_users): 155 | recent_topics = [self._item_topics[item] 156 | for item in self._user_histories[user_id]] 157 | recent_topics, counts = np.unique( 158 | recent_topics, return_counts=True) 159 | recent_topics = recent_topics[counts > self._boredom_threshold] 160 | for topic_id in recent_topics: 161 | ratings[user_id, self._item_topics == 162 | topic_id] -= self._boredom_penalty 163 | 164 | return ratings 165 | 166 | def _get_rating(self, user_id, item_id): # noqa: D102 167 | topic = self._item_topics[item_id] 168 | rating = (self._user_preferences[user_id, topic] - 169 | self._satiation_factor * self._satiations[user_id, topic] + 170 | self._user_biases[user_id] + self._item_biases[item_id] + self._offset) 171 | recent_topics = [self._item_topics[item] 172 | for item in self._user_histories[user_id]] 173 | if len(recent_topics) > 0: 174 | recent_topics = list(np.concatenate(recent_topics)) 175 | if recent_topics.count(topic) > self._boredom_threshold: 176 | rating -= self._boredom_penalty 177 | rating = np.clip(rating + self._dynamics_random.randn() 178 | * self._noise, 1, 5) 179 | return rating 180 | 181 | def _rate_items(self, user_id, item_ids): # noqa: D102 182 | # TODO: Add support for slates of size greater than 1. 183 | item_id = [item_ids[0]] 184 | rating = self._get_rating(user_id, item_id) 185 | topic = self._item_topics[item_id] 186 | 187 | # Determine satiation decay based on sensitization state. 188 | if type(self._satiation_decay) is tuple or type(self._satiation_decay) is list: 189 | 190 | # State transition function for sensitization v boredom. 191 | # The user's state for all topics (not just the one recommended) 192 | # switches based on self._switch_probability. 193 | sensitized = np.where(self._sensitization_state[user_id] == 0) 194 | bored = np.where(self._sensitization_state[user_id] == 1) 195 | self._sensitization_state[user_id, sensitized] = np.random.choice( 196 | [0, 1], size=len(sensitized), p=[1 - self._switch_probability[0], self._switch_probability[0]]) 197 | self._sensitization_state[user_id, bored] = np.random.choice( 198 | [0, 1], size=len(bored), p=[self._switch_probability[1], 1 - self._switch_probability[1]]) 199 | 200 | decay = self._satiation_decay[int( 201 | self._sensitization_state[user_id, topic])] 202 | 203 | else: 204 | decay = self._satiation_decay 205 | 206 | # Update satiation. 207 | recommended = np.zeros(self._num_topics) 208 | recommended[topic] = 1 209 | self._satiations[user_id] = (decay * (self._satiations[user_id] + recommended) + 210 | np.random.randn(self._num_topics) * self._satiation_noise) 211 | 212 | # Update underlying preference. 213 | preference = self._user_preferences[user_id, topic] 214 | if preference <= 5: 215 | self._user_preferences[user_id, topic] += self._topic_change 216 | not_topic = np.arange(self._num_topics) != topic 217 | self._user_preferences[user_id, not_topic] -= ( 218 | self._topic_change / (self._num_topics - 1)) 219 | 220 | return rating 221 | 222 | def _reset_state(self): # noqa: D102 223 | if self._user_bias_type == 'normal': 224 | self._user_biases = self._init_random.normal( 225 | loc=0., scale=0.5, size=self._num_users) 226 | elif self._user_bias_type == 'power': 227 | self._user_biases = 1 - \ 228 | self._init_random.power(5, size=self._num_users) 229 | elif self._user_bias_type == 'none': 230 | self._user_biases = np.zeros(self._num_users) 231 | else: 232 | print('User bias distribution is not supported') 233 | 234 | if self._item_bias_type == 'normal': 235 | self._item_biases = self._init_random.normal( 236 | loc=0., scale=0.5, size=self._num_items) 237 | elif self._item_bias_type == 'power': 238 | self._item_biases = 1 - \ 239 | self._init_random.power(5, size=self._num_users) 240 | elif self._item_bias_type == 'none': 241 | self._item_biases = np.zeros(self._num_items) 242 | else: 243 | print('Item bias distribution is not supported') 244 | 245 | self._offset = 0 246 | self._satiations = np.zeros((self._num_users, self._num_topics)) 247 | self._sensitization_state = np.zeros( 248 | (self._num_users, self._num_topics), dtype=int) 249 | self._user_preferences = self._init_random.uniform( 250 | low=0.5, high=5.5, size=(self._num_users, self._num_topics)) 251 | self._item_topics = self._init_random.choice( 252 | self._num_topics, size=self._num_items) 253 | self._users = collections.OrderedDict( 254 | (user_id, np.zeros(0)) for user_id in range(self._num_users)) 255 | self._items = collections.OrderedDict( 256 | (item_id, np.zeros(0)) for item_id in range(self._num_items)) 257 | 258 | def _update_state(self): # noqa: D102 259 | if (self._timestep + 1) % self._shift_steps == 0: 260 | # Apply preference and bias shift to a fraction of users. 261 | shifted_users = self._dynamics_random.choice( 262 | self._num_users, int(self._num_users * self._shift_frequency)) 263 | new_preferences = self._init_random.uniform( 264 | low=0.5, high=5.5, size=(len(shifted_users), self._num_topics)) 265 | if self._user_bias_type == 'normal': 266 | new_user_biases = self._init_random.normal( 267 | loc=0, scale=0.5, size=len(shifted_users)) 268 | elif self._user_bias_type == 'power': 269 | new_user_biases = 1 - \ 270 | self._init_random.power(5, size=len(shifted_users)) 271 | elif self._user_bias_type == 'none': 272 | new_user_biases = np.zeros(self._num_users) 273 | else: 274 | print('User bias distribution is not supported') 275 | 276 | self._user_preferences[shifted_users] = ( 277 | self._shift_weight * self._user_preferences[shifted_users] + 278 | (1 - self._shift_weight) * new_preferences) 279 | 280 | self._user_biases[shifted_users] = ( 281 | self._shift_weight * self._user_biases[shifted_users] + 282 | (1 - self._shift_weight) * new_user_biases[shifted_users]) 283 | 284 | return collections.OrderedDict(), collections.OrderedDict() 285 | -------------------------------------------------------------------------------- /reclab/environments/latent_factors.py: -------------------------------------------------------------------------------- 1 | """Contains the implementation for the Latent Behavior environment. 2 | 3 | In this environment users and items both have latent vectors, and 4 | the rating is determined by the inner product. Users and item both 5 | have bias terms, and there is an underlying bias as well. 6 | """ 7 | import collections 8 | import json 9 | import os 10 | 11 | import numpy as np 12 | 13 | from . import environment 14 | from .. import data_utils 15 | 16 | 17 | class LatentFactorBehavior(environment.DictEnvironment): 18 | """An environment where users and items have latent factors and biases. 19 | 20 | Ratings are generated as 21 | r = clip( + b_u + b_i + b_0 ) 22 | where p_u is a user's latent factor, q_i is an item's latent factor, 23 | b_u is a user bias, b_i is an item bias, and b_0 is a global bias. 24 | 25 | Parameters 26 | ---------- 27 | latent_dim : int 28 | Size of latent factors p, q. 29 | num_users : int 30 | The number of users in the environment. 31 | num_items : int 32 | The number of items in the environment. 33 | rating_frequency : float 34 | The proportion of users that will need a recommendation at each step. 35 | Must be between 0 and 1. 36 | num_init_ratings : int 37 | The number of ratings available from the start. User-item pairs are randomly selected. 38 | noise : float 39 | The standard deviation of the noise added to ratings. 40 | affinity_change : float 41 | How much the user's latent factor is shifted towards that of an item. 42 | memory_length : int 43 | The number of recent items a user remembers which affect the rating. 44 | boredom_threshold : int 45 | The size of the inner product between a new item and an item in the 46 | user's history to trigger a boredom response. 47 | boredom_penalty : float 48 | The factor on the penalty on the rating when a user is bored. The penalty 49 | is the average of the values which exceed the boredom_threshold, and the decrease 50 | in rating is the penalty multiplied by this factor. 51 | user_dist_choice : str 52 | The choice of user distribution for selecting online users. By default, the subset of 53 | online users is chosen from a uniform distribution. Currently supports normal and lognormal. 54 | 55 | """ 56 | 57 | def __init__(self, latent_dim, num_users, num_items, 58 | rating_frequency=0.02, num_init_ratings=0, 59 | noise=0.0, memory_length=0, affinity_change=0.0, 60 | boredom_threshold=0, boredom_penalty=0.0, user_dist_choice='uniform'): 61 | """Create a Latent Factor environment.""" 62 | super().__init__(rating_frequency, num_init_ratings, memory_length, user_dist_choice) 63 | self._latent_dim = latent_dim 64 | self._num_users = num_users 65 | self._num_items = num_items 66 | self._noise = noise 67 | self._affinity_change = affinity_change 68 | self._boredom_threshold = boredom_threshold 69 | self._boredom_penalty = boredom_penalty 70 | if self._memory_length > 0: 71 | self._boredom_penalty /= self._memory_length 72 | self._user_factors = None 73 | self._user_biases = None 74 | self._item_factors = None 75 | self._item_biases = None 76 | self._offset = None 77 | 78 | @property 79 | def name(self): 80 | """Name of environment, used for saving.""" 81 | return 'latent' 82 | 83 | def _get_dense_ratings(self): # noqa: D102 84 | ratings = (self._user_factors @ self._item_factors.T + self._user_biases[:, np.newaxis] + 85 | self._item_biases[np.newaxis, :] + self._offset) 86 | # Compute the boredom penalties. 87 | item_norms = np.linalg.norm(self._item_factors, axis=1) 88 | normalized_items = self._item_factors / item_norms[:, np.newaxis] 89 | similarities = normalized_items @ normalized_items.T 90 | similarities -= self._boredom_threshold 91 | similarities[similarities < 0] = 0 92 | penalties = self._boredom_penalty * similarities 93 | for user_id in range(self._num_users): 94 | for item_id in self._user_histories[user_id]: 95 | if item_id is not None: 96 | ratings[user_id] -= penalties[item_id] 97 | 98 | return ratings 99 | 100 | def _get_rating(self, user_id, item_id): 101 | """Compute user's rating of item based on model. 102 | 103 | Parameters 104 | ---------- 105 | user_id : int 106 | The id of the user making the rating. 107 | item_id : int 108 | The id of the item being rated. 109 | 110 | Returns 111 | ------- 112 | rating : int 113 | The rating the item was given by the user. 114 | 115 | """ 116 | raw_rating = (self._user_factors[user_id] @ self._item_factors[item_id] 117 | + self._user_biases[user_id] + self._item_biases[item_id] + self._offset) 118 | 119 | # Compute the boredom penalty. 120 | boredom_penalty = 0 121 | for item_id_hist in self._user_histories[user_id]: 122 | item_factor = self._item_factors[item_id_hist] 123 | if item_factor is not None: 124 | similarity = ((self._item_factors[item_id] @ item_factor) 125 | / np.linalg.norm(item_factor) 126 | / np.linalg.norm(self._item_factors[item_id])) 127 | if similarity > self._boredom_threshold: 128 | boredom_penalty += (similarity - self._boredom_threshold) 129 | boredom_penalty *= self._boredom_penalty 130 | rating = np.clip(raw_rating - boredom_penalty + self._dynamics_random.randn() * 131 | self._noise, 1, 5) 132 | 133 | return rating 134 | 135 | def _rate_items(self, user_id, item_ids): 136 | """Get a user to rate an item and update the internal rating state. 137 | 138 | Parameters 139 | ---------- 140 | user_id : int 141 | The id of the user making the rating. 142 | item_id : int 143 | The id of the item being rated. 144 | 145 | Returns 146 | ------- 147 | rating : int 148 | The rating the item was given by the user. 149 | 150 | """ 151 | # TODO: Add support for slates of size greater than 1. 152 | item_id = item_ids[0] 153 | rating = self._get_rating(user_id, item_id) 154 | 155 | # Updating underlying affinity 156 | self._user_factors[user_id] = ((1.0 - self._affinity_change) * self._user_factors[user_id] 157 | + self._affinity_change * self._item_factors[item_id]) 158 | return np.array([rating]) 159 | 160 | def _reset_state(self): 161 | """Reset the state of the environment.""" 162 | user_factors, user_bias, item_factors, item_bias, offset = self._generate_latent_factors() 163 | self._user_factors = user_factors 164 | self._user_biases = user_bias 165 | self._item_factors = item_factors 166 | self._item_biases = item_bias 167 | self._offset = offset 168 | 169 | self._users = collections.OrderedDict((user_id, np.zeros(0)) 170 | for user_id in range(self._num_users)) 171 | self._items = collections.OrderedDict((item_id, np.zeros(0)) 172 | for item_id in range(self._num_items)) 173 | 174 | def _generate_latent_factors(self): 175 | """Generate random latent factors.""" 176 | # Initialization size determined such that ratings generally fall in 0-5 range 177 | factor_sd = np.sqrt(np.sqrt(0.5 / self._latent_dim)) 178 | # User latent factors are normally distributed 179 | user_bias = self._init_random.normal(loc=0., scale=0.5, size=self._num_users) 180 | user_factors = self._init_random.normal(loc=0., scale=factor_sd, 181 | size=(self._num_users, self._latent_dim)) 182 | # Item latent factors are normally distributed 183 | item_bias = self._init_random.normal(loc=0., scale=0.5, size=self._num_items) 184 | item_factors = self._init_random.normal(loc=0., scale=factor_sd, 185 | size=(self._num_items, self._latent_dim)) 186 | # Shift up the mean 187 | offset = 3.0 188 | return user_factors, user_bias, item_factors, item_bias, offset 189 | 190 | 191 | class DatasetLatentFactor(LatentFactorBehavior): 192 | """An environment where user behavior is based on a dataset. 193 | 194 | Latent factor model of behavior with parameters fit directly from full dataset. 195 | 196 | Parameters 197 | ---------- 198 | name : str 199 | The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'lastfm'. 200 | latent_dim : int 201 | Size of latent factors p, q. 202 | datapath : str 203 | The path to the directory containing datafiles 204 | force_retrain : bool 205 | Forces retraining the latent factor model 206 | max_num_users : int 207 | The maximum number of users for the environment, if not the number in the dataset. 208 | max_num_items : int 209 | The maximum number of items for the environment, if not the number in the dataset. 210 | 211 | """ 212 | 213 | def __init__(self, name, latent_dim=128, datapath=data_utils.DATA_DIR, force_retrain=False, 214 | max_num_users=np.inf, max_num_items=np.inf, **kwargs): 215 | """Create a ML100K Latent Factor environment.""" 216 | self.dataset_name = name 217 | modelpath = datapath 218 | if name == 'ml-100k': 219 | self.datapath = os.path.expanduser(os.path.join(datapath, 'ml-100k')) 220 | self.modelpath = os.path.join(modelpath, 'ml-100k') 221 | latent_dim = 100 if latent_dim is None else latent_dim 222 | self._full_num_users = 943 223 | self._full_num_items = 1682 224 | # These parameters are the result of tuning. 225 | reg = 0.1 226 | learn_rate = 0.005 227 | self.train_params = dict(bias_reg=reg, one_way_reg=reg, two_way_reg=reg, 228 | learning_rate=learn_rate, num_iter=100) 229 | elif name == 'ml-10m': 230 | self.datapath = os.path.expanduser(os.path.join(datapath, 'ml-10M100K')) 231 | self.modelpath = os.path.join(modelpath, 'ml-10M100K') 232 | latent_dim = 128 if latent_dim is None else latent_dim 233 | self._full_num_users = 69878 234 | self._full_num_items = 10677 235 | # these parameters are presented in "On the Difficulty of Baselines" by Rendle et al. 236 | reg = 0.04 237 | learn_rate = 0.003 238 | self.train_params = dict(bias_reg=reg, one_way_reg=reg, two_way_reg=reg, 239 | learning_rate=learn_rate, num_iter=128) 240 | elif name == 'lastfm': 241 | self.datapath = os.path.expanduser(os.path.join(datapath, 'lastfm-dataset-1K')) 242 | self.modelpath = os.path.join(modelpath, 'lastfm-dataset-1K') 243 | latent_dim = 128 if latent_dim is None else latent_dim 244 | self._full_num_users = 992 245 | self._full_num_items = 177023 246 | # These parameters are presented in "Recommendations and User Agency" by Dean et al. 247 | reg = 0.08 248 | learn_rate = 0.001 249 | self.train_params = dict(bias_reg=reg, one_way_reg=reg, two_way_reg=reg, 250 | learning_rate=learn_rate, num_iter=128) 251 | else: 252 | raise ValueError('dataset name not recognized') 253 | self._force_retrain = force_retrain 254 | 255 | num_users = min(self._full_num_users, max_num_users) 256 | num_items = min(self._full_num_items, max_num_items) 257 | 258 | super().__init__(latent_dim, num_users, num_items, **kwargs) 259 | 260 | @property 261 | def name(self): 262 | """Name of environment, used for saving.""" 263 | return 'latent-{}'.format(self.dataset_name) 264 | 265 | def _generate_latent_factors(self): 266 | full_model_params = dict(num_user_features=0, num_item_features=0, num_rating_features=0, 267 | max_num_users=self._full_num_users, 268 | max_num_items=self._full_num_items, 269 | num_two_way_factors=self._latent_dim, **self.train_params) 270 | 271 | model_file = os.path.join(self.modelpath, 'fm_model.npz') 272 | res = load_latent_factors(model_file) 273 | if res is None or self._force_retrain: 274 | print('Training model from scratch, either due to force_retrain flag or') 275 | print('\tdid not find model file at {}'.format(model_file)) 276 | res = generate_latent_factors_from_data(self.dataset_name, model_file, 277 | full_model_params) 278 | user_factors, user_bias, item_factors, item_bias, offset = res 279 | else: 280 | user_factors, user_bias, item_factors, item_bias, offset = res 281 | 282 | if self._num_users < self._full_num_users or self._num_items < self._full_num_items: 283 | num_users, num_items = (min(self._num_users, self._full_num_users), 284 | min(self._num_items, self._full_num_items)) 285 | # TODO: may want to reduce the number in some other way 286 | # e.g. related to popularity 287 | user_indices = self._init_random.choice(user_factors.shape[0], size=num_users, 288 | replace=False) 289 | item_indices = self._init_random.choice(item_factors.shape[0], size=num_items, 290 | replace=False) 291 | user_factors = user_factors[user_indices] 292 | user_bias = user_bias[user_indices] 293 | item_factors = item_factors[item_indices] 294 | item_bias = item_bias[item_indices] 295 | return user_factors, user_bias, item_factors, item_bias, offset 296 | 297 | 298 | def load_latent_factors(model_file): 299 | """Load pretrained latent factor model.""" 300 | if not os.path.isfile(model_file): 301 | return None 302 | model = np.load(model_file) 303 | print('Loading model from {} trained via:\n{}.'.format(model_file, model['params'])) 304 | 305 | user_factors = model['user_factors'] 306 | user_bias = model['user_bias'] 307 | item_factors = model['item_factors'] 308 | item_bias = model['item_bias'] 309 | offset = model['offset'] 310 | 311 | return user_factors, user_bias, item_factors, item_bias, offset 312 | 313 | 314 | def generate_latent_factors_from_data(dataset_name, model_file, params): 315 | """Create latent factors based on a dataset.""" 316 | from ..recommenders import LibFM 317 | 318 | users, items, ratings = data_utils.read_dataset(dataset_name) 319 | print('Initializing latent factor model') 320 | recommender = LibFM(**params) 321 | recommender.reset(users, items, ratings) 322 | print('Training latent factor model with parameters: {}'.format(params)) 323 | 324 | global_bias, weights, pairwise_interactions = recommender.model_parameters() 325 | if len(weights) == 0: 326 | weights = np.zeros(pairwise_interactions.shape[0]) 327 | 328 | # TODO: this logic is only correct if there are no additional user/item/rating features 329 | # Note that we discard the original data's user_ids and item_ids at this step 330 | user_indices = np.arange(params['max_num_users']) 331 | item_indices = np.arange(params['max_num_users'], 332 | params['max_num_users'] + params['max_num_items']) 333 | 334 | user_factors = pairwise_interactions[user_indices] 335 | user_bias = weights[user_indices] 336 | item_factors = pairwise_interactions[item_indices] 337 | item_bias = weights[item_indices] 338 | offset = global_bias 339 | params = json.dumps(recommender.hyperparameters) 340 | 341 | np.savez(model_file, user_factors=user_factors, user_bias=user_bias, 342 | item_factors=item_factors, item_bias=item_bias, offset=offset, 343 | params=params) 344 | 345 | return user_factors, user_bias, item_factors, item_bias, offset 346 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist=numpy 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=autorec_lib,cfnade_lib,llorma_lib,experiment_scripts 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=0 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | # List of plugins (as comma separated values of python modules names) to load, 29 | # usually to register additional checkers. 30 | load-plugins= 31 | 32 | # Pickle collected data for later comparisons. 33 | persistent=yes 34 | 35 | # Specify a configuration file. 36 | #rcfile= 37 | 38 | # When enabled, pylint would attempt to guess common misconfiguration and emit 39 | # user-friendly hints instead of false-positive error messages. 40 | suggestion-mode=yes 41 | 42 | # Allow loading of arbitrary C extensions. Extensions are imported into the 43 | # active Python interpreter and may run arbitrary code. 44 | unsafe-load-any-extension=no 45 | 46 | 47 | [MESSAGES CONTROL] 48 | 49 | # Only show warnings with the listed confidence levels. Leave empty to show 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 51 | confidence= 52 | 53 | # Disable the message, report, category or checker with the given id(s). You 54 | # can either give multiple identifiers separated by comma (,) or put this 55 | # option multiple times (only on the command line, not in the configuration 56 | # file where it should appear only once). You can also use "--disable=all" to 57 | # disable everything first and then reenable specific checks. For example, if 58 | # you want to run only the similarities checker, you can use "--disable=all 59 | # --enable=similarities". If you want to run only the classes checker, but have 60 | # no Warning level messages displayed, use "--disable=all --enable=classes 61 | # --disable=W". 62 | disable= 63 | 64 | # Enable the message, report, category or checker with the given id(s). You can 65 | # either give multiple identifier separated by comma (,) or put this option 66 | # multiple time (only on the command line, not in the configuration file where 67 | # it should appear only once). See also the "--disable" option for examples. 68 | enable=c-extension-no-member 69 | 70 | 71 | [REPORTS] 72 | 73 | # Python expression which should return a note less than 10 (10 is the highest 74 | # note). You have access to the variables errors warning, statement which 75 | # respectively contain the number of errors / warnings messages and the total 76 | # number of statements analyzed. This is used by the global evaluation report 77 | # (RP0004). 78 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 79 | 80 | # Template used to display messages. This is a python new-style format string 81 | # used to format the message information. See doc for all details. 82 | #msg-template= 83 | 84 | # Set the output format. Available formats are text, parseable, colorized, json 85 | # and msvs (visual studio). You can also give a reporter class, e.g. 86 | # mypackage.mymodule.MyReporterClass. 87 | output-format=text 88 | 89 | # Tells whether to display a full report or only the messages. 90 | reports=no 91 | 92 | # Activate the evaluation score. 93 | score=yes 94 | 95 | 96 | [REFACTORING] 97 | 98 | # Maximum number of nested blocks for function / method body 99 | max-nested-blocks=5 100 | 101 | # Complete name of functions that never returns. When checking for 102 | # inconsistent-return-statements if a never returning function is called then 103 | # it will be considered as an explicit return statement and no message will be 104 | # printed. 105 | never-returning-functions=sys.exit 106 | 107 | 108 | [SPELLING] 109 | 110 | # Limits count of emitted suggestions for spelling mistakes. 111 | max-spelling-suggestions=4 112 | 113 | # Spelling dictionary name. Available dictionaries: none. To make it working 114 | # install python-enchant package.. 115 | spelling-dict= 116 | 117 | # List of comma separated words that should not be checked. 118 | spelling-ignore-words= 119 | 120 | # A path to a file that contains private dictionary; one word per line. 121 | spelling-private-dict-file= 122 | 123 | # Tells whether to store unknown words to indicated private dictionary in 124 | # --spelling-private-dict-file option instead of raising a message. 125 | spelling-store-unknown-words=no 126 | 127 | 128 | [MISCELLANEOUS] 129 | 130 | # List of note tags to take in consideration, separated by a comma. 131 | notes= 132 | 133 | 134 | [TYPECHECK] 135 | 136 | # List of decorators that produce context managers, such as 137 | # contextlib.contextmanager. Add to this list to register other decorators that 138 | # produce valid context managers. 139 | contextmanager-decorators=contextlib.contextmanager 140 | 141 | # List of members which are set dynamically and missed by pylint inference 142 | # system, and so shouldn't trigger E1101 when accessed. Python regular 143 | # expressions are accepted. 144 | generated-members= 145 | 146 | # Tells whether missing members accessed in mixin class should be ignored. A 147 | # mixin class is detected if its name ends with "mixin" (case insensitive). 148 | ignore-mixin-members=yes 149 | 150 | # Tells whether to warn about missing members when the owner of the attribute 151 | # is inferred to be None. 152 | ignore-none=yes 153 | 154 | # This flag controls whether pylint should warn about no-member and similar 155 | # checks whenever an opaque object is returned when inferring. The inference 156 | # can return multiple potential results while evaluating a Python object, but 157 | # some branches might not be evaluated, which results in partial inference. In 158 | # that case, it might be useful to still emit no-member and other checks for 159 | # the rest of the inferred objects. 160 | ignore-on-opaque-inference=yes 161 | 162 | # List of class names for which member attributes should not be checked (useful 163 | # for classes with dynamically set attributes). This supports the use of 164 | # qualified names. 165 | ignored-classes=optparse.Values,thread._local,_thread._local 166 | 167 | # List of module names for which member attributes should not be checked 168 | # (useful for modules/projects where namespaces are manipulated during runtime 169 | # and thus existing member attributes cannot be deduced by static analysis. It 170 | # supports qualified module names, as well as Unix pattern matching. 171 | ignored-modules= numpy 172 | 173 | # Show a hint with possible names when a member name was not found. The aspect 174 | # of finding the hint is based on edit distance. 175 | missing-member-hint=yes 176 | 177 | # The minimum edit distance a name should have in order to be considered a 178 | # similar match for a missing member name. 179 | missing-member-hint-distance=1 180 | 181 | # The total number of similar names that should be taken in consideration when 182 | # showing a hint for a missing member. 183 | missing-member-max-choices=1 184 | 185 | 186 | [BASIC] 187 | 188 | # Naming style matching correct argument names. 189 | argument-naming-style=snake_case 190 | 191 | # Regular expression matching correct argument names. Overrides argument- 192 | # naming-style. 193 | #argument-rgx= 194 | 195 | # Naming style matching correct attribute names. 196 | attr-naming-style=snake_case 197 | 198 | # Regular expression matching correct attribute names. Overrides attr-naming- 199 | # style. 200 | #attr-rgx= 201 | 202 | # Bad variable names which should always be refused, separated by a comma. 203 | bad-names=foo, 204 | bar, 205 | baz, 206 | toto, 207 | tutu, 208 | tata 209 | 210 | # Naming style matching correct class attribute names. 211 | class-attribute-naming-style=any 212 | 213 | # Regular expression matching correct class attribute names. Overrides class- 214 | # attribute-naming-style. 215 | #class-attribute-rgx= 216 | 217 | # Naming style matching correct class names. 218 | class-naming-style=PascalCase 219 | 220 | # Regular expression matching correct class names. Overrides class-naming- 221 | # style. 222 | #class-rgx= 223 | 224 | # Naming style matching correct constant names. 225 | const-naming-style=UPPER_CASE 226 | 227 | # Regular expression matching correct constant names. Overrides const-naming- 228 | # style. 229 | #const-rgx= 230 | 231 | # Minimum line length for functions/classes that require docstrings, shorter 232 | # ones are exempt. 233 | docstring-min-length=-1 234 | 235 | # Naming style matching correct function names. 236 | function-naming-style=snake_case 237 | 238 | # Regular expression matching correct function names. Overrides function- 239 | # naming-style. 240 | #function-rgx= 241 | 242 | # Good variable names which should always be accepted, separated by a comma. 243 | good-names=i, 244 | j, 245 | k, 246 | ex, 247 | Run, 248 | _, 249 | X, 250 | Y, 251 | Z, 252 | n, 253 | x, 254 | y, 255 | z 256 | 257 | # Include a hint for the correct naming format with invalid-name. 258 | include-naming-hint=no 259 | 260 | # Naming style matching correct inline iteration names. 261 | inlinevar-naming-style=any 262 | 263 | # Regular expression matching correct inline iteration names. Overrides 264 | # inlinevar-naming-style. 265 | #inlinevar-rgx= 266 | 267 | # Naming style matching correct method names. 268 | method-naming-style=snake_case 269 | 270 | # Regular expression matching correct method names. Overrides method-naming- 271 | # style. 272 | #method-rgx= 273 | 274 | # Naming style matching correct module names. 275 | module-naming-style=snake_case 276 | 277 | # Regular expression matching correct module names. Overrides module-naming- 278 | # style. 279 | #module-rgx= 280 | 281 | # Colon-delimited sets of names that determine each other's naming style when 282 | # the name regexes allow several styles. 283 | name-group= 284 | 285 | # Regular expression which should only match function or class names that do 286 | # not require a docstring. 287 | no-docstring-rgx=^_ 288 | 289 | # List of decorators that produce properties, such as abc.abstractproperty. Add 290 | # to this list to register other decorators that produce valid properties. 291 | # These decorators are taken in consideration only for invalid-name. 292 | property-classes=abc.abstractproperty 293 | 294 | # Naming style matching correct variable names. 295 | variable-naming-style=snake_case 296 | 297 | # Regular expression matching correct variable names. Overrides variable- 298 | # naming-style. 299 | #variable-rgx= 300 | 301 | 302 | [VARIABLES] 303 | 304 | # List of additional names supposed to be defined in builtins. Remember that 305 | # you should avoid defining new builtins when possible. 306 | additional-builtins= 307 | 308 | # Tells whether unused global variables should be treated as a violation. 309 | allow-global-unused-variables=yes 310 | 311 | # List of strings which can identify a callback function by name. A callback 312 | # name must start or end with one of those strings. 313 | callbacks=cb_, 314 | _cb 315 | 316 | # A regular expression matching the name of dummy variables (i.e. expected to 317 | # not be used). 318 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 319 | 320 | # Argument names that match this expression will be ignored. Default to name 321 | # with leading underscore. 322 | ignored-argument-names=_.*|^ignored_|^unused_ 323 | 324 | # Tells whether we should check for unused import in __init__ files. 325 | init-import=no 326 | 327 | # List of qualified module names which can have objects that can redefine 328 | # builtins. 329 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 330 | 331 | 332 | [SIMILARITIES] 333 | 334 | # Ignore comments when computing similarities. 335 | ignore-comments=yes 336 | 337 | # Ignore docstrings when computing similarities. 338 | ignore-docstrings=yes 339 | 340 | # Ignore imports when computing similarities. 341 | ignore-imports=no 342 | 343 | # Minimum lines number of a similarity. 344 | min-similarity-lines=4 345 | 346 | 347 | [FORMAT] 348 | 349 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 350 | expected-line-ending-format= 351 | 352 | # Regexp for a line that is allowed to be longer than the limit. 353 | ignore-long-lines=^\s*(# )??$ 354 | 355 | # Number of spaces of indent required inside a hanging or continued line. 356 | indent-after-paren=4 357 | 358 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 359 | # tab). 360 | indent-string=' ' 361 | 362 | # Maximum number of characters on a single line. 363 | max-line-length=100 364 | 365 | # Maximum number of lines in a module. 366 | max-module-lines=1500 367 | 368 | # List of optional constructs for which whitespace checking is disabled. `dict- 369 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 370 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 371 | # `empty-line` allows space-only lines. 372 | no-space-check=trailing-comma, 373 | dict-separator 374 | 375 | # Allow the body of a class to be on the same line as the declaration if body 376 | # contains single statement. 377 | single-line-class-stmt=no 378 | 379 | # Allow the body of an if to be on the same line as the test if there is no 380 | # else. 381 | single-line-if-stmt=no 382 | 383 | 384 | [LOGGING] 385 | 386 | # Format style used to check logging format string. `old` means using % 387 | # formatting, while `new` is for `{}` formatting. 388 | logging-format-style=old 389 | 390 | # Logging modules to check that the string format arguments are in logging 391 | # function parameter format. 392 | logging-modules=logging 393 | 394 | 395 | [IMPORTS] 396 | 397 | # Allow wildcard imports from modules that define __all__. 398 | allow-wildcard-with-all=no 399 | 400 | # Analyse import fallback blocks. This can be used to support both Python 2 and 401 | # 3 compatible code, which means that the block might have code that exists 402 | # only in one or another interpreter, leading to false positives when analysed. 403 | analyse-fallback-blocks=no 404 | 405 | # Deprecated modules which should not be used, separated by a comma. 406 | deprecated-modules=optparse,tkinter.tix 407 | 408 | # Create a graph of external dependencies in the given file (report RP0402 must 409 | # not be disabled). 410 | ext-import-graph= 411 | 412 | # Create a graph of every (i.e. internal and external) dependencies in the 413 | # given file (report RP0402 must not be disabled). 414 | import-graph= 415 | 416 | # Create a graph of internal dependencies in the given file (report RP0402 must 417 | # not be disabled). 418 | int-import-graph= 419 | 420 | # Force import order to recognize a module as part of the standard 421 | # compatibility libraries. 422 | known-standard-library= 423 | 424 | # Force import order to recognize a module as part of a third party library. 425 | known-third-party=enchant 426 | 427 | 428 | [DESIGN] 429 | 430 | # Maximum number of arguments for function / method. 431 | max-args=20 432 | 433 | # Maximum number of attributes for a class (see R0902). 434 | max-attributes=20 435 | 436 | # Maximum number of boolean expressions in an if statement. 437 | max-bool-expr=5 438 | 439 | # Maximum number of branch for function / method body. 440 | max-branches=12 441 | 442 | # Maximum number of locals for function / method body. 443 | max-locals=20 444 | 445 | # Maximum number of parents for a class (see R0901). 446 | max-parents=7 447 | 448 | # Maximum number of public methods for a class (see R0904). 449 | max-public-methods=20 450 | 451 | # Maximum number of return / yield for function / method body. 452 | max-returns=6 453 | 454 | # Maximum number of statements in function / method body. 455 | max-statements=50 456 | 457 | # Minimum number of public methods for a class (see R0903). 458 | min-public-methods=2 459 | 460 | 461 | [CLASSES] 462 | 463 | # List of method names used to declare (i.e. assign) instance attributes. 464 | defining-attr-methods=__init__, 465 | __new__, 466 | setUp 467 | 468 | # List of member names, which should be excluded from the protected access 469 | # warning. 470 | exclude-protected=_asdict, 471 | _fields, 472 | _replace, 473 | _source, 474 | _make 475 | 476 | # List of valid names for the first argument in a class method. 477 | valid-classmethod-first-arg=cls 478 | 479 | # List of valid names for the first argument in a metaclass class method. 480 | valid-metaclass-classmethod-first-arg=cls 481 | 482 | 483 | [EXCEPTIONS] 484 | 485 | # Exceptions that will emit a warning when being caught. Defaults to 486 | # "Exception". 487 | overgeneral-exceptions=Exception 488 | 489 | # Set the linting for string quotes 490 | string-quote=single 491 | triple-quote=double 492 | docstring-quote=double 493 | -------------------------------------------------------------------------------- /reclab/data_utils.py: -------------------------------------------------------------------------------- 1 | """A utility module for loading and manipulating various datasets.""" 2 | import collections 3 | import os 4 | import urllib.request 5 | import zipfile 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import scipy.sparse 10 | 11 | DATA_DIR = os.environ.get('RECLAB_DATA_PATH') 12 | if DATA_DIR is None: 13 | DATA_DIR = os.path.dirname(__file__) 14 | 15 | def read_dataset(name, shuffle=True, seed=0): 16 | """Read a dataset as specified by name. 17 | 18 | Parameters 19 | ---------- 20 | name : str 21 | The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a', 22 | 'pinterest', or 'lastfm'. 23 | shuffle : bool, optional 24 | A flag to indicate whether the dataset should be shuffled after loading, 25 | true by default. 26 | 27 | Returns 28 | ------- 29 | users : dict 30 | The dict of all users where the key is the user-id and the value is the user's features. 31 | items : dict 32 | The dict of all items where the key is the item-id and the value is the item's features. 33 | ratings : dict 34 | The dict of all ratings where the key is a tuple whose first element is the user-id 35 | and whose second element is the item id. The value is a tuple whose first element is the 36 | rating value and whose second element is the rating context (in this case an empty array). 37 | 38 | """ 39 | data = get_data(name) 40 | 41 | return dataset_from_dataframe(data, shuffle=shuffle, seed=seed) 42 | 43 | def dataset_from_dataframe(data, shuffle=True, seed=0): 44 | """Read a dataset as specified by name. 45 | 46 | Parameters 47 | ---------- 48 | data : dataframe 49 | The dataset, with columns user_id, item_id, and rating 50 | shuffle : bool, optional 51 | A flag to indicate whether the dataset should be shuffled after loading, 52 | true by default. 53 | 54 | Returns 55 | ------- 56 | users : dict 57 | The dict of all users where the key is the user-id and the value is the user's features. 58 | items : dict 59 | The dict of all items where the key is the item-id and the value is the item's features. 60 | ratings : dict 61 | The dict of all ratings where the key is a tuple whose first element is the user-id 62 | and whose second element is the item id. The value is a tuple whose first element is the 63 | rating value and whose second element is the rating context (in this case an empty array). 64 | 65 | """ 66 | 67 | if shuffle: 68 | data = data.sample(frac=1, random_state=seed).reset_index(drop=True) 69 | 70 | users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])} 71 | items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])} 72 | 73 | # Fill the rating array with initial data. 74 | ratings = {} 75 | for user_id, item_id, rating in zip(data['user_id'], data['item_id'], data['rating']): 76 | # TODO: may want to eventually a rating context depending on dataset (e.g. time) 77 | ratings[user_id, item_id] = (rating, np.zeros(0)) 78 | 79 | return users, items, ratings 80 | 81 | 82 | def read_bandit_dataset(name): 83 | """Read a bandit dataset as specified by name. 84 | 85 | Parameters 86 | ---------- 87 | name : str 88 | The name of the dataset. Must be one of: 'wiki10-31k'. 89 | 90 | Returns 91 | ------- 92 | features : scipy.sparse.dok_matrix 93 | The features at each timestep. 94 | ratings : scipy.sparse.dok_matrix 95 | The ratings at each timestep. 96 | 97 | """ 98 | if name == 'wiki10-31k': 99 | with open_zipped(zipped_dir_name='wiki10-31k', 100 | data_name='features.npz', 101 | data_url='https://kkrauth.s3-us-west-2.amazonaws.com/wiki10-31k.zip', 102 | mode='rb') as feature_file: 103 | features = scipy.sparse.load_npz(feature_file).tocsr() 104 | 105 | with open_zipped(zipped_dir_name='wiki10-31k', 106 | data_name='ratings.npz', 107 | data_url='https://kkrauth.s3-us-west-2.amazonaws.com/wiki10-31k.zip', 108 | mode='rb') as ratings_file: 109 | ratings = scipy.sparse.load_npz(ratings_file).tocsr() 110 | else: 111 | raise ValueError('Dataset name not recognized.') 112 | 113 | return features, ratings 114 | 115 | 116 | def split_ratings(ratings, proportion, shuffle=False, seed=None): 117 | """Split a group of ratings into two groups. 118 | 119 | Parameters 120 | ---------- 121 | ratings : dict 122 | The ratings to split. 123 | proportion : float 124 | The proportion of ratings that will be in the first group. Must be between 0 and 1. 125 | shuffle : bool 126 | Whether to shuffle the rating data. 127 | 128 | Returns 129 | ------- 130 | ratings_1 : OrderedDict 131 | The first set of ratings. 132 | ratings_2 : OrderedDict 133 | The second set of ratings. 134 | 135 | """ 136 | split_1 = collections.OrderedDict() 137 | split_2 = collections.OrderedDict() 138 | split_1_end = int(proportion * len(ratings)) 139 | iterator = list(ratings.items()) 140 | 141 | if shuffle: 142 | if seed is not None: 143 | np.random.seed(seed) 144 | np.random.shuffle(iterator) 145 | 146 | for i, (key, val) in enumerate(iterator): 147 | if i < split_1_end: 148 | split_1[key] = val 149 | else: 150 | split_2[key] = val 151 | 152 | return split_1, split_2 153 | 154 | 155 | def read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params): 156 | """Locate or download zipped file and load csv into DataFrame. 157 | 158 | Parameters 159 | ---------- 160 | zipped_dir_name : str 161 | The directory within the downloaded zip. 162 | data_name : str 163 | The name of the data file to be loaded from the directory. 164 | data_url : str 165 | The location of the download. 166 | csv_params : str 167 | Parameters for loading csv into DataFrame. 168 | 169 | Returns 170 | ------- 171 | data : DataFrame 172 | Dataset of interest. 173 | 174 | """ 175 | data_file = os.path.join(DATA_DIR, zipped_dir_name, data_name) 176 | fetch_zip(zipped_dir_name, data_url) 177 | return pd.read_csv(data_file, **csv_params) 178 | 179 | 180 | def open_zipped(zipped_dir_name, data_name, data_url, mode): 181 | """Download a zipped file and open it. 182 | 183 | Parameters 184 | ---------- 185 | zipped_dir_name : str 186 | The directory within the downloaded zip. 187 | data_name : str 188 | The name of the data file to be loaded from the directory. 189 | data_url : str 190 | The location of the download. 191 | mode: str 192 | The mode to open the file in. 193 | 194 | Returns 195 | ------- 196 | file : file 197 | The file of interest. 198 | 199 | """ 200 | data_file = os.path.join(DATA_DIR, zipped_dir_name, data_name) 201 | fetch_zip(zipped_dir_name, data_url) 202 | return open(data_file, mode) 203 | 204 | 205 | def fetch_zip(zipped_dir_name, data_url): 206 | """Download a zipped directory and extract it. 207 | 208 | Parameters 209 | ---------- 210 | zipped_dir_name : str 211 | The directory within the downloaded zip. 212 | data_url : str 213 | The location of the download. 214 | 215 | """ 216 | data_dir = os.path.join(DATA_DIR, zipped_dir_name) 217 | if not os.path.isdir(data_dir): 218 | os.makedirs(DATA_DIR, exist_ok=True) 219 | 220 | download_location = os.path.join('{}.zip'.format(data_dir)) 221 | urllib.request.urlretrieve(data_url, 222 | filename=download_location) 223 | with zipfile.ZipFile(download_location, 'r') as zip_ref: 224 | zip_ref.extractall(DATA_DIR) 225 | os.remove(download_location) 226 | 227 | 228 | def find_npz(dir_name, data_name, data_url, np_params): 229 | """Locate or download npz file and load into DataFrame. 230 | 231 | Parameters 232 | ---------- 233 | dir_name : str 234 | The directory to put the .npz file. 235 | data_name : str 236 | The name of the .npz file. 237 | data_url : str 238 | The location of the download. 239 | csv_params : str 240 | Parameters for loading the numpy array into DataFrame. 241 | 242 | Returns 243 | ------- 244 | data : DataFrame 245 | Dataset of interest. 246 | 247 | """ 248 | download_dir = os.path.join(DATA_DIR, dir_name) 249 | datafile = os.path.join(download_dir, data_name) 250 | if not os.path.isfile(datafile): 251 | os.makedirs(download_dir, exist_ok=True) 252 | urllib.request.urlretrieve(data_url, filename=datafile) 253 | data_np = np.load(datafile, allow_pickle=True)['train_data'] 254 | data = pd.DataFrame(data_np, **np_params) 255 | # TODO: deal better with implicit ratings 256 | data['rating'] = 1 257 | return data 258 | 259 | def find_txt(dir_name, data_name, data_url, csv_params): 260 | """Locate or download txt file and load into DataFrame. 261 | 262 | Parameters 263 | ---------- 264 | dir_name : str 265 | The directory to put the .txt file. 266 | data_name : str 267 | The name of the .txt file. 268 | data_url : str 269 | The location of the download. 270 | csv_params : str 271 | Parameters for loading the csv into DataFrame. 272 | 273 | Returns 274 | ------- 275 | data : DataFrame 276 | Dataset of interest. 277 | 278 | """ 279 | download_dir = os.path.join(DATA_DIR, dir_name) 280 | datafile = os.path.join(download_dir, data_name) 281 | if not os.path.isfile(datafile): 282 | os.makedirs(download_dir, exist_ok=True) 283 | urllib.request.urlretrieve(data_url, filename=datafile) 284 | data = pd.read_csv(datafile, **csv_params) 285 | return data 286 | 287 | 288 | def get_data(name, load_attributes=False): 289 | """Read a dataset specified by name into pandas dataframe. 290 | 291 | Parameters 292 | ---------- 293 | name : str 294 | The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'ml-1m', 295 | 'citeulike-a', 'pinterest', or 'lastfm'. 296 | 297 | Returns 298 | ------- 299 | data : DataFrame 300 | Dataset of interest. 301 | 302 | """ 303 | if name == 'ml-100k': 304 | zipped_dir_name = 'ml-100k' 305 | data_name = 'u.data' 306 | data_url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip' 307 | csv_params = dict(sep='\t', header=None, usecols=[0, 1, 2, 3], 308 | names=['user_id', 'item_id', 'rating', 'timestamp']) 309 | data = read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params) 310 | if load_attributes: 311 | user_attributes = read_zipped_csv(zipped_dir_name, 'u.user', data_url, 312 | dict(sep='|', header=None, usecols=[0, 1, 2, 3, 4], 313 | names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])) 314 | item_attributes = read_zipped_csv(zipped_dir_name, 'u.item', data_url, 315 | dict(sep='|', header=None, usecols=[0, 1, 2, 3, 4], encoding='latin-1', 316 | names=['item_id', 'title', 'release', 'video release', 'IMDb URL'])) 317 | data = (data, user_attributes, item_attributes) 318 | elif name == 'ml-10m': 319 | zipped_dir_name = 'ml-10M100K' 320 | data_name = 'ratings.dat' 321 | data_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip' 322 | csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3], 323 | names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python') 324 | data = read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params) 325 | if load_attributes: 326 | item_attributes = read_zipped_csv(zipped_dir_name, 'movies.dat', data_url, 327 | dict(sep='::', header=None, usecols=[0, 1, 2], 328 | names=['item_id', 'title', 'genre'])) 329 | data = (data, None, item_attributes) 330 | elif name == 'ml-1m': 331 | zipped_dir_name = 'ml-1m' 332 | data_name = 'ratings.dat' 333 | data_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' 334 | csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3], 335 | names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python') 336 | data = read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params) 337 | if load_attributes: 338 | user_attributes = read_zipped_csv(zipped_dir_name, 'users.dat', data_url, 339 | dict(sep='::', header=None, usecols=[0, 1, 2, 3, 4], 340 | names=['user_id', 'gender', 'age', 'occupation', 'zip code'])) 341 | item_attributes = read_zipped_csv(zipped_dir_name, 'movies.dat', data_url, 342 | dict(sep='::', header=None, usecols=[0, 1, 2], 343 | names=['item_id', 'title', 'genre'])) 344 | data = (data,user_attributes, item_attributes) 345 | elif name == 'citeulike-a': 346 | dir_name = 'citeulike-a' 347 | data_name = 'data.npz' 348 | data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/' 349 | 'master/data/citeulike-a.npz') 350 | np_params = dict(columns=['user_id', 'item_id']) 351 | data = find_npz(dir_name, data_name, data_url, np_params) 352 | # TODO: additional info on users or items? 353 | elif name == 'pinterest': 354 | dir_name = 'pinterest' 355 | data_name = 'data.npz' 356 | data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/' 357 | 'master/data/pinterest.npz') 358 | np_params = dict(columns=['user_id', 'item_id']) 359 | data = find_npz(dir_name, data_name, data_url, np_params) 360 | # TODO: additional info on users or items? 361 | elif name == 'lastfm-360k': 362 | dir_name = 'lastfm-360k' 363 | data_name = 'LastFM360k-Le75.txt' 364 | data_url = ('https://zenodo.org/record/3964506/files/LastFM360k-Le75.txt?download=1') 365 | csv_params = dict(sep=',', header=0, usecols=[0, 1, 2], 366 | names=['user_id', 'item_id', 'rating']) 367 | data = find_txt(dir_name, data_name, data_url, csv_params) 368 | # log transform for better scaling 369 | data['rating'] = np.log(1 + data['rating']) 370 | if load_attributes: 371 | item_attributes = find_txt(dir_name, 'LastFM360k-MB-artists.txt', 372 | 'https://zenodo.org/record/3964506/files/LastFM360k-MB-artists.txt?download=1', 373 | dict(sep='\t', header=0, usecols=[0, 1, 2], 374 | names=['item_id', 'artist_name', 'gender'])) 375 | data = (data, None, item_attributes) 376 | elif name == 'lastfm': 377 | data_name = 'lastfm-dataset-1K/lfm1k-play-counts.csv' 378 | csv_params = dict(header=0, usecols=[0, 1, 2], 379 | names=['user_id', 'item_id', 'rating']) 380 | datafile = os.path.join(DATA_DIR, data_name) 381 | try: 382 | data = pd.read_csv(datafile, **csv_params) 383 | # log transform for better scaling 384 | data['rating'] = np.log(1 + data['rating']) 385 | # TODO: remove artists with less than 50 total listens? 386 | # otherwise should probably retrain for hyperparameter tuning... 387 | except FileNotFoundError as error: 388 | print(('LastFM data must be downloaded and preprocessed locally, ' 389 | 'get files from https://drive.google.com/open?id=1qxmsQHe' 390 | 'D8O-81CbHxvaFP8omMvMxgEh0')) 391 | raise error 392 | else: 393 | raise ValueError('dataset name not recognized') 394 | return data 395 | 396 | 397 | def get_time_split_dataset(name, shuffle=True, binarize=False): 398 | """Get a time-based test/train split of a dataset as specified by name. 399 | 400 | Parameters 401 | ---------- 402 | name : str 403 | The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a', 404 | 'pinterest', or 'lastfm'. 405 | shuffle : bool, optional 406 | A flag to indicate whether the dataset should be shuffled after loading, 407 | true by default. 408 | binarize : bool, optional 409 | A flag to indicate whether to binarize the ratings to be 0 or 1, 410 | true by default. 411 | 412 | Returns 413 | ------- 414 | users : dict 415 | The dict of all users where the key is the user-id and the value is the user's features. 416 | items : dict 417 | The dict of all items where the key is the item-id and the value is the item's features. 418 | train_ratings : dict 419 | The dict of all training ratings. 420 | test_ratings : dict 421 | The dict of all testing ratings. 422 | 423 | """ 424 | data = get_data(name) 425 | if binarize: 426 | data['rating'] = 1 427 | 428 | users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])} 429 | items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])} 430 | 431 | # Add final rating to test set 432 | test_idx = [] 433 | for uid in np.unique(data['user_id']): 434 | last_rating_idx = data[data['user_id'] == uid]['timestamp'].idxmax() 435 | test_idx.append(last_rating_idx) 436 | data_test = data.loc[test_idx] 437 | data_train = data.drop(test_idx) 438 | 439 | # Shuffle remaining data 440 | if shuffle: 441 | data_train = data_train.sample(frac=1).reset_index(drop=True) 442 | 443 | # Fill the rating array with initial data. 444 | train_ratings = {} 445 | for user_id, item_id, rating in zip(data_train['user_id'], data_train['item_id'], 446 | data_train['rating']): 447 | # TODO: may want to eventually a rating context depending on dataset (e.g. time) 448 | train_ratings[user_id, item_id] = (rating, np.zeros(0)) 449 | 450 | # Fill the rating array with initial data. 451 | test_ratings = {} 452 | for user_id, item_id, rating in zip(data_test['user_id'], data_test['item_id'], 453 | data_test['rating']): 454 | # TODO: may want to eventually a rating context depending on dataset (e.g. time) 455 | test_ratings[user_id, item_id] = (rating, np.zeros(0)) 456 | 457 | return users, items, train_ratings, test_ratings 458 | --------------------------------------------------------------------------------