├── reclab
    ├── recommenders
    │   ├── llorma
    │   │   ├── llorma_lib
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── train_utils.py
    │   │   │   └── anchor.py
    │   │   ├── __init__.py
    │   │   └── llorma.py
    │   ├── cfnade
    │   │   ├── __init__.py
    │   │   ├── cfnade_lib
    │   │   │   ├── nade.py
    │   │   │   └── utils.py
    │   │   └── cfnade.py
    │   ├── autorec
    │   │   ├── __init__.py
    │   │   ├── autorec_lib
    │   │   │   └── autorec.py
    │   │   └── autorec.py
    │   ├── __init__.py
    │   ├── top_pop.py
    │   ├── baseline.py
    │   ├── README.md
    │   ├── sparse.py
    │   ├── knn_recommender.py
    │   └── libfm.py
    ├── __init__.py
    ├── environments
    │   ├── __init__.py
    │   ├── fixed_rating.py
    │   ├── contextual.py
    │   ├── README.md
    │   ├── beta_rank.py
    │   ├── schmit.py
    │   ├── registry.py
    │   ├── topics.py
    │   └── latent_factors.py
    └── data_utils.py
├── MANIFEST.in
├── setup.cfg
├── tests
    ├── __init__.py
    ├── test_ease.py
    ├── test_slim.py
    ├── test_simple_example.py
    ├── test_knn.py
    ├── test_cfnade.py
    ├── test_contextual.py
    ├── test_autorec.py
    ├── test_llorma.py
    ├── test_top_pop.py
    ├── test_beta_rank.py
    ├── test_fixed.py
    ├── test_libfm.py
    ├── utils.py
    └── test_topics.py
├── figures
    └── RecSys.png
├── models
    └── ml-100k
    │   └── fm_model.npz
├── update_docs.sh
├── .gitignore
├── lint.sh
├── LICENSE.txt
├── .travis.yml
├── requirements.txt
├── setup.py
├── README.md
└── .pylintrc


/reclab/recommenders/llorma/llorma_lib/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include data/ml-100k-model/fm_model.npz
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """A set of tests for both recommenders and environments."""
2 | 


--------------------------------------------------------------------------------
/figures/RecSys.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeley-reclab/RecLab/HEAD/figures/RecSys.png


--------------------------------------------------------------------------------
/models/ml-100k/fm_model.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeley-reclab/RecLab/HEAD/models/ml-100k/fm_model.npz


--------------------------------------------------------------------------------
/update_docs.sh:
--------------------------------------------------------------------------------
1 | pdoc --force --html --output-dir docs reclab
2 | mv -r docs/reclab/* ../berkeley-reclab.github.io/docs/
3 | 


--------------------------------------------------------------------------------
/reclab/__init__.py:
--------------------------------------------------------------------------------
1 | """This package contains environments and models for recommendation."""
2 | from .environments import make
3 | 


--------------------------------------------------------------------------------
/reclab/recommenders/cfnade/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The package for the Cfnade recommender.
3 | 
4 | See https://arxiv.org/abs/1605.09477 for details.
5 | """
6 | from .cfnade import Cfnade
7 | 


--------------------------------------------------------------------------------
/reclab/recommenders/autorec/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The package for the Autorec recommender.
3 | 
4 | See https://doi.org/10.1145/2740908.2742726 for details.
5 | """
6 | from .autorec import Autorec
7 | 


--------------------------------------------------------------------------------
/reclab/recommenders/llorma/__init__.py:
--------------------------------------------------------------------------------
1 | """The package for the Global LLORMA recommender.
2 | 
3 | Code modified from https://github.com/JoonyoungYi/LLORMA-tensorflow
4 | """
5 | from .llorma import Llorma
6 | 


--------------------------------------------------------------------------------
/reclab/recommenders/llorma/llorma_lib/__init__.py:
--------------------------------------------------------------------------------
1 | """ Init
2 | """
3 | from .llorma_g import Llorma, LocalModel, BatchManager
4 | from .anchor import AnchorManager
5 | from .train_utils import init_latent_mat, init_session, get_train_op
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /**/__pycache__/
 2 | 
 3 | models
 4 | train.libfm
 5 | test.libfm
 6 | predictions
 7 | 
 8 | .ipynb_checkpoints
 9 | *.swo
10 | *.swp
11 | *.egg-info
12 | 
13 | results
14 | experiments/data
15 | experiments/results
16 | 
17 | .coverage
18 | .venv
19 | model.h5
20 | dist/
21 | 


--------------------------------------------------------------------------------
/reclab/environments/__init__.py:
--------------------------------------------------------------------------------
 1 | """The package that contains all environments."""
 2 | from .beta_rank import BetaRank
 3 | from .contextual import Contextual
 4 | from .environment import DictEnvironment
 5 | from .environment import Environment
 6 | from .fixed_rating import FixedRating
 7 | from .latent_factors import LatentFactorBehavior, DatasetLatentFactor
 8 | from .registry import make
 9 | from .schmit import Schmit
10 | from .topics import Topics
11 | 


--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
1 | pylint --rcfile=.pylintrc reclab -f parseable -r n --load-plugins pylint_quotes
2 | pycodestyle reclab --max-line-length=100 --exclude=reclab/recommenders/autorec/autorec_lib,reclab/recommenders/cfnade/cfnade_lib,reclab/recommenders/llorma/llorma_lib
3 | pydocstyle reclab --match-dir="^(?!autorec_lib|cfnade_lib|llorma_lib).*"
4 | pylint --rcfile=.pylintrc tests -f parseable -r n --load-plugins pylint_quotes
5 | pycodestyle tests --max-line-length=100
6 | pydocstyle tests
7 | 


--------------------------------------------------------------------------------
/tests/test_ease.py:
--------------------------------------------------------------------------------
 1 | """Tests for the EASE recommender."""
 2 | from reclab.recommenders import EASE
 3 | from . import utils
 4 | 
 5 | 
 6 | def test_predict():
 7 |     """Test that EASE predicts well and that it gets better with more data."""
 8 |     recommender = EASE(lam=100, binarize=True)
 9 |     utils.test_binary_recommend_ml100k(recommender, 0.1)
10 | 
11 | 
12 | def test_recommend():
13 |     """Test that EASE will recommend reasonable items."""
14 |     recommender = EASE(lam=100)
15 |     utils.test_recommend_simple(recommender)
16 | 


--------------------------------------------------------------------------------
/tests/test_slim.py:
--------------------------------------------------------------------------------
 1 | """Tests for the SLIM recommender."""
 2 | from reclab.recommenders import SLIM
 3 | from . import utils
 4 | 
 5 | 
 6 | def test_predict():
 7 |     """Test that SLIM predicts well and that it gets better with more data."""
 8 |     recommender = SLIM(alpha=0.1, l1_ratio=1e-3, seed=0)
 9 |     utils.test_binary_recommend_ml100k(recommender, 0.1)
10 | 
11 | 
12 | def test_recommend():
13 |     """Test that SLIM will recommend reasonable items."""
14 |     recommender = SLIM(alpha=0.1, l1_ratio=1e-3, seed=0)
15 |     utils.test_recommend_simple(recommender)
16 | 


--------------------------------------------------------------------------------
/reclab/recommenders/__init__.py:
--------------------------------------------------------------------------------
 1 | """A set of recommender to be used in conjunction with environments."""
 2 | from .baseline import RandomRec
 3 | from .baseline import PerfectRec
 4 | from .knn_recommender import KNNRecommender
 5 | from .recommender import Recommender
 6 | from .recommender import PredictRecommender
 7 | from .top_pop import TopPop
 8 | 
 9 | try:
10 |     from .autorec import Autorec
11 |     from .cfnade import Cfnade
12 |     from .libfm import LibFM
13 |     from .llorma import Llorma
14 |     from .sparse import SLIM, EASE
15 | except ImportError:
16 |     pass
17 | 


--------------------------------------------------------------------------------
/tests/test_simple_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the basic example found in the README
 3 | """
 4 | import numpy as np
 5 | import reclab
 6 | 
 7 | 
 8 | def test_basic_example():
 9 |     """Test the basic example in the README."""
10 |     env = reclab.make('topics-dynamic-v1')
11 |     items, users, ratings = env.reset()
12 |     for i in range(1):
13 |         online_users = env.online_users
14 |         # Your recommendation algorithm here. This recommends 10 random items to each online user.
15 |         recommendations = np.random.choice(list(items), size=(len(online_users), 10))
16 |         _, _, ratings, info = env.step(recommendations)
17 |     env.close()
18 | 


--------------------------------------------------------------------------------
/tests/test_knn.py:
--------------------------------------------------------------------------------
 1 | """Tests for the KNN recommender."""
 2 | from reclab.recommenders import KNNRecommender
 3 | from . import utils
 4 | 
 5 | 
 6 | def test_user_predict():
 7 |     """Test that KNN-user predicts well and that it gets better with more data."""
 8 |     recommender = KNNRecommender(user_based=True)
 9 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.1, test_dense=True)
10 | 
11 | 
12 | def test_item_predict():
13 |     """Test that KNN-user predicts well and that it gets better with more data."""
14 |     recommender = KNNRecommender(user_based=False, shrinkage=0.1)
15 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.5, test_dense=True)
16 | 
17 | 
18 | def test_user_recommend():
19 |     """Test that KNN-item will recommend reasonable items."""
20 |     recommender = KNNRecommender(user_based=True)
21 |     utils.test_recommend_simple(recommender)
22 | 
23 | 
24 | def test_item_recommend():
25 |     """Test that KNN-item will recommend reasonable items."""
26 |     recommender = KNNRecommender(user_based=True)
27 |     utils.test_recommend_simple(recommender)
28 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Karl Krauth
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | matrix:
 4 |     include:
 5 |         - os: osx
 6 |           language: generic
 7 |           env: PYTHON_VERSION=3.8
 8 |         - os: linux
 9 |           dist: xenial
10 |           python: 3.8
11 |           env: PYTHON_VERSION=3.8
12 | 
13 | # Command to install dependencies
14 | install:
15 |     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
16 |         wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
17 |       else
18 |         wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh;
19 |       fi
20 |     - bash miniconda.sh -b -p $HOME/miniconda
21 |     - export PATH="$HOME/miniconda/bin:$PATH"
22 |     - hash -r
23 |     - conda config --set always_yes yes --set changeps1 no
24 |     - conda update -q conda
25 |     - conda info -a
26 |     - conda create -q -n test-environment python=$PYTHON_VERSION
27 |     - source activate test-environment
28 |     - pip --version
29 |     - pip install -r requirements.txt
30 | 
31 | # Command to run tests
32 | script:
33 |     - bash lint.sh
34 |     - pytest --durations=0 --cov=reclab tests
35 | after_success:
36 |     - coveralls
37 | notifications:
38 |       email: false
39 | 


--------------------------------------------------------------------------------
/tests/test_cfnade.py:
--------------------------------------------------------------------------------
 1 | """Tests for the CFNADE recommender."""
 2 | from reclab.recommenders.cfnade import Cfnade
 3 | from . import utils
 4 | 
 5 | 
 6 | def test_cfnade_predict():
 7 |     """Test that CFNADE predicts well and that it gets better with more data."""
 8 |     recommender = Cfnade(num_users=utils.NUM_USERS_ML100K,
 9 |                          num_items=utils.NUM_ITEMS_ML100K,
10 |                          batch_size=64,
11 |                          train_epoch=10,
12 |                          rating_bucket=5,
13 |                          hidden_dim=250,
14 |                          learning_rate=0.001,
15 |                          random_seed=0)
16 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.2)
17 | 
18 | 
19 | def test_cfnade_recommend():
20 |     """Test that CFNADE will recommend reasonable items."""
21 |     recommender = Cfnade(num_users=utils.NUM_USERS_SIMPLE,
22 |                          num_items=utils.NUM_ITEMS_SIMPLE,
23 |                          batch_size=1,
24 |                          train_epoch=10,
25 |                          rating_bucket=5,
26 |                          hidden_dim=250,
27 |                          learning_rate=0.001,
28 |                          random_seed=0)
29 |     utils.test_recommend_simple(recommender)
30 | 


--------------------------------------------------------------------------------
/tests/test_contextual.py:
--------------------------------------------------------------------------------
 1 | """Tests for the Contextual environment."""
 2 | import numpy as np
 3 | 
 4 | from reclab.environments import Contextual
 5 | 
 6 | 
 7 | def test_contextual_wiki():
 8 |     """Test contextual instantiated with Wiki10-31k."""
 9 |     env = Contextual('wiki10-31k')
10 |     assert env.name == 'contextual'
11 |     users, items, ratings = env.reset()
12 | 
13 |     # Test that the users and items have empty features.
14 |     assert users[0].shape == (0,)
15 |     assert items[0].shape == (0,)
16 | 
17 |     # Test that contexts have a given size.
18 |     assert env.online_users[0].shape == (101938,)
19 |     context = env.online_users[0]
20 | 
21 |     # Test the number of users and items.
22 |     assert len(env.online_users) == 1
23 |     assert len(users) == 1
24 |     assert len(items) == 30938
25 | 
26 |     # Recommend item 0, we should a new user and no new items.
27 |     users, items, ratings, _ = env.step(np.array([[0]]))
28 |     assert len(users) == 1
29 |     assert 1 in users
30 |     assert len(items) == 0
31 | 
32 |     # The first user should have left.
33 |     assert 0 not in env.users
34 | 
35 |     # We should only have received one rating of 0.
36 |     assert len(ratings) == 1
37 |     assert ratings[(0, 0)][0] == 0.0
38 |     assert np.array_equal(ratings[(0, 0)][1], context)
39 | 


--------------------------------------------------------------------------------
/tests/test_autorec.py:
--------------------------------------------------------------------------------
 1 | """Tests for the Autorec recommender."""
 2 | from reclab.recommenders import Autorec
 3 | from . import utils
 4 | 
 5 | 
 6 | def test_predict():
 7 |     """Test that Autorec predicts well and that it gets better with more data."""
 8 |     recommender = Autorec(utils.NUM_USERS_ML100K,
 9 |                           utils.NUM_ITEMS_ML100K,
10 |                           hidden_neuron=500,
11 |                           lambda_value=20,
12 |                           train_epoch=50,
13 |                           batch_size=20,
14 |                           grad_clip=False,
15 |                           base_lr=1e-4,
16 |                           random_seed=0)
17 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.3)
18 | 
19 | 
20 | def test_recommend():
21 |     """Test that Autorec will recommend reasonable items."""
22 |     recommender = Autorec(utils.NUM_USERS_SIMPLE,
23 |                           utils.NUM_ITEMS_SIMPLE,
24 |                           hidden_neuron=500,
25 |                           lambda_value=20,
26 |                           train_epoch=1000,
27 |                           batch_size=20,
28 |                           grad_clip=False,
29 |                           base_lr=1e-4,
30 |                           random_seed=0)
31 |     utils.test_recommend_simple(recommender)
32 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.9.0
 2 | astroid==2.4.2
 3 | astunparse==1.6.3
 4 | attrs==19.3.0
 5 | cachetools==4.1.1
 6 | certifi==2020.6.20
 7 | chardet==3.0.4
 8 | coveralls==2.1.2
 9 | future==0.18.2
10 | gast==0.3.3
11 | google-auth==1.19.1
12 | google-auth-oauthlib==0.4.1
13 | google-pasta==0.2.0
14 | grpcio==1.30.0
15 | h5py==2.10.0
16 | idna==2.10
17 | importlib-metadata==1.7.0
18 | isort==4.3.21
19 | joblib==0.16.0
20 | Keras==2.4.3
21 | Keras-Preprocessing==1.1.2
22 | lazy-object-proxy==1.4.3
23 | Markdown==3.2.2
24 | mccabe==0.6.1
25 | more-itertools==8.4.0
26 | numpy==1.19.0
27 | oauthlib==3.1.0
28 | opt-einsum==3.2.1
29 | packaging==20.4
30 | pandas==1.0.5
31 | pluggy==0.13.1
32 | protobuf==3.15.0
33 | py==1.10.0
34 | pyasn1==0.4.8
35 | pyasn1-modules==0.2.8
36 | pybind11==2.5.0
37 | pycodestyle==2.6.0
38 | pydocstyle==5.0.2
39 | pylint==2.5.3
40 | pylint-quotes==0.2.1
41 | pyparsing==2.4.7
42 | pytest==5.4.3
43 | pytest-cov==2.10.1
44 | pytest-mock==3.3.0
45 | python-dateutil==2.8.1
46 | pytz==2020.1
47 | PyYAML==5.4
48 | requests==2.24.0
49 | requests-oauthlib==1.3.0
50 | rsa==4.7
51 | scikit-learn==0.23.1
52 | scipy==1.4.1
53 | six==1.15.0
54 | sklearn==0.0
55 | snowballstemmer==2.0.0
56 | tensorboard==2.2.2
57 | tensorboard-plugin-wit==1.7.0
58 | tensorflow>=2.2.1
59 | tensorflow-estimator==2.2.0
60 | termcolor==1.1.0
61 | threadpoolctl==2.1.0
62 | toml==0.10.1
63 | torch==1.5.1
64 | typed-ast==1.4.1
65 | urllib3==1.26.5
66 | wcwidth==0.2.5
67 | Werkzeug==1.0.1
68 | wpyfm==0.1.9
69 | wrapt==1.12.1
70 | zipp==3.1.0
71 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | 
 4 | setup(
 5 |     name='RecLab',
 6 |     version='0.1.2',
 7 |     author='Karl Krauth',
 8 |     author_email='karl.krauth@gmail.com',
 9 |     description='A simulation framework for recommender systems.',
10 |     license='MIT',
11 |     download_url= 'https://github.com/berkeley-reclab/RecLab/archive/v0.1.2.tar.gz',
12 |     packages=find_packages(),
13 |     include_package_data=True,
14 |     url='https://berkeley-reclab.github.io/',
15 |     keywords=[
16 |         'recommender',
17 |         'recommendation',
18 |         'simulation',
19 |         'evaluation'
20 |     ],
21 |     install_requires=[
22 |         'numpy>=1.19.0',
23 |         'pandas>=1.0.5',
24 |         'scipy>=1.4.1',
25 |     ],
26 |     extras_require={
27 |         'recommenders': [
28 |             'keras>=2.4.3',
29 |             'scikit-learn>=0.23.1',
30 |             'tensorflow>=2.2.0',
31 |             'torch>=1.5.1',
32 |             'wpyfm>=0.1.9',
33 |         ]
34 |     },
35 |     tests_require=[
36 |         'pytest>=5.4.3',
37 |         'pytest-mock>=3.3.0',
38 |     ],
39 |     python_requires='>=3.6',
40 |     classifiers=[
41 |         'Development Status :: 3 - Alpha',
42 |         'Intended Audience :: Science/Research',
43 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
44 |         'License :: OSI Approved :: MIT License',
45 |         'Programming Language :: Python :: 3',
46 |         'Programming Language :: Python :: 3.6',
47 |         'Programming Language :: Python :: 3.7',
48 |         'Programming Language :: Python :: 3.8',
49 |     ],
50 | )
51 | 


--------------------------------------------------------------------------------
/reclab/recommenders/top_pop.py:
--------------------------------------------------------------------------------
 1 | """An implementation of the top popularity baseline recommender."""
 2 | 
 3 | import numpy as np
 4 | import scipy.sparse
 5 | 
 6 | from . import recommender
 7 | 
 8 | 
 9 | # TODO: add flag to allow this to also be based on number of times rated.
10 | class TopPop(recommender.PredictRecommender):
11 |     """The top popularity recommendation model based on ratings."""
12 | 
13 |     @property
14 |     def name(self):  # noqa: D102
15 |         return 'top-pop'
16 | 
17 |     @property
18 |     def dense_predictions(self):  # noqa: D102
19 |         if self._dense_predictions is None:
20 |             item_vector = self._average_item_ratings()
21 |             self._dense_predictions = np.vstack([item_vector] * self._ratings.shape[0])
22 |         return self._dense_predictions
23 | 
24 |     def _average_item_ratings(self):
25 |         # Compute average rating of each item
26 |         row, col = self._ratings.nonzero()
27 |         data = np.ones(len(row))
28 |         binary_ratings = scipy.sparse.csr_matrix((data, (row, col)), shape=self._ratings.shape)
29 | 
30 |         summed_item_ratings = self._ratings.sum(0)
31 |         num_times_rated = binary_ratings.sum(0)
32 | 
33 |         item_vector = np.mean(self._ratings) * np.ones(num_times_rated.shape)
34 |         idx_rated = np.where(num_times_rated > 0)
35 |         item_vector[idx_rated] = summed_item_ratings[idx_rated] / num_times_rated[idx_rated]
36 | 
37 |         return item_vector.flatten()
38 | 
39 |     def _predict(self, user_item):  # noqa: D102
40 |         # Predict on all user-item pairs.
41 |         average_item_ratings = self._average_item_ratings()
42 |         predictions = []
43 |         for _, item_id, _ in user_item:
44 |             predictions.append(average_item_ratings[item_id])
45 | 
46 |         return np.array(predictions)
47 | 


--------------------------------------------------------------------------------
/tests/test_llorma.py:
--------------------------------------------------------------------------------
 1 | """Tests for the LLORMA recommender."""
 2 | from reclab.recommenders.llorma import Llorma
 3 | from . import utils
 4 | 
 5 | 
 6 | def test_llorma_predict():
 7 |     """Test that LLORMA predicts well and that it gets better with more data."""
 8 |     recommender = Llorma(max_user=utils.NUM_USERS_ML100K,
 9 |                          max_item=utils.NUM_ITEMS_ML100K,
10 |                          n_anchor=10,
11 |                          pre_rank=10,
12 |                          pre_learning_rate=3e-4,
13 |                          pre_lambda_val=0.01,
14 |                          pre_train_steps=70,
15 |                          rank=20,
16 |                          learning_rate=2e-2,
17 |                          lambda_val=1e-4,
18 |                          train_steps=50,
19 |                          batch_size=1000,
20 |                          use_cache=False,
21 |                          result_path='results',
22 |                          random_seed=0)
23 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.1)
24 | 
25 | 
26 | def test_llorma_recommend():
27 |     """Test that LLORMA will recommend reasonable items."""
28 |     recommender = Llorma(max_user=utils.NUM_USERS_ML100K,
29 |                          max_item=utils.NUM_ITEMS_ML100K,
30 |                          n_anchor=10,
31 |                          pre_rank=10,
32 |                          pre_learning_rate=3e-4,
33 |                          pre_lambda_val=0.01,
34 |                          pre_train_steps=70,
35 |                          rank=20,
36 |                          learning_rate=2e-2,
37 |                          lambda_val=1e-4,
38 |                          train_steps=50,
39 |                          batch_size=1000,
40 |                          use_cache=False,
41 |                          result_path='results',
42 |                          random_seed=0)
43 |     utils.test_recommend_simple(recommender)


--------------------------------------------------------------------------------
/reclab/recommenders/llorma/llorma_lib/train_utils.py:
--------------------------------------------------------------------------------
 1 | """ LLORMA training utils
 2 | """
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | import math
 8 | import tensorflow as tf
 9 | 
10 | 
11 | def init_session():
12 |     """Initializes TF session
13 | 
14 |     Returns
15 |     -------
16 |     obj: tf.Session
17 |         Returns TF Session
18 |     """
19 |     # gpu_options = tf.GPUOptions(
20 |     #     per_process_gpu_memory_fraction=GPU_MEMORY_FRAC)
21 |     # gpu_config = tf.ConfigProto(gpu_options=gpu_options)
22 |     # session = tf.Session(config=gpu_config)
23 | 
24 |     config = tf.compat.v1.ConfigProto()
25 |     config.gpu_options.allow_growth = True
26 | 
27 |     session = tf.compat.v1.Session(config=config)
28 |     session.run(tf.compat.v1.global_variables_initializer())
29 |     return session
30 | 
31 | 
32 | def get_train_op(optimizer, loss, var_list):
33 |     """ Get a train operation
34 | 
35 |     Parameters
36 |     ----------
37 |     optimizer : obj
38 |         Valid TensorFlow optimizer,
39 |         e.g. tf.train.GradientDescentOptimizer
40 |     loss : obj
41 |         TF variable
42 |     var_list : obj
43 |         List of TF variables
44 |     """
45 |     gvs = optimizer.compute_gradients(loss, var_list=var_list)
46 |     # capped_gvs = [(tf.clip_by_value(grad, -100.0, 100.0), var)
47 |     #               for grad, var in gvs]
48 |     capped_gvs = gvs
49 |     train_op = optimizer.apply_gradients(capped_gvs)
50 |     return train_op
51 | 
52 | 
53 | def init_latent_mat(n, rank, mu_val, std_val):
54 |     """Initialize a matrix for the latent factors
55 | 
56 |     Parameters
57 |     ----------
58 |     n : int
59 |         Number of user/items
60 |     rank : int
61 |         Size of the latent dimension
62 |     mu_val : float
63 |         Unscaled mean value
64 |     std_val : float
65 |         Unscaled standard deviation value
66 |     """
67 |     _mu = math.sqrt(mu_val / rank)
68 |     _std = math.sqrt((math.sqrt(mu_val * mu_val + std_val * std_val) - mu_val) / rank)
69 |     return tf.Variable(
70 |         tf.compat.v1.truncated_normal([n, rank], _mu, _std, dtype=tf.float64))
71 | 


--------------------------------------------------------------------------------
/reclab/recommenders/autorec/autorec_lib/autorec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class AutoRec(torch.nn.Module):
 4 |     def __init__(self, num_users, num_items,
 5 |                  seen_users, seen_items,
 6 |                  hidden_neuron,
 7 |                  dropout=0.05, random_seed=0):
 8 |         super(AutoRec, self).__init__()
 9 |         self.num_users = num_users
10 |         self.num_items = num_items
11 |         self.seen_users = seen_users
12 |         self.seen_items = seen_items
13 | 
14 |         self.hidden_neuron = hidden_neuron
15 |         self.random_seed = random_seed
16 |         self.dropout_p = dropout
17 |         self.sigmoid = torch.nn.Sigmoid()
18 | 
19 |     def loss(self, pred, test, mask, lambda_value=1):
20 |         mse = (((pred * mask) - test) ** 2).sum()
21 |         reg_value_enc = torch.mul(lambda_value / 2, list(self.encoder.parameters())[0].norm(p='fro') ** 2)
22 |         reg_value_dec = torch.mul(lambda_value / 2, list(self.decoder.parameters())[0].norm(p='fro') ** 2)
23 |         return torch.add(mse, torch.add(reg_value_enc, reg_value_dec))
24 | 
25 |     def prepare_model(self):
26 |         self.encoder = torch.nn.Linear(self.num_users, self.hidden_neuron, bias=True)
27 |         self.dropout = torch.nn.Dropout(p=self.dropout_p)
28 |         self.decoder = torch.nn.Linear(self.hidden_neuron, self.num_users, bias=True)
29 | 
30 |     def forward(self, x):
31 |         x = self.encoder(x)
32 |         x = self.sigmoid(x)
33 |         x = self.dropout(x)
34 |         x = self.decoder(x)
35 |         return x
36 | 
37 |     def predict(self, user_item, test_data):
38 |         users = [triple[0] for triple in user_item]
39 |         items = [triple[1] for triple in user_item]
40 | 
41 |         user_item = zip(users, items)
42 |         user_idx = set(users)
43 |         item_idx = set(items)
44 |         Estimated_R = self.forward(test_data)
45 |         for item in range(test_data.shape[0]):
46 |             for user in range(test_data.shape[1]):
47 |                 if user not in self.seen_users and item not in self.seen_items:
48 |                     Estimated_R[item, user] = 3
49 |         idx = [tuple(users), tuple(items)]
50 |         Estimated_R = Estimated_R.clamp(1, 5)
51 |         return Estimated_R.T[idx].cpu().detach().numpy()
52 | 


--------------------------------------------------------------------------------
/tests/test_top_pop.py:
--------------------------------------------------------------------------------
 1 | """Tests for the TopPop recommender."""
 2 | import collections
 3 | 
 4 | import numpy as np
 5 | 
 6 | from reclab.recommenders import TopPop
 7 | 
 8 | 
 9 | def test_top_pop_one_step():
10 |     """Test a single recommendation step."""
11 |     users = {0: np.zeros((0,)),
12 |              1: np.zeros((0,)),
13 |              2: np.zeros((0,))}
14 |     items = {0: np.zeros((0,)),
15 |              1: np.zeros((0,)),
16 |              2: np.zeros((0,))}
17 |     ratings = {(0, 0): (5, np.zeros((0,))),
18 |                (0, 1): (4, np.zeros((0,))),
19 |                (1, 1): (4, np.zeros((0,))),
20 |                (1, 2): (3, np.zeros((0,)))}
21 |     user_contexts = collections.OrderedDict([(0, np.zeros((0,))),
22 |                                              (1, np.zeros((0,))),
23 |                                              (2, np.zeros((0,)))])
24 | 
25 |     recommender = TopPop()
26 |     recommender.reset(users, items, ratings)
27 |     recs, _ = recommender.recommend(user_contexts, 1)
28 |     assert recs.shape == (3, 1)
29 |     assert recs[0, 0] == 2
30 |     assert recs[1, 0] == 0
31 |     assert recs[2, 0] == 0
32 | 
33 | 
34 | def test_top_pop_multi_step():
35 |     """Test multiple rounds of recommending and rating."""
36 |     users = {0: np.zeros((0,)),
37 |              1: np.zeros((0,))}
38 |     items = {0: np.zeros((0,)),
39 |              1: np.zeros((0,)),
40 |              2: np.zeros((0,))}
41 |     ratings = {(0, 0): (5, np.zeros((0,))),
42 |                (1, 1): (3, np.zeros((0,)))}
43 |     user_contexts = collections.OrderedDict([(0, np.zeros((0,))),
44 |                                              (1, np.zeros((0,)))])
45 | 
46 |     recommender = TopPop()
47 |     recommender.reset(users, items, ratings)
48 |     recs, _ = recommender.recommend(user_contexts, 1)
49 |     assert recs.shape == (2, 1)
50 |     assert recs[0, 0] == 1
51 |     assert recs[1, 0] == 0
52 |     user_contexts[2] = np.zeros((0,))
53 |     recommender.update(users={2: np.zeros((0,))},
54 |                        ratings={(0, 1): (5, np.zeros((0,))),
55 |                                 (1, 0): (1, np.zeros((0,)))})
56 |     recs, _ = recommender.recommend(user_contexts, 1)
57 |     assert recs.shape == (3, 1)
58 |     assert recs[0, 0] == 2
59 |     assert recs[1, 0] == 2
60 |     assert recs[2, 0] == 1
61 | 


--------------------------------------------------------------------------------
/reclab/environments/fixed_rating.py:
--------------------------------------------------------------------------------
 1 | """A simple environment for debugging. Each user will either always rate an item a 1 or a 5."""
 2 | import numpy as np
 3 | 
 4 | from . import environment
 5 | 
 6 | 
 7 | class FixedRating(environment.DictEnvironment):
 8 |     """An environment in which half the users rate all items with a 1 and the other half with a 5.
 9 | 
10 |     Parameters
11 |     ----------
12 |     num_users : int
13 |         The number of users in the environment.
14 |     num_items : int
15 |         The number of items in the environment.
16 |     rating_frequency : float
17 |         What proportion of users will need a recommendation at each step.
18 |     num_init_ratings : int
19 |         The number of initial ratings available when the environment is reset.
20 | 
21 |     """
22 | 
23 |     def __init__(self, num_users, num_items,
24 |                  rating_frequency=0.2, num_init_ratings=0):
25 |         """Create a FixedRating environment."""
26 |         super().__init__(rating_frequency, num_init_ratings)
27 |         self._num_users = num_users
28 |         self._num_items = num_items
29 | 
30 |     @property
31 |     def name(self):  # noqa: D102
32 |         return 'fixed'
33 | 
34 |     def _get_dense_ratings(self):  # noqa: D102
35 |         ratings = np.ones([self._num_users, self._num_items])
36 |         ratings[:, self._num_items // 2:] = 5.0
37 |         return ratings
38 | 
39 |     def _reset_state(self):  # noqa: D102
40 |         self._users = {user_id: np.zeros((0,)) for user_id in range(self._num_users)}
41 |         self._items = {item_id: np.zeros((0,)) for item_id in range(self._num_items)}
42 | 
43 |     def _rate_items(self, user_id, item_ids):  # noqa: D102
44 |         # Find the largest item id that has not yet been rated.
45 |         max_id = None
46 |         for item_id in sorted(item_ids, reverse=True):
47 |             if (user_id, item_id) not in self._ratings:
48 |                 max_id = item_id
49 |                 break
50 | 
51 |         # If we have found an unrated item, rate it either 1 or 5.
52 |         ratings = np.ones(len(item_ids)) * np.nan
53 |         if max_id is not None:
54 |             if max_id >= self._num_items // 2:
55 |                 ratings[item_ids == max_id] = 5.0
56 |             else:
57 |                 ratings[item_ids == max_id] = 1.0
58 | 
59 |         return ratings
60 | 


--------------------------------------------------------------------------------
/tests/test_beta_rank.py:
--------------------------------------------------------------------------------
 1 | """Tests for the BetaRank environment."""
 2 | import numpy as np
 3 | 
 4 | from reclab.environments import BetaRank
 5 | 
 6 | 
 7 | def test_beta_simple():
 8 |     """Test BetaRank with only one user."""
 9 |     env = BetaRank(dimension=10,
10 |                    num_users=1,
11 |                    num_items=2,
12 |                    rating_frequency=1.0,
13 |                    num_init_ratings=0)
14 |     assert env.name == 'beta-rank'
15 |     users, items, ratings = env.reset()
16 | 
17 |     # Test that the users and items have empty features.
18 |     assert users[0].shape == (0,)
19 |     assert items[0].shape == (0,)
20 |     assert env.online_users[0].shape == (0,)
21 | 
22 |     # Recommend item 0, we shouldn't observe new users or items.
23 |     users, items, ratings, _ = env.step(np.array([[0]]))
24 |     assert users == {}
25 |     assert items == {}
26 | 
27 |     # Test that item 0 falls in the [0, 1] range.
28 |     assert ratings[(0, 0)][0] <= 1 and ratings[(0, 0)][0] >= 0
29 | 
30 | 
31 | def test_fixed_slates():
32 |     """Test FixedRating with slate recommendations."""
33 |     env = BetaRank(dimension=10,
34 |                    num_users=1,
35 |                    num_items=100,
36 |                    rating_frequency=1.0,
37 |                    num_init_ratings=0)
38 |     env.seed(0)
39 |     env.reset()
40 |     assert ((env.dense_ratings >= 0) & (env.dense_ratings <= 1)).all()
41 |     # Sort item ids from best to worst.
42 |     item_ids = env.dense_ratings[0].argsort()
43 |     # Swap the second largest and second smallest elements.
44 |     item_ids[1], item_ids[-2] = item_ids[-2], item_ids[1]
45 |     # The environment should pick the second item here since it will
46 |     # have a larger value than other highly ranked items.
47 |     _, _, ratings, _ = env.step(np.array([item_ids]))
48 |     assert len(ratings) == 1
49 |     assert (0, item_ids[1]) in ratings
50 |     # Swap the tenth largest and tenth smallest elements.
51 |     item_ids[9], item_ids[-10] = item_ids[-10], item_ids[9]
52 |     # The environment should pick the tenth item here since it will
53 |     # have a larger value than other highly ranked items, except for
54 |     # the second item which has already been rated.
55 |     _, _, ratings, _ = env.step(np.array([item_ids]))
56 |     assert len(ratings) == 1
57 |     assert (0, item_ids[9]) in ratings
58 | 


--------------------------------------------------------------------------------
/reclab/environments/contextual.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains the implementation for the Contextual environment.
 3 | 
 4 | In a contextual environment, only one user is on the platform at a time.
 5 | The user has no state, and only stays for one timestep. However, each user comes
 6 | with a context that is predictive of its preferences for items.
 7 | 
 8 | """
 9 | import collections
10 | 
11 | import numpy as np
12 | 
13 | from .. import data_utils
14 | from . import environment
15 | 
16 | 
17 | class Contextual(environment.DictEnvironment):
18 |     """
19 |     An environment that implements the contextual bandit assumption.
20 | 
21 |     Parameters
22 |     ----------
23 |     name: string
24 |         The dataset to instantiate the environment with. Can be one of: 'wiki10-31k'.
25 |     user_dist_choice : str
26 |         The choice of user distribution for selecting online users. By default, the subset of
27 |         online users is chosen from a uniform distribution. Currently supports normal and lognormal.
28 | 
29 |     """
30 | 
31 |     def __init__(self, name, user_dist_choice='uniform'):
32 |         """Create a Contextual environment."""
33 |         self._features, self._full_ratings = data_utils.read_bandit_dataset(name)
34 |         self._curr_user = 0
35 |         super().__init__(rating_frequency=1,
36 |                          num_init_ratings=0,
37 |                          memory_length=0,
38 |                          user_dist_choice=user_dist_choice)
39 | 
40 |     @property
41 |     def name(self):  # noqa: D102
42 |         return 'contextual'
43 | 
44 |     def _get_dense_ratings(self):  # noqa: D102
45 |         return self._full_ratings[:self._curr_user + 1].toarray()
46 | 
47 |     def _reset_state(self):  # noqa: D102
48 |         self._curr_user = 0
49 |         self._users = collections.OrderedDict([(self._curr_user, np.zeros(0))])
50 |         self._items = collections.OrderedDict((item_id, np.zeros(0))
51 |                                               for item_id in range(self._full_ratings.shape[1]))
52 | 
53 |     def _rate_items(self, user_id, item_ids):  # noqa: D102
54 |         assert user_id in self._users
55 |         assert len(item_ids) == 1
56 |         rating = self._full_ratings[user_id, item_ids[0]]
57 |         return np.array([rating])
58 | 
59 |     def _rating_context(self, user_id):  # noqa: D102
60 |         return self._features[self._curr_user].toarray().flatten()
61 | 
62 |     def _update_state(self):  # noqa: D102
63 |         self._curr_user += 1
64 |         self._users = collections.OrderedDict([(self._curr_user, np.zeros(0))])
65 |         return self._users.copy(), collections.OrderedDict()
66 | 


--------------------------------------------------------------------------------
/reclab/recommenders/baseline.py:
--------------------------------------------------------------------------------
 1 | """An implementation of baseline perfect and random recommenders."""
 2 | import numpy as np
 3 | 
 4 | from . import recommender
 5 | 
 6 | 
 7 | class RandomRec(recommender.PredictRecommender):
 8 |     """A random recommendation model.
 9 | 
10 |     Parameters
11 |     ----------
12 |     range : tuple
13 |         Upper and lower bounds for the uniformly random predictions.
14 |     seed : int
15 |         The random seed to use for recommendations.
16 | 
17 |     """
18 | 
19 |     def __init__(self, rating_range=(0, 5), seed=0):
20 |         """Create a random recommender."""
21 |         self._range = rating_range
22 |         np.random.seed(seed)
23 |         super().__init__()
24 | 
25 |     @property
26 |     def name(self):  # noqa: D102
27 |         return 'random'
28 | 
29 |     @property
30 |     def dense_predictions(self):  # noqa: D102
31 |         if self._dense_predictions is None:
32 |             num_users = len(self._users)
33 |             num_items = len(self._items)
34 |             self._dense_predictions = np.random.uniform(low=self._range[0],
35 |                                                         high=self._range[1],
36 |                                                         size=[num_users, num_items])
37 |         return self._dense_predictions
38 | 
39 |     def _predict(self, user_item):  # noqa: D102
40 |         # Random predictions for all pairs.
41 |         all_predictions = self.dense_predictions
42 |         predictions = []
43 |         for user_id, item_id, _ in user_item:
44 |             predictions.append(all_predictions[user_id, item_id])
45 |         return np.array(predictions)
46 | 
47 | 
48 | class PerfectRec(recommender.PredictRecommender):
49 |     """A perfect recommendation model.
50 | 
51 |     Parameters
52 |     ----------
53 |     dense_rating_function : function
54 |         The function which generates true user ratings.
55 | 
56 |     """
57 | 
58 |     def __init__(self, dense_rating_function):
59 |         """Create a perfect recommender."""
60 |         self._dense_rating_function = dense_rating_function
61 |         super().__init__()
62 | 
63 |     @property
64 |     def name(self):  # noqa: D102
65 |         return 'perfect'
66 | 
67 |     @property
68 |     def dense_predictions(self):  # noqa: D102
69 |         if self._dense_predictions is None:
70 |             self._dense_predictions = self._dense_rating_function()
71 |         return self._dense_predictions
72 | 
73 |     def _predict(self, user_item):  # noqa: D102
74 |         # Use provided function to predict for all pairs.
75 |         all_predictions = self.dense_predictions
76 |         predictions = []
77 |         for user_id, item_id, _ in user_item:
78 |             predictions.append(all_predictions[user_id, item_id])
79 |         return np.array(predictions)
80 | 


--------------------------------------------------------------------------------
/reclab/environments/README.md:
--------------------------------------------------------------------------------
 1 | ## List of Environments
 2 | 
 3 | All the provided environments inherit from `DictEnvironment`, which is an environment where data is passed around as dictionaries.
 4 | 
 5 | ### [Topics](reclab/environments/topics.py)
 6 | In the `Topics` environment, each item is uniformly assigned to one of *K* topics and users prefer certain topics.
 7 | The preference of user *u* for items *i* of topic *k_i* is initialized as *p(u,k_i) ~ Unif(0.5, 5.5)*, while the topic *k* of item *i* is chosen randomly from the set of all topics. When user *u* is recommended item *i* it will rate the item as
 8 | *r_t(u,i) = clip(p(u,k_i) + eps)* where *eps* is normally distributed independent noise.
 9 | 
10 | User preferences can change as a result of the items they consume. We model the fact that users might become more interested in a topic through repeated exposure (`topic_change` parameter). The item rating also models negative effects arising from boredom. The effect of boredom arises from three parameters: `memory_length, boredom_threshold, boredom_penalty`.
11 | 
12 | ### [Latent Factor](reclab/environments/latent_factors.py)
13 | In the `LatentFactorBehaviour`environment users and items have latent factors and biases. Ratings of items are modelled as:
14 | *r_t(u,i = < p_u, q_i > + b_u + b_i + b_0)*, where *p_u* is a user's latent factor, *q_i* is an item's latent factor, *b_u* is the user bias, *b_i* is the item bias, and *b_0* is the global bias.
15 | 
16 | The `DatasetLatentFactor` environment initializes latent factors and biases by fitting a Matrix Factorization model on a rating dataset. Currently, it supports environment initialization based on [`MovieLens 100K`, `Movie Lense 10M`](https://grouplens.org/datasets/movielens/) and [`Last FM`](http://millionsongdataset.com/lastfm/) data.
17 | 
18 | Similarly to the `Topics` environment, latent factor based environments allow for changes in user preferences: `affinity_change` models increased interest in a topic via alignments of user factors with item factors while `memory_length, boredom_threshold, boredom_penalty` model user boredom.
19 | 
20 | ### Partial Information
21 | #### [Schmit](reclab/environments/schmit.py)
22 | `Schmit` contains the implementation for the environment in ["Human Interaction with Recommendation Systems"](https://arxiv.org/pdf/1703.00535.pdf). It is a slate based environment and  it is similar to a latent-factor environment in that users and items are described by latent factors. However, users have only partial knowledge of an item value to them.  Rather than using true rating, they use this partial information, along with the recommender's predicted score, to select an item from a slate of recommended items.
23 | 
24 | #### [Engelhardt](reclab/environments/engelhardt.py)
25 | `Engelhardt` contains the implementation for the environment in [How Algorithmic Confounding in Recommendation Systems Increases Homogeneity and Decreases Utility](https://arxiv.org/abs/1710.11214). It is similar to `Schmit`: users know part of the value for each item and users/items are represented by latent vectors. The rating of a user *i* on an item *j* is given by:
26 | *r(u, i) ~ Beta(< p_u, q_i >, s^2)*
27 | where *p_u* is the latent vector for user *i*, *q_i* is the latent vector for item *i*, and the Beta distribution is parametrized according to its mean and variance. In this setting users chose from a slate of items based upon their observed utility and the recommender's ranking.
28 | 
29 | ### Debug environments
30 | The [`FixedRating`](reclab/environments/fixed_rating.py) is a simple environment for debugging, in which half the users rate all items with a 1 and the other half with a 5.
31 | 


--------------------------------------------------------------------------------
/reclab/environments/beta_rank.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains the implementation for the BetaRank environment from the algorithmic confounding paper.
 3 | 
 4 | In this environment users have a hidden preference for each topic and each item has a
 5 | hidden topic assigned to it.
 6 | """
 7 | import collections
 8 | 
 9 | import numpy as np
10 | 
11 | from . import environment
12 | 
13 | 
14 | class BetaRank(environment.DictEnvironment):
15 |     """
16 |     Implementation of environment with known and unknown user utility, static over time.
17 | 
18 |     Based on "How Algorithmic Confounding in Recommendation Systems Increases Homogeneity
19 |     and Decreases Utility" by Chaney, Stewart, and Engelhardt (2018).
20 | 
21 |     """
22 | 
23 |     def __init__(self, dimension, num_users, num_items, rating_frequency=0.2,
24 |                  num_init_ratings=0, known_mean=0.98, user_dist_choice='uniform'):
25 |         """Create a BetaRank environment."""
26 |         super().__init__(rating_frequency, num_init_ratings, 0, user_dist_choice)
27 |         self._dimension = dimension
28 |         self._num_users = num_users
29 |         self._num_items = num_items
30 |         self._known_mean = known_mean
31 |         self._user_preferences = None
32 |         self._item_preferences = None
33 | 
34 |     @property
35 |     def name(self):  # noqa: D102
36 |         return 'beta-rank'
37 | 
38 |     def _get_dense_ratings(self):  # noqa: D102
39 |         return np.clip(np.round(20 * (self._user_preferences @ self._item_preferences.T) + 1), 1, 5)
40 | 
41 |     def _reset_state(self):  # noqa: D102
42 |         # TODO: We should probably pass the magic numbers below as parameters.
43 |         self._user_preferences = self._init_random.dirichlet(
44 |             100 * self._init_random.dirichlet(np.ones(self._dimension)),
45 |             size=self._num_users
46 |         )
47 |         self._item_preferences = self._init_random.dirichlet(
48 |             0.1 * self._init_random.dirichlet(100 * np.ones(self._dimension)),
49 |             size=self._num_items
50 |         )
51 |         self._users = collections.OrderedDict((user_id, np.zeros(0))
52 |                                               for user_id in range(self._num_users))
53 |         self._items = collections.OrderedDict((item_id, np.zeros(0))
54 |                                               for item_id in range(self._num_items))
55 | 
56 |     def _rate_items(self, user_id, item_ids):  # noqa: D102
57 |         # Compute the user's known values for each items and sort them accordingly.
58 |         means = self._item_preferences[item_ids] @ self._user_preferences[user_id].T
59 |         values = self._beta_prime(means)
60 |         known = self._beta_prime(self._known_mean, size=len(item_ids))
61 |         sorted_idxs = reversed(
62 |             np.argsort(np.arange(1, len(item_ids) + 1) ** (-0.8) * known * values))
63 | 
64 |         # Find the index of the item with the highest known value that hasn't been rated yet.
65 |         chosen_idx = None
66 |         for idx in sorted_idxs:
67 |             if (user_id, item_ids[idx]) not in self._ratings:
68 |                 chosen_idx = idx
69 |                 break
70 | 
71 |         # Rate the chosen item and don't rate anything else.
72 |         ratings = np.ones(len(item_ids)) * np.nan
73 |         if chosen_idx is not None:
74 |             ratings[chosen_idx] = values[chosen_idx]
75 |         return np.clip(np.round((ratings).flatten() * 20 + 1), 1, 5)
76 | 
77 |     def _beta_prime(self, mean, std_dev=1e-5, size=None):
78 |         alpha = ((1 - mean) / std_dev ** 2 - 1 / mean) * mean ** 2 + 1e-6
79 |         beta = alpha * (1 / mean - 1)
80 |         return self._dynamics_random.beta(alpha, beta, size=size)
81 | 


--------------------------------------------------------------------------------
/reclab/recommenders/README.md:
--------------------------------------------------------------------------------
 1 | ## List of Recommenders
 2 | 
 3 | All provided recommenders are subclasses of `PredictRecommender`, which uses rating predictions to make recommendations. It supports both deterministic and stochastic item selection policies.
 4 | 
 5 | ### Baseline Recommenders
 6 | 
 7 | #### [RandomRec](reclab/recommenders/baseline.py)
 8 | A recommender that returns a random item from the list of unconsumed items foreach online user. It is a useful baseline for calibrating lower-bounds of recommender performance.
 9 | 
10 | #### [TopPop](reclab/recommenders/top_pop.py)
11 | A recommender that uses historical ratings to make global rankings of items and recommend items based on items with highest overall popularity. This is a useful baseline for measuring the benefits of personalization.
12 | 
13 | #### [PerfectRec](reclab/recommenders/baseline.py)
14 | A recommender that is instantiated with a `dense_rating_function`, which provides the true ratings of the users for all items. It is a useful baseline for calibrating upper-bounds of recommender performance.
15 | 
16 | ### [Neighborhood-based recommenders](reclab/recommenders/knn_recommender.py)
17 | `KNNRecommender` neighborhood based collaborative filtering algorithm. The class supports both user and item based collaborative filtering. In an `user_based` KNN recommender, user features are stacked and pairwise similarity metrics between users are measured. An online user is thus recommended an item that was highly rated by a similar user. Conversely in an  `item_based` KNN recommender, item features are stacked and pairwise similarity metrics between items are measured. An online user is thus recommended an item that is highly similar to other highly rated items of the user.
18 | 
19 | ### [Matrix Factorization](reclab/recommenders/libfm.py)
20 | It is wrapper for the [LibFM recommender](https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf). See www.libfm.org for implementation details. We built a pip installable python package **`ypyfm`** based on [this C++ implementation](https://github.com/srendle/libfm), that might be of interest in it's own right.
21 | 
22 | At each step of the simulation the `LibFM` recommender re-trains a matrix factorization model. It computes rating predictions as the inner product of user and item factors plus bias terms.
23 | 
24 | ### [AutoRec](reclab/recommenders/autorec/autorec.py)
25 | `AutoRec` is an autoencoder framework for collaborative filtering proposed by this [paper](https://dl.acm.org/doi/10.1145/2740908.2742726). It can be seen as a non-linear generalization of factorization models. We adapted a publicly available [implementation](https://github.com/mesuvash/NNRec)
26 | 
27 | ### [CF-NADE](/reclab/recommenders/cfnade/cfnade.py)
28 | `Cfnade` is neural autoregressive architecture for collaborative filtering proposed by this [paper](https://arxiv.org/pdf/1605.09477.pdf). We adapted a publicly available [implementation](https://github.com/JoonyoungYi/CFNADE-keras).
29 | 
30 | ### [LLORMA](reclab/recommenders/llorma/llorma.py)
31 | `Llorma` is a generalization of low rank matrix factorization techniques based on this (paper)[http://jmlr.org/papers/v17/14-301.html]. The LLORMA algorithm approximates the observed rating matrix as a weighted sum of low-rank matrices which are limited to a local region of the observed matrix. We adapted a publicly available [implementation](https://github.com/JoonyoungYi/LLORMA-tensorflow).
32 | 
33 | ### [Sparse Recommenders](reclab/recommenders/sparse.py)
34 | `SLIM` is a sparse linear recommendation model based on this [paper](http://glaros.dtc.umn.edu/gkhome/node/774). For an user *i* it models the predicted rating of an unseen item *i* as a weighted average of the ratings of items previously rated by user *u*.
35 | 
36 | `EASE` predicts ratings bases on item-item similarity model based on this [paper](https://arxiv.org/pdf/1905.03375.pdf). Assuming that the historical data contains *N* users and *M* items in a *NxM* rating matrix *X*. The model computes a *MxM* self-similarity matrix *B*. Unseen ratings are predicted as *XB*.
37 | 


--------------------------------------------------------------------------------
/tests/test_fixed.py:
--------------------------------------------------------------------------------
  1 | """Tests for the FixedRating environment.
  2 | 
  3 | The primary intent of these tests is to validate the code in the DictEnvironment
  4 | parent class instead of only testing FixedRating.
  5 | """
  6 | import numpy as np
  7 | 
  8 | from reclab.environments import FixedRating
  9 | from . import utils
 10 | 
 11 | 
 12 | def test_fixed_simple():
 13 |     """Test FixedRating with only two items."""
 14 |     env = FixedRating(num_users=1,
 15 |                       num_items=2,
 16 |                       rating_frequency=1.0,
 17 |                       num_init_ratings=0)
 18 |     assert env.name == 'fixed'
 19 |     users, items, ratings = env.reset()
 20 | 
 21 |     # Test that the users and items have empty features.
 22 |     assert users[0].shape == (0,)
 23 |     assert items[0].shape == (0,)
 24 |     assert env.online_users[0].shape == (0,)
 25 | 
 26 |     # Recommend item 0, we shouldn't observe new users or items.
 27 |     users, items, ratings, _ = env.step(np.array([[0]]))
 28 |     assert users == {}
 29 |     assert items == {}
 30 | 
 31 |     # Test that item 0 will have a rating of 1.
 32 |     assert ratings[(0, 0)][0] == 1
 33 | 
 34 |     # Recommend item 1, the environment should rate it 5.
 35 |     users, items, ratings, _ = env.step(np.array([[1]]))
 36 |     assert users == {}
 37 |     assert items == {}
 38 |     assert ratings[(0, 1)][0] == 5
 39 | 
 40 |     # Test the internal state of the environment.
 41 |     assert len(env.users) == 1
 42 |     assert env.users[0].shape == (0,)
 43 |     assert len(env.items) == 2
 44 |     assert env.items[0].shape == (0,)
 45 |     assert len(env.ratings) == 2
 46 |     assert env.ratings[0, 0][0] == 1
 47 |     assert env.ratings[0, 1][0] == 5
 48 | 
 49 | 
 50 | def test_fixed_two_users(mocker):
 51 |     """Test FixedRating with two users."""
 52 |     mocker.patch('reclab.environments.FixedRating._select_online_users',
 53 |                  utils.mock_select_online_users)
 54 |     env = FixedRating(num_users=2,
 55 |                       num_items=2,
 56 |                       rating_frequency=0.5,
 57 |                       num_init_ratings=0)
 58 |     env.reset()
 59 |     assert env.dense_ratings.shape == (2, 2)
 60 |     assert (env.dense_ratings[:, 0] == 1).all()
 61 |     assert (env.dense_ratings[:, 1] == 5).all()
 62 |     assert len(env.online_users) == 1
 63 |     assert 0 in env.online_users
 64 |     env.step(np.array([[0]]))
 65 |     assert len(env.online_users) == 1
 66 |     assert 1 in env.online_users
 67 |     env.step(np.array([[1]]))
 68 |     assert len(env.ratings) == 2
 69 |     assert env.ratings[0, 0][0] == 1
 70 |     assert env.ratings[1, 1][0] == 5
 71 | 
 72 | 
 73 | def test_fixed_slates():
 74 |     """Test FixedRating with slate recommendations."""
 75 |     env = FixedRating(num_users=1,
 76 |                       num_items=4,
 77 |                       rating_frequency=1.0,
 78 |                       num_init_ratings=0)
 79 |     env.reset()
 80 |     _, _, ratings, _ = env.step(np.array([[0, 1, 2, 3]]))
 81 |     assert len(ratings) == 1
 82 |     assert ratings[0, 3][0] == 5
 83 |     _, _, ratings, _ = env.step(np.array([[0, 1, 2, 3]]))
 84 |     assert len(ratings) == 1
 85 |     assert ratings[0, 2][0] == 5
 86 |     _, _, ratings, _ = env.step(np.array([[0, 2, 3]]))
 87 |     assert len(ratings) == 1
 88 |     assert ratings[0, 0][0] == 1
 89 |     _, _, ratings, _ = env.step(np.array([[0, 1, 2, 3]]))
 90 |     assert len(ratings) == 1
 91 |     assert ratings[0, 1][0] == 1
 92 | 
 93 | 
 94 | def test_init_ratings():
 95 |     """Test FixedRating properly initializes ratings."""
 96 |     env = FixedRating(num_users=50,
 97 |                       num_items=50,
 98 |                       rating_frequency=1.0,
 99 |                       num_init_ratings=100)
100 |     env.seed(0)
101 |     _, _, ratings = env.reset()
102 |     assert len(ratings) == 100
103 |     for (user_id, item_id), (rating, context) in ratings.items():
104 |         assert context.shape == (0,)
105 |         assert user_id < 50
106 |         assert item_id < 50
107 |         if rating == 5.0:
108 |             assert item_id >= 25
109 |         else:
110 |             assert item_id < 25
111 | 


--------------------------------------------------------------------------------
/tests/test_libfm.py:
--------------------------------------------------------------------------------
 1 | """Tests for the LibFM recommender."""
 2 | from reclab.recommenders import LibFM
 3 | from . import utils
 4 | 
 5 | 
 6 | def test_sgd_predict():
 7 |     """Test that LibFM trained with SGD predicts well and that it gets better with more data."""
 8 |     recommender = LibFM(num_user_features=0,
 9 |                         num_item_features=0,
10 |                         num_rating_features=0,
11 |                         max_num_users=utils.NUM_USERS_ML100K,
12 |                         max_num_items=utils.NUM_ITEMS_ML100K,
13 |                         method='sgd',
14 |                         learning_rate=0.003,
15 |                         num_two_way_factors=8,
16 |                         bias_reg=0.04,
17 |                         one_way_reg=0.04,
18 |                         two_way_reg=0.04,
19 |                         num_iter=128,
20 |                         seed=0)
21 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.1)
22 | 
23 | 
24 | def test_sgd_recommend():
25 |     """Test that LibFM trained with SGD will recommend reasonable items."""
26 |     recommender = LibFM(num_user_features=0,
27 |                         num_item_features=0,
28 |                         num_rating_features=0,
29 |                         max_num_users=utils.NUM_USERS_SIMPLE,
30 |                         max_num_items=utils.NUM_ITEMS_SIMPLE,
31 |                         method='sgd',
32 |                         learning_rate=0.01,
33 |                         num_two_way_factors=8,
34 |                         num_iter=128,
35 |                         seed=0)
36 |     utils.test_recommend_simple(recommender)
37 | 
38 | 
39 | def test_mcmc_predict():
40 |     """Test that LibFM trained with MCMC predicts well and that it gets better with more data."""
41 |     recommender = LibFM(num_user_features=0,
42 |                         num_item_features=0,
43 |                         num_rating_features=0,
44 |                         max_num_users=utils.NUM_USERS_ML100K,
45 |                         max_num_items=utils.NUM_ITEMS_ML100K,
46 |                         method='mcmc',
47 |                         num_two_way_factors=8,
48 |                         num_iter=128,
49 |                         seed=0)
50 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.1)
51 | 
52 | 
53 | def test_mcmc_recommend():
54 |     """Test that LibFM trained with MCMC will recommend reasonable items."""
55 |     recommender = LibFM(num_user_features=0,
56 |                         num_item_features=0,
57 |                         num_rating_features=0,
58 |                         max_num_users=utils.NUM_USERS_SIMPLE,
59 |                         max_num_items=utils.NUM_ITEMS_SIMPLE,
60 |                         method='mcmc',
61 |                         num_two_way_factors=8,
62 |                         num_iter=128,
63 |                         seed=0)
64 |     utils.test_recommend_simple(recommender)
65 | 
66 | 
67 | def test_als_predict():
68 |     """Test that LibFM trained with ALS predicts well and that it gets better with more data."""
69 |     recommender = LibFM(num_user_features=0,
70 |                         num_item_features=0,
71 |                         num_rating_features=0,
72 |                         max_num_users=utils.NUM_USERS_ML100K,
73 |                         max_num_items=utils.NUM_ITEMS_ML100K,
74 |                         method='als',
75 |                         num_two_way_factors=8,
76 |                         reg=0.02,
77 |                         num_iter=128,
78 |                         seed=0)
79 |     utils.test_predict_ml100k(recommender, rmse_threshold=1.4)
80 | 
81 | 
82 | def test_als_recommend():
83 |     """Test that LibFM trained with ALS will recommend reasonable items."""
84 |     recommender = LibFM(num_user_features=0,
85 |                         num_item_features=0,
86 |                         num_rating_features=0,
87 |                         max_num_users=utils.NUM_USERS_SIMPLE,
88 |                         max_num_items=utils.NUM_ITEMS_SIMPLE,
89 |                         method='als',
90 |                         num_two_way_factors=8,
91 |                         num_iter=128,
92 |                         seed=0)
93 |     utils.test_recommend_simple(recommender)
94 | 


--------------------------------------------------------------------------------
/reclab/recommenders/cfnade/cfnade_lib/nade.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from keras.engine import Layer, InputSpec
  3 | from keras import backend as K
  4 | from keras import initializers
  5 | from keras import regularizers
  6 | from keras import constraints
  7 | 
  8 | # def dot_product(x, kernel):
  9 | #     """
 10 | #     Wrapper for dot product operation, in order to be compatible with both
 11 | #     Theano and Tensorflow
 12 | #     Args:
 13 | #         x (): input
 14 | #         kernel (): weights
 15 | #     Returns:
 16 | #     """
 17 | #     return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
 18 | 
 19 | 
 20 | class NADE(Layer):
 21 |     def __init__(self,
 22 |                  hidden_dim,
 23 |                  activation,
 24 |                  W_regularizer=None,
 25 |                  V_regularizer=None,
 26 |                  b_regularizer=None,
 27 |                  c_regularizer=None,
 28 |                  bias=False, 
 29 |                  normalized_layer=False,
 30 |                  **kwargs):
 31 | 
 32 |         self.init = initializers.get('uniform')
 33 | 
 34 |         self.bias = bias
 35 |         self.activation = activation
 36 |         self.hidden_dim = hidden_dim
 37 | 
 38 |         self.W_regularizer = regularizers.get(W_regularizer)
 39 |         self.V_regularizer = regularizers.get(V_regularizer)
 40 |         self.b_regularizer = regularizers.get(b_regularizer)
 41 |         self.c_regularizer = regularizers.get(c_regularizer)
 42 |         
 43 |         self.normalized_layer = normalized_layer
 44 | 
 45 |         super(NADE, self).__init__(**kwargs)
 46 | 
 47 |     def build(self, input_shape):
 48 |         self.input_dim1 = input_shape[1]
 49 |         self.input_dim2 = input_shape[2]
 50 | 
 51 |         self.W = self.add_weight(
 52 |             shape=(self.input_dim1, self.input_dim2, self.hidden_dim),
 53 |             initializer=self.init,
 54 |             name='{}_W'.format(self.name),
 55 |             regularizer=self.W_regularizer)
 56 |         if self.bias:
 57 |             self.c = self.add_weight(
 58 |                 shape=(self.hidden_dim, ),
 59 |                 initializer=self.init,
 60 |                 name='{}_c'.format(self.name),
 61 |                 regularizer=self.c_regularizer)
 62 | 
 63 |         if self.bias:
 64 |             self.b = self.add_weight(
 65 |                 shape=(self.input_dim1, self.input_dim2),
 66 |                 initializer=self.init,
 67 |                 name='{}_b'.format(self.name),
 68 |                 regularizer=self.b_regularizer)
 69 | 
 70 |         self.V = self.add_weight(
 71 |             shape=(self.hidden_dim, self.input_dim1, self.input_dim2),
 72 |             initializer=self.init,
 73 |             name='{}_V'.format(self.name),
 74 |             regularizer=self.V_regularizer)
 75 | 
 76 |         super().build(input_shape)
 77 | 
 78 |     def call(self, original_x):
 79 | 
 80 |         x = K.cumsum(original_x[:, :, ::-1], axis=2)[:, :, ::-1]
 81 |         # x.shape = (?,6040,5)
 82 |         # W.shape = (6040, 5, 500)
 83 |         # c.shape = (500,)
 84 |         output_ = tf.tensordot(x, self.W, axes=[[1, 2], [0, 1]])
 85 |        
 86 |         if self.normalized_layer:
 87 |             output_ /= tf.matmul(
 88 |                 tf.maximum(
 89 |                     tf.reshape(
 90 |                         tf.reduce_sum(
 91 |                             tf.reduce_sum(original_x, axis=2), axis=1),
 92 |                         [-1, 1]), 1), tf.ones([1, output_.shape[1]]))
 93 | 
 94 |         if self.bias:
 95 |             output_ = output_ + self.c
 96 | 
 97 |         h_out = tf.reshape(output_, [-1, self.hidden_dim])
 98 |         #tf.cast(indices, tf.float32)
 99 |         # output_.shape = (?,500)
100 | 
101 |         h_out_act = K.tanh(h_out)
102 |         # h_out_act.shape = (?,500)
103 |         # V.shape = (500, 6040, 5)
104 |         # b.shape = (6040,5)
105 |         if self.bias:
106 |             output = tf.tensordot(h_out_act, self.V, axes=[[1], [0]]) + self.b
107 |         else:
108 |             output = tf.tensordot(h_out_act, self.V, axes=[[1], [0]])
109 |         # output.shape = (?,6040,5)
110 |         output = tf.reshape(output, [-1, self.input_dim1, self.input_dim2])
111 |         return output
112 | 
113 |     def compute_output_shape(self, input_shape):
114 |         return (input_shape[0], input_shape[1], input_shape[2])
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Build Status](https://travis-ci.com/berkeley-reclab/RecLab.svg?branch=master)
 2 | [![Coverage Status](https://coveralls.io/repos/github/berkeley-reclab/RecLab/badge.svg?branch=master)](https://coveralls.io/github/berkeley-reclab/RecLab?branch=master)
 3 | 
 4 | # RecLab
 5 | RecLab is a simulation framework used to evaluate recommendation algorithms. The framework makes
 6 | no platform-specific assumptions. As such, it can be used to evaluate recommendation algorithms
 7 | implemented with any computational library.
 8 | 
 9 | Reclab is under active development. If you find a bug or would like to request a new feature
10 | please file an [issue](https://github.com/berkeley-reclab/reclab/issues). Furthermore, we welcome a
11 | broad set of contributions including: documentation, tests, new environments, reproduced
12 | recommenders, and code quality improvements. Simply fork the repo and make a
13 | [pull request](https://github.com/berkeley-reclab/reclab/pulls).
14 | 
15 | ## Getting Started
16 | This section contains a brief guide on how to get started with RecLab.
17 | 
18 | ### Setup
19 | RecLab was developed and tested in Python 3.8. To install RecLab run
20 | ```
21 | pip install reclab
22 | ```
23 | RecLab also implements a set of benchmark recommender systems, however the default
24 | `pip install` command will not fetch the necessary dependencies. To fetch these dependencies
25 | you must have g++ 5.0 or higher and [python3-dev](https://stackoverflow.com/a/21530768)
26 | installed. You should then run
27 | ```
28 | pip install reclab[recommenders]
29 | ```
30 | which will install both the core reclab framework and the benchmark recommendation algorithms.
31 | 
32 | ### Example
33 | The code below shows a simple use-case with random recommendations.
34 | ```python
35 | import numpy as np
36 | import reclab
37 | env = reclab.make('topics-dynamic-v1')
38 | items, users, ratings = env.reset()
39 | for i in range(1000):
40 |     online_users = env.online_users
41 |     # Your recommendation algorithm here. This recommends 10 random items to each online user.
42 |     recommendations = np.random.choice(list(items), size=(len(online_users), 10))
43 |     _, _, ratings, info = env.step(recommendations)
44 | env.close()
45 | ```
46 | 
47 | ## RecLab Design
48 | This section briefly outlines the overall design of RecLab, and how to add new environments.
49 | 
50 | ### Basics
51 | Evaluation in RecLab consists of two basic components: **Environments** and **Recommenders**.
52 | An environment consists of a set of users and items. A recommender and an environment interact
53 | iteratively. At each time-step the environment specifies a set of _online users_ that need to be
54 | recommended an item. The recommender uses the history of user-item interactions to either recommend
55 | a single item (top-1 recommendation), or a set of items (slate-based recommendation) to each online
56 | user. The environment then provides ratings to some of, or all, the recommended items.
57 | 
58 | Below is a visualization of the interaction between environment and recommender.
59 | 
60 | ![Flowchart](/figures/RecSys.png)
61 | 
62 | #### Environments
63 | In RecLab all environments inherit from the [`Environment`](reclab/environments/environment.py) interface. The following methods must be implemented:
64 | - `reset`: Reset the environment to its original state. Must be called before the first step of the simulation.
65 | - `online_users`: Return a list of available users at each timestep.
66 | - `step(recommendations)`: Given `recommendations`, update the internal state of the environment and return the following data:
67 |     - `users`: New users and users whose information got updated this timestep, along with any side information about each user.
68 |     - `items`: New items and items whose information got updated this timestep, along with any side information about each item.
69 |     - `ratings`: New ratings and ratings whose information got updated this timestep, along with any side information about each rating.
70 |     - `info`: Extra information that can be used for debugging but should not be made accessible to the recommender.
71 | 
72 | To see a description of available environments see the [list of enviroments](reclab/environments/README.md).
73 | 
74 | #### Recommenders
75 | RecLab does not assume recommendation algorithms are implemented in any specific way. However, we
76 | also provide a [convenient interface](reclab/recommenders/recommender.py) to simplify the design of
77 | new recommendation algorithms.
78 | 
79 | To see a description of available recommenders see the
80 | [list of recommenders](reclab/recommenders/README.md). Note that you must install the optional
81 | dependencies to use some of these recommenders as outline under the [setup section](#Setup).
82 | 
83 | **Coming soon:** More functionality for running experiments and custom performance metrics.
84 | 


--------------------------------------------------------------------------------
/reclab/recommenders/llorma/llorma.py:
--------------------------------------------------------------------------------
  1 | """Tensorflow implementation of AutoRec recommender."""
  2 | import numpy as np
  3 | from .llorma_lib import llorma_g
  4 | from .. import recommender
  5 | 
  6 | 
  7 | class Llorma(recommender.PredictRecommender):
  8 |     """Many local low rank models averaged via kernels.
  9 | 
 10 |     Parameters
 11 |     ----------
 12 |     max_user : int
 13 |         Maximum number of users in the environment
 14 |     max_item  : int
 15 |         Maximum number of items in the environment
 16 |     n_anchor : int
 17 |         Number of local model to build in the train phase
 18 |     pre_rank : int
 19 |         Dimension of the pre-train user/item latent factors
 20 |     rank : int
 21 |         Dimension of the train user/item factors
 22 |     pre_lambda_val : float
 23 |         Regularization parameter for the pre-train matrix factorization
 24 |     lambda_val : float
 25 |         Regularization parameter for the train model
 26 |     pre_learning_rate : float
 27 |         Learning rate when optimizing the pre-train matrix factorization
 28 |     learning_rate : float
 29 |         Learning rate for the the train model
 30 |     pre_train_steps : int
 31 |         Number of epochs in the pre-train phase
 32 |     train_steps : int
 33 |         Number of epochs in the training phase
 34 |     batch_size : int
 35 |         Batch size in training phase
 36 |     use_cache : bool
 37 |         If True use stored pre-trained item/user latent factors
 38 |     results_path : str
 39 |         Folder to save model outputs and checkpoints.
 40 |     kernel_fun : callable
 41 |         kernel function used for similarity,
 42 | 
 43 |     """
 44 | 
 45 |     def __init__(self,
 46 |                  max_user,
 47 |                  max_item,
 48 |                  n_anchor=10,
 49 |                  pre_rank=5,
 50 |                  pre_learning_rate=1e-3,
 51 |                  pre_lambda_val=10,
 52 |                  pre_train_steps=10,
 53 |                  rank=10,
 54 |                  learning_rate=1e-2,
 55 |                  lambda_val=1e-3,
 56 |                  train_steps=10,
 57 |                  batch_size=128,
 58 |                  use_cache=True,
 59 |                  result_path='results',
 60 |                  kernel_fun=None,
 61 |                  random_seed=0):
 62 |         """Create new Local Low-Rank Matrix Approximation (LLORMA) recommender."""
 63 |         super().__init__()
 64 | 
 65 |         self.model = llorma_g.Llorma(max_user, max_item, n_anchor, pre_rank,
 66 |                                      pre_learning_rate, pre_lambda_val, pre_train_steps,
 67 |                                      rank, learning_rate, lambda_val, train_steps,
 68 |                                      batch_size, use_cache, result_path, kernel_fun)
 69 |         self._hyperparameters.update(locals())
 70 | 
 71 |         # We only want the function arguments so remove class related objects.
 72 |         del self._hyperparameters['self']
 73 |         del self._hyperparameters['__class__']
 74 |         np.random.seed(random_seed)
 75 | 
 76 |     @property
 77 |     def name(self):  # noqa: D102
 78 |         return 'llorma'
 79 | 
 80 |     def _predict(self, user_item):  # noqa: D102
 81 |         users, items, _ = list(zip(*user_item))
 82 |         users = np.array(users)
 83 |         items = np.array(items)
 84 |         # Check that both the item and the user have been seen in historical data.
 85 |         is_seen_uid = np.array(users <= (self.model.batch_manager.n_user - 1))
 86 |         is_seen_iid = np.array(items <= (self.model.batch_manager.n_item - 1))
 87 |         is_seen_id = np.logical_and(is_seen_iid, is_seen_uid)
 88 | 
 89 |         seen_user_item = np.column_stack((users[is_seen_id], items[is_seen_id]))
 90 |         seen_estimate = self.model.predict(seen_user_item)
 91 |         # Choose the mean of the seen values as the estimate for the unseen ids.
 92 |         unseen_estimate = np.mean(seen_estimate)
 93 |         estimate = np.ones(len(users))*unseen_estimate
 94 |         estimate[is_seen_id] = seen_estimate
 95 |         print('Low: {:.3f}, Mean: {:.3f}, High: {:.3f}'.format(np.quantile(seen_estimate, 0.25),
 96 |                                                                np.quantile(seen_estimate, 0.5),
 97 |                                                                np.quantile(seen_estimate, 0.75)))
 98 |         return estimate
 99 | 
100 |     def update(self, users=None, items=None, ratings=None):  # noqa: D102
101 |         super().update(users, items, ratings)
102 |         updated_ratings = dict(self._ratings)
103 |         user_items = np.array(list(updated_ratings.keys()))
104 |         rating_arr = list(updated_ratings.values())
105 | 
106 |         data = np.column_stack((user_items, rating_arr))
107 |         self.model.reset_data(data, data, data)
108 |         self.model.train()
109 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
  1 | """A set of utility functions for testing."""
  2 | import collections
  3 | import numpy as np
  4 | 
  5 | from reclab import data_utils
  6 | 
  7 | NUM_USERS_ML100K = 943
  8 | NUM_ITEMS_ML100K = 1682
  9 | 
 10 | NUM_USERS_SIMPLE = 2
 11 | NUM_ITEMS_SIMPLE = 3
 12 | 
 13 | 
 14 | def test_predict_ml100k(recommender, rmse_threshold=1.1, seed=None, test_dense=False):
 15 |     """Test that recommender predicts well and that it gets better with more data."""
 16 |     users, items, ratings = data_utils.read_dataset('ml-100k')
 17 |     assert NUM_USERS_ML100K == len(users)
 18 |     assert NUM_ITEMS_ML100K == len(items)
 19 |     train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.9, shuffle=True, seed=seed)
 20 |     train_ratings_1, train_ratings_2 = data_utils.split_ratings(train_ratings, 0.5)
 21 |     recommender.reset(users, items, train_ratings_1)
 22 |     user_item = [(key[0], key[1], val[1]) for key, val in test_ratings.items()]
 23 |     preds = recommender.predict(user_item)
 24 |     targets = [t[0] for t in test_ratings.values()]
 25 |     rmse1 = rmse(preds, targets)
 26 | 
 27 |     # We should get a relatively low RMSE here.
 28 |     assert rmse1 < rmse_threshold
 29 | 
 30 |     recommender.update(ratings=train_ratings_2)
 31 |     preds = recommender.predict(user_item)
 32 |     rmse2 = rmse(preds, targets)
 33 | 
 34 |     # The RMSE should have reduced.
 35 |     assert rmse1 > rmse2
 36 | 
 37 |     if test_dense:
 38 |         # Test that the dense predictions work as well.
 39 |         dense = recommender.dense_predictions
 40 |         preds = np.array([dense[key[0] - 1, key[1] - 1] for key in test_ratings])
 41 |         rmse3 = rmse(preds, targets)
 42 |         # The RMSE should have reduced.
 43 |         assert rmse1 > rmse3
 44 | 
 45 | 
 46 | def test_binary_recommend_ml100k(recommender, hit_rate_threshold, seed=None):
 47 |     """Test that the recommender will recommend good items and it gets better with more data."""
 48 |     users, items, ratings = data_utils.read_dataset('ml-100k')
 49 |     assert NUM_USERS_ML100K == len(users)
 50 |     assert NUM_ITEMS_ML100K == len(items)
 51 |     train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.9, shuffle=True, seed=seed)
 52 |     train_ratings_1, train_ratings_2 = data_utils.split_ratings(train_ratings, 0.5)
 53 |     all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
 54 | 
 55 |     recommender.reset(users, items, train_ratings_1)
 56 |     recs, _ = recommender.recommend(all_contexts, 1)
 57 |     num_hits = sum((user_id, rec) in test_ratings for user_id, rec in zip(users, recs[:, 0]))
 58 |     hit_rate1 = num_hits / NUM_USERS_ML100K
 59 | 
 60 |     # We should get a relatively low hit rate here.
 61 |     assert hit_rate1 > hit_rate_threshold, hit_rate1
 62 | 
 63 |     recommender.reset(users, items, train_ratings_1)
 64 |     recommender.update(ratings=train_ratings_2)
 65 |     recs, _ = recommender.recommend(all_contexts, 1)
 66 |     num_hits = sum((user_id, rec) in test_ratings for user_id, rec in zip(users, recs[:, 0]))
 67 |     hit_rate2 = num_hits / NUM_USERS_ML100K
 68 | 
 69 |     # The hit rate should have increased.
 70 |     assert hit_rate1 < hit_rate2, hit_rate2
 71 | 
 72 | 
 73 | def test_recommend_simple(recommender):
 74 |     """Test that recommender will recommend reasonable items in simple setting."""
 75 |     users = {0: np.zeros((0,)),
 76 |              1: np.zeros((0,))}
 77 |     items = {0: np.zeros((0,)),
 78 |              1: np.zeros((0,)),
 79 |              2: np.zeros((0,))}
 80 |     assert NUM_USERS_SIMPLE == len(users)
 81 |     assert NUM_ITEMS_SIMPLE == len(items)
 82 |     ratings = {(0, 0): (5, np.zeros((0,))),
 83 |                (0, 1): (1, np.zeros((0,))),
 84 |                (0, 2): (5, np.zeros((0,))),
 85 |                (1, 0): (5, np.zeros((0,)))}
 86 |     recommender.reset(users, items, ratings)
 87 |     user_contexts = collections.OrderedDict([(1, np.zeros((0,)))])
 88 |     recs, _ = recommender.recommend(user_contexts, 1)
 89 |     recommender.predict([(1, 1, np.zeros(0,)), (1, 2, np.zeros(0,))])
 90 |     assert recs.shape == (1, 1)
 91 |     # The recommender should have recommended the item that user0 rated the highest.
 92 |     assert recs[0, 0] == 2
 93 | 
 94 | 
 95 | def rmse(predictions, targets):
 96 |     """Compute the root mean squared error (RMSE) between prediction and target vectors."""
 97 |     return np.sqrt(((predictions - targets) ** 2).mean())
 98 | 
 99 | 
100 | def mock_select_online_users(self):
101 |     """Return the users online at a given timestep.
102 | 
103 |     This functions is meant to replace the _select_online_users method in an environment
104 |     when used for testing.
105 |     """
106 |     # pylint: disable=protected-access
107 |     num_online = int(len(self._users) * self._rating_frequency)
108 |     start_id = (num_online * (self._timestep + 1)) % len(self._users)
109 |     end_id = min(start_id + num_online, len(self._users))
110 |     return np.arange(start_id, end_id)
111 | 


--------------------------------------------------------------------------------
/reclab/environments/schmit.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains implementation for environment in "Human Interaction with Recommendation Systems".
  3 | 
  4 | https://arxiv.org/pdf/1703.00535.pdf
  5 | """
  6 | 
  7 | import numpy as np
  8 | 
  9 | from . import environment
 10 | 
 11 | 
 12 | class Schmit(environment.DictEnvironment):
 13 |     """
 14 |     Implementation of environment with static private user preferences and user-item interactions.
 15 | 
 16 |     Based on "Human Interaction with Recommendation Systems" by Schmit and Riquelme (2018).
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     num_users : int
 21 |         The number of users in the environment.
 22 |     num_items : int
 23 |         The number of items in the environment.
 24 |     rating_frequency : float
 25 |         What proportion of users will need a recommendation at each step.
 26 |     num_init_ratings: : int
 27 |         The number of initial ratings available when the environment is reset.
 28 |     rank : int
 29 |         Rank of user preferences.
 30 |     sigma : float
 31 |         Variance of the Gaussian noise added to determine user-item value.
 32 |     user_dist_choice : str
 33 |         The choice of user distribution for selecting online users. By default, the subset of
 34 |         online users is chosen from a uniform distribution. Currently supports normal and lognormal.
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self, num_users, num_items, rating_frequency=0.2,
 39 |                  num_init_ratings=0, rank=10, sigma=0.2,
 40 |                  user_dist_choice='uniform'):
 41 |         """Create an environment."""
 42 |         super().__init__(rating_frequency, num_init_ratings, 0, user_dist_choice)
 43 |         self._num_users = num_users
 44 |         self._num_items = num_items
 45 | 
 46 |         self.rank = rank
 47 |         self.sigma = sigma
 48 | 
 49 |         # constants
 50 |         self.item_bias = self._init_random.randn(num_items, 1) / 1.5
 51 |         self.user_bias = self._init_random.randn(num_users, 1) / 3
 52 | 
 53 |         # unobserved by agents
 54 |         self.U = self._init_random.randn(num_users, rank) / np.sqrt(self.rank)
 55 |         self.V = self._init_random.randn(num_items, rank) / np.sqrt(self.rank)
 56 | 
 57 |         # observed by agents
 58 |         self.X = self._init_random.randn(num_users, rank) / np.sqrt(self.rank)
 59 |         self.Y = self._init_random.randn(num_items, rank) / np.sqrt(self.rank)
 60 | 
 61 |     @property
 62 |     def name(self):
 63 |         """Name of environment, used for saving."""
 64 |         return 'schmit'
 65 | 
 66 |     def true_score(self, user, item):
 67 |         """
 68 |         Calculate true score.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         user : int
 73 |             User id for calculating preferences.
 74 |         item : int
 75 |             Item id.
 76 | 
 77 |         Returns
 78 |         -------
 79 |         score : float
 80 |             The true score of the item for the user.
 81 | 
 82 |         """
 83 |         return float(self.item_bias[item] + self.user_bias[user] + self.U[user] @ self.V[item].T)
 84 | 
 85 |     def value(self, user, item):
 86 |         """
 87 |         Add private user preferences and Gaussian noise to true score.
 88 | 
 89 |         Parameters
 90 |         ----------
 91 |         user : int
 92 |             User id for calculating preferences.
 93 |         item : int
 94 |             Item id.
 95 | 
 96 |         Returns
 97 |         -------
 98 |         value : float
 99 |             The (noisy) value of the item to the user.
100 | 
101 |         """
102 |         ratings = float(self.true_score(user, item) + self.X[user] @ self.Y[item].T +
103 |                         self._dynamics_random.normal(loc=0, scale=self.sigma) + 3)
104 |         return np.clip(ratings, 1, 5)
105 | 
106 |     def _reset_state(self):
107 |         self._users = {user_id: np.zeros((0,))
108 |                        for user_id in range(self._num_users)}
109 |         self._items = {item_id: np.zeros((0,))
110 |                        for item_id in range(self._num_items)}
111 | 
112 |         self.item_bias = self._init_random.randn(self._num_items, 1) / 1.5
113 |         self.user_bias = self._init_random.randn(self._num_users, 1) / 3
114 | 
115 |         self.U = self._init_random.randn(
116 |             self._num_users, self.rank) / np.sqrt(self.rank)
117 |         self.V = self._init_random.randn(
118 |             self._num_items, self.rank) / np.sqrt(self.rank)
119 |         self.X = self._init_random.randn(
120 |             self._num_users, self.rank) / np.sqrt(self.rank)
121 |         self.Y = self._init_random.randn(
122 |             self._num_items, self.rank) / np.sqrt(self.rank)
123 | 
124 |     def _rate_items(self, user_id, item_id):
125 |         return self.value(user_id, item_id)
126 | 
127 |     def _get_dense_ratings(self):
128 |         """Compute all the true ratings on every user-item pair at the current timestep.
129 | 
130 |         A true rating is defined as the rating a user would make with all noise removed.
131 | 
132 |         Returns
133 |         -------
134 |         dense_ratings : np.ndarray
135 |             The array of all true ratings where true_ratings[i, j] is the rating by user i
136 |             on item j.
137 | 
138 |         """
139 |         dense_ratings = np.zeros([self._num_users, self._num_items])
140 |         for u in range(self._num_users):
141 |             for i in range(self._num_items):
142 |                 dense_ratings[u, i] = self.true_score(
143 |                     u, i) + self.X[u] @ self.Y[i].T + 3
144 |         return dense_ratings
145 | 


--------------------------------------------------------------------------------
/reclab/environments/registry.py:
--------------------------------------------------------------------------------
  1 | """Contains make, a function to instantiate a standardized environment from a string."""
  2 | from .beta_rank import BetaRank
  3 | from .latent_factors import LatentFactorBehavior, DatasetLatentFactor
  4 | from .schmit import Schmit
  5 | from .topics import Topics
  6 | 
  7 | NAMED_ENV_DICT = {
  8 |     'topics-static-v1': (
  9 |         Topics,
 10 |         dict(num_topics=19,
 11 |              num_users=1000,
 12 |              num_items=1700,
 13 |              rating_frequency=0.2,
 14 |              num_init_ratings=100000,
 15 |              noise=0.5,
 16 |              topic_change=0,
 17 |              memory_length=0,
 18 |              boredom_threshold=0,
 19 |              boredom_penalty=0)
 20 |     ),
 21 |     'topics-static-v1-small': (
 22 |         Topics,
 23 |         dict(num_topics=19,
 24 |              num_users=100,
 25 |              num_items=170,
 26 |              rating_frequency=0.2,
 27 |              num_init_ratings=5000,
 28 |              noise=0.5,
 29 |              topic_change=0,
 30 |              memory_length=0,
 31 |              boredom_threshold=0,
 32 |              boredom_penalty=0)
 33 |     ),
 34 |     'topics-dynamic-v1': (
 35 |         Topics,
 36 |         dict(num_topics=19,
 37 |              num_users=1000,
 38 |              num_items=1700,
 39 |              rating_frequency=0.2,
 40 |              num_init_ratings=100000,
 41 |              noise=0.5,
 42 |              topic_change=0.1,
 43 |              memory_length=5,
 44 |              boredom_threshold=2,
 45 |              boredom_penalty=1)
 46 |     ),
 47 |     'topics-satiation-v1': (
 48 |         Topics,
 49 |         dict(num_topics=19,
 50 |              num_users=1000,
 51 |              num_items=1700,
 52 |              rating_frequency=0.2,
 53 |              num_init_ratings=100000,
 54 |              noise=0.5,
 55 |              satiation_factor=3,
 56 |              satiation_decay=0.5,
 57 |              satiation_noise=0.1)
 58 |     ),
 59 |     'topics-sensitization-v1': (
 60 |         Topics,
 61 |         dict(num_topics=19,
 62 |              num_users=1000,
 63 |              num_items=1700,
 64 |              rating_frequency=0.2,
 65 |              num_init_ratings=100000,
 66 |              noise=0.5,
 67 |              satiation_factor=3,
 68 |              satiation_decay=(0.1, 0.5),
 69 |              satiation_noise=0.1,
 70 |              switch_probability=(0.05, 0.2))
 71 |     ),
 72 |     'latent-static-v1': (
 73 |         LatentFactorBehavior,
 74 |         dict(latent_dim=100,
 75 |              num_users=943,
 76 |              num_items=1682,
 77 |              rating_frequency=0.2,
 78 |              num_init_ratings=100000,
 79 |              noise=0.5,
 80 |              affinity_change=0,
 81 |              memory_length=0,
 82 |              boredom_threshold=0,
 83 |              boredom_penalty=0)
 84 |     ),
 85 |     'latent-dynamic-v1': (
 86 |         LatentFactorBehavior,
 87 |         dict(latent_dim=100,
 88 |              num_users=943,
 89 |              num_items=1682,
 90 |              rating_frequency=0.2,
 91 |              num_init_ratings=100000,
 92 |              noise=0.5,
 93 |              affinity_change=0.2,
 94 |              memory_length=5,
 95 |              boredom_threshold=0,
 96 |              boredom_penalty=2)
 97 |     ),
 98 |     'ml-100k-v1': (
 99 |         DatasetLatentFactor,
100 |         dict(name='ml-100k',
101 |              latent_dim=0,
102 |              rating_frequency=0.00107,
103 |              num_init_ratings=0,
104 |              noise=0.5,
105 |              affinity_change=0,
106 |              memory_length=0,
107 |              boredom_threshold=0,
108 |              boredom_penalty=0)
109 |     ),
110 |     'latent-score-v1': (
111 |         Schmit,
112 |         dict(num_users=1000,
113 |              num_items=1700,
114 |              rating_frequency=0.2,
115 |              num_init_ratings=100000,
116 |              rank=10,
117 |              sigma=0.2)
118 |     ),
119 |     'beta-rank-v1': (
120 |         BetaRank,
121 |         dict(num_users=1000,
122 |              num_items=1700,
123 |              dimension=19,
124 |              rating_frequency=0.001,
125 |              num_init_ratings=0,
126 |              known_mean=0.98)
127 |     ),
128 |     'beta-rank-lowdata-v1': (
129 |         BetaRank,
130 |         dict(num_users=1000,
131 |              num_items=1700,
132 |              dimension=19,
133 |              rating_frequency=0.001,
134 |              num_init_ratings=0,
135 |              known_mean=0.98)
136 |     ),
137 |     'beta-rank-small-v1': (
138 |         BetaRank,
139 |         dict(num_users=100,
140 |              num_items=170,
141 |              dimension=19,
142 |              rating_frequency=0.01,
143 |              num_init_ratings=0,
144 |              known_mean=0.98)
145 |     ),
146 | }
147 | 
148 | 
149 | def make(name, **kwargs):
150 |     """
151 |     Create an environment by name.
152 | 
153 |     You may optionally override the arguments for the environment constructor by specifying kwargs.
154 | 
155 |     Parameters
156 |     ----------
157 |     name : str
158 |         The name of the environment.
159 | 
160 |     Returns
161 |     ------
162 |     env : Environment
163 |         The constructed environment.
164 | 
165 |     """
166 |     if name not in NAMED_ENV_DICT:
167 |         raise ValueError('{} is not a valid environment name. '.format(name) +
168 |                          'Valid named environments: {}'.format(NAMED_ENV_DICT.keys()))
169 |     env_class, params = NAMED_ENV_DICT[name]
170 |     params.update(kwargs)
171 |     return env_class(**params)
172 | 


--------------------------------------------------------------------------------
/reclab/recommenders/cfnade/cfnade_lib/utils.py:
--------------------------------------------------------------------------------
  1 | """ Util functions for class Cfnade"""
  2 | from itertools import islice
  3 | import numpy as np
  4 | import keras
  5 | from keras import backend as K
  6 | from keras.callbacks import Callback
  7 | 
  8 | class DataSet(Callback):
  9 |     """
 10 |     A datagenerator the feeds data in batches.
 11 | 
 12 |     ratings_df: rating matrix, num_iters * num_users, entry is input rating rounded to integer
 13 |     batch_size: int, batch size, default is 64
 14 |     num_users: int, number of users
 15 |     num_items: int, number of items
 16 |     mode: int, 0 for train, 1 for eval, 2 for test
 17 |     """
 18 |     def __init__(self,ratings_df,
 19 |         num_users,
 20 |         num_items,
 21 |         batch_size,
 22 |         rating_bucket,
 23 |         mode):
 24 | 
 25 |         self.num_users = num_users
 26 |         self.num_items = num_items
 27 |         self.batch_size = batch_size
 28 |         self.ratings_df = ratings_df
 29 |         self.rating_bucket = rating_bucket
 30 |         self.mode = mode
 31 | 
 32 |     def generate(self, eval=False):
 33 |         """
 34 |         a generator function yields ratings_df for each batch
 35 | 
 36 |         """
 37 |         line_pointer = 0
 38 |         while True:
 39 |             next_n_data_lines = list(islice(self.ratings_df, line_pointer, line_pointer+self.batch_size))
 40 |             if not next_n_data_lines:
 41 |                 if self.mode == 0 and eval==False:
 42 |                     line_pointer = 0
 43 |                     next_n_data_lines = list(islice(self.ratings_df, line_pointer, line_pointer+self.batch_size))
 44 |                 else:
 45 |                     break
 46 |             input_ranking_vectors = np.zeros((self.batch_size, self.num_users, self.rating_bucket), dtype='int8')
 47 |             output_ranking_vectors = np.zeros((self.batch_size, self.num_users, self.rating_bucket), dtype='int8')
 48 |             input_mask_vectors = np.zeros((self.batch_size, self.num_users), dtype='int8')
 49 |             output_mask_vectors = np.zeros((self.batch_size, self.num_users), dtype='int8')
 50 |             for i, line in enumerate(next_n_data_lines):
 51 |                 user_ids = np.nonzero(line)[0]
 52 |                 ratings_line = line[line != 0]
 53 | 
 54 |                 if self.mode == 0 and len(user_ids) != 0:
 55 |                     # a random ordered list 0 to len(user_ids)-1
 56 |     
 57 |                     ordering = np.random.permutation(np.arange(len(user_ids)))
 58 |                     random_num = np.random.randint(0, len(ordering))
 59 |                     flag_in = (ordering < random_num)
 60 |                     flag_out = (ordering >= random_num)
 61 |                   
 62 |                     input_mask_vectors[i][user_ids] = flag_in
 63 |                     output_mask_vectors[i][user_ids] = flag_out
 64 | 
 65 |                     for j, (user_id, value) in enumerate(zip(user_ids, ratings_line)):
 66 |                         if flag_in[j]:
 67 |                             input_ranking_vectors[i, user_id, (value-1)] = 1
 68 |                         else:
 69 |                             output_ranking_vectors[i, user_id, (value-1)] = 1
 70 |                 if self.mode == 2:
 71 |                     for j, (user_id, value) in enumerate(zip(user_ids, ratings_line)):
 72 |                         input_ranking_vectors[i, user_id, (value-1)] = 1                  
 73 | 
 74 |             inputs = {
 75 |                 'input_ratings': input_ranking_vectors,
 76 |                 'output_ratings': output_ranking_vectors,
 77 |                 'input_masks': input_mask_vectors,
 78 |                 'output_masks': output_mask_vectors}
 79 | 
 80 |             outputs = {'nade_loss': np.zeros([self.batch_size])}
 81 |             yield (inputs, outputs)
 82 |             line_pointer = line_pointer + self.batch_size 
 83 | 
 84 | 
 85 | def prediction_layer(x):
 86 |     # x.shape = (?,6040,5)
 87 |     x_cumsum = K.cumsum(x, axis=2)
 88 |     # x_cumsum.shape = (?,6040,5)
 89 | 
 90 |     output = K.softmax(x_cumsum)
 91 |     # output = (?,6040,5)
 92 |     return output
 93 | 
 94 | 
 95 | def prediction_output_shape(input_shape):
 96 | 
 97 |     return input_shape
 98 | 
 99 | 
100 | def d_layer(x):
101 | 
102 |     return K.sum(x, axis=1)
103 | 
104 | 
105 | def d_output_shape(input_shape):
106 | 
107 |     return (input_shape[0], )
108 | 
109 | 
110 | def D_layer(x):
111 | 
112 |     return K.sum(x, axis=1)
113 | 
114 | 
115 | def D_output_shape(input_shape):
116 | 
117 |     return (input_shape[0],)
118 | 
119 | 
120 | def rating_cost_lambda_func(args):
121 |     alpha=0.01 #in the paper they reported alpha = 0.01 and std = 1.0. THis is what was used in the repo.
122 |     std=1.0
123 |     pred_score, true_ratings, input_masks, output_masks, D, d = args
124 |     pred_score_cum = K.cumsum(pred_score, axis=2)
125 |     prob_item_ratings = K.softmax(pred_score_cum)
126 |     accu_prob_1N = K.cumsum(prob_item_ratings, axis=2)
127 |     accu_prob_N1 = K.cumsum(prob_item_ratings[:, :, ::-1], axis=2)[:, :, ::-1]
128 |     mask1N = K.cumsum(true_ratings[:, :, ::-1], axis=2)[:, :, ::-1]
129 |     maskN1 = K.cumsum(true_ratings, axis=2)
130 |     cost_ordinal_1N = -K.sum((K.log(prob_item_ratings) - K.log(accu_prob_1N)) * mask1N, axis=2)
131 |     cost_ordinal_N1 = -K.sum((K.log(prob_item_ratings) - K.log(accu_prob_N1)) * maskN1, axis=2)
132 |     cost_ordinal = cost_ordinal_1N + cost_ordinal_N1
133 |     nll_item_ratings = K.sum(-(true_ratings * K.log(prob_item_ratings)), axis=2)
134 |     nll = std * K.sum(nll_item_ratings, axis=1) * 1.0 * D / (D - d + 1e-6) \
135 |         + alpha * K.sum(cost_ordinal, axis=1) * 1.0 * D / (D - d + 1e-6)
136 |     cost = K.mean(nll)
137 |     cost = K.expand_dims(cost, 0)
138 | 
139 |     return cost
140 | 


--------------------------------------------------------------------------------
/reclab/recommenders/autorec/autorec.py:
--------------------------------------------------------------------------------
  1 | """Pytorch implementation of AutoRec recommender."""
  2 | 
  3 | import math
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | from .autorec_lib import autorec
  8 | from .. import recommender
  9 | 
 10 | 
 11 | class Autorec(recommender.PredictRecommender):
 12 |     """The Autorec recommender.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     num_users : int
 17 |         Number of users in the environment.
 18 |     num_items : int
 19 |         Number of items in the environment.
 20 |     hidden_neuron : int
 21 |         Output dimension of hidden layer.
 22 |     lambda_value : float
 23 |         Coefficient for regularization while training layers.
 24 |     train_epoch : int
 25 |         Number of epochs to train for each call.
 26 |     batch_size : int
 27 |         Batch size during initial training phase.
 28 |     optimizer_method : str
 29 |         Optimizer for training model; either Adam or RMSProp.
 30 |     grad_clip : bool
 31 |         Set to true to clip gradients to [-5, 5].
 32 |     base_lr : float
 33 |         Base learning rate for optimizer.
 34 |     lr_decay : float
 35 |         Rate for decaying learning rate during training.
 36 |     dropout : float
 37 |         Probability to initialize dropout layer. Set to 0 for no dropout.
 38 |     random_seed : int
 39 |         Random seed to reproduce results.
 40 | 
 41 |     """
 42 | 
 43 |     def __init__(self, num_users, num_items,
 44 |                  hidden_neuron=500, lambda_value=1,
 45 |                  train_epoch=1000, batch_size=1000, optimizer_method='RMSProp',
 46 |                  grad_clip=False, base_lr=1e-3, lr_decay=1e-2,
 47 |                  dropout=0.05, random_seed=0):
 48 |         """Create new Autorec recommender."""
 49 |         super().__init__()
 50 | 
 51 |         # We only want the function arguments so remove class related objects.
 52 |         self._hyperparameters.update(locals())
 53 |         del self._hyperparameters['self']
 54 |         del self._hyperparameters['__class__']
 55 | 
 56 |         self.model = autorec.AutoRec(num_users,
 57 |                                      num_items,
 58 |                                      seen_users=set(),
 59 |                                      seen_items=set(),
 60 |                                      hidden_neuron=hidden_neuron,
 61 |                                      dropout=dropout,
 62 |                                      random_seed=random_seed)
 63 |         self.lambda_value = lambda_value
 64 |         self.num_users = num_users
 65 |         self.num_items = num_items
 66 |         self.train_epoch = train_epoch
 67 |         self.batch_size = batch_size
 68 |         self.num_batch = int(math.ceil(self.num_items / float(self.batch_size)))
 69 |         self.base_lr = base_lr
 70 |         self.optimizer_method = optimizer_method
 71 |         self.random_seed = random_seed
 72 | 
 73 |         self.lr_decay = lr_decay
 74 |         self.grad_clip = grad_clip
 75 |         np.random.seed(self.random_seed)
 76 |         # pylint: disable=no-member
 77 |         self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 78 | 
 79 |     def train_model(self, data):
 80 |         """Train for all epochs in train_epoch."""
 81 |         self.model.train()
 82 |         if self.optimizer_method == 'Adam':
 83 |             optimizer = torch.optim.Adam(self.model.parameters(), lr=self.base_lr)
 84 | 
 85 |         elif self.optimizer_method == 'RMSProp':
 86 |             optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.base_lr)
 87 |         else:
 88 |             raise ValueError('Optimizer Key ERROR')
 89 | 
 90 |         scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=self.lr_decay)
 91 | 
 92 |         self.model.to(self.device)
 93 |         for epoch in range(self.train_epoch):
 94 |             self.train(data, optimizer, scheduler)
 95 | 
 96 |     def train(self, data, optimizer, scheduler):
 97 |         """Train for a single epoch."""
 98 |         random_perm_doc_idx = np.random.permutation(self.num_items)
 99 |         for i in range(self.num_batch):
100 |             if i == self.num_batch - 1:
101 |                 batch_set_idx = random_perm_doc_idx[i * self.batch_size:]
102 |             elif i < self.num_batch - 1:
103 |                 batch_set_idx = random_perm_doc_idx[i * self.batch_size:(i+1) * self.batch_size]
104 | 
105 |             batch = data[batch_set_idx, :].to(self.device)
106 |             output = self.model.forward(batch)
107 |             mask = self.mask_ratings[batch_set_idx, :].to(self.device)
108 |             loss = self.model.loss(output,
109 |                                    batch,
110 |                                    mask,
111 |                                    lambda_value=self.lambda_value)
112 | 
113 |             loss.backward()
114 |             if self.grad_clip:
115 |                 torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5)
116 | 
117 |             optimizer.step()
118 |             scheduler.step()
119 | 
120 |     @property
121 |     def name(self):  # noqa: D102
122 |         return 'autorec'
123 | 
124 |     def _predict(self, user_item):
125 |         self.model = self.model.eval()
126 |         return self.model.predict(user_item, self.ratings.to(self.device))
127 | 
128 |     def reset(self, users=None, items=None, ratings=None):  # noqa: D102
129 |         self.model.prepare_model()
130 |         super().reset(users, items, ratings)
131 | 
132 |     def update(self, users=None, items=None, ratings=None):  # noqa: D102
133 |         super().update(users, items, ratings)
134 |         self.model.prepare_model()
135 |         self.model = self.model.train()
136 |         for user_item in ratings:
137 |             self.model.seen_users.add(user_item[0])
138 |             self.model.seen_items.add(user_item[1])
139 | 
140 |         ratings = self._ratings.toarray()
141 |         # Item-based autorec expects rows that represent items
142 |         # pylint: disable=no-member
143 |         self.ratings = torch.FloatTensor(ratings.T)
144 |         # pylint: disable=no-member
145 |         self.mask_ratings = torch.FloatTensor(ratings.T).clamp(0, 1)
146 | 
147 |         self.train_model(self.ratings)
148 | 


--------------------------------------------------------------------------------
/tests/test_topics.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=protected-access
  2 | """Tests for the Topics environment."""
  3 | import copy
  4 | import numpy as np
  5 | 
  6 | from reclab.environments import Topics
  7 | 
  8 | 
  9 | def _test_dimension_consistency(environment):
 10 |     """ Basic Helper Test to check if dimension of
 11 |     various environment properties."""
 12 |     env = copy.deepcopy(environment)
 13 | 
 14 |     assert env.name == 'topics'
 15 |     users, items, _ = env.reset()
 16 | 
 17 |     # Test that the users and items have empty features.
 18 |     num_users = len(env.users)
 19 |     num_items = len(env.items)
 20 |     num_topics = env._num_topics
 21 |     assert users[0].shape == (0,)
 22 |     assert items[0].shape == (0,)
 23 |     assert env.online_users[0].shape == (0,)
 24 | 
 25 |     # Test that item topics and user preferences are of the correct size.
 26 |     assert env._item_topics.shape == (num_items,)
 27 |     assert env._user_preferences.shape == (num_users, num_topics)
 28 | 
 29 |     # Recommend item 0, we shouldn't observe new users or items.
 30 |     users, items, _, _ = env.step(np.array([[0]]))
 31 |     assert users == {}
 32 |     assert items == {}
 33 | 
 34 | 
 35 | def test_topics_static_simple():
 36 |     """Test Topics with only one user, with no preference shifts
 37 |     and no topic change and no boredom."""
 38 |     env = Topics(num_topics=2,
 39 |                  num_users=1,
 40 |                  num_items=2,
 41 |                  rating_frequency=1.0,
 42 |                  num_init_ratings=0,
 43 |                  noise=0.0,
 44 |                  topic_change=0.0,
 45 |                  memory_length=0,
 46 |                  boredom_threshold=0,
 47 |                  boredom_penalty=0.0,
 48 |                  user_dist_choice='uniform',
 49 |                  shift_steps=1,
 50 |                  shift_frequency=0.0,
 51 |                  shift_weight=0.0)
 52 | 
 53 |     _test_dimension_consistency(env)
 54 |     env.reset()
 55 | 
 56 |     old_user_preferences = copy.deepcopy(env._user_preferences)
 57 |     old_dense_ratings = env._get_dense_ratings()
 58 | 
 59 |     # Recommend item 0
 60 |     env.step(np.array([[0]]))
 61 | 
 62 |     # Test that the preferences didn't change
 63 |     assert np.array_equal(old_user_preferences, env._user_preferences)
 64 |     # Test that the dense ratings didn't change
 65 |     assert np.array_equal(old_dense_ratings, env._get_dense_ratings())
 66 | 
 67 | 
 68 | def test_topics_shift():
 69 |     """Test Topics with random preference shifts"""
 70 |     env = Topics(num_topics=2,
 71 |                  num_users=1,
 72 |                  num_items=10,
 73 |                  rating_frequency=1.0,
 74 |                  num_init_ratings=0,
 75 |                  noise=0.0,
 76 |                  topic_change=0.0,
 77 |                  memory_length=0,
 78 |                  boredom_threshold=0,
 79 |                  boredom_penalty=0.0,
 80 |                  user_dist_choice='uniform',
 81 |                  shift_steps=2,
 82 |                  shift_frequency=1,
 83 |                  shift_weight=0.5,
 84 |                  user_bias_type='normal')
 85 | 
 86 |     _test_dimension_consistency(env)
 87 |     env.reset()
 88 | 
 89 |     old_user_preferences = copy.deepcopy(env._user_preferences)
 90 |     old_user_biases = copy.deepcopy(env._user_biases)
 91 | 
 92 |     # Recommend item 0.
 93 |     env.step(np.array([[0]]))
 94 | 
 95 |     # Test that the preferences and biases didn't change.
 96 |     assert np.array_equal(old_user_preferences, env._user_preferences)
 97 |     assert np.array_equal(old_user_biases, env._user_biases)
 98 | 
 99 |     # Recommend another item and check that preferences have changed.
100 |     env.step(np.array([[1]]))
101 |     assert not np.array_equal(old_user_preferences, env._user_preferences)
102 |     assert not np.array_equal(old_user_biases, env._user_biases)
103 | 
104 | 
105 | def test_topics_boredom():
106 |     """Test Topics with boredom shifts"""
107 |     env = Topics(num_topics=2,
108 |                  num_users=1,
109 |                  num_items=10,
110 |                  rating_frequency=1.0,
111 |                  num_init_ratings=0,
112 |                  noise=0.0,
113 |                  topic_change=0.0,
114 |                  memory_length=3,
115 |                  boredom_threshold=1,
116 |                  boredom_penalty=1,
117 |                  user_dist_choice='uniform',
118 |                  shift_steps=1,
119 |                  shift_frequency=0,
120 |                  shift_weight=0)
121 | 
122 |     _test_dimension_consistency(env)
123 |     env.reset()
124 |     # Change all the item types to type 0.
125 |     env._item_topics = np.zeros(len(env.items), dtype=int)
126 | 
127 |     old_ratings = env._get_dense_ratings()
128 | 
129 |     # Recommend item 0 and check that ratings don't change.
130 |     env.step(np.array([[0]]))
131 |     assert np.array_equal(old_ratings, env._get_dense_ratings())
132 | 
133 |     # Recommend item 1 and check that dense ratings decrease by the
134 |     # same amount as the boredom penalty.
135 |     env.step(np.array([[1]]))
136 |     assert np.array_equal(old_ratings-env._boredom_penalty, env._get_dense_ratings())
137 | 
138 | 
139 | def test_topics_change():
140 |     """Test Topics with topic change"""
141 |     env = Topics(num_topics=2,
142 |                  num_users=1,
143 |                  num_items=10,
144 |                  rating_frequency=1.0,
145 |                  num_init_ratings=0,
146 |                  noise=0.0,
147 |                  topic_change=0.5,
148 |                  memory_length=0,
149 |                  boredom_threshold=0,
150 |                  boredom_penalty=0,
151 |                  user_dist_choice='uniform',
152 |                  shift_steps=1,
153 |                  shift_frequency=0,
154 |                  shift_weight=0)
155 | 
156 |     _test_dimension_consistency(env)
157 |     env.reset()
158 |     # Change all the item types to type 0.
159 |     env._item_topics = np.zeros(len(env.items), dtype=int)
160 | 
161 |     old_user_preferences = copy.deepcopy(env._user_preferences)
162 | 
163 |     # Recommend item 0 and check that preferences for the recommended topic have
164 |     # increased while the preference for the other topic decreased.
165 |     env.step(np.array([[0]]))
166 |     topic = env._item_topics[0]
167 |     new_user_preferences = env._user_preferences
168 |     assert new_user_preferences[0][topic] >= old_user_preferences[0][topic]
169 |     assert new_user_preferences[0][1-topic] <= old_user_preferences[0][1-topic]
170 | 


--------------------------------------------------------------------------------
/reclab/recommenders/sparse.py:
--------------------------------------------------------------------------------
  1 | """An implementation of SLIM and EASE sparse linear recommenders.
  2 | 
  3 | For details, see:
  4 |   - http://glaros.dtc.umn.edu/gkhome/node/774
  5 |   - https://arxiv.org/pdf/1905.03375.pdf
  6 | """
  7 | import warnings
  8 | import numpy as np
  9 | import scipy.sparse
 10 | import sklearn.linear_model
 11 | from sklearn.exceptions import ConvergenceWarning
 12 | 
 13 | from . import recommender
 14 | 
 15 | warnings.simplefilter('ignore', category=ConvergenceWarning)
 16 | 
 17 | 
 18 | class SLIM(recommender.PredictRecommender):
 19 |     """The SLIM recommendation model which is a sparse linear method.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     binarize : boolean
 24 |         Determines whether to binarize ratings before fitting a model.
 25 |     alpha : float
 26 |         Constant that multiplies the regularization terms.
 27 |     l1_ratio : float
 28 |         The ratio of the L1 regularization term with respect to the L2 regularization.
 29 |     max_iter : int
 30 |         The maximum number of iterations to train the model for.
 31 |     tol : float
 32 |         The tolerance below which the optimization will stop.
 33 |     seed : int
 34 |         The random seed to use when training the model.
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self,
 39 |                  binarize=False,
 40 |                  alpha=1.0,
 41 |                  l1_ratio=0.1,
 42 |                  positive=True,
 43 |                  max_iter=100,
 44 |                  tol=1e-4,
 45 |                  seed=0):
 46 |         """Create a SLIM recommender."""
 47 |         super().__init__()
 48 |         self._binarize = binarize
 49 |         self._model = sklearn.linear_model.ElasticNet(alpha=alpha,
 50 |                                                       l1_ratio=l1_ratio,
 51 |                                                       positive=positive,
 52 |                                                       fit_intercept=False,
 53 |                                                       copy_X=False,
 54 |                                                       precompute=True,
 55 |                                                       selection='random',
 56 |                                                       max_iter=max_iter,
 57 |                                                       tol=tol,
 58 |                                                       random_state=seed)
 59 |         self._weights = None
 60 |         self._hyperparameters.update(locals())
 61 | 
 62 |         # We only want the function arguments so remove class related objects.
 63 |         del self._hyperparameters['self']
 64 |         del self._hyperparameters['__class__']
 65 | 
 66 |     @property
 67 |     def name(self):  # noqa: D102
 68 |         return 'slim'
 69 | 
 70 |     def update(self, users=None, items=None, ratings=None):  # noqa: D102
 71 |         super().update(users, items, ratings)
 72 |         num_items = len(self._items)
 73 |         self._weights = scipy.sparse.dok_matrix((num_items, num_items))
 74 |         if self._binarize:
 75 |             row, col = self._ratings.nonzero()
 76 |             data = np.ones(len(row))
 77 |             ratings = scipy.sparse.csr_matrix((data, (row, col)), shape=self._ratings.shape).tolil()
 78 |         else:
 79 |             ratings = self._ratings.tolil()
 80 |         for item_id in range(num_items):
 81 |             target = ratings[:, item_id].toarray()
 82 |             # Zero out the column of the current item to prevent a trivial solution.
 83 |             ratings[:, item_id] = 0
 84 |             # Fit the mode and save the weights
 85 |             # This currently takes 0.02s/item on ML100k
 86 |             self._model.fit(ratings, target)
 87 |             self._weights[:, item_id] = self._model.sparse_coef_.T
 88 |             self._weights[item_id, item_id] = 0
 89 |             # Restore the rating column.
 90 |             ratings[:, item_id] = target
 91 |         self._weights = scipy.sparse.csr_matrix(self._weights)
 92 | 
 93 |     @property
 94 |     def dense_predictions(self):  # noqa: D102
 95 |         if self._dense_predictions is None:
 96 |             self._dense_predictions = (self._ratings @ self._weights).todense()
 97 |         return self._dense_predictions
 98 | 
 99 |     def _predict(self, user_item):  # noqa: D102
100 |         # Predict on all user-item pairs.
101 |         all_predictions = self.dense_predictions
102 |         predictions = []
103 |         for user_id, item_id, _ in user_item:
104 |             predictions.append(all_predictions[user_id, item_id])
105 | 
106 |         return np.array(predictions)
107 | 
108 | 
109 | class EASE(recommender.PredictRecommender):
110 |     """The EASE recommendation model which is a simple linear method.
111 | 
112 |     Parameters
113 |     ----------
114 |     binarize : boolean
115 |         Determines whether to binarize ratings before fitting a model.
116 |     lam : float
117 |         Constant that multiplies the regularization terms.
118 | 
119 |     """
120 | 
121 |     def __init__(self,
122 |                  binarize=False,
123 |                  lam=1.0):
124 |         """Create an EASE recommender."""
125 |         super().__init__()
126 | 
127 |         self._binarize = binarize
128 |         self._lam = lam
129 | 
130 |         self._weights = None
131 |         self._hyperparameters.update(locals())
132 | 
133 |         # We only want the function arguments so remove class related objects.
134 |         del self._hyperparameters['self']
135 |         del self._hyperparameters['__class__']
136 | 
137 |     @property
138 |     def name(self):  # noqa: D102
139 |         return 'ease'
140 | 
141 |     def update(self, users=None, items=None, ratings=None):  # noqa: D102
142 |         super().update(users, items, ratings)
143 | 
144 |         if self._binarize:
145 |             row, col = self._ratings.nonzero()
146 |             data = np.ones(len(row))
147 |             ratings = scipy.sparse.csr_matrix((data, (row, col)), shape=self._ratings.shape)
148 |         else:
149 |             ratings = self._ratings
150 | 
151 |         item_products = ratings.T @ ratings
152 | 
153 |         diag_ind = np.diag_indices(item_products.shape[0])
154 |         item_products[diag_ind] += self._lam
155 |         inverse_mat = np.linalg.inv(item_products.todense())
156 |         self._weights = inverse_mat / (-np.diag(inverse_mat))
157 |         self._weights[diag_ind] = 0
158 | 
159 |     @property
160 |     def dense_predictions(self):  # noqa: D102
161 |         if self._dense_predictions is None:
162 |             self._dense_predictions = (self._ratings @ self._weights)
163 |         return self._dense_predictions
164 | 
165 |     def _predict(self, user_item):  # noqa: D102
166 |         # Predict on all user-item pairs.
167 |         all_predictions = self.dense_predictions
168 |         predictions = []
169 |         for user_id, item_id, _ in user_item:
170 |             predictions.append(all_predictions[user_id, item_id])
171 | 
172 |         return np.array(predictions)
173 | 


--------------------------------------------------------------------------------
/reclab/recommenders/cfnade/cfnade.py:
--------------------------------------------------------------------------------
  1 | """Implementation of the CF-NADE recommender using Keras."""
  2 | from keras.layers import Input, Dropout, Lambda, add
  3 | from keras.models import Model
  4 | import keras.regularizers
  5 | from tensorflow.keras.optimizers import Adam
  6 | import numpy as np
  7 | 
  8 | from .cfnade_lib.nade import NADE
  9 | from .cfnade_lib import utils
 10 | from .. import recommender
 11 | 
 12 | 
 13 | class Cfnade(recommender.PredictRecommender):
 14 |     """
 15 |     A Neural Autoregressive Distribution Estimator (NADE) for collaborative filtering (CF) tasks.
 16 | 
 17 |     Parameters
 18 |     ---------
 19 |     num_users : int
 20 |         Number of users in the environment.
 21 |     num_items : int
 22 |         Number of items in the environment.
 23 |     train_set : np.matrix
 24 |         Matrix of shape (num_users, num_items) populated with user ratings.
 25 |     train_epoch : int
 26 |         Number of epochs to train for each call.
 27 |     batch_size : int
 28 |         Batch size during initial training phase.
 29 |     rating_bucket: int
 30 |         number of rating buckets
 31 |     rate_score: array of float
 32 |         An array of corresponding rating score for each bucket
 33 |     hidden_dim: int
 34 |         hidden dimension to construct the layer
 35 |     learning_rate: float
 36 |         learning rate
 37 | 
 38 |     """
 39 | 
 40 |     def __init__(
 41 |             self, num_users, num_items,
 42 |             batch_size=64, train_epoch=10,
 43 |             rating_bucket=5, hidden_dim=500,
 44 |             learning_rate=0.001, normalized_layer=False,
 45 |             random_seed=0):
 46 |         """Create new Cfnade recommender."""
 47 |         super().__init__()
 48 |         self._num_users = num_users
 49 |         self._num_items = num_items
 50 |         self._batch_size = batch_size
 51 |         if num_items <= batch_size:
 52 |             self._batch_size = num_items
 53 |         self._input_dim0 = num_users
 54 |         self._rating_bucket = rating_bucket
 55 |         self._rate_score = np.array(np.arange(1, rating_bucket+1), np.float32)
 56 |         self._hidden_dim = hidden_dim
 57 |         self._learning_rate = learning_rate
 58 |         self._train_epoch = train_epoch
 59 |         self._hyperparameters.update(locals())
 60 |         self._new_items = np.zeros(num_items)
 61 |         np.random.seed(random_seed)
 62 | 
 63 |         # We only want the function arguments so remove class related objects.
 64 |         del self._hyperparameters['self']
 65 |         del self._hyperparameters['__class__']
 66 | 
 67 |         # Prepare model
 68 |         input_layer = Input(shape=(self._input_dim0, self._rating_bucket), name='input_ratings')
 69 |         output_ratings = Input(shape=(self._input_dim0, self._rating_bucket), name='output_ratings')
 70 |         input_masks = Input(shape=(self._input_dim0,), name='input_masks')
 71 |         output_masks = Input(shape=(self._input_dim0,), name='output_masks')
 72 |         nade_layer = Dropout(0.0)(input_layer)
 73 |         nade_layer = NADE(
 74 |                         hidden_dim=self._hidden_dim, activation='tanh', bias=True,
 75 |                         W_regularizer=keras.regularizers.l2(0.02),
 76 |                         V_regularizer=keras.regularizers.l2(0.02),
 77 |                         b_regularizer=keras.regularizers.l2(0.02),
 78 |                         c_regularizer=keras.regularizers.l2(0.02),
 79 |                         normalized_layer=normalized_layer)(nade_layer)
 80 | 
 81 |         predicted_ratings = Lambda(
 82 |             utils.prediction_layer,
 83 |             output_shape=utils.prediction_output_shape,
 84 |             name='predicted_ratings')(nade_layer)
 85 | 
 86 |         func_d = Lambda(
 87 |             utils.d_layer, output_shape=utils.d_output_shape,
 88 |             name='func_d')(input_masks)
 89 |         sum_masks = add([input_masks, output_masks])
 90 |         func_d_2 = Lambda(
 91 |             utils.D_layer, output_shape=utils.D_output_shape,
 92 |             name='func_d_2')(sum_masks)
 93 |         loss_out = Lambda(
 94 |             utils.rating_cost_lambda_func, output_shape=(1, ),
 95 |             name='nade_loss')([nade_layer, output_ratings,
 96 |                                input_masks, output_masks, func_d_2, func_d])
 97 | 
 98 |         self._cf_nade_model = Model(
 99 |             inputs=[input_layer, output_ratings, input_masks, output_masks],
100 |             outputs=[loss_out, predicted_ratings])
101 |         optimizer = Adam(self._learning_rate, 0.9, 0.999, 1e-8)
102 |         self._cf_nade_model.compile(
103 |             loss={'nade_loss': lambda y_true, y_pred: y_pred},
104 |             optimizer=optimizer)
105 |         self._cf_nade_model.save_weights('model.h5')
106 | 
107 |     @property
108 |     def name(self):  # noqa: D102
109 |         return 'cfnade'
110 | 
111 |     def update(self, users=None, items=None, ratings=None):  # noqa: D102
112 |         super().update(users, items, ratings)
113 |         self._cf_nade_model.load_weights('model.h5')
114 | 
115 |         ratings_matrix = self._ratings.toarray()
116 |         ratings_matrix = np.around(ratings_matrix.transpose())
117 |         ratings_matrix = ratings_matrix.astype(int)
118 | 
119 |         train_set = utils.DataSet(ratings_matrix,
120 |                                   num_users=self._num_users,
121 |                                   num_items=self._num_items,
122 |                                   batch_size=self._batch_size,
123 |                                   rating_bucket=self._rating_bucket,
124 |                                   mode=0)
125 |         self._cf_nade_model.fit_generator(train_set.generate(),
126 |                                           steps_per_epoch=(self._num_items // self._batch_size),
127 |                                           epochs=self._train_epoch,
128 |                                           callbacks=[train_set], verbose=1)
129 | 
130 |     def _predict(self, user_item):  # noqa: D102
131 |         ratings_matrix = self._ratings.toarray()
132 |         ratings_matrix = np.around(ratings_matrix.transpose())
133 |         ratings_matrix = ratings_matrix.astype(int)
134 | 
135 |         # keep track of unseen items in ratings
136 |         ratings_matrix_total = ratings_matrix.transpose().sum(axis=1)
137 |         self._new_items = np.where(ratings_matrix_total == 0)[0]
138 | 
139 |         test_set = utils.DataSet(ratings_matrix,
140 |                                  num_users=self._num_users,
141 |                                  num_items=self._num_items,
142 |                                  batch_size=self._batch_size,
143 |                                  rating_bucket=self._rating_bucket,
144 |                                  mode=2)
145 |         pred_rating = []
146 |         for batch in test_set.generate():
147 |             pred_matrix = self._cf_nade_model.predict(batch[0])[1]
148 |             pred_rating_batch = pred_matrix * self._rate_score[np.newaxis, np.newaxis, :]
149 |             pred_rating_batch = pred_rating_batch.sum(axis=2)
150 |             pred_rating.append(pred_rating_batch)
151 |         pred_rating = np.concatenate(pred_rating, axis=0)
152 | 
153 |         predictions = []
154 |         for user, item, _ in user_item:
155 |             if item in self._new_items:
156 |                 predictions.append(3)
157 |             else:
158 |                 predictions.append(pred_rating[item, user])
159 | 
160 |         return np.array(predictions)
161 | 


--------------------------------------------------------------------------------
/reclab/recommenders/knn_recommender.py:
--------------------------------------------------------------------------------
  1 | """The implementation for a neighborhood based recommender."""
  2 | import heapq
  3 | 
  4 | import numpy as np
  5 | import scipy.sparse
  6 | import scipy.sparse.linalg
  7 | 
  8 | from . import recommender
  9 | 
 10 | 
 11 | class KNNRecommender(recommender.PredictRecommender):
 12 |     """A neighborhood based collaborative filtering algorithm.
 13 | 
 14 |     The class supports both user and item based collaborative filtering.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     shrinkage : float
 19 |         The shrinkage parameter applied to the similarity measure.
 20 |     neighborhood_size : int
 21 |         The number of users/items to consider when estimating a rating.
 22 |     user_based : bool
 23 |         If this variable is set to true the created object will use user-based collaborative
 24 |         filtering, otherwise it will use item-based collaborative filtering.
 25 |     use_content : bool
 26 |         Whether to use the user/item features when computing the similarity measure.
 27 |     use_means : bool
 28 |         Whether to adjust the ratings based on the mean rating of each user/item.
 29 | 
 30 |     """
 31 | 
 32 |     def __init__(self, shrinkage=0, neighborhood_size=40,
 33 |                  user_based=True, use_content=True, use_means=True,
 34 |                  **kwargs):
 35 |         """Create a new neighborhood recommender."""
 36 |         super().__init__(**kwargs)
 37 |         self._shrinkage = shrinkage
 38 |         self._neighborhood_size = neighborhood_size
 39 |         self._user_based = user_based
 40 |         self._use_content = use_content
 41 |         self._use_means = use_means
 42 |         self._feature_matrix = scipy.sparse.csr_matrix((0, 0))
 43 |         self._means = np.empty(0)
 44 |         self._similarity_matrix = np.empty((0, 0))
 45 |         self._ratings_matrix = np.empty((0, 0))
 46 |         self._hyperparameters.update(locals())
 47 | 
 48 |         # We only want the function arguments so remove class related objects.
 49 |         del self._hyperparameters['self']
 50 |         del self._hyperparameters['__class__']
 51 | 
 52 |     @property
 53 |     def name(self):  # noqa: D102
 54 |         return 'knn'
 55 | 
 56 |     @property
 57 |     def dense_predictions(self):  # noqa: D102
 58 |         if self._dense_predictions is not None:
 59 |             return self._dense_predictions
 60 | 
 61 |         # Set up whether we will loop over users or items.
 62 |         if self._user_based:
 63 |             loop_range = range(len(self._users))
 64 |             ratings_matrix = self._ratings_matrix
 65 |         else:
 66 |             loop_range = range(len(self._items))
 67 |             ratings_matrix = self._ratings_matrix.T
 68 | 
 69 |         preds = []
 70 |         for idx in loop_range:
 71 |             relevant_idxs = nlargest_indices(
 72 |                 self._neighborhood_size, self._similarity_matrix[idx])
 73 |             ratings = ratings_matrix[relevant_idxs]
 74 |             # We only care about means and similarities with corresponding nonzero ratings.
 75 |             zero = ratings == 0
 76 | 
 77 |             # Create a matrix of means that can easily be subtracted by the ratings.
 78 |             relevant_means = self._means[relevant_idxs]
 79 |             relevant_means = np.tile(relevant_means, (ratings_matrix.shape[1], 1)).T
 80 |             relevant_means[zero] = 0.0
 81 | 
 82 |             # Create a matrix of relevant similarities that can easily be multiplied with ratings.
 83 |             similarities = self._similarity_matrix[relevant_idxs, idx]
 84 |             similarities = np.tile(similarities, (ratings_matrix.shape[1], 1)).T
 85 |             similarities[zero] = 0.0
 86 | 
 87 |             # Ensure that we aren't weighting by all 0.
 88 |             zero = np.all(np.isclose(similarities, 0), axis=0)
 89 |             similarities[:, zero] = 1.0
 90 | 
 91 |             # Compute the predictions.
 92 |             if self._use_means:
 93 |                 ratings_sum = self._means[idx] + (ratings - relevant_means)
 94 |             else:
 95 |                 ratings_sum = ratings
 96 |             preds.append((ratings_sum * similarities).sum(axis=0) / similarities.sum(axis=0))
 97 | 
 98 |         preds = np.array(preds)
 99 |         if not self._user_based:
100 |             preds = preds.T
101 | 
102 |         self._dense_predictions = preds
103 |         return preds
104 | 
105 |     def reset(self, users=None, items=None, ratings=None):  # noqa: D102
106 |         self._feature_matrix = scipy.sparse.csr_matrix((0, 0))
107 |         self._similarity_matrix = np.empty((0, 0))
108 |         self._means = np.empty(0)
109 |         self._ratings_matrix = np.empty((0, 0))
110 |         super().reset(users, items, ratings)
111 | 
112 |     def update(self, users=None, items=None, ratings=None):  # noqa: D102
113 |         super().update(users, items, ratings)
114 |         if self._user_based:
115 |             self._feature_matrix = scipy.sparse.csr_matrix(self._ratings)
116 |         else:
117 |             self._feature_matrix = scipy.sparse.csr_matrix(self._ratings.T)
118 |         self._means = divide_zero(flatten(self._feature_matrix.sum(axis=1)),
119 |                                   self._feature_matrix.getnnz(axis=1))
120 |         if self._use_content:
121 |             if self._user_based:
122 |                 self._feature_matrix = scipy.sparse.hstack([self._feature_matrix, self._users])
123 |             else:
124 |                 self._feature_matrix = scipy.sparse.hstack([self._feature_matrix, self._items])
125 |         self._similarity_matrix = cosine_similarity(self._feature_matrix, self._feature_matrix,
126 |                                                     self._shrinkage)
127 |         np.fill_diagonal(self._similarity_matrix, 0)
128 |         # TODO: this may not be the best way to store ratings, but it does speed access
129 |         self._ratings_matrix = self._ratings.A
130 | 
131 |     def _predict(self, user_item):  # noqa: D102
132 |         preds = []
133 |         relevant_idxs_cache = {}
134 |         for user_id, item_id, _ in user_item:
135 |             if self._user_based:
136 |                 if user_id not in relevant_idxs_cache:
137 |                     relevant_idxs_cache[user_id] = nlargest_indices(
138 |                         self._neighborhood_size, self._similarity_matrix[user_id])
139 |                 relevant_idxs = relevant_idxs_cache[user_id]
140 |                 similarities = self._similarity_matrix[relevant_idxs, user_id]
141 |                 ratings = self._ratings_matrix[relevant_idxs, item_id].ravel()
142 |                 mean = self._means[user_id]
143 |             else:
144 |                 if item_id not in relevant_idxs_cache:
145 |                     relevant_idxs_cache[item_id] = nlargest_indices(
146 |                         self._neighborhood_size, self._similarity_matrix[item_id])
147 |                 relevant_idxs = relevant_idxs_cache[item_id]
148 |                 similarities = self._similarity_matrix[relevant_idxs, item_id]
149 |                 ratings = self._ratings_matrix.T[relevant_idxs, user_id].ravel()
150 |                 mean = self._means[item_id]
151 |             relevant_means = self._means[relevant_idxs]
152 |             nonzero = ratings != 0
153 |             ratings = ratings[nonzero]
154 |             similarities = similarities[nonzero]
155 |             # ensure that we aren't weighting by all 0
156 |             if np.all(np.isclose(similarities, 0)):
157 |                 similarities = np.ones_like(similarities)
158 |             if self._use_means:
159 |                 if len(ratings) == 0:
160 |                     preds.append(mean)
161 |                 else:
162 |                     preds.append(mean + np.average(ratings - relevant_means[nonzero],
163 |                                                    weights=similarities))
164 |             else:
165 |                 if len(ratings) == 0:
166 |                     preds.append(0)
167 |                 else:
168 |                     preds.append(np.average(ratings, weights=similarities))
169 | 
170 |         return np.array(preds)
171 | 
172 | 
173 | def cosine_similarity(X, Y, shrinkage):
174 |     """Compute the cosine similarity between each row vector in each matrix X and Y.
175 | 
176 |     Parameters
177 |     ----------
178 |     X : np.matrix
179 |         The first matrix for which to compute the cosine similarity.
180 |     Y : np.matrix
181 |         The second matrix for which to compute the cosine similarity.
182 |     shrinkage : float
183 |         The amount of shrinkage to apply to the similarity computation.
184 | 
185 |     Returns
186 |     -------
187 |     similarity : np.ndarray
188 |         The similarity array between each pairs of row, where similarity[i, j]
189 |         is the cosine similarity between X[i] and Y[j].
190 | 
191 |     """
192 |     return divide_zero((X @ Y.T).A, scipy.sparse.linalg.norm(X, axis=1)[:, np.newaxis] *
193 |                        scipy.sparse.linalg.norm(Y, axis=1)[np.newaxis, :] + shrinkage)
194 | 
195 | 
196 | def nlargest_indices(n, iterable):
197 |     """Given an iterable, computes the indices of the n largest items.
198 | 
199 |     Parameters
200 |     ----------
201 |     n : int
202 |         How many indices to retrieve.
203 |     iterable : iterable
204 |         The iterable from which to compute the n largest indices.
205 | 
206 |     Returns
207 |     -------
208 |     largest : list of int
209 |         The n largest indices where largest[i] is the index of the i-th largest index.
210 | 
211 |     """
212 |     nlargest = heapq.nlargest(n, enumerate(iterable),
213 |                               key=lambda x: x[1])
214 |     return [i[0] for i in nlargest]
215 | 
216 | 
217 | def flatten(matrix):
218 |     """Given a matrix return a flattened numpy array."""
219 |     return matrix.A.ravel()
220 | 
221 | 
222 | def divide_zero(num, denom):
223 |     """Divide a and b but return 0 instead of nan for divide by 0."""
224 |     # TODO: is this the desired zero-division behavior?
225 |     return np.divide(num, denom, out=np.zeros_like(num), where=(denom != 0))
226 | 


--------------------------------------------------------------------------------
/reclab/recommenders/llorma/llorma_lib/anchor.py:
--------------------------------------------------------------------------------
  1 | """Anchor Manager module
  2 | """
  3 | import random
  4 | 
  5 | import numpy as np
  6 | from sklearn.preprocessing import normalize
  7 | from scipy.spatial import distance_matrix
  8 | 
  9 | 
 10 | def _init_anchor_points(data, n_anchor, row_k, col_k):
 11 |     """ Helper function that
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     data : array-like, shape [n_ratings, 3]
 16 |         Rating data
 17 |         Each row is of the form [user_id, item_id, rating]
 18 |     n_anchor : int
 19 |         Number of anchor points
 20 |     row_k : array-like, shape [n_users, n_users]
 21 |         Symmetric kernel matrix where entry (i,j) is
 22 |         the similarity between user_i and user_j
 23 |     col_k : array-like, shape [n_items, n_items]
 24 |         Symmetric kernel matrix where entry (i, j) id
 25 |         the similarity between item_i and item_j
 26 | 
 27 |     Returns
 28 |     -------
 29 |     np.ndarray, shape (n_anchor,)
 30 |         Array of anchor indices, indexed according
 31 |         to their order in the rating data
 32 |     """
 33 |     user_ids = data[:, 0].astype(np.int64)
 34 |     item_ids = data[:, 1].astype(np.int64)
 35 | 
 36 |     anchor_idxs = []
 37 |     while len(anchor_idxs) < n_anchor:
 38 |         anchor_idx = random.randint(0, data.shape[0] - 1)
 39 |         if anchor_idx in anchor_idxs:
 40 |             continue
 41 | 
 42 |         anchor_row = data[anchor_idx]
 43 |         uid = int(anchor_row[0])
 44 |         iid = int(anchor_row[1])
 45 | 
 46 |         k = np.multiply(row_k[uid][user_ids],
 47 |                         col_k[iid][item_ids])
 48 |         sum_a_of_anchor = np.sum(k)
 49 |         if sum_a_of_anchor < 1:
 50 |             continue
 51 | 
 52 |         #print('>> %10d\t%d' % (anchor_idx, sum_a_of_anchor))
 53 |         anchor_idxs.append(anchor_idx)
 54 | 
 55 |     return anchor_idxs
 56 | 
 57 | 
 58 | def _get_distance_matrix(latent):
 59 |     """Helper function to compute a matrix
 60 |     of pairwise cosine distances between latent
 61 |     factors of a pair of users of a pair of items
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     latent : array-like, shape (N, latent_dim)
 66 |         Matrix of latent factors
 67 |         Number of rows is the number of users or items
 68 |         Number of columns is the latent dimension
 69 | 
 70 |     Returns
 71 |     -------
 72 |     array-like, shape (N, N)
 73 |         Matrix of cosine distances between every
 74 |         pair of users (items)
 75 |     """
 76 |     _normalized_latent = normalize(latent, axis=1)
 77 | 
 78 |     d_mat = distance_matrix(_normalized_latent, _normalized_latent)
 79 |     assert np.count_nonzero(np.isnan(d_mat)) == 0
 80 |     return d_mat
 81 | 
 82 | 
 83 | def _get_k_from_distance(d_mat):
 84 |     """Helper function to compute kernel matrix from distance matrix
 85 | 
 86 |     Parameters
 87 |     ----------
 88 |     d_mat : array-like, shape [N, N]
 89 |         Matrix of cosine distances between every
 90 |         pair of users (items)
 91 | 
 92 |     Returns
 93 |     -------
 94 |     np.ndarray, shape [N, N]
 95 |         Kernel matrix corresponding to the distance matrix
 96 |     """
 97 |     m_mat = np.zeros(d_mat.shape)
 98 |     m_mat[d_mat < 0.9] = 1
 99 |     k_mat = np.multiply(np.subtract(np.ones(d_mat.shape), np.square(d_mat)), m_mat)
100 |     return k_mat
101 | 
102 | def _get_rbf_k(latent, gamma=None, scaled=True):
103 |     """Helper function to compute scaled
104 |     Gaussian Kernel matrix for latent factors
105 | 
106 |     Parameters
107 |     ----------
108 |     latent : array-like, shape (N, latent_dim)
109 |         Matrix of latent factors
110 |         Number of rows is the number of users or items
111 |         Number of columns is the latent dimension
112 |     gamma : float, optional
113 |         parameter for the , by default None
114 |     scaled : bool, optional
115 |         if true, the kernel is scaled by the norms of the factors
116 |         by default True
117 |     """
118 | 
119 |     if gamma is None:
120 |         gamma = 1
121 |     d_mat = _get_distance_matrix(latent)
122 | 
123 |     rbf_mat = np.exp(-1*gamma*d_mat)
124 |     row_norms = np.linalg.norm(latent, axis=1)
125 |     if scaled:
126 |         norms_mat = np.outer(row_norms, row_norms)
127 |         k_mat = np.multiply(rbf_mat, norms_mat)
128 |     else: k_mat = rbf_mat
129 | 
130 |     # normalize such that diagonals have value 1
131 |     row_avg = np.mean(k_mat, axis=1, keepdims=True).reshape(-1, 1)
132 |     col_avg = np.mean(k_mat, axis=0, keepdims=True).reshape(1, -1)
133 |     avg = np.mean(k_mat)
134 |     k_mat = k_mat-col_avg-row_avg+2*avg
135 |     k_diag = np.sqrt(np.diagonal(k_mat))
136 |     k_diag_outer = np.outer(k_diag, k_diag)
137 |     k_mat = np.divide(k_mat, k_diag_outer)
138 |     # return (k_mat - 1)*2
139 |     return(k_mat)
140 | 
141 | 
142 | 
143 | def _get_ks_from_latents(row_latent, col_latent):
144 |     """Helper function to get kernels
145 | 
146 |     Parameters
147 |     ----------
148 |     row_latent : array-like, shape (N_users, rank)
149 |         Matrix of latent factors corresponding to users
150 |     col_latent : array-like, shape (N_items, rank)
151 |         Matrix of latent factors corresponding to items
152 | 
153 |     Returns
154 |     -------
155 |     (row_k, col_k): array-like, (N_users, N_users), (N_items, N_items)
156 |         Returns two square matrices corresponding to similarity kernels
157 |         row_k: entry (i,j) is the similarity between user_i and user_j
158 |         col_k: entry (i,j) is the similarity between item_i and item_j
159 |     """
160 |     # row_d = _get_distance_matrix(row_latent)
161 |     # col_d = _get_distance_matrix(col_latent)
162 | 
163 |     # row_k = _get_k_from_distance(row_d)
164 |     # col_k = _get_k_from_distance(col_d)
165 | 
166 |     row_k = _get_rbf_k(row_latent)
167 |     col_k = _get_rbf_k(col_latent)
168 | 
169 |     return row_k, col_k
170 | 
171 | 
172 | class AnchorManager:
173 |     """ AnchorManager class
174 | 
175 |     Parameters
176 |     ----------
177 |     n_anchor : int
178 |         number of anchor points
179 |     batch_manager : obj: BatchManager
180 |         an instance of BatchManager class
181 |     row_latent_init : array-like, shape (n_users, latent_dim)
182 |         Matrix of latent factors for users.
183 |         Typically this is set to factors pre-trained in a
184 |         pre-train Matrix Factorization step
185 |     col_latent_init : array-like, shape (n_item, latent_dim)
186 |         Matrix of latent factors for items.
187 |         Typically this is set to factors pre-trained in a
188 |         pre-train Matrix Factorization step
189 |     """
190 | 
191 |     def __init__(
192 |             self,
193 |             n_anchor,
194 |             batch_manager,
195 |             row_latent_init,
196 |             col_latent_init,
197 |             kernel_fun):
198 |         """ Instantiate an AnchorManager
199 |         """
200 | 
201 |         train_data = batch_manager.train_data
202 | 
203 |         row_latent = row_latent_init
204 |         col_latent = col_latent_init
205 | 
206 |         if kernel_fun is None:
207 |             row_k, col_k = _get_ks_from_latents(row_latent, col_latent)
208 |         else:
209 |             row_k = kernel_fun(row_latent)
210 |             col_k = kernel_fun(col_latent)
211 | 
212 |         anchor_idxs = _init_anchor_points(train_data, n_anchor, row_k, col_k)
213 |         assert len(anchor_idxs) == n_anchor
214 |         anchor_points = train_data[anchor_idxs]
215 | 
216 |         self.train_data = train_data
217 |         self.valid_data = batch_manager.valid_data
218 |         self.test_data = batch_manager.test_data
219 | 
220 |         self.anchor_idxs = anchor_idxs
221 |         self.anchor_points = anchor_points
222 | 
223 |         self.row_k = row_k
224 |         self.col_k = col_k
225 | 
226 |     def get_k(self, anchor_idx, user_item_data):
227 |         """Returns the Kernel similarity between the
228 |         anchor user_item pair and the user_item pairs
229 |         in the user_item data
230 | 
231 |         Parameters
232 |         ----------
233 |         anchor_idx : Array-like, shape (2,)
234 |             (user_id, item_id) of the anchor point
235 |         user_item_data : Array-like, shape (N_ratings, >2)
236 |             Array where first 2 columns are (user_id, item_id) pairs
237 | 
238 |         Returns
239 |         -------
240 |         np.ndarray, shape (N_ratings,)
241 |             Returns an array of kernel weights corresponding to
242 |             the chosen anchor for each user_item pair in the data
243 |         """
244 |         row_k = self.row_k
245 |         col_k = self.col_k
246 |         anchor_point = self.anchor_points[anchor_idx]
247 | 
248 |         anchor_uid = int(anchor_point[0])
249 |         anchor_iid = int(anchor_point[1])
250 | 
251 |         user_ids = user_item_data[:, 0].astype(np.int64)
252 |         item_ids = user_item_data[:, 1].astype(np.int64)
253 | 
254 |         return np.multiply(row_k[anchor_uid][user_ids], col_k[anchor_iid][item_ids])
255 | 
256 |     def get_train_k(self, anchor_idx):
257 |         """ Get Kernel matrix of the train_data of a given anchor
258 | 
259 |         Parameters
260 |         ----------
261 |         anchor_idx : Array-like, shape (2,)
262 |             (user_id, item_id) of the anchor point
263 | 
264 |         Returns
265 |         -------
266 |         np.ndarray, shape (N_ratings,)
267 |             Returns an array of kernel weights corresponding to
268 |             the chosen anchor for each user_item pair in the train data
269 |         """
270 |         return self.get_k(anchor_idx, self.train_data)
271 | 
272 |     def get_valid_k(self, anchor_idx):
273 |         """ Get Kernel matrix of the validation_data of a given anchor
274 | 
275 |         Parameters
276 |         ----------
277 |         anchor_idx : Array-like, shape (2,)
278 |             (user_id, item_id) of the anchor point
279 | 
280 |         Returns
281 |         -------
282 |         np.ndarray, shape (N_ratings,)
283 |             Returns an array of kernel weights corresponding to
284 |             the chosen anchor for each user_item pair in the valid data
285 |         """
286 |         return self.get_k(anchor_idx, self.valid_data)
287 | 
288 |     def get_test_k(self, anchor_idx):
289 |         """ Get Kernel matrix of the test_data of a given anchor
290 | 
291 |         Parameters
292 |         ----------
293 |         anchor_idx : Array-like, shape (2,)
294 |             (user_id, item_id) of the anchor point
295 | 
296 |         Returns
297 |         -------
298 |         np.ndarray, shape (N_ratings,)
299 |             Returns an array of kernel weights corresponding to
300 |             the chosen anchor for each user_item pair in the test data
301 |         """
302 |         return self.get_k(anchor_idx, self.test_data)
303 | 


--------------------------------------------------------------------------------
/reclab/recommenders/libfm.py:
--------------------------------------------------------------------------------
  1 | """A wrapper for the LibFM recommender. See www.libfm.org for implementation details."""
  2 | import numpy as np
  3 | import scipy.sparse
  4 | 
  5 | import wpyfm
  6 | from . import recommender
  7 | 
  8 | 
  9 | class LibFM(recommender.PredictRecommender):
 10 |     """The libFM recommendation model which is a factorization machine.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     num_user_features : int
 15 |         The number of features that describe each user.
 16 |     num_item_features : int
 17 |         The number of features that describe each item.
 18 |     num_rating_features : int
 19 |         The number of features that describe the context in which each rating occurs.
 20 |     max_num_users : int
 21 |         The maximum number of users that we will be making predictions for. Note that
 22 |         setting this value to be too large will lead to a degradation in performance.
 23 |     max_num_items : int
 24 |         The maximum number of items that we will be making predictions for. Note that
 25 |         setting this value to be too large will lead to a degradation in performance.
 26 |     method : str
 27 |         The method to learn parameters. Can be one of: 'sgd', 'sgda', or 'mcmc'.
 28 |     use_global_bias : bool
 29 |         Whether to use a global bias term.
 30 |     use_one_way : bool
 31 |         Whether to use one way interactions.
 32 |     num_two_way_factors : int
 33 |         The number of factors to use for the two way interactions.
 34 |     learning_rate : float
 35 |         The learning rate for sgd or sgda.
 36 |     reg : float
 37 |         The regularization across all parameters. Will be overwritten for their respective
 38 |         parameters if bias_reg, one_way_reg, or two_way_reg is not None.
 39 |     bias_reg : float
 40 |         The regularization for the global bias.
 41 |     one_way_reg : float
 42 |         The regularization for the one-way interactions.
 43 |     two_way_reg : float
 44 |         The regularization for the two-way interactions.
 45 |     init_stdev : float
 46 |         Standard deviation for initialization of the 2-way factors.
 47 |     num_iter : int
 48 |         The number of iterations to train the model for.
 49 |     seed : int
 50 |         The random seed to use when training the model.
 51 | 
 52 |     """
 53 | 
 54 |     def __init__(self,
 55 |                  num_user_features,
 56 |                  num_item_features,
 57 |                  num_rating_features,
 58 |                  max_num_users,
 59 |                  max_num_items,
 60 |                  method='sgd',
 61 |                  use_global_bias=True,
 62 |                  use_one_way=True,
 63 |                  num_two_way_factors=8,
 64 |                  learning_rate=0.1,
 65 |                  reg=0.0,
 66 |                  bias_reg=None,
 67 |                  one_way_reg=None,
 68 |                  two_way_reg=None,
 69 |                  init_stdev=0.1,
 70 |                  num_iter=100,
 71 |                  seed=0,
 72 |                  **kwargs):
 73 |         """Create a LibFM recommender."""
 74 |         super().__init__(**kwargs)
 75 |         if bias_reg is None:
 76 |             bias_reg = reg
 77 |         if one_way_reg is None:
 78 |             one_way_reg = reg
 79 |         if two_way_reg is None:
 80 |             two_way_reg = reg
 81 |         self._max_num_users = max_num_users
 82 |         self._max_num_items = max_num_items
 83 |         self._train_data = None
 84 |         self._num_features = (self._max_num_users + num_user_features + self._max_num_items +
 85 |                               num_item_features + num_rating_features)
 86 |         self._model = wpyfm.PyFM(method=method,
 87 |                                  dim=(use_global_bias, use_one_way, num_two_way_factors),
 88 |                                  lr=learning_rate,
 89 |                                  reg=(bias_reg, one_way_reg, two_way_reg),
 90 |                                  init_stdev=init_stdev,
 91 |                                  num_iter=num_iter,
 92 |                                  seed=seed)
 93 |         self._hyperparameters.update(locals())
 94 |         self._has_xt = method in ('mcmc', 'als')
 95 | 
 96 |         # We only want the function arguments so remove class related objects.
 97 |         del self._hyperparameters['self']
 98 |         del self._hyperparameters['__class__']
 99 | 
100 |         # Each row of rating_inputs has the following structure:
101 |         # (user_id, user_features, item_id, item_features, rating_features).
102 |         # Where user_id and item_id are one hot encoded.
103 |         rating_inputs = scipy.sparse.csr_matrix((0, self._num_features))
104 |         # Each row of rating_outputs consists of the numerical value assigned to that interaction.
105 |         rating_outputs = np.empty((0,))
106 |         self._train_data = wpyfm.Data(rating_inputs, rating_outputs, has_xt=self._has_xt)
107 | 
108 |     @property
109 |     def name(self):  # noqa: D102
110 |         return 'libfm'
111 | 
112 |     def reset(self, users=None, items=None, ratings=None):  # noqa: D102
113 |         rating_inputs = scipy.sparse.csr_matrix((0, self._num_features))
114 |         rating_outputs = np.empty((0,))
115 |         self._train_data = wpyfm.Data(rating_inputs, rating_outputs, has_xt=self._has_xt)
116 |         super().reset(users, items, ratings)
117 | 
118 |     def update(self, users=None, items=None, ratings=None, retrain=True):  # noqa: D102
119 |         super().update(users, items, ratings)
120 |         self._retrain = retrain
121 | 
122 |         if ratings is not None:
123 |             data = []
124 |             row_col = [[], []]
125 |             new_rating_outputs = []
126 |             # TODO: create internal _update function for dealing with inner ids
127 |             for row, ((user_id_outer, item_id_outer),
128 |                       (rating, rating_context)) in enumerate(ratings.items()):
129 |                 user_id = self._outer_to_inner_uid[user_id_outer]
130 |                 item_id = self._outer_to_inner_iid[item_id_outer]
131 |                 user_features = self._users[user_id]
132 |                 item_features = self._items[item_id]
133 |                 row_col[0].append(row)
134 |                 row_col[1].append(user_id)
135 |                 data.append(1)
136 |                 for i, feature in enumerate(user_features):
137 |                     row_col[0].append(row)
138 |                     row_col[1].append(self._max_num_users + i)
139 |                     data.append(feature)
140 |                 row_col[0].append(row)
141 |                 row_col[1].append(self._max_num_users + len(user_features) + item_id)
142 |                 data.append(1)
143 |                 for i, feature in enumerate(item_features):
144 |                     row_col[0].append(row)
145 |                     row_col[1].append(self._max_num_users + len(user_features) +
146 |                                       self._max_num_items + i)
147 |                     data.append(feature)
148 |                 for i, feature in enumerate(rating_context):
149 |                     row_col[0].append(row)
150 |                     row_col[1].append(self._max_num_users + len(user_features) +
151 |                                       self._max_num_items + len(item_features) + i)
152 |                     data.append(feature)
153 | 
154 |                 new_rating_outputs.append(rating)
155 | 
156 |             new_rating_inputs = scipy.sparse.csr_matrix((data, row_col),
157 |                                                         shape=(len(ratings), self._num_features))
158 |             new_rating_outputs = np.array(new_rating_outputs)
159 |             # TODO: We need to account for when the same rating gets added again. Right now
160 |             # this will just add duplicate rows with different ratings.
161 |             self._train_data.add_rows(new_rating_inputs, new_rating_outputs)
162 | 
163 |     def _predict(self, user_item):  # noqa: D102
164 |         # Create a test_inputs array that can be parsed by our output function.
165 |         test_inputs = []
166 |         data = []
167 |         row_col = [[], []]
168 |         for row, (user_id, item_id, rating_context) in enumerate(user_item):
169 |             user_features = self._users[user_id]
170 |             item_features = self._items[item_id]
171 |             row_col[0].append(row)
172 |             row_col[1].append(user_id)
173 |             data.append(1)
174 |             for i, feature in enumerate(user_features):
175 |                 row_col[0].append(row)
176 |                 row_col[1].append(self._max_num_users + i)
177 |                 data.append(feature)
178 |             row_col[0].append(row)
179 |             row_col[1].append(self._max_num_users + len(user_features) + item_id)
180 |             data.append(1)
181 |             for i, feature in enumerate(item_features):
182 |                 row_col[0].append(row)
183 |                 row_col[1].append(self._max_num_users + len(user_features) +
184 |                                   self._max_num_items + i)
185 |                 data.append(feature)
186 |             for i, feature in enumerate(rating_context):
187 |                 row_col[0].append(row)
188 |                 row_col[1].append(self._max_num_users + len(user_features) +
189 |                                   self._max_num_items + len(item_features) + i)
190 |                 data.append(feature)
191 | 
192 |         test_inputs = scipy.sparse.csr_matrix((data, row_col),
193 |                                               shape=(len(user_item), self._num_features))
194 |         test_data = wpyfm.Data(test_inputs, np.zeros(test_inputs.shape[0]), has_xt=self._has_xt)
195 | 
196 |         if self._retrain:
197 |             if self._has_xt:
198 |                 self._model.train(self._train_data, test=test_data)
199 |             else:
200 |                 self._model.train(self._train_data)
201 |         predictions = self._model.predict(test_data)
202 | 
203 |         return predictions
204 | 
205 |     def model_parameters(self):
206 |         """Train a libfm model and get the resulting model's parameters.
207 | 
208 |         The degree-2 factorization machine model predicts a rating by
209 | 
210 |         r(x) = b_0 + w^T x + Ind(j = i) Ind(k = u) V_j^T V_k
211 | 
212 |         where b_0 is the global bias, w is the weights, and
213 |         V is the pairwise interactions with dimension k * (m+n)
214 |         V_j is the j^th row of V
215 |         x is defined as the concatenation of two one-hot encodings e_i and e_u,
216 |         and w^T x correpond to the user and item biases.
217 | 
218 |         Returns
219 |         -------
220 |         global_bias : float
221 |             Global bias term in the model.
222 |         weights : np.ndarray
223 |             Linear terms in the model (related to user/item biases).
224 |         pairwise_interactions  : np.ndarray
225 |             Interaction term in the model (related to user/item factors).
226 | 
227 |         """
228 |         self._model.train(self._train_data)
229 |         return self._model.parameters()
230 | 


--------------------------------------------------------------------------------
/reclab/environments/topics.py:
--------------------------------------------------------------------------------
  1 | """Contains the implementation for the Topics environment.
  2 | 
  3 | In this environment users have a hidden preference for each topic and each item has a
  4 | hidden topic assigned to it.
  5 | """
  6 | import collections
  7 | import numpy as np
  8 | 
  9 | from . import environment
 10 | 
 11 | 
 12 | class Topics(environment.DictEnvironment):
 13 |     """
 14 |     An environment where items have a single topic and users prefer certain topics.
 15 | 
 16 |     The user preference for any given topic is initialized as Unif(0.5, 5.5) while
 17 |     topics are uniformly assigned to items. Users will
 18 |     also have a changing preference for topics they get recommended based on the topic_change
 19 |     parameter. Users and items can have biases, there can also exist an underlying bias.
 20 | 
 21 |     Ratings are generated as
 22 |     r = clip( user preference for a given topic + b_u + b_i + b_0, 1, 5)
 23 |     where b_u is a user bias, b_i is an item bias, and b_0 is a global bias.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     num_topics : int
 28 |         The number of topics items can be assigned to.
 29 |     num_users : int
 30 |         The number of users in the environment.
 31 |     num_items : int
 32 |         The number of items in the environment.
 33 |     rating_frequency : float
 34 |         The proportion of users that will need a recommendation at each step.
 35 |         Must be between 0 and 1.
 36 |     num_init_ratings : int
 37 |         The number of ratings available from the start. User-item pairs are randomly selected.
 38 |     noise : float
 39 |         The standard deviation of the noise added to ratings.
 40 |     topic_change : float
 41 |         How much the user's preference for a topic changes each time that topic is recommended
 42 |         to them. The negative of topic_change gets split across all other topics as well.
 43 |     memory_length : int
 44 |         The number of recent topics a user remembers which affect the rating
 45 |     boredom_threshold : int
 46 |         The number of times a topics has to be seen within the memory to gain a
 47 |         penalty.
 48 |     boredom_penalty : float
 49 |         The penalty on the rating when a user is bored
 50 |     satiation_factor : float
 51 |         The extent to which satiation affects user ratings.
 52 |     satiation_decay : float or tuple
 53 |         A number between 0 and 1 that indicates how quickly satiation decays. 
 54 |         If a tuple, the decay will alternate between the two values depending on the user's 
 55 |         sensitization state
 56 |     satiation_noise : float
 57 |         The standard deviation of the noise influencing satiation at each timestep.
 58 |     switch_probability : tuple
 59 |         Represents a probability matrix where index 0 is the conditional probability of a user 
 60 |         switching from a state of sensitization (S) to a state of boredom (B): P(B | S). 
 61 |         Similarly, index 1 is P(S | B). The probability of staying in a state is 1 - P(switching)
 62 |     user_dist_choice : str
 63 |         The choice of user distribution for selecting online users. By default, the subset of
 64 |         online users is chosen from a uniform distribution. Currently supports normal and lognormal.
 65 |     initial_sampling: str or array
 66 |         How the initial ratings should be sampled. Can be 'uniform', 'powerlaw', or an
 67 |         array of tuples where arr[i][0] and arr[i][1] are the user-id and item-id respectively
 68 |         of the i-th initial rating. If initial_sampling is a string, then users are sampled
 69 |         according to user_dist_choice and items are sampled according to initial_sampling.
 70 |     shift_steps : int
 71 |         The number of timesteps to wait between each user preference shift.
 72 |     shift_frequency : float
 73 |         The proportion of users whose preference we wish to change during a preference shift.
 74 |     shift_weight : float
 75 |         The weight to assign to a user's new preferences after a preference shift.
 76 |         User's old preferences get assigned a weight of 1 - shift_weight.
 77 |     user_bias_type : normal or power
 78 |         distribution type for user biases.
 79 |         normal is normal distribution with default mean zero and variance 0.5
 80 |         power is power law distribution
 81 |     item_bias_type : normal or power
 82 |         distribution type for item biases.
 83 |         normal is normal distribution with default mean zero and variance 0.5
 84 |         power is power law distribution
 85 | 
 86 |     """
 87 | 
 88 |     def __init__(self,
 89 |                  num_topics,
 90 |                  num_users,
 91 |                  num_items,
 92 |                  rating_frequency=1.0,
 93 |                  num_init_ratings=0,
 94 |                  noise=0.0,
 95 |                  topic_change=0.0,
 96 |                  memory_length=0,
 97 |                  boredom_threshold=0,
 98 |                  boredom_penalty=0.0,
 99 |                  satiation_factor=0.0,
100 |                  satiation_decay=0.0,
101 |                  satiation_noise=0.0,
102 |                  switch_probability=(0.0, 0.0),
103 |                  user_dist_choice='uniform',
104 |                  initial_sampling='uniform',
105 |                  shift_steps=1,
106 |                  shift_frequency=0.0,
107 |                  shift_weight=0.0,
108 |                  user_bias_type='none',
109 |                  item_bias_type='none'):
110 |         """Create a Topics environment."""
111 |         super().__init__(rating_frequency=rating_frequency,
112 |                          num_init_ratings=num_init_ratings,
113 |                          memory_length=memory_length,
114 |                          user_dist_choice=user_dist_choice,
115 |                          initial_sampling=initial_sampling)
116 |         self._num_topics = num_topics
117 |         self._num_users = num_users
118 |         self._num_items = num_items
119 |         self._topic_change = topic_change
120 |         self._noise = noise
121 |         self._user_preferences = None
122 |         self._item_topics = None
123 |         self._boredom_threshold = boredom_threshold
124 |         self._boredom_penalty = boredom_penalty
125 |         self._satiation_factor = satiation_factor
126 |         self._satiation_decay = satiation_decay
127 |         self._satiation_noise = satiation_noise
128 |         self._satiations = None
129 |         self._switch_probability = switch_probability
130 |         self._sensitization_state = None
131 |         self._shift_steps = shift_steps
132 |         self._shift_frequency = shift_frequency
133 |         self._shift_weight = shift_weight
134 |         self._user_biases = None
135 |         self._item_biases = None
136 |         self._offset = None
137 |         self._user_bias_type = user_bias_type
138 |         self._item_bias_type = item_bias_type
139 | 
140 |     @property
141 |     def name(self):  # noqa: D102
142 |         return 'topics'
143 | 
144 |     def _get_dense_ratings(self):  # noqa: D102
145 |         ratings = np.zeros([self._num_users, self._num_items])
146 |         for item_id in range(self._num_items):
147 |             topic = self._item_topics[item_id]
148 |             ratings[:, item_id] = (self._user_preferences[:, topic] +
149 |                                    self._satiation_factor * self._satiations[:, topic] +
150 |                                    np.full((self._num_users), self._item_biases[item_id]) +
151 |                                    self._user_biases + np.full((self._num_users), self._offset))
152 | 
153 |         # Account for boredom.
154 |         for user_id in range(self._num_users):
155 |             recent_topics = [self._item_topics[item]
156 |                              for item in self._user_histories[user_id]]
157 |             recent_topics, counts = np.unique(
158 |                 recent_topics, return_counts=True)
159 |             recent_topics = recent_topics[counts > self._boredom_threshold]
160 |             for topic_id in recent_topics:
161 |                 ratings[user_id, self._item_topics ==
162 |                         topic_id] -= self._boredom_penalty
163 | 
164 |         return ratings
165 | 
166 |     def _get_rating(self, user_id, item_id):  # noqa: D102
167 |         topic = self._item_topics[item_id]
168 |         rating = (self._user_preferences[user_id, topic] -
169 |                   self._satiation_factor * self._satiations[user_id, topic] +
170 |                   self._user_biases[user_id] + self._item_biases[item_id] + self._offset)
171 |         recent_topics = [self._item_topics[item]
172 |                          for item in self._user_histories[user_id]]
173 |         if len(recent_topics) > 0:
174 |             recent_topics = list(np.concatenate(recent_topics))
175 |         if recent_topics.count(topic) > self._boredom_threshold:
176 |             rating -= self._boredom_penalty
177 |         rating = np.clip(rating + self._dynamics_random.randn()
178 |                          * self._noise, 1, 5)
179 |         return rating
180 | 
181 |     def _rate_items(self, user_id, item_ids):  # noqa: D102
182 |         # TODO: Add support for slates of size greater than 1.
183 |         item_id = [item_ids[0]]
184 |         rating = self._get_rating(user_id, item_id)
185 |         topic = self._item_topics[item_id]
186 | 
187 |         # Determine satiation decay based on sensitization state.
188 |         if type(self._satiation_decay) is tuple or type(self._satiation_decay) is list:
189 | 
190 |             # State transition function for sensitization v boredom.
191 |             # The user's state for all topics (not just the one recommended)
192 |             # switches based on self._switch_probability.
193 |             sensitized = np.where(self._sensitization_state[user_id] == 0)
194 |             bored = np.where(self._sensitization_state[user_id] == 1)
195 |             self._sensitization_state[user_id, sensitized] = np.random.choice(
196 |                 [0, 1], size=len(sensitized), p=[1 - self._switch_probability[0], self._switch_probability[0]])
197 |             self._sensitization_state[user_id, bored] = np.random.choice(
198 |                 [0, 1], size=len(bored), p=[self._switch_probability[1], 1 - self._switch_probability[1]])
199 | 
200 |             decay = self._satiation_decay[int(
201 |                 self._sensitization_state[user_id, topic])]
202 | 
203 |         else:
204 |             decay = self._satiation_decay
205 | 
206 |         # Update satiation.
207 |         recommended = np.zeros(self._num_topics)
208 |         recommended[topic] = 1
209 |         self._satiations[user_id] = (decay * (self._satiations[user_id] + recommended) +
210 |                                      np.random.randn(self._num_topics) * self._satiation_noise)
211 | 
212 |         # Update underlying preference.
213 |         preference = self._user_preferences[user_id, topic]
214 |         if preference <= 5:
215 |             self._user_preferences[user_id, topic] += self._topic_change
216 |             not_topic = np.arange(self._num_topics) != topic
217 |             self._user_preferences[user_id, not_topic] -= (
218 |                 self._topic_change / (self._num_topics - 1))
219 | 
220 |         return rating
221 | 
222 |     def _reset_state(self):  # noqa: D102
223 |         if self._user_bias_type == 'normal':
224 |             self._user_biases = self._init_random.normal(
225 |                 loc=0., scale=0.5, size=self._num_users)
226 |         elif self._user_bias_type == 'power':
227 |             self._user_biases = 1 - \
228 |                 self._init_random.power(5, size=self._num_users)
229 |         elif self._user_bias_type == 'none':
230 |             self._user_biases = np.zeros(self._num_users)
231 |         else:
232 |             print('User bias distribution is not supported')
233 | 
234 |         if self._item_bias_type == 'normal':
235 |             self._item_biases = self._init_random.normal(
236 |                 loc=0., scale=0.5, size=self._num_items)
237 |         elif self._item_bias_type == 'power':
238 |             self._item_biases = 1 - \
239 |                 self._init_random.power(5, size=self._num_users)
240 |         elif self._item_bias_type == 'none':
241 |             self._item_biases = np.zeros(self._num_items)
242 |         else:
243 |             print('Item bias distribution is not supported')
244 | 
245 |         self._offset = 0
246 |         self._satiations = np.zeros((self._num_users, self._num_topics))
247 |         self._sensitization_state = np.zeros(
248 |             (self._num_users, self._num_topics), dtype=int)
249 |         self._user_preferences = self._init_random.uniform(
250 |             low=0.5, high=5.5, size=(self._num_users, self._num_topics))
251 |         self._item_topics = self._init_random.choice(
252 |             self._num_topics, size=self._num_items)
253 |         self._users = collections.OrderedDict(
254 |             (user_id, np.zeros(0)) for user_id in range(self._num_users))
255 |         self._items = collections.OrderedDict(
256 |             (item_id, np.zeros(0)) for item_id in range(self._num_items))
257 | 
258 |     def _update_state(self):  # noqa: D102
259 |         if (self._timestep + 1) % self._shift_steps == 0:
260 |             # Apply preference and bias shift to a fraction of users.
261 |             shifted_users = self._dynamics_random.choice(
262 |                 self._num_users, int(self._num_users * self._shift_frequency))
263 |             new_preferences = self._init_random.uniform(
264 |                 low=0.5, high=5.5, size=(len(shifted_users), self._num_topics))
265 |             if self._user_bias_type == 'normal':
266 |                 new_user_biases = self._init_random.normal(
267 |                     loc=0, scale=0.5, size=len(shifted_users))
268 |             elif self._user_bias_type == 'power':
269 |                 new_user_biases = 1 - \
270 |                     self._init_random.power(5, size=len(shifted_users))
271 |             elif self._user_bias_type == 'none':
272 |                 new_user_biases = np.zeros(self._num_users)
273 |             else:
274 |                 print('User bias distribution is not supported')
275 | 
276 |             self._user_preferences[shifted_users] = (
277 |                 self._shift_weight * self._user_preferences[shifted_users] +
278 |                 (1 - self._shift_weight) * new_preferences)
279 | 
280 |             self._user_biases[shifted_users] = (
281 |                 self._shift_weight * self._user_biases[shifted_users] +
282 |                 (1 - self._shift_weight) * new_user_biases[shifted_users])
283 | 
284 |         return collections.OrderedDict(), collections.OrderedDict()
285 | 


--------------------------------------------------------------------------------
/reclab/environments/latent_factors.py:
--------------------------------------------------------------------------------
  1 | """Contains the implementation for the Latent Behavior environment.
  2 | 
  3 | In this environment users and items both have latent vectors, and
  4 | the rating is determined by the inner product. Users and item both
  5 | have bias terms, and there is an underlying bias as well.
  6 | """
  7 | import collections
  8 | import json
  9 | import os
 10 | 
 11 | import numpy as np
 12 | 
 13 | from . import environment
 14 | from .. import data_utils
 15 | 
 16 | 
 17 | class LatentFactorBehavior(environment.DictEnvironment):
 18 |     """An environment where users and items have latent factors and biases.
 19 | 
 20 |     Ratings are generated as
 21 |     r = clip( <p_u, q_i> + b_u + b_i + b_0 )
 22 |     where p_u is a user's latent factor, q_i is an item's latent factor,
 23 |     b_u is a user bias, b_i is an item bias, and b_0 is a global bias.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     latent_dim : int
 28 |         Size of latent factors p, q.
 29 |     num_users : int
 30 |         The number of users in the environment.
 31 |     num_items : int
 32 |         The number of items in the environment.
 33 |     rating_frequency : float
 34 |         The proportion of users that will need a recommendation at each step.
 35 |         Must be between 0 and 1.
 36 |     num_init_ratings : int
 37 |         The number of ratings available from the start. User-item pairs are randomly selected.
 38 |     noise : float
 39 |         The standard deviation of the noise added to ratings.
 40 |     affinity_change : float
 41 |         How much the user's latent factor is shifted towards that of an item.
 42 |     memory_length : int
 43 |         The number of recent items a user remembers which affect the rating.
 44 |     boredom_threshold : int
 45 |         The size of the inner product between a new item and an item in the
 46 |         user's history to trigger a boredom response.
 47 |     boredom_penalty : float
 48 |         The factor on the penalty on the rating when a user is bored. The penalty
 49 |         is the average of the values which exceed the boredom_threshold, and the decrease
 50 |         in rating is the penalty multiplied by this factor.
 51 |     user_dist_choice : str
 52 |         The choice of user distribution for selecting online users. By default, the subset of
 53 |         online users is chosen from a uniform distribution. Currently supports normal and lognormal.
 54 | 
 55 |     """
 56 | 
 57 |     def __init__(self, latent_dim, num_users, num_items,
 58 |                  rating_frequency=0.02, num_init_ratings=0,
 59 |                  noise=0.0, memory_length=0, affinity_change=0.0,
 60 |                  boredom_threshold=0, boredom_penalty=0.0, user_dist_choice='uniform'):
 61 |         """Create a Latent Factor environment."""
 62 |         super().__init__(rating_frequency, num_init_ratings, memory_length, user_dist_choice)
 63 |         self._latent_dim = latent_dim
 64 |         self._num_users = num_users
 65 |         self._num_items = num_items
 66 |         self._noise = noise
 67 |         self._affinity_change = affinity_change
 68 |         self._boredom_threshold = boredom_threshold
 69 |         self._boredom_penalty = boredom_penalty
 70 |         if self._memory_length > 0:
 71 |             self._boredom_penalty /= self._memory_length
 72 |         self._user_factors = None
 73 |         self._user_biases = None
 74 |         self._item_factors = None
 75 |         self._item_biases = None
 76 |         self._offset = None
 77 | 
 78 |     @property
 79 |     def name(self):
 80 |         """Name of environment, used for saving."""
 81 |         return 'latent'
 82 | 
 83 |     def _get_dense_ratings(self):  # noqa: D102
 84 |         ratings = (self._user_factors @ self._item_factors.T + self._user_biases[:, np.newaxis] +
 85 |                    self._item_biases[np.newaxis, :] + self._offset)
 86 |         # Compute the boredom penalties.
 87 |         item_norms = np.linalg.norm(self._item_factors, axis=1)
 88 |         normalized_items = self._item_factors / item_norms[:, np.newaxis]
 89 |         similarities = normalized_items @ normalized_items.T
 90 |         similarities -= self._boredom_threshold
 91 |         similarities[similarities < 0] = 0
 92 |         penalties = self._boredom_penalty * similarities
 93 |         for user_id in range(self._num_users):
 94 |             for item_id in self._user_histories[user_id]:
 95 |                 if item_id is not None:
 96 |                     ratings[user_id] -= penalties[item_id]
 97 | 
 98 |         return ratings
 99 | 
100 |     def _get_rating(self, user_id, item_id):
101 |         """Compute user's rating of item based on model.
102 | 
103 |         Parameters
104 |         ----------
105 |         user_id : int
106 |             The id of the user making the rating.
107 |         item_id : int
108 |             The id of the item being rated.
109 | 
110 |         Returns
111 |         -------
112 |         rating : int
113 |             The rating the item was given by the user.
114 | 
115 |         """
116 |         raw_rating = (self._user_factors[user_id] @ self._item_factors[item_id]
117 |                       + self._user_biases[user_id] + self._item_biases[item_id] + self._offset)
118 | 
119 |         # Compute the boredom penalty.
120 |         boredom_penalty = 0
121 |         for item_id_hist in self._user_histories[user_id]:
122 |             item_factor = self._item_factors[item_id_hist]
123 |             if item_factor is not None:
124 |                 similarity = ((self._item_factors[item_id] @ item_factor)
125 |                               / np.linalg.norm(item_factor)
126 |                               / np.linalg.norm(self._item_factors[item_id]))
127 |                 if similarity > self._boredom_threshold:
128 |                     boredom_penalty += (similarity - self._boredom_threshold)
129 |         boredom_penalty *= self._boredom_penalty
130 |         rating = np.clip(raw_rating - boredom_penalty + self._dynamics_random.randn() *
131 |                          self._noise, 1, 5)
132 | 
133 |         return rating
134 | 
135 |     def _rate_items(self, user_id, item_ids):
136 |         """Get a user to rate an item and update the internal rating state.
137 | 
138 |         Parameters
139 |         ----------
140 |         user_id : int
141 |             The id of the user making the rating.
142 |         item_id : int
143 |             The id of the item being rated.
144 | 
145 |         Returns
146 |         -------
147 |         rating : int
148 |             The rating the item was given by the user.
149 | 
150 |         """
151 |         # TODO: Add support for slates of size greater than 1.
152 |         item_id = item_ids[0]
153 |         rating = self._get_rating(user_id, item_id)
154 | 
155 |         # Updating underlying affinity
156 |         self._user_factors[user_id] = ((1.0 - self._affinity_change) * self._user_factors[user_id]
157 |                                        + self._affinity_change * self._item_factors[item_id])
158 |         return np.array([rating])
159 | 
160 |     def _reset_state(self):
161 |         """Reset the state of the environment."""
162 |         user_factors, user_bias, item_factors, item_bias, offset = self._generate_latent_factors()
163 |         self._user_factors = user_factors
164 |         self._user_biases = user_bias
165 |         self._item_factors = item_factors
166 |         self._item_biases = item_bias
167 |         self._offset = offset
168 | 
169 |         self._users = collections.OrderedDict((user_id, np.zeros(0))
170 |                                               for user_id in range(self._num_users))
171 |         self._items = collections.OrderedDict((item_id, np.zeros(0))
172 |                                               for item_id in range(self._num_items))
173 | 
174 |     def _generate_latent_factors(self):
175 |         """Generate random latent factors."""
176 |         # Initialization size determined such that ratings generally fall in 0-5 range
177 |         factor_sd = np.sqrt(np.sqrt(0.5 / self._latent_dim))
178 |         # User latent factors are normally distributed
179 |         user_bias = self._init_random.normal(loc=0., scale=0.5, size=self._num_users)
180 |         user_factors = self._init_random.normal(loc=0., scale=factor_sd,
181 |                                                 size=(self._num_users, self._latent_dim))
182 |         # Item latent factors are normally distributed
183 |         item_bias = self._init_random.normal(loc=0., scale=0.5, size=self._num_items)
184 |         item_factors = self._init_random.normal(loc=0., scale=factor_sd,
185 |                                                 size=(self._num_items, self._latent_dim))
186 |         # Shift up the mean
187 |         offset = 3.0
188 |         return user_factors, user_bias, item_factors, item_bias, offset
189 | 
190 | 
191 | class DatasetLatentFactor(LatentFactorBehavior):
192 |     """An environment where user behavior is based on a dataset.
193 | 
194 |     Latent factor model of behavior with parameters fit directly from full dataset.
195 | 
196 |     Parameters
197 |     ----------
198 |     name : str
199 |         The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'lastfm'.
200 |     latent_dim : int
201 |         Size of latent factors p, q.
202 |     datapath : str
203 |         The path to the directory containing datafiles
204 |     force_retrain : bool
205 |         Forces retraining the latent factor model
206 |     max_num_users : int
207 |         The maximum number of users for the environment, if not the number in the dataset.
208 |     max_num_items : int
209 |         The maximum number of items for the environment, if not the number in the dataset.
210 | 
211 |     """
212 | 
213 |     def __init__(self, name, latent_dim=128, datapath=data_utils.DATA_DIR, force_retrain=False,
214 |                  max_num_users=np.inf, max_num_items=np.inf, **kwargs):
215 |         """Create a ML100K Latent Factor environment."""
216 |         self.dataset_name = name
217 |         modelpath = datapath
218 |         if name == 'ml-100k':
219 |             self.datapath = os.path.expanduser(os.path.join(datapath, 'ml-100k'))
220 |             self.modelpath = os.path.join(modelpath, 'ml-100k')
221 |             latent_dim = 100 if latent_dim is None else latent_dim
222 |             self._full_num_users = 943
223 |             self._full_num_items = 1682
224 |             # These parameters are the result of tuning.
225 |             reg = 0.1
226 |             learn_rate = 0.005
227 |             self.train_params = dict(bias_reg=reg, one_way_reg=reg, two_way_reg=reg,
228 |                                      learning_rate=learn_rate, num_iter=100)
229 |         elif name == 'ml-10m':
230 |             self.datapath = os.path.expanduser(os.path.join(datapath, 'ml-10M100K'))
231 |             self.modelpath = os.path.join(modelpath, 'ml-10M100K')
232 |             latent_dim = 128 if latent_dim is None else latent_dim
233 |             self._full_num_users = 69878
234 |             self._full_num_items = 10677
235 |             # these parameters are presented in "On the Difficulty of Baselines" by Rendle et al.
236 |             reg = 0.04
237 |             learn_rate = 0.003
238 |             self.train_params = dict(bias_reg=reg, one_way_reg=reg, two_way_reg=reg,
239 |                                      learning_rate=learn_rate, num_iter=128)
240 |         elif name == 'lastfm':
241 |             self.datapath = os.path.expanduser(os.path.join(datapath, 'lastfm-dataset-1K'))
242 |             self.modelpath = os.path.join(modelpath, 'lastfm-dataset-1K')
243 |             latent_dim = 128 if latent_dim is None else latent_dim
244 |             self._full_num_users = 992
245 |             self._full_num_items = 177023
246 |             # These parameters are presented in "Recommendations and User Agency" by Dean et al.
247 |             reg = 0.08
248 |             learn_rate = 0.001
249 |             self.train_params = dict(bias_reg=reg, one_way_reg=reg, two_way_reg=reg,
250 |                                      learning_rate=learn_rate, num_iter=128)
251 |         else:
252 |             raise ValueError('dataset name not recognized')
253 |         self._force_retrain = force_retrain
254 | 
255 |         num_users = min(self._full_num_users, max_num_users)
256 |         num_items = min(self._full_num_items, max_num_items)
257 | 
258 |         super().__init__(latent_dim, num_users, num_items, **kwargs)
259 | 
260 |     @property
261 |     def name(self):
262 |         """Name of environment, used for saving."""
263 |         return 'latent-{}'.format(self.dataset_name)
264 | 
265 |     def _generate_latent_factors(self):
266 |         full_model_params = dict(num_user_features=0, num_item_features=0, num_rating_features=0,
267 |                                  max_num_users=self._full_num_users,
268 |                                  max_num_items=self._full_num_items,
269 |                                  num_two_way_factors=self._latent_dim, **self.train_params)
270 | 
271 |         model_file = os.path.join(self.modelpath, 'fm_model.npz')
272 |         res = load_latent_factors(model_file)
273 |         if res is None or self._force_retrain:
274 |             print('Training model from scratch, either due to force_retrain flag or')
275 |             print('\tdid not find model file at {}'.format(model_file))
276 |             res = generate_latent_factors_from_data(self.dataset_name, model_file,
277 |                                                     full_model_params)
278 |             user_factors, user_bias, item_factors, item_bias, offset = res
279 |         else:
280 |             user_factors, user_bias, item_factors, item_bias, offset = res
281 | 
282 |         if self._num_users < self._full_num_users or self._num_items < self._full_num_items:
283 |             num_users, num_items = (min(self._num_users, self._full_num_users),
284 |                                     min(self._num_items, self._full_num_items))
285 |             # TODO: may want to reduce the number in some other way
286 |             # e.g. related to popularity
287 |             user_indices = self._init_random.choice(user_factors.shape[0], size=num_users,
288 |                                                     replace=False)
289 |             item_indices = self._init_random.choice(item_factors.shape[0], size=num_items,
290 |                                                     replace=False)
291 |             user_factors = user_factors[user_indices]
292 |             user_bias = user_bias[user_indices]
293 |             item_factors = item_factors[item_indices]
294 |             item_bias = item_bias[item_indices]
295 |         return user_factors, user_bias, item_factors, item_bias, offset
296 | 
297 | 
298 | def load_latent_factors(model_file):
299 |     """Load pretrained latent factor model."""
300 |     if not os.path.isfile(model_file):
301 |         return None
302 |     model = np.load(model_file)
303 |     print('Loading model from {} trained via:\n{}.'.format(model_file, model['params']))
304 | 
305 |     user_factors = model['user_factors']
306 |     user_bias = model['user_bias']
307 |     item_factors = model['item_factors']
308 |     item_bias = model['item_bias']
309 |     offset = model['offset']
310 | 
311 |     return user_factors, user_bias, item_factors, item_bias, offset
312 | 
313 | 
314 | def generate_latent_factors_from_data(dataset_name, model_file, params):
315 |     """Create latent factors based on a dataset."""
316 |     from ..recommenders import LibFM
317 | 
318 |     users, items, ratings = data_utils.read_dataset(dataset_name)
319 |     print('Initializing latent factor model')
320 |     recommender = LibFM(**params)
321 |     recommender.reset(users, items, ratings)
322 |     print('Training latent factor model with parameters: {}'.format(params))
323 | 
324 |     global_bias, weights, pairwise_interactions = recommender.model_parameters()
325 |     if len(weights) == 0:
326 |         weights = np.zeros(pairwise_interactions.shape[0])
327 | 
328 |     # TODO: this logic is only correct if there are no additional user/item/rating features
329 |     # Note that we discard the original data's user_ids and item_ids at this step
330 |     user_indices = np.arange(params['max_num_users'])
331 |     item_indices = np.arange(params['max_num_users'],
332 |                              params['max_num_users'] + params['max_num_items'])
333 | 
334 |     user_factors = pairwise_interactions[user_indices]
335 |     user_bias = weights[user_indices]
336 |     item_factors = pairwise_interactions[item_indices]
337 |     item_bias = weights[item_indices]
338 |     offset = global_bias
339 |     params = json.dumps(recommender.hyperparameters)
340 | 
341 |     np.savez(model_file, user_factors=user_factors, user_bias=user_bias,
342 |              item_factors=item_factors, item_bias=item_bias, offset=offset,
343 |              params=params)
344 | 
345 |     return user_factors, user_bias, item_factors, item_bias, offset
346 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-whitelist=numpy
  7 | 
  8 | # Add files or directories to the blacklist. They should be base names, not
  9 | # paths.
 10 | ignore=autorec_lib,cfnade_lib,llorma_lib,experiment_scripts
 11 | 
 12 | # Add files or directories matching the regex patterns to the blacklist. The
 13 | # regex matches against base names, not paths.
 14 | ignore-patterns=
 15 | 
 16 | # Python code to execute, usually for sys.path manipulation such as
 17 | # pygtk.require().
 18 | #init-hook=
 19 | 
 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 21 | # number of processors available to use.
 22 | jobs=0
 23 | 
 24 | # Control the amount of potential inferred values when inferring a single
 25 | # object. This can help the performance when dealing with large functions or
 26 | # complex, nested conditions.
 27 | limit-inference-results=100
 28 | # List of plugins (as comma separated values of python modules names) to load,
 29 | # usually to register additional checkers.
 30 | load-plugins=
 31 | 
 32 | # Pickle collected data for later comparisons.
 33 | persistent=yes
 34 | 
 35 | # Specify a configuration file.
 36 | #rcfile=
 37 | 
 38 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 39 | # user-friendly hints instead of false-positive error messages.
 40 | suggestion-mode=yes
 41 | 
 42 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 43 | # active Python interpreter and may run arbitrary code.
 44 | unsafe-load-any-extension=no
 45 | 
 46 | 
 47 | [MESSAGES CONTROL]
 48 | 
 49 | # Only show warnings with the listed confidence levels. Leave empty to show
 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 51 | confidence=
 52 | 
 53 | # Disable the message, report, category or checker with the given id(s). You
 54 | # can either give multiple identifiers separated by comma (,) or put this
 55 | # option multiple times (only on the command line, not in the configuration
 56 | # file where it should appear only once). You can also use "--disable=all" to
 57 | # disable everything first and then reenable specific checks. For example, if
 58 | # you want to run only the similarities checker, you can use "--disable=all
 59 | # --enable=similarities". If you want to run only the classes checker, but have
 60 | # no Warning level messages displayed, use "--disable=all --enable=classes
 61 | # --disable=W".
 62 | disable=
 63 | 
 64 | # Enable the message, report, category or checker with the given id(s). You can
 65 | # either give multiple identifier separated by comma (,) or put this option
 66 | # multiple time (only on the command line, not in the configuration file where
 67 | # it should appear only once). See also the "--disable" option for examples.
 68 | enable=c-extension-no-member
 69 | 
 70 | 
 71 | [REPORTS]
 72 | 
 73 | # Python expression which should return a note less than 10 (10 is the highest
 74 | # note). You have access to the variables errors warning, statement which
 75 | # respectively contain the number of errors / warnings messages and the total
 76 | # number of statements analyzed. This is used by the global evaluation report
 77 | # (RP0004).
 78 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 79 | 
 80 | # Template used to display messages. This is a python new-style format string
 81 | # used to format the message information. See doc for all details.
 82 | #msg-template=
 83 | 
 84 | # Set the output format. Available formats are text, parseable, colorized, json
 85 | # and msvs (visual studio). You can also give a reporter class, e.g.
 86 | # mypackage.mymodule.MyReporterClass.
 87 | output-format=text
 88 | 
 89 | # Tells whether to display a full report or only the messages.
 90 | reports=no
 91 | 
 92 | # Activate the evaluation score.
 93 | score=yes
 94 | 
 95 | 
 96 | [REFACTORING]
 97 | 
 98 | # Maximum number of nested blocks for function / method body
 99 | max-nested-blocks=5
100 | 
101 | # Complete name of functions that never returns. When checking for
102 | # inconsistent-return-statements if a never returning function is called then
103 | # it will be considered as an explicit return statement and no message will be
104 | # printed.
105 | never-returning-functions=sys.exit
106 | 
107 | 
108 | [SPELLING]
109 | 
110 | # Limits count of emitted suggestions for spelling mistakes.
111 | max-spelling-suggestions=4
112 | 
113 | # Spelling dictionary name. Available dictionaries: none. To make it working
114 | # install python-enchant package..
115 | spelling-dict=
116 | 
117 | # List of comma separated words that should not be checked.
118 | spelling-ignore-words=
119 | 
120 | # A path to a file that contains private dictionary; one word per line.
121 | spelling-private-dict-file=
122 | 
123 | # Tells whether to store unknown words to indicated private dictionary in
124 | # --spelling-private-dict-file option instead of raising a message.
125 | spelling-store-unknown-words=no
126 | 
127 | 
128 | [MISCELLANEOUS]
129 | 
130 | # List of note tags to take in consideration, separated by a comma.
131 | notes=
132 | 
133 | 
134 | [TYPECHECK]
135 | 
136 | # List of decorators that produce context managers, such as
137 | # contextlib.contextmanager. Add to this list to register other decorators that
138 | # produce valid context managers.
139 | contextmanager-decorators=contextlib.contextmanager
140 | 
141 | # List of members which are set dynamically and missed by pylint inference
142 | # system, and so shouldn't trigger E1101 when accessed. Python regular
143 | # expressions are accepted.
144 | generated-members=
145 | 
146 | # Tells whether missing members accessed in mixin class should be ignored. A
147 | # mixin class is detected if its name ends with "mixin" (case insensitive).
148 | ignore-mixin-members=yes
149 | 
150 | # Tells whether to warn about missing members when the owner of the attribute
151 | # is inferred to be None.
152 | ignore-none=yes
153 | 
154 | # This flag controls whether pylint should warn about no-member and similar
155 | # checks whenever an opaque object is returned when inferring. The inference
156 | # can return multiple potential results while evaluating a Python object, but
157 | # some branches might not be evaluated, which results in partial inference. In
158 | # that case, it might be useful to still emit no-member and other checks for
159 | # the rest of the inferred objects.
160 | ignore-on-opaque-inference=yes
161 | 
162 | # List of class names for which member attributes should not be checked (useful
163 | # for classes with dynamically set attributes). This supports the use of
164 | # qualified names.
165 | ignored-classes=optparse.Values,thread._local,_thread._local
166 | 
167 | # List of module names for which member attributes should not be checked
168 | # (useful for modules/projects where namespaces are manipulated during runtime
169 | # and thus existing member attributes cannot be deduced by static analysis. It
170 | # supports qualified module names, as well as Unix pattern matching.
171 | ignored-modules= numpy
172 | 
173 | # Show a hint with possible names when a member name was not found. The aspect
174 | # of finding the hint is based on edit distance.
175 | missing-member-hint=yes
176 | 
177 | # The minimum edit distance a name should have in order to be considered a
178 | # similar match for a missing member name.
179 | missing-member-hint-distance=1
180 | 
181 | # The total number of similar names that should be taken in consideration when
182 | # showing a hint for a missing member.
183 | missing-member-max-choices=1
184 | 
185 | 
186 | [BASIC]
187 | 
188 | # Naming style matching correct argument names.
189 | argument-naming-style=snake_case
190 | 
191 | # Regular expression matching correct argument names. Overrides argument-
192 | # naming-style.
193 | #argument-rgx=
194 | 
195 | # Naming style matching correct attribute names.
196 | attr-naming-style=snake_case
197 | 
198 | # Regular expression matching correct attribute names. Overrides attr-naming-
199 | # style.
200 | #attr-rgx=
201 | 
202 | # Bad variable names which should always be refused, separated by a comma.
203 | bad-names=foo,
204 |           bar,
205 |           baz,
206 |           toto,
207 |           tutu,
208 |           tata
209 | 
210 | # Naming style matching correct class attribute names.
211 | class-attribute-naming-style=any
212 | 
213 | # Regular expression matching correct class attribute names. Overrides class-
214 | # attribute-naming-style.
215 | #class-attribute-rgx=
216 | 
217 | # Naming style matching correct class names.
218 | class-naming-style=PascalCase
219 | 
220 | # Regular expression matching correct class names. Overrides class-naming-
221 | # style.
222 | #class-rgx=
223 | 
224 | # Naming style matching correct constant names.
225 | const-naming-style=UPPER_CASE
226 | 
227 | # Regular expression matching correct constant names. Overrides const-naming-
228 | # style.
229 | #const-rgx=
230 | 
231 | # Minimum line length for functions/classes that require docstrings, shorter
232 | # ones are exempt.
233 | docstring-min-length=-1
234 | 
235 | # Naming style matching correct function names.
236 | function-naming-style=snake_case
237 | 
238 | # Regular expression matching correct function names. Overrides function-
239 | # naming-style.
240 | #function-rgx=
241 | 
242 | # Good variable names which should always be accepted, separated by a comma.
243 | good-names=i,
244 |            j,
245 |            k,
246 |            ex,
247 |            Run,
248 |            _,
249 |            X,
250 |            Y,
251 |            Z,
252 |            n,
253 |            x,
254 |            y,
255 |            z
256 | 
257 | # Include a hint for the correct naming format with invalid-name.
258 | include-naming-hint=no
259 | 
260 | # Naming style matching correct inline iteration names.
261 | inlinevar-naming-style=any
262 | 
263 | # Regular expression matching correct inline iteration names. Overrides
264 | # inlinevar-naming-style.
265 | #inlinevar-rgx=
266 | 
267 | # Naming style matching correct method names.
268 | method-naming-style=snake_case
269 | 
270 | # Regular expression matching correct method names. Overrides method-naming-
271 | # style.
272 | #method-rgx=
273 | 
274 | # Naming style matching correct module names.
275 | module-naming-style=snake_case
276 | 
277 | # Regular expression matching correct module names. Overrides module-naming-
278 | # style.
279 | #module-rgx=
280 | 
281 | # Colon-delimited sets of names that determine each other's naming style when
282 | # the name regexes allow several styles.
283 | name-group=
284 | 
285 | # Regular expression which should only match function or class names that do
286 | # not require a docstring.
287 | no-docstring-rgx=^_
288 | 
289 | # List of decorators that produce properties, such as abc.abstractproperty. Add
290 | # to this list to register other decorators that produce valid properties.
291 | # These decorators are taken in consideration only for invalid-name.
292 | property-classes=abc.abstractproperty
293 | 
294 | # Naming style matching correct variable names.
295 | variable-naming-style=snake_case
296 | 
297 | # Regular expression matching correct variable names. Overrides variable-
298 | # naming-style.
299 | #variable-rgx=
300 | 
301 | 
302 | [VARIABLES]
303 | 
304 | # List of additional names supposed to be defined in builtins. Remember that
305 | # you should avoid defining new builtins when possible.
306 | additional-builtins=
307 | 
308 | # Tells whether unused global variables should be treated as a violation.
309 | allow-global-unused-variables=yes
310 | 
311 | # List of strings which can identify a callback function by name. A callback
312 | # name must start or end with one of those strings.
313 | callbacks=cb_,
314 |           _cb
315 | 
316 | # A regular expression matching the name of dummy variables (i.e. expected to
317 | # not be used).
318 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
319 | 
320 | # Argument names that match this expression will be ignored. Default to name
321 | # with leading underscore.
322 | ignored-argument-names=_.*|^ignored_|^unused_
323 | 
324 | # Tells whether we should check for unused import in __init__ files.
325 | init-import=no
326 | 
327 | # List of qualified module names which can have objects that can redefine
328 | # builtins.
329 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
330 | 
331 | 
332 | [SIMILARITIES]
333 | 
334 | # Ignore comments when computing similarities.
335 | ignore-comments=yes
336 | 
337 | # Ignore docstrings when computing similarities.
338 | ignore-docstrings=yes
339 | 
340 | # Ignore imports when computing similarities.
341 | ignore-imports=no
342 | 
343 | # Minimum lines number of a similarity.
344 | min-similarity-lines=4
345 | 
346 | 
347 | [FORMAT]
348 | 
349 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
350 | expected-line-ending-format=
351 | 
352 | # Regexp for a line that is allowed to be longer than the limit.
353 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
354 | 
355 | # Number of spaces of indent required inside a hanging or continued line.
356 | indent-after-paren=4
357 | 
358 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
359 | # tab).
360 | indent-string='    '
361 | 
362 | # Maximum number of characters on a single line.
363 | max-line-length=100
364 | 
365 | # Maximum number of lines in a module.
366 | max-module-lines=1500
367 | 
368 | # List of optional constructs for which whitespace checking is disabled. `dict-
369 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
370 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
371 | # `empty-line` allows space-only lines.
372 | no-space-check=trailing-comma,
373 |                dict-separator
374 | 
375 | # Allow the body of a class to be on the same line as the declaration if body
376 | # contains single statement.
377 | single-line-class-stmt=no
378 | 
379 | # Allow the body of an if to be on the same line as the test if there is no
380 | # else.
381 | single-line-if-stmt=no
382 | 
383 | 
384 | [LOGGING]
385 | 
386 | # Format style used to check logging format string. `old` means using %
387 | # formatting, while `new` is for `{}` formatting.
388 | logging-format-style=old
389 | 
390 | # Logging modules to check that the string format arguments are in logging
391 | # function parameter format.
392 | logging-modules=logging
393 | 
394 | 
395 | [IMPORTS]
396 | 
397 | # Allow wildcard imports from modules that define __all__.
398 | allow-wildcard-with-all=no
399 | 
400 | # Analyse import fallback blocks. This can be used to support both Python 2 and
401 | # 3 compatible code, which means that the block might have code that exists
402 | # only in one or another interpreter, leading to false positives when analysed.
403 | analyse-fallback-blocks=no
404 | 
405 | # Deprecated modules which should not be used, separated by a comma.
406 | deprecated-modules=optparse,tkinter.tix
407 | 
408 | # Create a graph of external dependencies in the given file (report RP0402 must
409 | # not be disabled).
410 | ext-import-graph=
411 | 
412 | # Create a graph of every (i.e. internal and external) dependencies in the
413 | # given file (report RP0402 must not be disabled).
414 | import-graph=
415 | 
416 | # Create a graph of internal dependencies in the given file (report RP0402 must
417 | # not be disabled).
418 | int-import-graph=
419 | 
420 | # Force import order to recognize a module as part of the standard
421 | # compatibility libraries.
422 | known-standard-library=
423 | 
424 | # Force import order to recognize a module as part of a third party library.
425 | known-third-party=enchant
426 | 
427 | 
428 | [DESIGN]
429 | 
430 | # Maximum number of arguments for function / method.
431 | max-args=20
432 | 
433 | # Maximum number of attributes for a class (see R0902).
434 | max-attributes=20
435 | 
436 | # Maximum number of boolean expressions in an if statement.
437 | max-bool-expr=5
438 | 
439 | # Maximum number of branch for function / method body.
440 | max-branches=12
441 | 
442 | # Maximum number of locals for function / method body.
443 | max-locals=20
444 | 
445 | # Maximum number of parents for a class (see R0901).
446 | max-parents=7
447 | 
448 | # Maximum number of public methods for a class (see R0904).
449 | max-public-methods=20
450 | 
451 | # Maximum number of return / yield for function / method body.
452 | max-returns=6
453 | 
454 | # Maximum number of statements in function / method body.
455 | max-statements=50
456 | 
457 | # Minimum number of public methods for a class (see R0903).
458 | min-public-methods=2
459 | 
460 | 
461 | [CLASSES]
462 | 
463 | # List of method names used to declare (i.e. assign) instance attributes.
464 | defining-attr-methods=__init__,
465 |                       __new__,
466 |                       setUp
467 | 
468 | # List of member names, which should be excluded from the protected access
469 | # warning.
470 | exclude-protected=_asdict,
471 |                   _fields,
472 |                   _replace,
473 |                   _source,
474 |                   _make
475 | 
476 | # List of valid names for the first argument in a class method.
477 | valid-classmethod-first-arg=cls
478 | 
479 | # List of valid names for the first argument in a metaclass class method.
480 | valid-metaclass-classmethod-first-arg=cls
481 | 
482 | 
483 | [EXCEPTIONS]
484 | 
485 | # Exceptions that will emit a warning when being caught. Defaults to
486 | # "Exception".
487 | overgeneral-exceptions=Exception
488 | 
489 | # Set the linting for string quotes
490 | string-quote=single
491 | triple-quote=double
492 | docstring-quote=double
493 | 


--------------------------------------------------------------------------------
/reclab/data_utils.py:
--------------------------------------------------------------------------------
  1 | """A utility module for loading and manipulating various datasets."""
  2 | import collections
  3 | import os
  4 | import urllib.request
  5 | import zipfile
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import scipy.sparse
 10 | 
 11 | DATA_DIR = os.environ.get('RECLAB_DATA_PATH')
 12 | if DATA_DIR is None:
 13 |     DATA_DIR = os.path.dirname(__file__)
 14 | 
 15 | def read_dataset(name, shuffle=True, seed=0):
 16 |     """Read a dataset as specified by name.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     name : str
 21 |         The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a',
 22 |         'pinterest', or 'lastfm'.
 23 |     shuffle : bool, optional
 24 |         A flag to indicate whether the dataset should be shuffled after loading,
 25 |         true by default.
 26 | 
 27 |     Returns
 28 |     -------
 29 |     users : dict
 30 |         The dict of all users where the key is the user-id and the value is the user's features.
 31 |     items : dict
 32 |         The dict of all items where the key is the item-id and the value is the item's features.
 33 |     ratings : dict
 34 |         The dict of all ratings where the key is a tuple whose first element is the user-id
 35 |         and whose second element is the item id. The value is a tuple whose first element is the
 36 |         rating value and whose second element is the rating context (in this case an empty array).
 37 | 
 38 |     """
 39 |     data = get_data(name)
 40 | 
 41 |     return dataset_from_dataframe(data, shuffle=shuffle, seed=seed)
 42 | 
 43 | def dataset_from_dataframe(data, shuffle=True, seed=0):
 44 |     """Read a dataset as specified by name.
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     data : dataframe
 49 |         The dataset, with columns user_id, item_id, and rating
 50 |     shuffle : bool, optional
 51 |         A flag to indicate whether the dataset should be shuffled after loading,
 52 |         true by default.
 53 | 
 54 |     Returns
 55 |     -------
 56 |     users : dict
 57 |         The dict of all users where the key is the user-id and the value is the user's features.
 58 |     items : dict
 59 |         The dict of all items where the key is the item-id and the value is the item's features.
 60 |     ratings : dict
 61 |         The dict of all ratings where the key is a tuple whose first element is the user-id
 62 |         and whose second element is the item id. The value is a tuple whose first element is the
 63 |         rating value and whose second element is the rating context (in this case an empty array).
 64 | 
 65 |     """
 66 | 
 67 |     if shuffle:
 68 |         data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
 69 | 
 70 |     users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])}
 71 |     items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])}
 72 | 
 73 |     # Fill the rating array with initial data.
 74 |     ratings = {}
 75 |     for user_id, item_id, rating in zip(data['user_id'], data['item_id'], data['rating']):
 76 |         # TODO: may want to eventually a rating context depending on dataset (e.g. time)
 77 |         ratings[user_id, item_id] = (rating, np.zeros(0))
 78 | 
 79 |     return users, items, ratings
 80 | 
 81 | 
 82 | def read_bandit_dataset(name):
 83 |     """Read a bandit dataset as specified by name.
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     name : str
 88 |         The name of the dataset. Must be one of: 'wiki10-31k'.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     features : scipy.sparse.dok_matrix
 93 |         The features at each timestep.
 94 |     ratings : scipy.sparse.dok_matrix
 95 |         The ratings at each timestep.
 96 | 
 97 |     """
 98 |     if name == 'wiki10-31k':
 99 |         with open_zipped(zipped_dir_name='wiki10-31k',
100 |                          data_name='features.npz',
101 |                          data_url='https://kkrauth.s3-us-west-2.amazonaws.com/wiki10-31k.zip',
102 |                          mode='rb') as feature_file:
103 |             features = scipy.sparse.load_npz(feature_file).tocsr()
104 | 
105 |         with open_zipped(zipped_dir_name='wiki10-31k',
106 |                          data_name='ratings.npz',
107 |                          data_url='https://kkrauth.s3-us-west-2.amazonaws.com/wiki10-31k.zip',
108 |                          mode='rb') as ratings_file:
109 |             ratings = scipy.sparse.load_npz(ratings_file).tocsr()
110 |     else:
111 |         raise ValueError('Dataset name not recognized.')
112 | 
113 |     return features, ratings
114 | 
115 | 
116 | def split_ratings(ratings, proportion, shuffle=False, seed=None):
117 |     """Split a group of ratings into two groups.
118 | 
119 |     Parameters
120 |     ----------
121 |     ratings : dict
122 |         The ratings to split.
123 |     proportion : float
124 |         The proportion of ratings that will be in the first group. Must be between 0 and 1.
125 |     shuffle : bool
126 |         Whether to shuffle the rating data.
127 | 
128 |     Returns
129 |     -------
130 |     ratings_1 : OrderedDict
131 |         The first set of ratings.
132 |     ratings_2 : OrderedDict
133 |         The second set of ratings.
134 | 
135 |     """
136 |     split_1 = collections.OrderedDict()
137 |     split_2 = collections.OrderedDict()
138 |     split_1_end = int(proportion * len(ratings))
139 |     iterator = list(ratings.items())
140 | 
141 |     if shuffle:
142 |         if seed is not None:
143 |             np.random.seed(seed)
144 |         np.random.shuffle(iterator)
145 | 
146 |     for i, (key, val) in enumerate(iterator):
147 |         if i < split_1_end:
148 |             split_1[key] = val
149 |         else:
150 |             split_2[key] = val
151 | 
152 |     return split_1, split_2
153 | 
154 | 
155 | def read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params):
156 |     """Locate or download zipped file and load csv into DataFrame.
157 | 
158 |     Parameters
159 |     ----------
160 |     zipped_dir_name : str
161 |         The directory within the downloaded zip.
162 |     data_name : str
163 |         The name of the data file to be loaded from the directory.
164 |     data_url : str
165 |         The location of the download.
166 |     csv_params : str
167 |         Parameters for loading csv into DataFrame.
168 | 
169 |     Returns
170 |     -------
171 |     data : DataFrame
172 |         Dataset of interest.
173 | 
174 |     """
175 |     data_file = os.path.join(DATA_DIR, zipped_dir_name, data_name)
176 |     fetch_zip(zipped_dir_name, data_url)
177 |     return pd.read_csv(data_file, **csv_params)
178 | 
179 | 
180 | def open_zipped(zipped_dir_name, data_name, data_url, mode):
181 |     """Download a zipped file and open it.
182 | 
183 |     Parameters
184 |     ----------
185 |     zipped_dir_name : str
186 |         The directory within the downloaded zip.
187 |     data_name : str
188 |         The name of the data file to be loaded from the directory.
189 |     data_url : str
190 |         The location of the download.
191 |     mode: str
192 |         The mode to open the file in.
193 | 
194 |     Returns
195 |     -------
196 |     file : file
197 |         The file of interest.
198 | 
199 |     """
200 |     data_file = os.path.join(DATA_DIR, zipped_dir_name, data_name)
201 |     fetch_zip(zipped_dir_name, data_url)
202 |     return open(data_file, mode)
203 | 
204 | 
205 | def fetch_zip(zipped_dir_name, data_url):
206 |     """Download a zipped directory and extract it.
207 | 
208 |     Parameters
209 |     ----------
210 |     zipped_dir_name : str
211 |         The directory within the downloaded zip.
212 |     data_url : str
213 |         The location of the download.
214 | 
215 |     """
216 |     data_dir = os.path.join(DATA_DIR, zipped_dir_name)
217 |     if not os.path.isdir(data_dir):
218 |         os.makedirs(DATA_DIR, exist_ok=True)
219 | 
220 |         download_location = os.path.join('{}.zip'.format(data_dir))
221 |         urllib.request.urlretrieve(data_url,
222 |                                    filename=download_location)
223 |         with zipfile.ZipFile(download_location, 'r') as zip_ref:
224 |             zip_ref.extractall(DATA_DIR)
225 |         os.remove(download_location)
226 | 
227 | 
228 | def find_npz(dir_name, data_name, data_url, np_params):
229 |     """Locate or download npz file and load into DataFrame.
230 | 
231 |     Parameters
232 |     ----------
233 |     dir_name : str
234 |         The directory to put the .npz file.
235 |     data_name : str
236 |         The name of the .npz file.
237 |     data_url : str
238 |         The location of the download.
239 |     csv_params : str
240 |         Parameters for loading the numpy array into DataFrame.
241 | 
242 |     Returns
243 |     -------
244 |     data : DataFrame
245 |         Dataset of interest.
246 | 
247 |     """
248 |     download_dir = os.path.join(DATA_DIR, dir_name)
249 |     datafile = os.path.join(download_dir, data_name)
250 |     if not os.path.isfile(datafile):
251 |         os.makedirs(download_dir, exist_ok=True)
252 |         urllib.request.urlretrieve(data_url, filename=datafile)
253 |     data_np = np.load(datafile, allow_pickle=True)['train_data']
254 |     data = pd.DataFrame(data_np, **np_params)
255 |     # TODO: deal better with implicit ratings
256 |     data['rating'] = 1
257 |     return data
258 | 
259 | def find_txt(dir_name, data_name, data_url, csv_params):
260 |     """Locate or download txt file and load into DataFrame.
261 | 
262 |     Parameters
263 |     ----------
264 |     dir_name : str
265 |         The directory to put the .txt file.
266 |     data_name : str
267 |         The name of the .txt file.
268 |     data_url : str
269 |         The location of the download.
270 |     csv_params : str
271 |         Parameters for loading the csv into DataFrame.
272 | 
273 |     Returns
274 |     -------
275 |     data : DataFrame
276 |         Dataset of interest.
277 | 
278 |     """
279 |     download_dir = os.path.join(DATA_DIR, dir_name)
280 |     datafile = os.path.join(download_dir, data_name)
281 |     if not os.path.isfile(datafile):
282 |         os.makedirs(download_dir, exist_ok=True)
283 |         urllib.request.urlretrieve(data_url, filename=datafile)
284 |     data = pd.read_csv(datafile, **csv_params)
285 |     return data
286 | 
287 | 
288 | def get_data(name, load_attributes=False):
289 |     """Read a dataset specified by name into pandas dataframe.
290 | 
291 |     Parameters
292 |     ----------
293 |     name : str
294 |         The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'ml-1m',
295 |         'citeulike-a', 'pinterest', or 'lastfm'.
296 | 
297 |     Returns
298 |     -------
299 |     data : DataFrame
300 |         Dataset of interest.
301 | 
302 |     """
303 |     if name == 'ml-100k':
304 |         zipped_dir_name = 'ml-100k'
305 |         data_name = 'u.data'
306 |         data_url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
307 |         csv_params = dict(sep='\t', header=None, usecols=[0, 1, 2, 3],
308 |                           names=['user_id', 'item_id', 'rating', 'timestamp'])
309 |         data = read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params)
310 |         if load_attributes:
311 |             user_attributes = read_zipped_csv(zipped_dir_name, 'u.user', data_url,
312 |                                               dict(sep='|', header=None, usecols=[0, 1, 2, 3, 4],
313 |                                                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code']))
314 |             item_attributes = read_zipped_csv(zipped_dir_name, 'u.item', data_url,
315 |                                               dict(sep='|', header=None, usecols=[0, 1, 2, 3, 4], encoding='latin-1',
316 |                                                    names=['item_id', 'title', 'release', 'video release', 'IMDb URL']))
317 |             data = (data, user_attributes, item_attributes)
318 |     elif name == 'ml-10m':
319 |         zipped_dir_name = 'ml-10M100K'
320 |         data_name = 'ratings.dat'
321 |         data_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip'
322 |         csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3],
323 |                           names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
324 |         data = read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params)
325 |         if load_attributes:
326 |             item_attributes = read_zipped_csv(zipped_dir_name, 'movies.dat', data_url,
327 |                                               dict(sep='::', header=None, usecols=[0, 1, 2],
328 |                                                    names=['item_id', 'title', 'genre']))
329 |             data = (data, None, item_attributes)
330 |     elif name == 'ml-1m':
331 |         zipped_dir_name = 'ml-1m'
332 |         data_name = 'ratings.dat'
333 |         data_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
334 |         csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3],
335 |                           names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
336 |         data = read_zipped_csv(zipped_dir_name, data_name, data_url, csv_params)
337 |         if load_attributes:
338 |             user_attributes = read_zipped_csv(zipped_dir_name, 'users.dat', data_url,
339 |                                               dict(sep='::', header=None, usecols=[0, 1, 2, 3, 4],
340 |                                                    names=['user_id', 'gender', 'age', 'occupation', 'zip code']))
341 |             item_attributes = read_zipped_csv(zipped_dir_name, 'movies.dat', data_url,
342 |                                               dict(sep='::', header=None, usecols=[0, 1, 2],
343 |                                                    names=['item_id', 'title', 'genre']))
344 |             data = (data,user_attributes, item_attributes)
345 |     elif name == 'citeulike-a':
346 |         dir_name = 'citeulike-a'
347 |         data_name = 'data.npz'
348 |         data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/'
349 |                     'master/data/citeulike-a.npz')
350 |         np_params = dict(columns=['user_id', 'item_id'])
351 |         data = find_npz(dir_name, data_name, data_url, np_params)
352 |         # TODO: additional info on users or items?
353 |     elif name == 'pinterest':
354 |         dir_name = 'pinterest'
355 |         data_name = 'data.npz'
356 |         data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/'
357 |                     'master/data/pinterest.npz')
358 |         np_params = dict(columns=['user_id', 'item_id'])
359 |         data = find_npz(dir_name, data_name, data_url, np_params)
360 |         # TODO: additional info on users or items?
361 |     elif name == 'lastfm-360k':
362 |         dir_name = 'lastfm-360k'
363 |         data_name = 'LastFM360k-Le75.txt'
364 |         data_url = ('https://zenodo.org/record/3964506/files/LastFM360k-Le75.txt?download=1')
365 |         csv_params = dict(sep=',', header=0, usecols=[0, 1, 2],
366 |                          names=['user_id', 'item_id', 'rating'])
367 |         data = find_txt(dir_name, data_name, data_url, csv_params)
368 |         # log transform for better scaling
369 |         data['rating'] = np.log(1 + data['rating'])
370 |         if load_attributes:
371 |             item_attributes = find_txt(dir_name, 'LastFM360k-MB-artists.txt',
372 |                                        'https://zenodo.org/record/3964506/files/LastFM360k-MB-artists.txt?download=1',
373 |                                        dict(sep='\t', header=0, usecols=[0, 1, 2],
374 |                                             names=['item_id', 'artist_name', 'gender']))
375 |             data = (data, None, item_attributes)
376 |     elif name == 'lastfm':
377 |         data_name = 'lastfm-dataset-1K/lfm1k-play-counts.csv'
378 |         csv_params = dict(header=0, usecols=[0, 1, 2],
379 |                           names=['user_id', 'item_id', 'rating'])
380 |         datafile = os.path.join(DATA_DIR, data_name)
381 |         try:
382 |             data = pd.read_csv(datafile, **csv_params)
383 |             # log transform for better scaling
384 |             data['rating'] = np.log(1 + data['rating'])
385 |             # TODO: remove artists with less than 50 total listens?
386 |             # otherwise should probably retrain for hyperparameter tuning...
387 |         except FileNotFoundError as error:
388 |             print(('LastFM data must be downloaded and preprocessed locally, '
389 |                    'get files from https://drive.google.com/open?id=1qxmsQHe'
390 |                    'D8O-81CbHxvaFP8omMvMxgEh0'))
391 |             raise error
392 |     else:
393 |         raise ValueError('dataset name not recognized')
394 |     return data
395 | 
396 | 
397 | def get_time_split_dataset(name, shuffle=True, binarize=False):
398 |     """Get a time-based test/train split of a dataset as specified by name.
399 | 
400 |     Parameters
401 |     ----------
402 |     name : str
403 |         The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a',
404 |         'pinterest', or 'lastfm'.
405 |     shuffle : bool, optional
406 |         A flag to indicate whether the dataset should be shuffled after loading,
407 |         true by default.
408 |     binarize : bool, optional
409 |         A flag to indicate whether to binarize the ratings to be 0 or 1,
410 |         true by default.
411 | 
412 |     Returns
413 |     -------
414 |     users : dict
415 |         The dict of all users where the key is the user-id and the value is the user's features.
416 |     items : dict
417 |         The dict of all items where the key is the item-id and the value is the item's features.
418 |     train_ratings : dict
419 |         The dict of all training ratings.
420 |     test_ratings : dict
421 |         The dict of all testing ratings.
422 | 
423 |     """
424 |     data = get_data(name)
425 |     if binarize:
426 |         data['rating'] = 1
427 | 
428 |     users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])}
429 |     items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])}
430 | 
431 |     # Add final rating to test set
432 |     test_idx = []
433 |     for uid in np.unique(data['user_id']):
434 |         last_rating_idx = data[data['user_id'] == uid]['timestamp'].idxmax()
435 |         test_idx.append(last_rating_idx)
436 |     data_test = data.loc[test_idx]
437 |     data_train = data.drop(test_idx)
438 | 
439 |     # Shuffle remaining data
440 |     if shuffle:
441 |         data_train = data_train.sample(frac=1).reset_index(drop=True)
442 | 
443 |     # Fill the rating array with initial data.
444 |     train_ratings = {}
445 |     for user_id, item_id, rating in zip(data_train['user_id'], data_train['item_id'],
446 |                                         data_train['rating']):
447 |         # TODO: may want to eventually a rating context depending on dataset (e.g. time)
448 |         train_ratings[user_id, item_id] = (rating, np.zeros(0))
449 | 
450 |     # Fill the rating array with initial data.
451 |     test_ratings = {}
452 |     for user_id, item_id, rating in zip(data_test['user_id'], data_test['item_id'],
453 |                                         data_test['rating']):
454 |         # TODO: may want to eventually a rating context depending on dataset (e.g. time)
455 |         test_ratings[user_id, item_id] = (rating, np.zeros(0))
456 | 
457 |     return users, items, train_ratings, test_ratings
458 | 


--------------------------------------------------------------------------------