├── configs
    └── .gitignore
├── references
    └── .gitignore
├── reports
    └── figures
    │   └── .gitignore
├── docs
    ├── _static
    │   └── .gitignore
    ├── authors.rst
    ├── changelog.rst
    ├── license.rst
    ├── index.rst
    ├── Makefile
    └── conf.py
├── postBuild
├── data
    ├── raw
    │   ├── ml-100k
    │   │   ├── u.info
    │   │   ├── u.item
    │   │   ├── u.genre
    │   │   └── u.occupation
    │   └── .gitignore
    ├── external
    │   └── .gitignore
    ├── interim
    │   └── .gitignore
    ├── preprocessed
    │   └── .gitignore
    └── .gitignore
├── notebooks
    ├── fm.png
    ├── parrot.png
    ├── template.ipynb
    ├── 1_e_explore_movielens.ipynb
    ├── 3_e_demographic_recs.ipynb
    ├── 4_e_cf_knn_rating_pred.ipynb
    ├── 2_e_popularity_recs.ipynb
    ├── 9_e_ligthfm.ipynb
    ├── 6_e_cf_mf_ranking_pred.ipynb
    ├── extra_sport_recommender.ipynb
    ├── solutions
    │   ├── 9_s_ligthfm.ipynb
    │   └── 4_s_cf_knn_rating_pred.ipynb
    └── 8_e_hybrid_fm.ipynb
├── models
    └── .gitignore
├── AUTHORS.rst
├── CHANGELOG.rst
├── tests
    └── conftest.py
├── .isort.cfg
├── docker
    ├── docker-compose.yaml
    └── Dockerfile
├── src
    └── recsys_training
    │   ├── __init__.py
    │   ├── evaluation.py
    │   ├── utils.py
    │   └── data.py
├── environment.yml
├── setup.py
├── .pre-commit-config.yaml
├── .coveragerc
├── .gitignore
├── LICENSE.txt
├── scripts
    └── train_model.py
├── README.md
└── setup.cfg


/configs/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/references/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reports/figures/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
1 | # Empty directory
2 | 


--------------------------------------------------------------------------------
/postBuild:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -ex
3 | pip install .
4 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. _authors:
2 | .. include:: ../AUTHORS.rst
3 | 


--------------------------------------------------------------------------------
/data/raw/ml-100k/u.info:
--------------------------------------------------------------------------------
1 | 943 users
2 | 1682 items
3 | 100000 ratings
4 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. _changes:
2 | .. include:: ../CHANGELOG.rst
3 | 


--------------------------------------------------------------------------------
/notebooks/fm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkurovski/recsys_training/HEAD/notebooks/fm.png


--------------------------------------------------------------------------------
/notebooks/parrot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkurovski/recsys_training/HEAD/notebooks/parrot.png


--------------------------------------------------------------------------------
/data/raw/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/data/raw/ml-100k/u.item:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkurovski/recsys_training/HEAD/data/raw/ml-100k/u.item


--------------------------------------------------------------------------------
/models/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/data/external/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/data/interim/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. _license:
2 | 
3 | =======
4 | License
5 | =======
6 | 
7 | .. include:: ../LICENSE.txt
8 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Contributors
3 | ============
4 | 
5 | * squall-1002 <marcel.kurovski@googlemail.com>
6 | 


--------------------------------------------------------------------------------
/data/preprocessed/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Changelog
 3 | =========
 4 | 
 5 | Version 0.1
 6 | ===========
 7 | 
 8 | - Feature A added
 9 | - FIX: nasty bug #1729 fixed
10 | - add your changes here!
11 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file and .gitignore in sub directories
4 | !.gitignore
5 | !raw
6 | !external
7 | !preprocessed
8 | !interim
9 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 |     Dummy conftest.py for recsys_training.
 4 | 
 5 |     If you don't know what this is for, just leave it empty.
 6 |     Read more about conftest.py under:
 7 |     https://pytest.org/latest/plugins.html
 8 | """
 9 | 
10 | # import pytest
11 | 


--------------------------------------------------------------------------------
/data/raw/ml-100k/u.genre:
--------------------------------------------------------------------------------
 1 | unknown|0
 2 | Action|1
 3 | Adventure|2
 4 | Animation|3
 5 | Children's|4
 6 | Comedy|5
 7 | Crime|6
 8 | Documentary|7
 9 | Drama|8
10 | Fantasy|9
11 | Film-Noir|10
12 | Horror|11
13 | Musical|12
14 | Mystery|13
15 | Romance|14
16 | Sci-Fi|15
17 | Thriller|16
18 | War|17
19 | Western|18
20 | 
21 | 


--------------------------------------------------------------------------------
/data/raw/ml-100k/u.occupation:
--------------------------------------------------------------------------------
 1 | administrator
 2 | artist
 3 | doctor
 4 | educator
 5 | engineer
 6 | entertainment
 7 | executive
 8 | healthcare
 9 | homemaker
10 | lawyer
11 | librarian
12 | marketing
13 | none
14 | other
15 | programmer
16 | retired
17 | salesman
18 | scientist
19 | student
20 | technician
21 | writer
22 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [settings]
 2 | line_length=88
 3 | indent='    '
 4 | skip=.tox,.venv,build,dist
 5 | known_standard_library=setuptools,pkg_resources
 6 | known_test=pytest
 7 | known_first_party=recsys_training
 8 | sections=FUTURE,STDLIB,COMPAT,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
 9 | default_section=THIRDPARTY
10 | multi_line_output=3
11 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 |   recsys-training-mle:
 5 |     image: recsys-training:mle
 6 |     container_name: recsys-training-mle
 7 |     command: "bash -c 'conda init bash && source /root/.bashrc && conda activate recsys_training && jupyter lab --no-browser --ip=* --port=8888 --allow-root --notebook-dir=/root/recsys_training/notebooks --NotebookApp.token=\"\"'"
 8 |     ports:
 9 |       - 8888:8888
10 | 


--------------------------------------------------------------------------------
/src/recsys_training/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pkg_resources import get_distribution, DistributionNotFound
 3 | 
 4 | try:
 5 |     # Change here if project is renamed and does not equal the package name
 6 |     dist_name = __name__
 7 |     __version__ = get_distribution(dist_name).version
 8 | except DistributionNotFound:
 9 |     __version__ = 'unknown'
10 | finally:
11 |     del get_distribution, DistributionNotFound
12 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: recsys_training
 2 | channels:
 3 |   - anaconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - cython=0.29.23
 8 |   - ipython=7.22.0
 9 |   - jupyterlab=3.0.14
10 |   - lightfm=1.16
11 |   - matplotlib=3.3.4
12 |   - notebook=6.3.0
13 |   - numpy=1.20.1
14 |   - pandas=1.2.4
15 |   - pip=21.1
16 |   - python=3.9.4
17 |   - scikit-learn=0.24.1
18 |   - scipy=1.6.2
19 |   - seaborn=0.11.1
20 |   - setuptools=49.6.0
21 |   - statsmodels=0.12.2
22 |   - pip:
23 |       - "--editable=git+https://github.com/coreylynch/pyFM#egg=pyfm"
24 |       - tqdm==4.60.0
25 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 |     Setup file for recsys_training.
 4 |     Use setup.cfg to configure your project.
 5 | 
 6 |     This file was generated with PyScaffold 3.2.3.
 7 |     PyScaffold helps you to put up the scaffold of your new Python project.
 8 |     Learn more under: https://pyscaffold.org/
 9 | """
10 | import sys
11 | 
12 | from pkg_resources import VersionConflict, require
13 | from setuptools import setup
14 | 
15 | try:
16 |     require('setuptools>=38.3')
17 | except VersionConflict:
18 |     print("Error: version of setuptools is too old (<38.3)!")
19 |     sys.exit(1)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     setup(use_pyscaffold=False)
24 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: '^docs/conf.py'
 2 | 
 3 | repos:
 4 | - repo: git://github.com/pre-commit/pre-commit-hooks
 5 |   rev: v2.2.3
 6 |   hooks:
 7 |   - id: trailing-whitespace
 8 |   - id: check-added-large-files
 9 |   - id: check-ast
10 |   - id: check-json
11 |   - id: check-merge-conflict
12 |   - id: check-xml
13 |   - id: check-yaml
14 |   - id: debug-statements
15 |   - id: end-of-file-fixer
16 |   - id: requirements-txt-fixer
17 |   - id: mixed-line-ending
18 |     args: ['--fix=no']
19 |   - id: flake8
20 |     args: ['--max-line-length=88']  # default of Black
21 | 
22 | - repo: https://github.com/pre-commit/mirrors-isort
23 |   rev: v4.3.4
24 |   hooks:
25 |   - id: isort
26 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | branch = True
 4 | source = recsys_training
 5 | # omit = bad_file.py
 6 | 
 7 | [paths]
 8 | source =
 9 |     src/
10 |     */site-packages/
11 | 
12 | [report]
13 | # Regexes for lines to exclude from consideration
14 | exclude_lines =
15 |     # Have to re-enable the standard pragma
16 |     pragma: no cover
17 | 
18 |     # Don't complain about missing debug-only code:
19 |     def __repr__
20 |     if self\.debug
21 | 
22 |     # Don't complain if tests don't hit defensive assertion code:
23 |     raise AssertionError
24 |     raise NotImplementedError
25 | 
26 |     # Don't complain if non-runnable code isn't run:
27 |     if 0:
28 |     if __name__ == .__main__.:
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Temporary and binary files
 2 | *~
 3 | *.py[cod]
 4 | *.so
 5 | *.cfg
 6 | !.isort.cfg
 7 | !setup.cfg
 8 | *.orig
 9 | *.log
10 | *.pot
11 | __pycache__/*
12 | .cache/*
13 | .*.swp
14 | */.ipynb_checkpoints/*
15 | .DS_Store
16 | 
17 | # Project files
18 | .ropeproject
19 | .project
20 | .pydevproject
21 | .settings
22 | .idea
23 | tags
24 | 
25 | # Package files
26 | *.egg
27 | *.eggs/
28 | .installed.cfg
29 | *.egg-info
30 | 
31 | # Unittest and coverage
32 | htmlcov/*
33 | .coverage
34 | .tox
35 | junit.xml
36 | coverage.xml
37 | .pytest_cache/
38 | 
39 | # Build and docs folder/files
40 | build/*
41 | dist/*
42 | sdist/*
43 | docs/api/*
44 | docs/_rst/*
45 | docs/_build/*
46 | cover/*
47 | MANIFEST
48 | 
49 | # Per-project virtualenvs
50 | .venv*/
51 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 squall-1002
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/train_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import logging
 4 | from pathlib import Path
 5 | import sys
 6 | 
 7 | import click
 8 | from IPython.core import ultratb
 9 | 
10 | import recsys_training
11 | 
12 | # fallback to debugger on error
13 | sys.excepthook = ultratb.FormattedTB(mode='Verbose', color_scheme='Linux', call_pdb=1)
14 | 
15 | _logger = logging.getLogger(__name__)
16 | 
17 | 
18 | @click.command()
19 | @click.option('-c', '--config', 'cfg_path', required=True,
20 |               type=click.Path(exists=True), help='path to config file')
21 | @click.option('--quiet', 'log_level', flag_value=logging.WARNING, default=True)
22 | @click.option('-v', '--verbose', 'log_level', flag_value=logging.INFO)
23 | @click.option('-vv', '--very-verbose', 'log_level', flag_value=logging.DEBUG)
24 | @click.version_option(recsys_training.__version__)
25 | def main(cfg_path: Path, log_level: int):
26 |     logging.basicConfig(stream=sys.stdout,
27 |                         level=log_level,
28 |                         datefmt='%Y-%m-%d %H:%M',
29 |                         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
30 |     # YOUR CODE GOES HERE! Keep the main functionality in src/recsys_training
31 |     # est = recsys_training.models.Estimator()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:latest
 2 | 
 3 | #  $ docker build . -t continuumio/miniconda:latest -t continuumio/miniconda:4.5.11 -t continuumio/miniconda2:latest -t continuumio/miniconda2:4.5.11
 4 | #  $ docker run --rm -it continuumio/miniconda2:latest /bin/bash
 5 | #  $ docker push continuumio/miniconda:latest
 6 | #  $ docker push continuumio/miniconda:4.5.11
 7 | #  $ docker push continuumio/miniconda2:latest
 8 | #  $ docker push continuumio/miniconda2:4.5.11
 9 | 
10 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
11 | ENV PATH /opt/conda/bin:$PATH
12 | 
13 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \
14 |     libglib2.0-0 libxext6 libsm6 libxrender1 \
15 |     git mercurial subversion
16 | 
17 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
18 |     /bin/bash ~/miniconda.sh -b -p /opt/conda && \
19 |     rm ~/miniconda.sh && \
20 |     ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
21 |     echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
22 |     echo "conda activate base" >> ~/.bashrc
23 | 
24 | RUN apt-get install -y gcc unzip git curl grep sed dpkg && \
25 |     TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \
26 |     curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \
27 |     dpkg -i tini.deb && \
28 |     rm tini.deb && \
29 |     apt-get clean
30 | 
31 | RUN cd ~ && \
32 |     git clone https://github.com/mkurovski/recsys_training.git && \
33 |     cd recsys_training && \
34 |     conda env create -f environment.yaml
35 | 
36 | RUN /bin/bash -c "source activate recsys_training && \
37 |     cd ~/recsys_training && \
38 |     python setup.py install"
39 | 
40 | # Download an unzip Data
41 | RUN wget http://files.grouplens.org/datasets/movielens/ml-100k.zip -O ~/recsys_training/data/raw/ml-100k.zip && \
42 |     cd ~/recsys_training/data/raw && \
43 |     unzip ml-100k.zip
44 | 
45 | ENTRYPOINT [ "/usr/bin/tini", "--" ]
46 | CMD [ "/bin/bash" ]
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # recsys_training
 2 | 
 3 | Recommender System Training Package
 4 | 
 5 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/mkurovski/recsys_training/master)
 6 | 
 7 | ## Description
 8 | 
 9 | Hands-on Training for Recommender Systems developed for Machine Learning Essentials 2020.
10 | 
11 | ## Installation
12 | 
13 | In order to set up the necessary environment:
14 | 
15 | 1. create an environment `recsys_training` with the help of [conda],
16 |    
17 |    ```
18 |    conda env create -f environment.yaml
19 |    ```
20 |    
21 | 2. activate the new environment with
22 |    
23 |    ```
24 |    conda activate recsys_training
25 |    ```
26 |    
27 | 3. install `recsys_training` with:
28 |    
29 |    ```
30 |    python setup.py install # or develop
31 |    ```
32 | 
33 | ### Docker
34 | 
35 | Make sure you have `docker` and `docker-compose` installed.
36 | 
37 | 1. Build the image with using the `Dockerfile` in `docker`
38 |     ```
39 |     docker build -t recsys-training:mle -f Dockerfile .
40 |     ```
41 | 
42 | 2. Start the container with `docker-compose` pointing to the yaml-file
43 |     ```
44 |     docker-compose up -f docker/docker-compose.yaml
45 |     ```
46 |     
47 | The jupyter lab port `8888` will be mapped to the same port on your host machine, simply got to your preferred browser and enter via
48 |     ```
49 |     http://localhost:8888/
50 |     ```
51 | 
52 | ## Usage
53 | 
54 | There are 9 notebooks within `notebooks/` each starting with a number followed by `_e_` for exercise. Within `notebooks/solutions/`you will find all notebooks with a solution proposal implemented. It is strongly advised to go through the notebooks in numerically ascending order.
55 | 
56 | We use MovieLens 100k as example dataset for the lessons. You can find the data in `data/raw/`.
57 | 
58 | ## Note
59 | 
60 | This project has been set up using PyScaffold 3.2.3 and the [dsproject extension] 0.4.
61 | For details and usage information on PyScaffold see https://pyscaffold.org/.
62 | 
63 | [conda]: https://docs.conda.io/
64 | [pre-commit]: https://pre-commit.com/
65 | [Jupyter]: https://jupyter.org/
66 | [nbstripout]: https://github.com/kynan/nbstripout
67 | [Google style]: http://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
68 | [dsproject extension]: https://github.com/pyscaffold/pyscaffoldext-dsproject
69 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | recsys_training
 3 | ===============
 4 | 
 5 | This is the documentation of **recsys_training**.
 6 | 
 7 | .. note::
 8 | 
 9 |     This is the main page of your project's `Sphinx`_ documentation.
10 |     It is formatted in `reStructuredText`_. Add additional pages
11 |     by creating rst-files in ``docs`` and adding them to the `toctree`_ below.
12 |     Use then `references`_ in order to link them from this page, e.g.
13 |     :ref:`authors` and :ref:`changes`.
14 | 
15 |     It is also possible to refer to the documentation of other Python packages
16 |     with the `Python domain syntax`_. By default you can reference the
17 |     documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_,
18 |     `Pandas`_, `Scikit-Learn`_. You can add more by extending the
19 |     ``intersphinx_mapping`` in your Sphinx's ``conf.py``.
20 | 
21 |     The pretty useful extension `autodoc`_ is activated by default and lets
22 |     you include documentation from docstrings. Docstrings can be written in
23 |     `Google style`_ (recommended!), `NumPy style`_ and `classical style`_.
24 | 
25 | 
26 | Contents
27 | ========
28 | 
29 | .. toctree::
30 |    :maxdepth: 2
31 | 
32 |    License <license>
33 |    Authors <authors>
34 |    Changelog <changelog>
35 |    Module Reference <api/modules>
36 | 
37 | 
38 | Indices and tables
39 | ==================
40 | 
41 | * :ref:`genindex`
42 | * :ref:`modindex`
43 | * :ref:`search`
44 | 
45 | .. _toctree: http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html
46 | .. _reStructuredText: http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
47 | .. _references: http://www.sphinx-doc.org/en/stable/markup/inline.html
48 | .. _Python domain syntax: http://sphinx-doc.org/domains.html#the-python-domain
49 | .. _Sphinx: http://www.sphinx-doc.org/
50 | .. _Python: http://docs.python.org/
51 | .. _Numpy: http://docs.scipy.org/doc/numpy
52 | .. _SciPy: http://docs.scipy.org/doc/scipy/reference/
53 | .. _matplotlib: https://matplotlib.org/contents.html#
54 | .. _Pandas: http://pandas.pydata.org/pandas-docs/stable
55 | .. _Scikit-Learn: http://scikit-learn.org/stable
56 | .. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html
57 | .. _Google style: https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings
58 | .. _NumPy style: https://numpydoc.readthedocs.io/en/latest/format.html
59 | .. _classical style: http://www.sphinx-doc.org/en/stable/domains.html#info-field-lists
60 | 


--------------------------------------------------------------------------------
/notebooks/template.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import os\n",
10 |     "import sys\n",
11 |     "import math\n",
12 |     "import logging\n",
13 |     "from pathlib import Path\n",
14 |     "\n",
15 |     "import numpy as np\n",
16 |     "import scipy as sp\n",
17 |     "import sklearn\n",
18 |     "import statsmodels.api as sm\n",
19 |     "from statsmodels.formula.api import ols\n",
20 |     "\n",
21 |     "%load_ext autoreload\n",
22 |     "%autoreload 2\n",
23 |     "\n",
24 |     "import matplotlib as mpl\n",
25 |     "import matplotlib.pyplot as plt\n",
26 |     "%matplotlib inline\n",
27 |     "%config InlineBackend.figure_format = 'retina'\n",
28 |     "\n",
29 |     "import seaborn as sns\n",
30 |     "sns.set_context(\"poster\")\n",
31 |     "sns.set(rc={'figure.figsize': (16, 9.)})\n",
32 |     "sns.set_style(\"whitegrid\")\n",
33 |     "\n",
34 |     "import pandas as pd\n",
35 |     "pd.set_option(\"display.max_rows\", 120)\n",
36 |     "pd.set_option(\"display.max_columns\", 120)\n",
37 |     "\n",
38 |     "logging.basicConfig(level=logging.INFO, stream=sys.stdout)"
39 |    ]
40 |   },
41 |   {
42 |    "cell_type": "code",
43 |    "execution_count": null,
44 |    "metadata": {},
45 |    "outputs": [],
46 |    "source": [
47 |     "from recsys_training import *"
48 |    ]
49 |   },
50 |   {
51 |    "cell_type": "markdown",
52 |    "metadata": {},
53 |    "source": [
54 |     "**PLEASE** save this file right now using the following naming convention: `NUMBER_FOR_SORTING-YOUR_INITIALS-SHORT_DESCRIPTION`, e.g. `1.0-fw-initial-data-exploration`. Use the number to order the file within the directory according to its usage."
55 |    ]
56 |   },
57 |   {
58 |    "cell_type": "code",
59 |    "execution_count": null,
60 |    "metadata": {},
61 |    "outputs": [],
62 |    "source": []
63 |   }
64 |  ],
65 |  "metadata": {
66 |   "kernelspec": {
67 |    "display_name": "Python 3",
68 |    "language": "python",
69 |    "name": "python3"
70 |   },
71 |   "language_info": {
72 |    "codemirror_mode": {
73 |     "name": "ipython",
74 |     "version": 3
75 |    },
76 |    "file_extension": ".py",
77 |    "mimetype": "text/x-python",
78 |    "name": "python",
79 |    "nbconvert_exporter": "python",
80 |    "pygments_lexer": "ipython3",
81 |    "version": "3.7.3"
82 |   },
83 |   "pycharm": {
84 |    "stem_cell": {
85 |     "cell_type": "raw",
86 |     "metadata": {
87 |      "collapsed": false
88 |     },
89 |     "source": []
90 |    }
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 2
95 | }
96 | 


--------------------------------------------------------------------------------
/src/recsys_training/evaluation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Ranking and Rating Evaluation Metrics
 3 | """
 4 | __author__ = "Marcel Kurovski"
 5 | __copyright__ = "Marcel Kurovski"
 6 | __license__ = "mit"
 7 | 
 8 | from typing import Dict, List, Tuple
 9 | 
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | 
14 | def compute_mae(test_ratings: pd.DataFrame, recommender) -> Tuple[float, float]:
15 |     pred = test_ratings.apply(lambda row:
16 |                               recommender.get_prediction(row['user'], row['item']),
17 |                               axis=1)
18 | 
19 |     pred = pred.apply(lambda val: list(val.values())[0]['pred'])
20 |     notnulls = pred.notnull()
21 |     mae = np.mean(np.abs(test_ratings.rating[notnulls] - pred[notnulls]))
22 |     coverage = notnulls.sum()/len(test_ratings)
23 | 
24 |     return {'mae': mae, 'coverage': coverage}
25 | 
26 | 
27 | # TODO: Remove min_rating logic from here (should be done before on data through binarize)
28 | def retrieval_score(test_ratings: pd.DataFrame,
29 |                     recommender,
30 |                     remove_known_pos: bool = False,
31 |                     metric: str = 'mrr') -> float:
32 |     """
33 |     Mean Average Precision / Mean Reciprocal Rank of first relevant item @ N
34 |     """
35 |     N = recommender.N
36 |     user_scores = []
37 |     relevant_items = get_relevant_items(test_ratings)
38 | 
39 |     for user in recommender.users:
40 |         if user in relevant_items.keys():
41 |             predicted_items = recommender.get_recommendations(user, remove_known_pos)
42 |             predicted_items = [item for item, _ in predicted_items]
43 |             if metric == 'map':
44 |                 true_positives = np.intersect1d(relevant_items[user],
45 |                                                 predicted_items)
46 |                 score = len(true_positives) / N
47 |             elif metric == 'mrr':
48 |                 score = np.mean([reciprocal_rank(item, predicted_items)
49 |                                  for item in relevant_items[user]])
50 |             else:
51 |                 raise ValueError(f"Unknown value {metric} for Argument `metric`")
52 | 
53 |             user_scores.append(score)
54 | 
55 |     return np.mean(user_scores)
56 | 
57 | 
58 | def reciprocal_rank(item: int, ranking: List[int]) -> float:
59 |     rr = 0
60 |     if item in ranking:
61 |         rr = 1/(ranking.index(item)+1)
62 | 
63 |     return rr
64 | 
65 | 
66 | def get_relevant_items(test_ratings: pd.DataFrame) -> Dict[int, List[int]]:
67 |     """
68 |     returns {user: [items]} as a list of relevant items per user
69 |     for all users found in the test dataset
70 |     """
71 |     relevant_items = test_ratings[['user', 'item']]
72 |     relevant_items = relevant_items.groupby('user')
73 |     relevant_items = {user: relevant_items.get_group(user)['item'].values
74 |                       for user in relevant_items.groups.keys()}
75 | 
76 |     return relevant_items
77 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | # This file is used to configure your project.
  2 | # Read more about the various options under:
  3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
  4 | 
  5 | [metadata]
  6 | name = recsys_training
  7 | description = Add a short description here!
  8 | author = squall-1002
  9 | author-email = marcel.kurovski@googlemail.com
 10 | license = mit
 11 | long-description = file: README.rst
 12 | long-description-content-type = text/markdown
 13 | # long-description-content-type = text/x-rst; charset=UTF-8
 14 | url = https://github.com/pyscaffold/pyscaffold/
 15 | project-urls =
 16 |     Documentation = https://pyscaffold.org/
 17 | # Change if running only on Windows, Mac or Linux (comma-separated)
 18 | platforms = any
 19 | # Add here all kinds of additional classifiers as defined under
 20 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
 21 | classifiers =
 22 |     Development Status :: 4 - Beta
 23 |     Programming Language :: Python
 24 | 
 25 | [options]
 26 | zip_safe = False
 27 | packages = find:
 28 | include_package_data = True
 29 | package_dir =
 30 |     =src
 31 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
 32 | setup_requires = pyscaffold>=3.2a0,<3.3a0
 33 | # Add here dependencies of your project (semicolon/line-separated), e.g.
 34 | # install_requires = numpy; scipy
 35 | # The usage of test_requires is discouraged, see `Dependency Management` docs
 36 | # tests_require = pytest; pytest-cov
 37 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 38 | # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
 39 | 
 40 | [options.packages.find]
 41 | where = src
 42 | exclude =
 43 |     tests
 44 | 
 45 | [options.extras_require]
 46 | # Add here additional requirements for extra features, to install with:
 47 | # `pip install recsys_training[PDF]` like:
 48 | # PDF = ReportLab; RXP
 49 | # Add here test requirements (semicolon/line-separated)
 50 | testing =
 51 |     pytest
 52 |     pytest-cov
 53 | 
 54 | [options.entry_points]
 55 | # Add here console scripts like:
 56 | # console_scripts =
 57 | #     script_name = recsys_training.module:function
 58 | # For example:
 59 | # console_scripts =
 60 | #     fibonacci = recsys_training.skeleton:run
 61 | # And any other entry points, for example:
 62 | # pyscaffold.cli =
 63 | #     awesome = pyscaffoldext.awesome.extension:AwesomeExtension
 64 | 
 65 | [test]
 66 | # py.test options when running `python setup.py test`
 67 | # addopts = --verbose
 68 | extras = True
 69 | 
 70 | [tool:pytest]
 71 | # Options for py.test:
 72 | # Specify command line options as you would do when invoking py.test directly.
 73 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
 74 | # in order to write a coverage file that can be read by Jenkins.
 75 | addopts =
 76 |     --cov recsys_training --cov-report term-missing
 77 |     --verbose
 78 | norecursedirs =
 79 |     dist
 80 |     build
 81 |     .tox
 82 | testpaths = tests
 83 | 
 84 | [aliases]
 85 | dists = bdist_wheel
 86 | 
 87 | [bdist_wheel]
 88 | # Use this option if your package is pure-python
 89 | universal = 1
 90 | 
 91 | [build_sphinx]
 92 | source_dir = docs
 93 | build_dir = build/sphinx
 94 | 
 95 | [devpi:upload]
 96 | # Options for the devpi: PyPI server and packaging tool
 97 | # VCS export must be deactivated since we are using setuptools-scm
 98 | no-vcs = 1
 99 | formats = bdist_wheel
100 | 
101 | [flake8]
102 | # Some sane defaults for the code style checker flake8
103 | exclude =
104 |     .tox
105 |     build
106 |     dist
107 |     .eggs
108 |     docs/conf.py
109 | 
110 | [pyscaffold]
111 | # PyScaffold's parameters when the project was created.
112 | # This will be used when updating. Do not change!
113 | version = 3.2.3
114 | package = recsys_training
115 | extensions =
116 |     no_skeleton
117 |     pre_commit
118 |     dsproject
119 | 


--------------------------------------------------------------------------------
/src/recsys_training/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions
  3 | """
  4 | __author__ = "Marcel Kurovski"
  5 | __copyright__ = "Marcel Kurovski"
  6 | __license__ = "mit"
  7 | 
  8 | import logging
  9 | import sys
 10 | from typing import Dict
 11 | 
 12 | import pandas as pd
 13 | import numpy as np
 14 | import scipy as sp
 15 | 
 16 | 
 17 | def setup_logging(loglevel):
 18 |     """Setup basic logging
 19 | 
 20 |     Args:
 21 |       loglevel (int): minimum loglevel for emitting messages
 22 |     """
 23 |     logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
 24 |     logging.basicConfig(level=loglevel, stream=sys.stdout,
 25 |                         format=logformat, datefmt="%Y-%m-%d %H:%M:%S")
 26 | 
 27 | 
 28 | def get_entity_sim(a: int, b: int,
 29 |                    entity_ratings: Dict[int, float],
 30 |                    metric: str = 'pearson') -> tuple:
 31 |     """
 32 |     Cosine Similarity
 33 |     Pearson Correlation
 34 |     Adjusted Cosine Similarity
 35 |     Jaccard Similarity (intersection over union) - not a good idea as it does not incorporate ratings, e.g.
 36 |         even the same users have rated two items, highest Jaccard similarity as evidence for high item similarity,
 37 |         their judgement may be very differently on the two items, justifying dissimilarity
 38 |     """
 39 |     # 1. isolate e.g. users that have rated both items (a and b)
 40 |     key_intersection = set(entity_ratings[a].keys()).intersection(entity_ratings[b].keys())
 41 |     ratings = np.array([(entity_ratings[a][key], entity_ratings[b][key]) for key in key_intersection])
 42 |     n_joint_ratings = len(ratings)
 43 | 
 44 |     if n_joint_ratings > 1:
 45 |         # 2. apply a similarity computation technique
 46 |         if metric == 'pearson':
 47 |             # Warning and nan if for one entity the variance is 0
 48 |             sim = np.corrcoef(ratings, rowvar=False)[0, 1]
 49 |         elif metric == 'cosine':
 50 |             nom = ratings[:, 0].dot(ratings[:, 1])
 51 |             denom = np.linalg.norm(ratings[:, 0]) * np.linalg.norm(ratings[:, 1])
 52 |             sim = nom / denom
 53 |         elif metric == 'euclidean':
 54 |             sim = normalized_euclidean_sim(ratings[:, 0], ratings[:, 1])
 55 |         elif metric == 'adj_cosine':
 56 |             sim = None
 57 |         else:
 58 |             raise ValueError(f"Value {metric} for argument 'mode' not supported.")
 59 |     else:
 60 |         sim = None
 61 | 
 62 |     return sim, n_joint_ratings
 63 | 
 64 | 
 65 | def normalized_euclidean_sim(a, b):
 66 |     # scale to unit vectors
 67 |     a_norm = a / np.linalg.norm(a)
 68 |     b_norm = b / np.linalg.norm(b)
 69 | 
 70 |     dist = np.linalg.norm(a_norm - b_norm)
 71 |     sim = 2 - dist - 1
 72 |     return sim
 73 | 
 74 | 
 75 | def min_max_scale(val, bounds):
 76 |     min_max_range = bounds['max']-bounds['min']
 77 |     return (val-bounds['min'])/min_max_range
 78 | 
 79 | 
 80 | def sigmoid(x):
 81 |     return 1/(1+np.exp(-x))
 82 | 
 83 | 
 84 | def df_to_coo(df, n_users, n_items):
 85 |     coo = sp.sparse.coo_matrix(([1]*len(df), (df.user.values-1, df.item.values-1)),
 86 |                                shape=(n_users, n_items), dtype=np.int32)
 87 |     return coo
 88 | 
 89 | 
 90 | def coo_to_df(coo):
 91 |     mat = np.concatenate((coo.row.reshape(-1, 1)+1,
 92 |                           coo.col.reshape(-1, 1)+1),
 93 |                          axis=1)
 94 |     return pd.DataFrame(mat, columns=['user', 'item'])
 95 | 
 96 | 
 97 | def get_sparsity(sparse_arr) -> float:
 98 |     num_elements = sparse_arr.shape[0]*sparse_arr.shape[1]
 99 |     num_nonzero_elements = sparse_arr.nnz
100 |     density = num_nonzero_elements/num_elements
101 |     return 1-density
102 | 
103 | 
104 | def one_hot_encode_ids(ids: np.array, length):
105 |     one_hot_enc = np.zeros((len(ids), length))
106 |     one_hot_enc[np.arange(len(ids)), ids] = 1
107 |     return one_hot_enc
108 | 


--------------------------------------------------------------------------------
/notebooks/1_e_explore_movielens.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 1: Exploratory Data Analysis on the MovieLens 100k Dataset"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In these lessons, we will be working a lot with [pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html) and numpy - so please take the time to at least get yourself familiar with it, e.g. with [10 Minutes to Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html).\n",
 15 |     "\n",
 16 |     "The [MovieLens](https://grouplens.org/datasets/movielens/) datasets are for recommender systems practitioners and researchers what MNIST is for computer vision people. Of course, the MovieLens datasets are not the only public datasets used in the RecSys community, but one of the most widely used. There are also the\n",
 17 |     "* [Million Song Dataset](http://millionsongdataset.com/)\n",
 18 |     "* [Amazon product review dataset](https://nijianmo.github.io/amazon/index.html)\n",
 19 |     "* [Criteo datasets](https://labs.criteo.com/category/dataset/)\n",
 20 |     "* [Twitter RecSys Challenge 2020](https://recsys-twitter.com/previous_challenge)\n",
 21 |     "* [Spotify Million Playlist Dataset](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge)\n",
 22 |     "* [YooChoose RecSys Challenge 2015](https://www.kaggle.com/chadgostopp/recsys-challenge-2015)\n",
 23 |     "* [BookCrossings](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) and many more\n",
 24 |     "\n",
 25 |     "On _kdnuggets_ you can find a [simple overview](https://www.kdnuggets.com/2016/02/nine-datasets-investigating-recommender-systems.html) of some of them.\n",
 26 |     "\n",
 27 |     "MovieLens comes in different sizes regarding the number of movie ratings, user, items. Take a look at the GroupLens website and explore them youself."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import numpy as np\n",
 37 |     "import pandas as pd\n",
 38 |     "import seaborn as sns"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 2,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "from recsys_training.data import genres"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n",
 57 |     "ml100k_item_filepath = '../data/raw/ml-100k/u.item'\n",
 58 |     "ml100k_user_filepath = '../data/raw/ml-100k/u.user'"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Load Data"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "ratings = pd.read_csv(ml100k_ratings_filepath,\n",
 75 |     "                      sep='\\t',\n",
 76 |     "                      header=None,\n",
 77 |     "                      names=['user', 'item', 'rating', 'timestamp'],\n",
 78 |     "                      engine='python')"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 5,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "items = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n",
 88 |     "                    names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n",
 89 |     "                    engine='python')"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 6,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "users = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n",
 99 |     "                    names=['user', 'age', 'gender', 'occupation', 'zip'])"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Data Exploration"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "In this unit, we like to get a better picture of the data we use for making recommendations in the upcoming units. Therefore, let's have a look to some statistics to get confident with the data and algorithms.\n",
114 |     "\n",
115 |     "![](parrot.png)\n",
116 |     "\n",
117 |     "**TODO:**\n",
118 |     "Let's find out the following:\n",
119 |     "\n",
120 |     "* number of users\n",
121 |     "* number of items\n",
122 |     "* rating distribution\n",
123 |     "* user / item mean ratings\n",
124 |     "* popularity skewness\n",
125 |     "    * user rating count distribution\n",
126 |     "    * item rating count distribution\n",
127 |     "* time\n",
128 |     "* sparsity\n",
129 |     "* user / item features"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Python 3",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.9.4"
157 |   },
158 |   "pycharm": {
159 |    "stem_cell": {
160 |     "cell_type": "raw",
161 |     "metadata": {
162 |      "collapsed": false
163 |     },
164 |     "source": []
165 |    }
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 4
170 | }
171 | 


--------------------------------------------------------------------------------
/src/recsys_training/data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Module for MovieLens Data Loading and Preprocessing
  4 | """
  5 | __author__ = "Marcel Kurovski"
  6 | __copyright__ = "Marcel Kurovski"
  7 | __license__ = "mit"
  8 | 
  9 | import calendar
 10 | import logging
 11 | from typing import Dict
 12 | 
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | from .utils import min_max_scale
 17 | 
 18 | 
 19 | _logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | genres = [
 23 |     'unknown',
 24 |     'Action',
 25 |     'Adventure',
 26 |     'Animation',
 27 |     'Children',
 28 |     'Comedy',
 29 |     'Crime',
 30 |     'Documentary',
 31 |     'Drama',
 32 |     'Fantasy',
 33 |     'Film-Noir',
 34 |     'Horror',
 35 |     'Musical',
 36 |     'Mystery',
 37 |     'Romance',
 38 |     'Sci-Fi',
 39 |     'Thriller',
 40 |     'War',
 41 |     'Western'
 42 | ]
 43 | 
 44 | 
 45 | # TODO: Generalize initialization into from dataframes and from file
 46 | class Dataset(object):
 47 |     def __init__(self, filepath: str):
 48 |         self.filepath = filepath
 49 |         self._load()
 50 | 
 51 |     def _load(self):
 52 |         self.ratings = pd.read_csv(self.filepath,
 53 |                                    sep='\t',
 54 |                                    header=None,
 55 |                                    names=['user', 'item', 'rating', 'timestamp'],
 56 |                                    engine='python')
 57 |         self.users = sorted(self.ratings['user'].unique())
 58 |         self.items = sorted(self.ratings['item'].unique())
 59 |         self.n_users = len(self.users)
 60 |         self.n_items = len(self.items)
 61 |         self.n_ratings = len(self.ratings)
 62 | 
 63 |     def rating_split(self, train_size: int = 0.8, seed: int = 42):
 64 |         # rating split instead of user/item split
 65 |         np.random.seed(seed)
 66 |         idxs = np.random.choice(self.n_ratings, size=self.n_ratings, replace=False)
 67 |         split_point = int(self.n_ratings * train_size)
 68 |         train_idxs, test_idxs = idxs[:split_point], idxs[split_point:]
 69 |         self.train_ratings = self.ratings.loc[train_idxs]
 70 |         self.test_ratings = self.ratings.loc[test_idxs]
 71 | 
 72 |     def filter(self, min_rating: float = 4.0):
 73 |         """Only keep ratings above threshold as positive implicit feedback"""
 74 |         idxs = self.ratings[self.ratings['rating'] >= min_rating].index.values
 75 |         self.ratings = self.ratings.loc[idxs, ['user', 'item', 'rating']]
 76 |         self.ratings.reset_index(drop=True, inplace=True)
 77 |         self.n_ratings = len(self.ratings)
 78 | 
 79 |     def get_user_ratings(self, dataset='train') -> Dict[int, Dict[int, float]]:
 80 |         user_ratings = {}
 81 |         if dataset == 'train':
 82 |             grouped = self.train_ratings[['user', 'item', 'rating']].groupby('user')
 83 |         else:
 84 |             grouped = self.test_ratings[['user', 'item', 'rating']].groupby('user')
 85 | 
 86 |         for user in self.users:
 87 |             vals = grouped.get_group(user)[['item', 'rating']].values
 88 |             user_ratings[user] = dict(zip(vals[:, 0].astype(int),
 89 |                                           vals[:, 1].astype(float)))
 90 | 
 91 |         return user_ratings
 92 | 
 93 | 
 94 | def preprocess_users(users: pd.DataFrame, zip_digits_to_cut: int = 3) -> pd.DataFrame:
 95 |     user_age_bounds = {'min': users['age'].min(),
 96 |                        'max': users['age'].max()}
 97 |     occupations = sorted(users['occupation'].unique())
 98 |     user_occupation_map = dict(zip(occupations, range(len(occupations))))
 99 |     genders = sorted(users['gender'].unique())
100 |     user_gender_map = dict(zip(genders, range(len(genders))))
101 |     idxs = users[~users['zip'].str.isnumeric()].index
102 |     users.loc[idxs, 'zip'] = '00000'
103 | 
104 |     users['age'] = users['age'].apply(lambda age: min_max_scale(age, user_age_bounds))
105 |     users['occupation'] = users['occupation'].map(user_occupation_map)
106 |     users['gender'] = users['gender'].map(user_gender_map)
107 |     users['zip'] = users['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)
108 | 
109 |     return users
110 | 
111 | 
112 | def preprocess_items(items: pd.DataFrame) -> pd.DataFrame:
113 |     idxs = items[items['release'].notnull()].index
114 |     items.loc[idxs, 'release_month'] = items.loc[idxs, 'release'].str.split('-')
115 |     items.loc[idxs, 'release_month'] = \
116 |         items.loc[idxs, 'release_month'].apply(lambda val: val[1])
117 |     items.loc[idxs, 'release_year'] = items.loc[idxs, 'release'].str.split('-')
118 |     items.loc[idxs, 'release_year'] = \
119 |         items.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)
120 | 
121 |     release_month_map = dict((v, k) for k, v in enumerate(calendar.month_abbr))
122 |     items.loc[idxs, 'release_month'] = items.loc[idxs, 'release_month'].map(
123 |         release_month_map)
124 | 
125 |     # using top month and top year to impute the only missing one
126 |     top_month = items['release_month'].value_counts().index[0]
127 |     top_year = items.loc[idxs, 'release_year'].astype(int).describe()['50%']
128 |     idx = items[items['release'].isnull()].index
129 |     items.loc[idx, 'release_month'] = top_month
130 |     items.loc[idx, 'release_year'] = top_year
131 | 
132 |     item_year_bounds = {'min': items['release_year'].min(),
133 |                         'max': items['release_year'].max()}
134 |     items['release_year'] = items['release_year'].apply(
135 |         lambda year: min_max_scale(year, item_year_bounds))
136 |     items.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)
137 | 
138 |     return items
139 | 
140 | 
141 | def get_user_profiles(ratings: pd.DataFrame, prep_items: pd.DataFrame) -> pd.DataFrame:
142 |     min_rating = 4
143 |     ratings = ratings[ratings.rating >= min_rating]
144 |     ratings.drop(['rating', 'timestamp'], axis=1, inplace=True)
145 |     ratings = ratings.merge(prep_items, on='item', how='left')
146 |     ratings.drop(['item', 'release_month'], axis=1, inplace=True)
147 |     grouped = ratings.groupby('user')
148 |     profiles = grouped.apply(user_profiler).reset_index()
149 |     profiles.rename(columns={'50%': 'median'}, inplace=True)
150 | 
151 |     return profiles
152 | 
153 | 
154 | def user_profiler(group):
155 |     genre_dist = group[genres].mean()
156 |     year_dist = group['release_year'].describe()[['mean', 'std', '50%']]
157 | 
158 |     return pd.concat((genre_dist, year_dist), axis=0)
159 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = ../build/sphinx/
  9 | AUTODOCDIR    = api
 10 | AUTODOCBUILD  = sphinx-apidoc
 11 | PROJECT       = recsys_training
 12 | MODULEDIR     = ../src/recsys_training
 13 | 
 14 | # User-friendly check for sphinx-build
 15 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1)
 16 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 17 | endif
 18 | 
 19 | # Internal variables.
 20 | PAPEROPT_a4     = -D latex_paper_size=a4
 21 | PAPEROPT_letter = -D latex_paper_size=letter
 22 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 23 | # the i18n builder cannot share the environment and doctrees with the others
 24 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 25 | 
 26 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext doc-requirements
 27 | 
 28 | help:
 29 | 	@echo "Please use \`make <target>' where <target> is one of"
 30 | 	@echo "  html       to make standalone HTML files"
 31 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 32 | 	@echo "  singlehtml to make a single large HTML file"
 33 | 	@echo "  pickle     to make pickle files"
 34 | 	@echo "  json       to make JSON files"
 35 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 36 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 37 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 38 | 	@echo "  epub       to make an epub"
 39 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 40 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 41 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 42 | 	@echo "  text       to make text files"
 43 | 	@echo "  man        to make manual pages"
 44 | 	@echo "  texinfo    to make Texinfo files"
 45 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 46 | 	@echo "  gettext    to make PO message catalogs"
 47 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 48 | 	@echo "  xml        to make Docutils-native XML files"
 49 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 50 | 	@echo "  linkcheck  to check all external links for integrity"
 51 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 52 | 
 53 | clean:
 54 | 	rm -rf $(BUILDDIR)/* $(AUTODOCDIR)
 55 | 
 56 | $(AUTODOCDIR): $(MODULEDIR)
 57 | 	mkdir -p $@
 58 | 	$(AUTODOCBUILD) -f -o $@ $^
 59 | 
 60 | doc-requirements: $(AUTODOCDIR)
 61 | 
 62 | html: doc-requirements
 63 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 66 | 
 67 | dirhtml: doc-requirements
 68 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 71 | 
 72 | singlehtml: doc-requirements
 73 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 74 | 	@echo
 75 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 76 | 
 77 | pickle: doc-requirements
 78 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 79 | 	@echo
 80 | 	@echo "Build finished; now you can process the pickle files."
 81 | 
 82 | json: doc-requirements
 83 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 84 | 	@echo
 85 | 	@echo "Build finished; now you can process the JSON files."
 86 | 
 87 | htmlhelp: doc-requirements
 88 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 89 | 	@echo
 90 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 91 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 92 | 
 93 | qthelp: doc-requirements
 94 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 95 | 	@echo
 96 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 97 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 98 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/$(PROJECT).qhcp"
 99 | 	@echo "To view the help file:"
100 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/$(PROJECT).qhc"
101 | 
102 | devhelp: doc-requirements
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $HOME/.local/share/devhelp/$(PROJECT)"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $HOME/.local/share/devhelp/$(PROJEC)"
109 | 	@echo "# devhelp"
110 | 
111 | epub: doc-requirements
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | patch-latex:
117 | 	find _build/latex -iname "*.tex" | xargs -- \
118 | 		sed -i'' 's~includegraphics{~includegraphics\[keepaspectratio,max size={\\textwidth}{\\textheight}\]{~g'
119 | 
120 | latex: doc-requirements
121 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
122 | 	$(MAKE) patch-latex
123 | 	@echo
124 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
125 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
126 | 	      "(use \`make latexpdf' here to do that automatically)."
127 | 
128 | latexpdf: doc-requirements
129 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
130 | 	$(MAKE) patch-latex
131 | 	@echo "Running LaTeX files through pdflatex..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | latexpdfja: doc-requirements
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | text: doc-requirements
142 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
143 | 	@echo
144 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
145 | 
146 | man: doc-requirements
147 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
148 | 	@echo
149 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
150 | 
151 | texinfo: doc-requirements
152 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
153 | 	@echo
154 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
155 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
156 | 	      "(use \`make info' here to do that automatically)."
157 | 
158 | info: doc-requirements
159 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
160 | 	@echo "Running Texinfo files through makeinfo..."
161 | 	make -C $(BUILDDIR)/texinfo info
162 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
163 | 
164 | gettext: doc-requirements
165 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
166 | 	@echo
167 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
168 | 
169 | changes: doc-requirements
170 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
171 | 	@echo
172 | 	@echo "The overview file is in $(BUILDDIR)/changes."
173 | 
174 | linkcheck: doc-requirements
175 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
176 | 	@echo
177 | 	@echo "Link check complete; look for any errors in the above output " \
178 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
179 | 
180 | doctest: doc-requirements
181 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
182 | 	@echo "Testing of doctests in the sources finished, look at the " \
183 | 	      "results in $(BUILDDIR)/doctest/output.txt."
184 | 
185 | xml: doc-requirements
186 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
187 | 	@echo
188 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
189 | 
190 | pseudoxml: doc-requirements
191 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
192 | 	@echo
193 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
194 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is execfile()d with the current directory set to its containing dir.
  4 | #
  5 | # Note that not all possible configuration values are present in this
  6 | # autogenerated file.
  7 | #
  8 | # All configuration values have a default; values that are commented out
  9 | # serve to show the default.
 10 | 
 11 | import os
 12 | import sys
 13 | import inspect
 14 | import shutil
 15 | 
 16 | __location__ = os.path.join(os.getcwd(), os.path.dirname(
 17 |     inspect.getfile(inspect.currentframe())))
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | sys.path.insert(0, os.path.join(__location__, '../src'))
 23 | 
 24 | # -- Run sphinx-apidoc ------------------------------------------------------
 25 | # This hack is necessary since RTD does not issue `sphinx-apidoc` before running
 26 | # `sphinx-build -b html . _build/html`. See Issue:
 27 | # https://github.com/rtfd/readthedocs.org/issues/1139
 28 | # DON'T FORGET: Check the box "Install your project inside a virtualenv using
 29 | # setup.py install" in the RTD Advanced Settings.
 30 | # Additionally it helps us to avoid running apidoc manually
 31 | 
 32 | try:  # for Sphinx >= 1.7
 33 |     from sphinx.ext import apidoc
 34 | except ImportError:
 35 |     from sphinx import apidoc
 36 | 
 37 | output_dir = os.path.join(__location__, "api")
 38 | module_dir = os.path.join(__location__, "../src/recsys_training")
 39 | try:
 40 |     shutil.rmtree(output_dir)
 41 | except FileNotFoundError:
 42 |     pass
 43 | 
 44 | try:
 45 |     import sphinx
 46 |     from pkg_resources import parse_version
 47 | 
 48 |     cmd_line_template = "sphinx-apidoc -f -o {outputdir} {moduledir}"
 49 |     cmd_line = cmd_line_template.format(outputdir=output_dir, moduledir=module_dir)
 50 | 
 51 |     args = cmd_line.split(" ")
 52 |     if parse_version(sphinx.__version__) >= parse_version('1.7'):
 53 |         args = args[1:]
 54 | 
 55 |     apidoc.main(args)
 56 | except Exception as e:
 57 |     print("Running `sphinx-apidoc` failed!\n{}".format(e))
 58 | 
 59 | # -- General configuration -----------------------------------------------------
 60 | 
 61 | # If your documentation needs a minimal Sphinx version, state it here.
 62 | # needs_sphinx = '1.0'
 63 | 
 64 | # Add any Sphinx extension module names here, as strings. They can be extensions
 65 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 66 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo',
 67 |               'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage',
 68 |               'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.mathjax',
 69 |               'sphinx.ext.napoleon']
 70 | 
 71 | # Add any paths that contain templates here, relative to this directory.
 72 | templates_path = ['_templates']
 73 | 
 74 | 
 75 | # To configure AutoStructify
 76 | def setup(app):
 77 |     from recommonmark.transform import AutoStructify
 78 |     app.add_config_value('recommonmark_config', {
 79 |         'auto_toc_tree_section': 'Contents',
 80 |         'enable_eval_rst': True,
 81 |         'enable_auto_doc_ref': True,
 82 |         'enable_math': True,
 83 |         'enable_inline_math': True
 84 |     }, True)
 85 |     app.add_transform(AutoStructify)
 86 |     
 87 | # Additional parsers besides rst
 88 | source_parsers = {
 89 |    '.md': 'recommonmark.parser.CommonMarkParser',
 90 | }
 91 | 
 92 | # The suffix of source filenames.
 93 | source_suffix = ['.rst', '.md']
 94 | 
 95 | # The encoding of source files.
 96 | # source_encoding = 'utf-8-sig'
 97 | 
 98 | # The master toctree document.
 99 | master_doc = 'index'
100 | 
101 | # General information about the project.
102 | project = u'recsys_training'
103 | copyright = u'2019, squall-1002'
104 | 
105 | # The version info for the project you're documenting, acts as replacement for
106 | # |version| and |release|, also used in various other places throughout the
107 | # built documents.
108 | #
109 | # The short X.Y version.
110 | version = ''  # Is set by calling `setup.py docs`
111 | # The full version, including alpha/beta/rc tags.
112 | release = ''  # Is set by calling `setup.py docs`
113 | 
114 | # The language for content autogenerated by Sphinx. Refer to documentation
115 | # for a list of supported languages.
116 | # language = None
117 | 
118 | # There are two options for replacing |today|: either, you set today to some
119 | # non-false value, then it is used:
120 | # today = ''
121 | # Else, today_fmt is used as the format for a strftime call.
122 | # today_fmt = '%B %d, %Y'
123 | 
124 | # List of patterns, relative to source directory, that match files and
125 | # directories to ignore when looking for source files.
126 | exclude_patterns = ['_build']
127 | 
128 | # The reST default role (used for this markup: `text`) to use for all documents.
129 | # default_role = None
130 | 
131 | # If true, '()' will be appended to :func: etc. cross-reference text.
132 | # add_function_parentheses = True
133 | 
134 | # If true, the current module name will be prepended to all description
135 | # unit titles (such as .. function::).
136 | # add_module_names = True
137 | 
138 | # If true, sectionauthor and moduleauthor directives will be shown in the
139 | # output. They are ignored by default.
140 | # show_authors = False
141 | 
142 | # The name of the Pygments (syntax highlighting) style to use.
143 | pygments_style = 'sphinx'
144 | 
145 | # A list of ignored prefixes for module index sorting.
146 | # modindex_common_prefix = []
147 | 
148 | # If true, keep warnings as "system message" paragraphs in the built documents.
149 | # keep_warnings = False
150 | 
151 | 
152 | # -- Options for HTML output ---------------------------------------------------
153 | 
154 | # The theme to use for HTML and HTML Help pages.  See the documentation for
155 | # a list of builtin themes.
156 | html_theme = 'alabaster'
157 | 
158 | # Theme options are theme-specific and customize the look and feel of a theme
159 | # further.  For a list of options available for each theme, see the
160 | # documentation.
161 | html_theme_options = {
162 |     'sidebar_width': '300px',
163 |     'page_width': '1200px'
164 | }
165 | 
166 | # Add any paths that contain custom themes here, relative to this directory.
167 | # html_theme_path = []
168 | 
169 | # The name for this set of Sphinx documents.  If None, it defaults to
170 | # "<project> v<release> documentation".
171 | try:
172 |     from recsys_training import __version__ as version
173 | except ImportError:
174 |     pass
175 | else:
176 |     release = version
177 | 
178 | # A shorter title for the navigation bar.  Default is the same as html_title.
179 | # html_short_title = None
180 | 
181 | # The name of an image file (relative to this directory) to place at the top
182 | # of the sidebar.
183 | # html_logo = ""
184 | 
185 | # The name of an image file (within the static path) to use as favicon of the
186 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
187 | # pixels large.
188 | # html_favicon = None
189 | 
190 | # Add any paths that contain custom static files (such as style sheets) here,
191 | # relative to this directory. They are copied after the builtin static files,
192 | # so a file named "default.css" will overwrite the builtin "default.css".
193 | html_static_path = ['_static']
194 | 
195 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
196 | # using the given strftime format.
197 | # html_last_updated_fmt = '%b %d, %Y'
198 | 
199 | # If true, SmartyPants will be used to convert quotes and dashes to
200 | # typographically correct entities.
201 | # html_use_smartypants = True
202 | 
203 | # Custom sidebar templates, maps document names to template names.
204 | # html_sidebars = {}
205 | 
206 | # Additional templates that should be rendered to pages, maps page names to
207 | # template names.
208 | # html_additional_pages = {}
209 | 
210 | # If false, no module index is generated.
211 | # html_domain_indices = True
212 | 
213 | # If false, no index is generated.
214 | # html_use_index = True
215 | 
216 | # If true, the index is split into individual pages for each letter.
217 | # html_split_index = False
218 | 
219 | # If true, links to the reST sources are added to the pages.
220 | # html_show_sourcelink = True
221 | 
222 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
223 | # html_show_sphinx = True
224 | 
225 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
226 | # html_show_copyright = True
227 | 
228 | # If true, an OpenSearch description file will be output, and all pages will
229 | # contain a <link> tag referring to it.  The value of this option must be the
230 | # base URL from which the finished HTML is served.
231 | # html_use_opensearch = ''
232 | 
233 | # This is the file name suffix for HTML files (e.g. ".xhtml").
234 | # html_file_suffix = None
235 | 
236 | # Output file base name for HTML help builder.
237 | htmlhelp_basename = 'recsys_training-doc'
238 | 
239 | 
240 | # -- Options for LaTeX output --------------------------------------------------
241 | 
242 | latex_elements = {
243 | # The paper size ('letterpaper' or 'a4paper').
244 | # 'papersize': 'letterpaper',
245 | 
246 | # The font size ('10pt', '11pt' or '12pt').
247 | # 'pointsize': '10pt',
248 | 
249 | # Additional stuff for the LaTeX preamble.
250 | # 'preamble': '',
251 | }
252 | 
253 | # Grouping the document tree into LaTeX files. List of tuples
254 | # (source start file, target name, title, author, documentclass [howto/manual]).
255 | latex_documents = [
256 |   ('index', 'user_guide.tex', u'recsys_training Documentation',
257 |    u'squall-1002', 'manual'),
258 | ]
259 | 
260 | # The name of an image file (relative to this directory) to place at the top of
261 | # the title page.
262 | # latex_logo = ""
263 | 
264 | # For "manual" documents, if this is true, then toplevel headings are parts,
265 | # not chapters.
266 | # latex_use_parts = False
267 | 
268 | # If true, show page references after internal links.
269 | # latex_show_pagerefs = False
270 | 
271 | # If true, show URL addresses after external links.
272 | # latex_show_urls = False
273 | 
274 | # Documents to append as an appendix to all manuals.
275 | # latex_appendices = []
276 | 
277 | # If false, no module index is generated.
278 | # latex_domain_indices = True
279 | 
280 | # -- External mapping ------------------------------------------------------------
281 | python_version = '.'.join(map(str, sys.version_info[0:2]))
282 | intersphinx_mapping = {
283 |     'sphinx': ('http://www.sphinx-doc.org/en/stable', None),
284 |     'python': ('https://docs.python.org/' + python_version, None),
285 |     'matplotlib': ('https://matplotlib.org', None),
286 |     'numpy': ('https://docs.scipy.org/doc/numpy', None),
287 |     'sklearn': ('http://scikit-learn.org/stable', None),
288 |     'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None),
289 |     'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
290 | }
291 | 


--------------------------------------------------------------------------------
/notebooks/3_e_demographic_recs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 3: Demographic Recommendations"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this section we leave the boring field of unpersonalized content and do our first steps for more personalization. But, before tailoring content to individuals we first tailor content to groups of individuals that by some criteria seem to be similar and therefore - assumed to - consume similar content.\n",
 15 |     "\n",
 16 |     "We distinguish individuals into groups by using demographic information we have on these individuals. This can be any of\n",
 17 |     "* age\n",
 18 |     "* gender\n",
 19 |     "* citizenship\n",
 20 |     "* income\n",
 21 |     "* etc."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import itertools\n",
 31 |     "from typing import List\n",
 32 |     "\n",
 33 |     "import numpy as np\n",
 34 |     "import pandas as pd"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from recsys_training.data import Dataset\n",
 44 |     "from recsys_training.evaluation import get_relevant_items"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n",
 54 |     "ml100k_user_filepath = '../data/raw/ml-100k/u.user'"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Load Data"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "data = Dataset(ml100k_ratings_filepath)\n",
 71 |     "data.rating_split(seed=42)\n",
 72 |     "user_ratings = data.get_user_ratings()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "MovieLens also provides some demographic data on users along with the datasets. We will user _age_ and _gender_ in this tutorial to create different groups."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "users = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n",
 89 |     "                    names=['user', 'age', 'gender', 'occupation', 'zip'])"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Explore Data"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "users.head()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "users.age.hist()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "Let's define 2 x 6 user groups by splitting by gender and age class (see advice [here](https://support.google.com/analytics/answer/2799357?hl=de))"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "gender_groups = ['M', 'F']\n",
131 |     "age_groups = [(18, 24),\n",
132 |     "              (25, 34),\n",
133 |     "              (35, 44),\n",
134 |     "              (45, 54),\n",
135 |     "              (55, 65),\n",
136 |     "              (65, 73)]\n",
137 |     "\n",
138 |     "user_groups = list(itertools.product(gender_groups, age_groups))\n",
139 |     "user_group_indices = range(len(user_groups))\n",
140 |     "user_groups = dict(zip(user_group_indices, user_groups))"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "user_groups"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "def assign_group(row, age_groups=age_groups):\n",
159 |     "    for age_group in age_groups:\n",
160 |     "        if row['age'] >= age_group[0] and row['age'] <= age_group[1]:\n",
161 |     "            break\n",
162 |     "    return (row['gender'], age_group)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "users['group'] = users.apply(lambda row: assign_group(row, age_groups), axis=1)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "users['group'] = users['group'].map(lambda val: list(user_groups.values()).index(val))"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "users['group'].value_counts()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "![](../parrot.png)\n",
197 |     "\n",
198 |     "For each group we use popularity recommendations based on the groups historical viewing popularity.\n",
199 |     "\n",
200 |     "**Task**: Infer the `group_popularities` as a mapping from group index to the item ordering array."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "group_popularities = dict.fromkeys(user_group_indices)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "for group_idx in user_group_indices:\n",
219 |     "    pass"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "group_popularities"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "![](parrot.png)\n",
236 |     "\n",
237 |     "**Task:** Adapt `get_recommendations` from the previous notebook and compute the $MAP@10$ for demographic recommendations."
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "user_group_map = dict(zip(users['user'].values,users['group'].values))"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "def get_recommendations(user: int,\n",
256 |     "                        user_ratings: dict,\n",
257 |     "                        item_popularity_order: np.array,\n",
258 |     "                        N: int) -> List[int]:\n",
259 |     "    known_positives = None\n",
260 |     "    recommendations = None\n",
261 |     "    \n",
262 |     "    return recommendations"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "## Evaluation Evaluating the Relevance of Recommendations"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 19,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "relevant_items = get_relevant_items(data.test_ratings)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "Computing $MAP@10$"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 20,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "N = 10"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 21,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "users = relevant_items.keys()\n",
304 |     "prec_at_N = dict.fromkeys(users)\n",
305 |     "\n",
306 |     "for user in users:\n",
307 |     "    recommendations = get_recommendations(user,\n",
308 |     "                                          user_ratings,\n",
309 |     "                                          user_group_map,\n",
310 |     "                                          group_popularities,\n",
311 |     "                                          N=N)\n",
312 |     "    hits = np.intersect1d(recommendations,\n",
313 |     "                          relevant_items[user])\n",
314 |     "    prec_at_N[user] = len(hits)/N"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 22,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "data": {
324 |       "text/plain": [
325 |        "0.06404255319148937"
326 |       ]
327 |      },
328 |      "execution_count": 22,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "np.mean(list(prec_at_N.values()))"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "What is the $MAP@10$ for ea. specific group?"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 23,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "group_maps = dict.fromkeys(user_group_indices, list())\n",
351 |     "for user in users:\n",
352 |     "    group_maps[user_group_map[user]].append(prec_at_N[user])\n",
353 |     "for group in user_group_indices:\n",
354 |     "    group_maps[group] = np.mean(group_maps[group])"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 24,
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "data": {
364 |       "text/plain": [
365 |        "{0: 0.06404255319148937,\n",
366 |        " 1: 0.06404255319148937,\n",
367 |        " 2: 0.06404255319148937,\n",
368 |        " 3: 0.06404255319148937,\n",
369 |        " 4: 0.06404255319148937,\n",
370 |        " 5: 0.06404255319148937,\n",
371 |        " 6: 0.06404255319148937,\n",
372 |        " 7: 0.06404255319148937,\n",
373 |        " 8: 0.06404255319148937,\n",
374 |        " 9: 0.06404255319148937,\n",
375 |        " 10: 0.06404255319148937,\n",
376 |        " 11: 0.06404255319148937}"
377 |       ]
378 |      },
379 |      "execution_count": 24,
380 |      "metadata": {},
381 |      "output_type": "execute_result"
382 |     }
383 |    ],
384 |    "source": [
385 |     "group_maps"
386 |    ]
387 |   }
388 |  ],
389 |  "metadata": {
390 |   "kernelspec": {
391 |    "display_name": "Python 3",
392 |    "language": "python",
393 |    "name": "python3"
394 |   },
395 |   "language_info": {
396 |    "codemirror_mode": {
397 |     "name": "ipython",
398 |     "version": 3
399 |    },
400 |    "file_extension": ".py",
401 |    "mimetype": "text/x-python",
402 |    "name": "python",
403 |    "nbconvert_exporter": "python",
404 |    "pygments_lexer": "ipython3",
405 |    "version": "3.7.5"
406 |   }
407 |  },
408 |  "nbformat": 4,
409 |  "nbformat_minor": 4
410 | }
411 | 


--------------------------------------------------------------------------------
/notebooks/4_e_cf_knn_rating_pred.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 4: Neighborhood-based Collaborative Filtering for Rating Prediction"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this section we generate personalized recommendations for the first time. We exploit rating similarities among users and items to identify similar users and items that assist in finding the relevant items to recommend for each user.\n",
 15 |     "\n",
 16 |     "This describes the fundamental idea behind Collaborative Filtering (CF) and using kNN is a neighborhood-based approach towards CF. In a later unit we will also have a look at model-based approaches.\n",
 17 |     "\n",
 18 |     "This is also the first time we try to predict user ratings for unknown items using rating predictions to take the top-$N$ items with the highest rating predictions and recommend those to the user."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 3,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from collections import OrderedDict\n",
 28 |     "import itertools\n",
 29 |     "from typing import Dict, List, Tuple\n",
 30 |     "\n",
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 1,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from recsys_training.data import Dataset\n",
 42 |     "from recsys_training.evaluation import get_relevant_items\n",
 43 |     "from recsys_training.utils import get_entity_sim"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Load Data"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "data = Dataset(ml100k_ratings_filepath)\n",
 69 |     "data.rating_split(seed=42)\n",
 70 |     "user_ratings = data.get_user_ratings()"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "The idea behind this recommender is to use item ratings of the $k$ most similar users (neighbors). We identify those _nearest neighbors_ with a similarity metric which we apply to the ratings both, root user and possible neighbor, have in common. Similarity thereby means having a similar opinion on movies.\n",
 78 |     "\n",
 79 |     "The steps are as follows:\n",
 80 |     "\n",
 81 |     "1. Compute user-user similarities (we use the Pearson Correlation Coefficient here, but feel free to try other similarity metrics)\n",
 82 |     "\n",
 83 |     "2. For each user:\n",
 84 |     "\n",
 85 |     "    1. Get the k nearest neighbors along with their similarities\n",
 86 |     "    2. Collect the neighborhood item ratings and ignore those already rated by the root user\n",
 87 |     "    3. Item Rating Prediction: Compute the similarity-weighted sum of neighborhood item ratings\n",
 88 |     "    4. Recommendations: Get the $N$ items with the highest ratings that have a minimum rating count"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "### 1. User-User Similarities"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "sim_metric = 'pearson'\n",
105 |     "user_user_sims = {}\n",
106 |     "user_pairs = itertools.combinations(data.users, 2)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "The following takes a few seconds to finish ..."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "for pair in user_pairs:\n",
123 |     "    user_user_sims[pair] = get_entity_sim(pair[0], pair[1],\n",
124 |     "                                          user_ratings,\n",
125 |     "                                          sim_metric)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "user_user_sims[(1,4)]"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "## 2. Computing Recommendations"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "### A. Implement Nearest Neighbors for a given user\n",
149 |     "\n",
150 |     "![](Parrot.png)\n",
151 |     "\n",
152 |     "**Task:** It's your turn again. Complete `get_k_nearest_neighbors` to return a sorted list of the $k$ nearest neighbors - identified by their id - for a given user, each along with its similarity."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 4,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "def get_k_nearest_neighbors(user: int, k: int, user_user_sims: dict) -> List[Tuple[int, float]]:\n",
162 |     "    neighbors = set(data.users)\n",
163 |     "    neighbors.remove(user)\n",
164 |     "\n",
165 |     "    nearest_neighbors = dict()\n",
166 |     "    \n",
167 |     "    pass\n",
168 |     "    \n",
169 |     "    return nearest_neighbors[:k]"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "user_neighbors = get_k_nearest_neighbors(1, k=10, user_user_sims=user_user_sims)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "user_neighbors"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "### B. Obtain the Neighborhood Ratings\n",
195 |     "\n",
196 |     "![](Parrot.png)\n",
197 |     "\n",
198 |     "**Task:** Now, use the nearest neighbors and get their ratings, but leave out the items our root user has already rated (known positives). Return a mapping from unknown item to a list of dicts with neighbor similarity and item rating."
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "def get_neighborhood_ratings(user, user_neighbors: List[Tuple[int, float]]) -> Dict[int, List[Dict[str, float]]]:\n",
208 |     "    neighborhood_ratings = dict()\n",
209 |     "    \n",
210 |     "    pass\n",
211 |     "    \n",
212 |     "    return neighborhood_ratings"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "neighborhood_ratings = get_neighborhood_ratings(1, user_neighbors)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "neighborhood_ratings"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "### C. Compute Rating Predictions from Neighborhood Ratings\n",
238 |     "\n",
239 |     "![](Parrot.png)\n",
240 |     "\n",
241 |     "**Task:** In this step, we estimate ratings for the seed user based on the neighborhood ratings. We implement a similarity weighted average of neighbor ratings for that. Return a mapping from item to its prediction and the count of neighbor ratings received."
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "def compute_rating_pred(neighborhood_ratings: dict) -> dict:\n",
251 |     "    rating_preds = dict()\n",
252 |     "    \n",
253 |     "    pass\n",
254 |     "\n",
255 |     "    return rating_preds"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "rating_preds = compute_rating_pred(neighborhood_ratings)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "list(rating_preds.items())[:20]"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "### D. Compute the Top-$N$ Recommendation Items\n",
281 |     "\n",
282 |     "![](Parrot.png)\n",
283 |     "\n",
284 |     "**Task:** The last step takes the rating predictions and returns the $N$ highest predictions which have a minimum rating count, i.e. the number of neighbors from the neighborhood that rated this item."
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "def compute_top_n(rating_preds: dict, min_count: int, N: int) -> OrderedDict:\n",
294 |     "    pass\n",
295 |     "    \n",
296 |     "    return OrderedDict(sorted_rating_preds[:N])"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "top_n_recs = compute_top_n(rating_preds, min_count=2, N=10)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "top_n_recs"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "### Combine all steps in `get_recommendations`"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "def get_recommendations(user: int,\n",
331 |     "                        user_user_sims: dict,\n",
332 |     "                        k: int,\n",
333 |     "                        C: int,\n",
334 |     "                        N: int):\n",
335 |     "    user_neighbors = get_k_nearest_neighbors(user, k=k, user_user_sims=user_user_sims)\n",
336 |     "    neighborhood_ratings = get_neighborhood_ratings(user, user_neighbors)\n",
337 |     "    rating_preds = compute_rating_pred(neighborhood_ratings)\n",
338 |     "    top_n_recs = compute_top_n(rating_preds, min_count=C, N=N)\n",
339 |     "    return top_n_recs"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "get_recommendations(1, user_user_sims, 10, 2, 10)"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "### Evaluation"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "Let's check the performance of the neighborhood- and user-based recommender for a neighborhood size of $k = 60$, minimum rating count of $C = 10$ and stay with $N = 10$ recommendations."
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "k = 60\n",
372 |     "C = 10\n",
373 |     "N = 10"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "relevant_items = get_relevant_items(data.test_ratings)"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "users = relevant_items.keys()\n",
392 |     "prec_at_N = dict.fromkeys(data.users)\n",
393 |     "\n",
394 |     "for user in users:\n",
395 |     "    recommendations = get_recommendations(user, user_user_sims, k, C, N)\n",
396 |     "    recommendations = list(recommendations.keys())\n",
397 |     "    hits = np.intersect1d(recommendations,\n",
398 |     "                          relevant_items[user])\n",
399 |     "    prec_at_N[user] = len(hits)/N"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "np.mean([val for val in prec_at_N.values() if val is not None])"
409 |    ]
410 |   }
411 |  ],
412 |  "metadata": {
413 |   "kernelspec": {
414 |    "display_name": "Python 3",
415 |    "language": "python",
416 |    "name": "python3"
417 |   },
418 |   "language_info": {
419 |    "codemirror_mode": {
420 |     "name": "ipython",
421 |     "version": 3
422 |    },
423 |    "file_extension": ".py",
424 |    "mimetype": "text/x-python",
425 |    "name": "python",
426 |    "nbconvert_exporter": "python",
427 |    "pygments_lexer": "ipython3",
428 |    "version": "3.9.4"
429 |   },
430 |   "pycharm": {
431 |    "stem_cell": {
432 |     "cell_type": "raw",
433 |     "metadata": {
434 |      "collapsed": false
435 |     },
436 |     "source": []
437 |    }
438 |   }
439 |  },
440 |  "nbformat": 4,
441 |  "nbformat_minor": 4
442 | }
443 | 


--------------------------------------------------------------------------------
/notebooks/2_e_popularity_recs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 2: Popularity Recommendations"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this section we build a recommender that sorts items by popularity as of the number of ratings they received. As a result we return the $N$ most popular items as recommendations."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from typing import Dict, List\n",
 24 |     "\n",
 25 |     "import numpy as np\n",
 26 |     "import pandas as pd\n",
 27 |     "from scipy.stats import spearmanr"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# `Dataset` is just a wrapper for the MovieLens training data\n",
 37 |     "from recsys_training.data import Dataset, genres"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n",
 47 |     "ml100k_item_filepath = '../data/raw/ml-100k/u.item'"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Load Data"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "We load the dataset with 100,000 ratings and split it $4:1$ into train and test set.\n",
 62 |     "\n",
 63 |     "(**Remark**: We do not focus on proper hyperparameter search within this tutorial and therefore do not generate a separate validation dataset)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "data = Dataset(ml100k_ratings_filepath)\n",
 73 |     "data.rating_split(train_size=0.8, seed=42)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "items = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n",
 83 |     "                    names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n",
 84 |     "                    engine='python')"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "data.train_ratings"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "data.test_ratings"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "Build a Mapping from user id to its item ratings. We will need this later."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "user_ratings = data.get_user_ratings()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "Show up to 20 user ratings for the first user"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "user = 1\n",
135 |     "list(user_ratings[user].items())[:20]"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "## Popularity Ranking"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "How do we define _popularity_? It turns out that there can be different things justifying the popularity of content:\n",
150 |     "- **pure count**: simply count the number of ratings or interactions an item received regardless of their quality\n",
151 |     "- **positive count**: only count the number of ratings or interactions that we assume reflect preference towards items, e.g. ratings above user mean ratings\n",
152 |     "- **time-dependency**: despite evergreen stars items may also be popular for a limited time only - how can we account for this?\n",
153 |     "\n",
154 |     "**Remark**: Popularity ranking entails no personalization. We obtain a single popularity ranking of items which is independent from the user and serve the same top-$N$ items to every user."
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "### Popularity based on simple Interaction Counts"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "![](parrot.png)\n",
169 |     "\n",
170 |     "**Task**: Infer the item popularity order from training ratings as an array with items in descending order of popularity."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "item_popularity = data.train_ratings.item.value_counts()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "item_popularity"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "item_order = item_popularity.values"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "item_order"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "What are the most popular movies?"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "top_movie_ids = item_order[:5]\n",
223 |     "items[items['item'].isin(top_movie_ids)][['item', 'title']]"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "### Popularity based on positive Interaction Counts\n",
231 |     "\n",
232 |     "We assume that the the mean rating for each user is the threshold above which movies are regarded as favorable and below which movies are deemed as bad.\n",
233 |     "\n",
234 |     "1. compute that user mean rating for each user.\n",
235 |     "2. remove all ratings that fall below this threshold.\n",
236 |     "3. apply the process above to the remaining ratings."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "user_mean_ratings = data.train_ratings[['user', 'rating']].groupby('user')\n",
246 |     "user_mean_ratings = user_mean_ratings.mean().reset_index()\n",
247 |     "user_mean_ratings.rename(columns={'rating': 'user_mean_rating'},\n",
248 |     "                         inplace=True)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "user_mean_ratings"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "positive_train_ratings = data.train_ratings.merge(user_mean_ratings,\n",
267 |     "                                                  on='user',\n",
268 |     "                                                  how='left')"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "keep_ratings = (positive_train_ratings['rating'] >= positive_train_ratings['user_mean_rating'])"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "positive_train_ratings = positive_train_ratings[keep_ratings]\n",
287 |     "positive_train_ratings.drop(columns='user_mean_rating', inplace=True)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "positive_train_ratings"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "item_popularity_positive = positive_train_ratings.item.value_counts()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "item_popularity_positive"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "item_order_positive = item_popularity.index.values"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "items[items['item'].isin(item_order_positive[:5])][['item', 'title']]"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "#### How strong do both orderings correlate with each other?"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "Check spearman rank correlation between both orderings to quantify the distortion in ordering."
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "joint_counts = [[item_popularity.loc[item], item_popularity_positive[item]]\n",
356 |     "                for item in np.intersect1d(item_popularity_positive.index.values,\n",
357 |     "                                           item_popularity.index.values)]\n",
358 |     "joint_counts = np.array(joint_counts)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "joint_counts"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "spearmanr(joint_counts)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "### Using Popularity Ordering for top-$N$ Recommendations\n",
384 |     "\n",
385 |     "Now, we can produce recommendations from our popularity ordering."
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "![](parrot.png)\n",
393 |     "\n",
394 |     "**Task**: Write a method `get_recommendation` that returns the top-$N$ items without any known positives, i.e. items the user has already viewed."
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "def get_recommendations(user: int,\n",
404 |     "                        user_ratings: dict,\n",
405 |     "                        item_popularity_order: np.array,\n",
406 |     "                        N: int) -> List[int]:\n",
407 |     "    known_positives = None\n",
408 |     "    recommendations = None\n",
409 |     "    \n",
410 |     "    return recommendations"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {},
416 |    "source": [
417 |     "Try it ..."
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "get_recommendations(1, user_ratings, item_order, 10)"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "## Evaluating the Relevance of Recommendations"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "def get_relevant_items(test_ratings: pd.DataFrame) -> Dict[int, List[int]]:\n",
443 |     "    \"\"\"\n",
444 |     "    returns {user: [items]} as a list of relevant items per user\n",
445 |     "    for all users found in the test dataset\n",
446 |     "    \"\"\"\n",
447 |     "    relevant_items = test_ratings[['user', 'item']]\n",
448 |     "    relevant_items = relevant_items.groupby('user')\n",
449 |     "    relevant_items = {user: relevant_items.get_group(user)['item'].values\n",
450 |     "                      for user in relevant_items.groups.keys()}\n",
451 |     "\n",
452 |     "    return relevant_items"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "relevant_items = get_relevant_items(data.test_ratings)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "relevant_items[1]"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "### $Precision@10$"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {},
483 |    "source": [
484 |     "Now, we can compute the intersection between the top-$N$ recommended items and the items each user interacted with. Ideally, we want every recommendation to be a hit, i.e. an item the user consumed. In this case the size of intersections is $N$ given $N$ recommendations which is a precision of 100% = $\\frac{N}{N}$.\n",
485 |     "\n",
486 |     "We compute the so called $Precision@N$ for every user and take the mean over all. The resulting metric is called _mean average precision at N_ or short $MAP@N$."
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "![](parrot.png)\n",
494 |     "\n",
495 |     "**Task:** Compute the $MAP@N$ for popularity recommendations"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": [
504 |     "def get_precision(users: List[int], user_ratings: Dict[int, Dict[int, float]],\n",
505 |     "                  item_order: np.array, N: int) -> Dict[int, float]:\n",
506 |     "    \n",
507 |     "    pass\n",
508 |     "    \n",
509 |     "    return prec_at_N"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "markdown",
514 |    "metadata": {},
515 |    "source": [
516 |     "Try it ..."
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "N = 10\n",
526 |     "users = relevant_items.keys()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "prec_at_N = get_precision(users, user_ratings, item_order, N)"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "np.mean(list(prec_at_N.values()))"
545 |    ]
546 |   }
547 |  ],
548 |  "metadata": {
549 |   "kernelspec": {
550 |    "display_name": "Python 3",
551 |    "language": "python",
552 |    "name": "python3"
553 |   },
554 |   "language_info": {
555 |    "codemirror_mode": {
556 |     "name": "ipython",
557 |     "version": 3
558 |    },
559 |    "file_extension": ".py",
560 |    "mimetype": "text/x-python",
561 |    "name": "python",
562 |    "nbconvert_exporter": "python",
563 |    "pygments_lexer": "ipython3",
564 |    "version": "3.9.4"
565 |   },
566 |   "pycharm": {
567 |    "stem_cell": {
568 |     "cell_type": "raw",
569 |     "metadata": {
570 |      "collapsed": false
571 |     },
572 |     "source": []
573 |    }
574 |   }
575 |  },
576 |  "nbformat": 4,
577 |  "nbformat_minor": 4
578 | }
579 | 


--------------------------------------------------------------------------------
/notebooks/9_e_ligthfm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 9: LightFM"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "You almost made it - this is the final lesson and it is also going to be the easiest one.\n",
 15 |     "\n",
 16 |     "As you may already assume - there are a lot of recommender packages in Python out there. In this lesson we will look at LightFM - an easy to use and lightweight implementation of different approaches and algorithms (FM, BPR, WARP, ...) to perform CF, CBF and hybrid recommenders.\n",
 17 |     "\n",
 18 |     "Within a few lines of code we set-up, train and use a recommender for recommendations.\n",
 19 |     "\n",
 20 |     "* [LightFM on GitHub](https://github.com/lyst/lightfm)\n",
 21 |     "* [LightFM documentation](https://making.lyst.com/lightfm/docs/home.html)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import matplotlib.pyplot as plt\n",
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "from scipy.sparse import coo_matrix\n",
 34 |     "\n",
 35 |     "from recsys_training.data import Dataset, genres"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from lightfm.datasets import fetch_movielens\n",
 45 |     "from lightfm.evaluation import precision_at_k\n",
 46 |     "from lightfm import LightFM"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'\n",
 56 |     "ml100k_item_filepath = '../../data/raw/ml-100k/u.item'\n",
 57 |     "ml100k_user_filepath = '../../data/raw/ml-100k/u.user'"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Load Data"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### You may easily load Movielens Data ..."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "data = fetch_movielens(min_rating=4.0, genre_features=True)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "data"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### But, we want to use the exact same data and split that we used in the lessons before"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "data = Dataset(ml100k_ratings_filepath)\n",
106 |     "data.filter(min_rating=4.0)\n",
107 |     "data.rating_split(seed=42)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "#### Transform our training and testing data into sparse matrices"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# Train DataFrame to Train COO Matrix\n",
124 |     "ratings = data.train_ratings[\"rating\"].values\n",
125 |     "# We subtract 1 to make user/item ids 0-index-based\n",
126 |     "rows = data.train_ratings[\"user\"].values - 1\n",
127 |     "cols = data.train_ratings[\"item\"].values - 1\n",
128 |     "\n",
129 |     "train_mat = coo_matrix((ratings, (rows, cols)),\n",
130 |     "                       shape=(data.n_users, data.n_items))\n",
131 |     "\n",
132 |     "\n",
133 |     "# Test DataFrame to Test COO Matrix\n",
134 |     "ratings = data.test_ratings[\"rating\"].values\n",
135 |     "# We subtract 1 to make user/item ids 0-index-based\n",
136 |     "rows = data.test_ratings[\"user\"].values - 1\n",
137 |     "cols = data.test_ratings[\"item\"].values - 1\n",
138 |     "\n",
139 |     "test_mat = coo_matrix((ratings, (rows, cols)),\n",
140 |     "                      shape=(data.n_users, data.n_items))"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "train_mat"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "test_mat"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## Collaborative Filtering"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "params = {\n",
175 |     "    'no_components': 10,\n",
176 |     "    'loss': 'bpr',\n",
177 |     "    'learning_rate': 0.07,\n",
178 |     "    'random_state': 42,\n",
179 |     "    'user_alpha': 0.0002,\n",
180 |     "    'item_alpha': 0.0002\n",
181 |     "}\n",
182 |     "\n",
183 |     "epochs = 10\n",
184 |     "\n",
185 |     "N = 10"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "cf_model = LightFM(**params)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "cf_model.fit(train_mat, epochs=epochs, verbose=True)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "### Evaluate the `MAP@10` on test data\n",
211 |     "\n",
212 |     "If we provide training data with evaluation, known positives will be removed."
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "prec_at_N = precision_at_k(cf_model, test_mat, train_mat, k=N)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "prec_at_N.mean()"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "### Evaluate the `MAP@10` on train data"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "prec_at_N = precision_at_k(cf_model, train_mat, k=N)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "prec_at_N.mean()"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "Maybe try adding some regularization to improve the recommendation relevancy - simply add `user_alpha` and `item_alpha` to the `params` dictionary and find appropriate values."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "## Hybrid (CF + CBF)"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "### Load user and item features"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "def min_max_scale(val, bounds):\n",
286 |     "    min_max_range = bounds['max']-bounds['min']\n",
287 |     "    return (val-bounds['min'])/min_max_range\n",
288 |     "\n",
289 |     "\n",
290 |     "def user_profiler(group):\n",
291 |     "    genre_dist = group[genres].mean()\n",
292 |     "    year_dist = group['release_year'].describe()[['mean', 'std', '50%']]\n",
293 |     "\n",
294 |     "    return pd.concat((genre_dist, year_dist), axis=0)\n",
295 |     "\n",
296 |     "\n",
297 |     "def get_user_profiles(ratings: pd.DataFrame,\n",
298 |     "                      item_feat: pd.DataFrame,\n",
299 |     "                      min_rating: float = 4.0) -> pd.DataFrame:\n",
300 |     "    ratings = ratings[ratings.rating >= min_rating]\n",
301 |     "    ratings = ratings[['user', 'item']]\n",
302 |     "    ratings = ratings.merge(item_feat, on='item', how='left')\n",
303 |     "    ratings.drop(['item'], axis=1, inplace=True)\n",
304 |     "\n",
305 |     "    grouped = ratings.groupby('user')\n",
306 |     "    profiles = grouped.apply(user_profiler).reset_index()\n",
307 |     "    profiles.rename(columns={'50%': 'median'}, inplace=True)\n",
308 |     "    \n",
309 |     "    return profiles\n",
310 |     "\n",
311 |     "\n",
312 |     "item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n",
313 |     "                        names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n",
314 |     "                        engine='python')\n",
315 |     "\n",
316 |     "user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n",
317 |     "                        names=['user', 'age', 'gender', 'occupation', 'zip'])\n",
318 |     "\n",
319 |     "# Infer the release year\n",
320 |     "idxs = item_feat[item_feat['release'].notnull()].index\n",
321 |     "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')\n",
322 |     "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)\n",
323 |     "\n",
324 |     "# Impute median release year value for the items with missing release year\n",
325 |     "top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']\n",
326 |     "idx = item_feat[item_feat['release'].isnull()].index\n",
327 |     "item_feat.loc[idx, 'release_year'] = top_year\n",
328 |     "\n",
329 |     "# Min-max scale the release year\n",
330 |     "item_year_bounds = {'min': item_feat['release_year'].min(),\n",
331 |     "                    'max': item_feat['release_year'].max()}\n",
332 |     "item_feat['release_year'] = item_feat['release_year'].apply(\n",
333 |     "    lambda year: min_max_scale(year, item_year_bounds))\n",
334 |     "\n",
335 |     "# Drop other columns\n",
336 |     "item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)\n",
337 |     "\n",
338 |     "# Min-max scale the age\n",
339 |     "user_age_bounds = {'min': user_feat['age'].min(),\n",
340 |     "                   'max': user_feat['age'].max()}\n",
341 |     "user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))\n",
342 |     "\n",
343 |     "# Transform gender characters to numerical values (categories)\n",
344 |     "genders = sorted(user_feat['gender'].unique())\n",
345 |     "user_gender_map = dict(zip(genders, range(len(genders))))\n",
346 |     "user_feat['gender'] = user_feat['gender'].map(user_gender_map)\n",
347 |     "\n",
348 |     "# Transform occupation strings to numerical values (categories)\n",
349 |     "occupations = sorted(user_feat['occupation'].unique())\n",
350 |     "user_occupation_map = dict(zip(occupations, range(len(occupations))))\n",
351 |     "user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)\n",
352 |     "\n",
353 |     "# Transform the zip codes to categories keeping the first three digits and impute for missing\n",
354 |     "idxs = user_feat[~user_feat['zip'].str.isnumeric()].index\n",
355 |     "user_feat.loc[idxs, 'zip'] = '00000'\n",
356 |     "zip_digits_to_cut = 3\n",
357 |     "user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)\n",
358 |     "\n",
359 |     "\n",
360 |     "profiles = get_user_profiles(data.train_ratings, item_feat)\n",
361 |     "user_feat = user_feat.merge(profiles, on='user', how='left')\n",
362 |     "\n",
363 |     "occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')\n",
364 |     "zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')\n",
365 |     "\n",
366 |     "user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)\n",
367 |     "user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)\n",
368 |     "\n",
369 |     "user_feat.fillna(0, inplace=True)\n",
370 |     "\n",
371 |     "\n",
372 |     "user_feat.index = user_feat['user'].values\n",
373 |     "user_feat.drop('user', axis=1, inplace=True)\n",
374 |     "\n",
375 |     "item_feat.index = item_feat['item'].values\n",
376 |     "item_feat.drop('item', axis=1, inplace=True)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "(user_feat==0).sum().sum()/user_feat.size"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "(item_feat==0).sum().sum()/item_feat.size"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "# Create User Feature COO Matrix\n",
404 |     "# user_feat_mat = coo_matrix(np.eye(data.n_users))\n",
405 |     "user_feat_mat = coo_matrix(np.concatenate((user_feat.values, np.eye(data.n_users)), axis=1))\n",
406 |     "\n",
407 |     "# Create Item Feature COO Matrix\n",
408 |     "# item_feat_mat = coo_matrix(np.eye(data.n_items))\n",
409 |     "item_feat_mat = coo_matrix(np.concatenate((item_feat.values, np.eye(data.n_items)), axis=1))"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "user_feat_mat"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "item_feat_mat"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "### Model Training"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "![](Parrot.png)\n",
442 |     "\n",
443 |     "**Task:** Check the [lightFM API](https://making.lyst.com/lightfm/docs/home.html) to see how you can incorporate proper data - can you tweek the algorithm to beat pure Collaborative Filtering?"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {},
450 |    "outputs": [],
451 |    "source": [
452 |     "params = {\n",
453 |     "    'no_components': 10,\n",
454 |     "    'loss': 'warp',\n",
455 |     "    'learning_rate': 0.03,\n",
456 |     "    'random_state': 42,\n",
457 |     "    'user_alpha': 0.0001,\n",
458 |     "    'item_alpha': 0.0001\n",
459 |     "}\n",
460 |     "\n",
461 |     "epochs = 10\n",
462 |     "\n",
463 |     "N = 10"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "hybrid_model = None\n",
473 |     "\n",
474 |     "#\n",
475 |     "# Up to you ;)\n",
476 |     "#"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "prec_at_N = precision_at_k(hybrid_model,\n",
486 |     "                           test_mat,\n",
487 |     "                           train_mat,\n",
488 |     "                           k=N,\n",
489 |     "                           user_features=user_feat_mat,\n",
490 |     "                           item_features=item_feat_mat)"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "prec_at_N.mean()"
500 |    ]
501 |   }
502 |  ],
503 |  "metadata": {
504 |   "kernelspec": {
505 |    "display_name": "Python 3",
506 |    "language": "python",
507 |    "name": "python3"
508 |   },
509 |   "language_info": {
510 |    "codemirror_mode": {
511 |     "name": "ipython",
512 |     "version": 3
513 |    },
514 |    "file_extension": ".py",
515 |    "mimetype": "text/x-python",
516 |    "name": "python",
517 |    "nbconvert_exporter": "python",
518 |    "pygments_lexer": "ipython3",
519 |    "version": "3.9.4"
520 |   },
521 |   "pycharm": {
522 |    "stem_cell": {
523 |     "cell_type": "raw",
524 |     "metadata": {
525 |      "collapsed": false
526 |     },
527 |     "source": []
528 |    }
529 |   }
530 |  },
531 |  "nbformat": 4,
532 |  "nbformat_minor": 4
533 | }
534 | 


--------------------------------------------------------------------------------
/notebooks/6_e_cf_mf_ranking_pred.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 6: Model-based Collaborative Filtering for **Ranking** Prediction"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "However, we still do Collaborative Filtering and Matrix Factorization in this unit, we do something fundamentally different: we change from rating prediction to **ranking prediction**.\n",
 15 |     "\n",
 16 |     "We achieve this by changing the optimization criterion. Instead of minimizing the deviation between true and predicted ratings we push positive and negative user-item combinationa as much as possible apart. We transform explicit user feedback into implicit feedback. Implicit feedback refers to user interaction without the purpose to reflect preference or disregard and is much more common in pactice. Ranking prediction algorithms tackle to learn from implicit feedback data.\n",
 17 |     "\n",
 18 |     "In addition, ranking-based algorithms yield a much more intuitive prediction result. Our goal is to present to the user a very limited amount of items in the correct ordering. Therefore, ordering is much more important than rating prediction. Ranking-based algorithms like BPR work pair-wise, i.e. for a user and two items they yield the correct order of both items for the user. Generalizing from this, we can impose an ordering on our item corpus and pick the top-$N$ to present to the user."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from collections import OrderedDict\n",
 28 |     "import itertools\n",
 29 |     "from typing import Dict, List, Tuple\n",
 30 |     "\n",
 31 |     "import matplotlib.pyplot as plt\n",
 32 |     "import numpy as np\n",
 33 |     "import pandas as pd"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from recsys_training.data import Dataset\n",
 43 |     "from recsys_training.evaluation import get_relevant_items"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Load Data"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "Different to previous units, we work with implicit feedback data now. However, MovieLens is an explicit feedback dataset, we can argue that everything above the user mean ratings is positive and everything below is negative. Bayesian Personalized Ranking learns from implicit positive feedback data and randomly samples negative feedback data during training. Thus, we keep all ratings above a threhold of $4.0$ and remove all other ratings."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 5,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "data = Dataset(ml100k_ratings_filepath)\n",
 76 |     "data.filter(min_rating=4.0)\n",
 77 |     "data.rating_split(seed=42)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "As we want to learn the user/item latent factors from rating data, we first randomly initialize them"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 6,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "seed = 42\n",
 94 |     "m = data.n_users\n",
 95 |     "n = data.n_items\n",
 96 |     "d = 10"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 7,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# Latent Factor initialization\n",
106 |     "random_state = np.random.RandomState(seed)\n",
107 |     "user_factors = (random_state.rand(m, d) - 0.5) / d\n",
108 |     "item_factors = (random_state.rand(n, d) - 0.5) / d\n",
109 |     "        \n",
110 |     "ratings = data.train_ratings.sample(frac=1, random_state=seed)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 8,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "# positive implicit feedback items\n",
120 |     "user_pos_items = dict()\n",
121 |     "# corpus of all remaining items for every user\n",
122 |     "# Ask me about the \"Non missing at random hypothesis\" ;)\n",
123 |     "user_neg_items = dict()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 11,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "grouped = ratings[['user', 'item']].groupby('user')\n",
133 |     "groups = grouped.groups.keys()\n",
134 |     "for user in data.users:\n",
135 |     "    pos_items = []\n",
136 |     "    if user in groups:\n",
137 |     "        pos_items = grouped.get_group(user).item.values\n",
138 |     "    neg_items = np.setdiff1d(data.items, pos_items)\n",
139 |     "    user_pos_items[user] = pos_items\n",
140 |     "    user_neg_items[user] = neg_items"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "## Training"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "Yes, there is some math involved:\n",
155 |     "\n",
156 |     "\\begin{equation*}\n",
157 |     "\\hat{x}_{uij} = \\hat{x}_{ui} - \\hat{x}_{uj} \\\\\n",
158 |     "x_{ui} = \\sum_{f=1}^{d} w_{uf} \\cdot h_{if}, i \\in I_u^+ \\\\\n",
159 |     "x_{uj} = \\sum_{f=1}^{d} w_{uf} \\cdot h_{jf}, j \\in I_u^- \\\\\n",
160 |     "\\end{equation*}\n",
161 |     "\n",
162 |     "\\begin{equation*}\n",
163 |     "\\text{BPR-Opt} := \\sum_{(u,i,j) \\in D_S} \\ln\\sigma(\\hat{x}_{uijj}) - \\lambda_{\\Theta} \\cdot ||\\Theta||^2\n",
164 |     "\\end{equation*}\n",
165 |     "\n",
166 |     "\\begin{equation*}\n",
167 |     "\\frac{\\partial \\text{BPR-Opt}}{\\partial \\Theta} = \\frac{-e^{-\\hat{x}_{uij}}}{1+e^{-\\hat{x}_{uij}}} \\cdot \\frac{\\partial \\hat{x}_{uij}}{\\partial \\Theta} - \\lambda_{\\Theta} \\cdot \\Theta\n",
168 |     "\\end{equation*}\n",
169 |     "\n",
170 |     "\\begin{equation*}\n",
171 |     "\\frac{\\partial x_{uij}}{\\partial \\Theta} =\n",
172 |     "\\begin{cases}\n",
173 |     "(h_{if}-h_{jf}) & \\text{for } \\Theta = w_{uf} \\\\\n",
174 |     "w_{uf} & \\text{for } \\Theta = h_{if} \\\\\n",
175 |     "-w_{uf} & \\text{for } \\Theta = h_{jf}\n",
176 |     "\\end{cases}\n",
177 |     "\\end{equation*}"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "Let's talk about regularization!"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "def sigmoid(x):\n",
194 |     "    return 1/(1+np.exp(-x))"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "def negative_sampling(user: int, user_neg_items: Dict[int, np.array]) -> int:\n",
204 |     "    \"\"\"\n",
205 |     "    Return the item ids for negative samples\n",
206 |     "    \"\"\"\n",
207 |     "    negative_item = np.random.choice(user_neg_items[user])\n",
208 |     "    \n",
209 |     "    return negative_item"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "![](Parrot.png)\n",
217 |     "\n",
218 |     "**Task:** Adapt the `compute_gradients` method from the unit before to realize stochastic gradient descent (SGD) for Bayesian Personalized Ranking."
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "def compute_gradients(user_embed: np.array,\n",
228 |     "                      pos_item_embed: np.array,\n",
229 |     "                      neg_item_embed: np.array,\n",
230 |     "                      l2_decay: Dict[str, float]) -> Tuple[np.array, np.array, np.array]:\n",
231 |     "    \n",
232 |     "    pos_pred = np.sum(user_embed * pos_item_embed)\n",
233 |     "    neg_pred = np.sum(user_embed * neg_item_embed)\n",
234 |     "    pred = pos_pred - neg_pred\n",
235 |     "\n",
236 |     "    generic_grad = None\n",
237 |     "    \n",
238 |     "    # Gradients\n",
239 |     "    user_grad = None\n",
240 |     "    pos_item_grad = None\n",
241 |     "    neg_item_grad = None\n",
242 |     "    \n",
243 |     "    # Add L2-Decay\n",
244 |     "    user_grad += None\n",
245 |     "    pos_item_grad += None\n",
246 |     "    neg_item_grad += None\n",
247 |     "\n",
248 |     "    return user_grad, pos_item_grad, neg_item_grad"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "def print_update(epoch: int, samples: np.array) -> float:\n",
258 |     "    # take the 1000 most recent ratings and compute the mean ranking loss\n",
259 |     "    users = samples[:, 0]\n",
260 |     "    pos_items = samples[:, 1]\n",
261 |     "    neg_items = np.array([negative_sampling(user, user_neg_items)\n",
262 |     "                          for user in users])\n",
263 |     "\n",
264 |     "    user_embeds = user_factors[users - 1]\n",
265 |     "    pos_item_embeds = item_factors[pos_items - 1]\n",
266 |     "    neg_item_embeds = item_factors[neg_items - 1]\n",
267 |     "\n",
268 |     "    pos_preds = np.sum(user_embeds * pos_item_embeds, axis=1)\n",
269 |     "    neg_preds = np.sum(user_embeds * neg_item_embeds, axis=1)\n",
270 |     "    preds = pos_preds - neg_preds\n",
271 |     "\n",
272 |     "    loss = -np.log(sigmoid(preds)).mean()\n",
273 |     "    print(f\"Epoch {epoch+1:02d}: Mean Ranking Loss: {loss:.4f}\")\n",
274 |     "    \n",
275 |     "    return loss"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "Instead of minibatch gradient descent we do **stochastic gradient descent** (SGD) here. It just shrinks the batch size down to 1 instance."
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "epochs = 30\n",
292 |     "learning_rate = 0.05\n",
293 |     "l2_decay = {'user': 0.002, 'pos': 0.0, 'neg': 0.002}\n",
294 |     "verbose = True"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "ratings_arr = ratings[['user', 'item']].values\n",
304 |     "n_ratings = len(ratings_arr)\n",
305 |     "loss_trace = []\n",
306 |     "\n",
307 |     "for epoch in range(epochs):\n",
308 |     "\n",
309 |     "    for _ in range(len(ratings)):\n",
310 |     "        random_index = np.random.randint(n_ratings)\n",
311 |     "        user, pos_item = tuple(ratings_arr[random_index])\n",
312 |     "        neg_item = negative_sampling(user, user_neg_items)\n",
313 |     "\n",
314 |     "        # Deduct 1 as user ids are 1-indexed, but array is 0-indexed\n",
315 |     "        user_embed = user_factors[user - 1]\n",
316 |     "        pos_item_embed = item_factors[pos_item - 1]\n",
317 |     "        neg_item_embed = item_factors[neg_item - 1]\n",
318 |     "\n",
319 |     "        user_grad, pos_item_grad, neg_item_grad = \\\n",
320 |     "            compute_gradients(user_embed,\n",
321 |     "                              pos_item_embed,\n",
322 |     "                              neg_item_embed,\n",
323 |     "                              l2_decay)\n",
324 |     "\n",
325 |     "        user_factors[user - 1] -= learning_rate * user_grad\n",
326 |     "        item_factors[pos_item - 1] -= learning_rate * pos_item_grad\n",
327 |     "        item_factors[neg_item - 1] -= learning_rate * neg_item_grad\n",
328 |     "\n",
329 |     "    if verbose:\n",
330 |     "        samples = ratings_arr[-1000:]\n",
331 |     "        loss = print_update(epoch, samples)\n",
332 |     "        loss_trace.append(loss)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "plt.figure(figsize=(12,8))\n",
342 |     "plt.plot(range(epochs), train_loss_trace, 'b--', label='Train')\n",
343 |     "plt.plot(range(epochs), test_loss_trace, 'g--', label='Test')\n",
344 |     "plt.grid(True)\n",
345 |     "plt.legend()\n",
346 |     "plt.show()"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "### Using the model for Recommendations"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "We have now created a model to describe users and items in terms of latent vectors. But this time we fitted them to get the rankings correctly. So for obtaining recommendations we simply multiply user-item latent vectors we are interested in and achieve an estimate that can be used to order items for a given user. This time it is not a rating prediction, but still a prediction.\n",
361 |     "\n",
362 |     "For that, we can reuse the `get_prediction` method from previous units.\n",
363 |     "\n",
364 |     "Thus, before writing the `get_recommendations` again we first implement `get_prediction`."
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "def get_prediction(user: int, items: np.array = None, remove_known_pos: bool = True) -> Dict[int, Dict[str, float]]:\n",
374 |     "    if items is None:\n",
375 |     "        if remove_known_pos:\n",
376 |     "            # Predict from unobserved items\n",
377 |     "            # We simplified this compared to the unit before\n",
378 |     "            items = user_neg_items[user]\n",
379 |     "        else:\n",
380 |     "            items = np.array(data.items)\n",
381 |     "    if type(items) == np.int64:\n",
382 |     "        items = np.array([items])\n",
383 |     "    \n",
384 |     "    user_embed = user_factors[user - 1].reshape(1, -1)\n",
385 |     "    item_embeds = item_factors[items - 1].reshape(len(items), -1)\n",
386 |     "\n",
387 |     "    # use array-broadcasting\n",
388 |     "    preds = np.sum(user_embed * item_embeds, axis=1)\n",
389 |     "    sorting = np.argsort(preds)[::-1]\n",
390 |     "    preds = {item: {'pred': pred} for item, pred in\n",
391 |     "             zip(items[sorting], preds[sorting])}\n",
392 |     "\n",
393 |     "    return preds"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "item_predictions = get_prediction(1)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "list(item_predictions.items())[:20]"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "def get_recommendations(user: int, N: int, remove_known_pos: bool = False) -> List[Tuple[int, Dict[str, float]]]:\n",
421 |     "    predictions = get_prediction(user, remove_known_pos=remove_known_pos)\n",
422 |     "    recommendations = []\n",
423 |     "    for item, pred in predictions.items():\n",
424 |     "        add_item = (item, pred)\n",
425 |     "        recommendations.append(add_item)\n",
426 |     "        if len(recommendations) == N:\n",
427 |     "            break\n",
428 |     "\n",
429 |     "    return recommendations"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {},
436 |    "outputs": [],
437 |    "source": [
438 |     "recommendations = get_recommendations(1, 10)"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "recommendations"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "## Evaluation"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "metadata": {},
461 |    "outputs": [],
462 |    "source": [
463 |     "N = 10"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "relevant_items = get_relevant_items(data.test_ratings)"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "users = relevant_items.keys()\n",
482 |     "prec_at_N = dict.fromkeys(data.users)\n",
483 |     "\n",
484 |     "for user in users:\n",
485 |     "    recommendations = get_recommendations(user, N, remove_known_pos=True)\n",
486 |     "    recommendations = [val[0] for val in recommendations]\n",
487 |     "    hits = np.intersect1d(recommendations,\n",
488 |     "                          relevant_items[user])\n",
489 |     "    prec_at_N[user] = len(hits)/N"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "recommendations"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {},
505 |    "outputs": [],
506 |    "source": [
507 |     "np.mean([val for val in prec_at_N.values() if val is not None])"
508 |    ]
509 |   }
510 |  ],
511 |  "metadata": {
512 |   "kernelspec": {
513 |    "display_name": "Python 3",
514 |    "language": "python",
515 |    "name": "python3"
516 |   },
517 |   "language_info": {
518 |    "codemirror_mode": {
519 |     "name": "ipython",
520 |     "version": 3
521 |    },
522 |    "file_extension": ".py",
523 |    "mimetype": "text/x-python",
524 |    "name": "python",
525 |    "nbconvert_exporter": "python",
526 |    "pygments_lexer": "ipython3",
527 |    "version": "3.9.4"
528 |   },
529 |   "pycharm": {
530 |    "stem_cell": {
531 |     "cell_type": "raw",
532 |     "metadata": {
533 |      "collapsed": false
534 |     },
535 |     "source": []
536 |    }
537 |   }
538 |  },
539 |  "nbformat": 4,
540 |  "nbformat_minor": 4
541 | }
542 | 


--------------------------------------------------------------------------------
/notebooks/extra_sport_recommender.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import itertools\n",
 10 |     "from typing import Dict, List, Tuple\n",
 11 |     "\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import numpy as np\n",
 14 |     "import pandas as pd"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 6,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "sports = [\n",
 24 |     "    'badminton',\n",
 25 |     "    'basketball',\n",
 26 |     "    'biking',\n",
 27 |     "    'boxing',\n",
 28 |     "    'fighting',\n",
 29 |     "    'fishing',\n",
 30 |     "    'football',\n",
 31 |     "    'hockey',\n",
 32 |     "    'running',\n",
 33 |     "    'swimming',\n",
 34 |     "    'tabletennis',\n",
 35 |     "    'tennis',\n",
 36 |     "    'volleyball'\n",
 37 |     "]"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 7,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "people = ['Barbara', 'Birol', 'Guido', 'Lisa', 'Rudi', 'Suna', 'Sven', 'Yvonne']"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 8,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "ratings = {\n",
 56 |     "    \"Barbara\": {\"football\": 3, \"basketball\": 5, \"boxing\": 4, \"biking\": 2, \"fighting\": 4},\n",
 57 |     "    \"Birol\": {\"boxing\": 4, \"hockey\": 2, \"biking\": 4, \"fighting\": 5, \"swimming\": 5, \"tennis\": 5},\n",
 58 |     "    \"Guido\": {\"basketball\": 2, \"tennis\": 4, \"boxing\": 2, \"biking\": 2, \"volleyball\": 4, \"football\": 5},\n",
 59 |     "    \"Lisa\": {\"football\": 4, \"tabletennis\": 3, \"running\": 4, \"volleyball\": 5, \"swimming\": 1},\n",
 60 |     "    \"Rudi\": {\"football\": 1, \"badminton\": 4, \"biking\": 5, \"running\": 5, \"tabletennis\": 1},\n",
 61 |     "    \"Suna\": {\"swimming\": 4, \"volleyball\": 5, \"running\": 3, \"tennis\": 5, \"tabletennis\": 4},\n",
 62 |     "    \"Sven\": {\"swimming\": 5, \"biking\": 4, \"running\": 4, \"fishing\": 1, \"badminton\": 5},\n",
 63 |     "    \"Yvonne\": {\"basketball\": 1, \"badminton\": 3, \"tennis\": 5, \"fighting\": 2, \"football\": 5, \"running\": 5}\n",
 64 |     "}\n",
 65 |     "ratings = {k: ratings[k] for k in sorted(ratings.keys())}"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 9,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "rows = []\n",
 75 |     "for person, individual_ratings in ratings.items():\n",
 76 |     "    for sport, rating in individual_ratings.items():\n",
 77 |     "        rows.append([person, sport, rating])\n",
 78 |     "\n",
 79 |     "ratings_df = pd.DataFrame(rows, columns=[\"person\", \"sport\", \"rating\"])"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 10,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "m = len(people)\n",
 89 |     "n = len(sports)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Fill the Matrix"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 11,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# initialize\n",
106 |     "rating_matrix = np.zeros((m, n))\n",
107 |     "# fill with ratings\n",
108 |     "for person_idx, person in enumerate(people):\n",
109 |     "    individual_ratings = ratings[person]\n",
110 |     "    for sport, rating in individual_ratings.items():\n",
111 |     "        sport_idx = sports.index(sport)\n",
112 |     "        rating_matrix[person_idx, sport_idx] = rating"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 12,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "array([[0., 5., 2., 4., 4., 0., 3., 0., 0., 0., 0., 0., 0.],\n",
124 |        "       [0., 0., 4., 4., 5., 0., 0., 2., 0., 5., 0., 5., 0.],\n",
125 |        "       [0., 2., 2., 2., 0., 0., 5., 0., 0., 0., 0., 4., 4.],\n",
126 |        "       [0., 0., 0., 0., 0., 0., 4., 0., 4., 1., 3., 0., 5.],\n",
127 |        "       [4., 0., 5., 0., 0., 0., 1., 0., 5., 0., 1., 0., 0.],\n",
128 |        "       [0., 0., 0., 0., 0., 0., 0., 0., 3., 4., 4., 5., 5.],\n",
129 |        "       [5., 0., 4., 0., 0., 1., 0., 0., 4., 5., 0., 0., 0.],\n",
130 |        "       [3., 1., 0., 0., 2., 0., 5., 0., 5., 0., 0., 5., 0.]])"
131 |       ]
132 |      },
133 |      "execution_count": 12,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "rating_matrix"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 13,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "Sparsity:  58.65%\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "sparsity = (rating_matrix == 0).sum() / rating_matrix.size\n",
157 |     "print(f\"Sparsity: {sparsity: .2%}\")"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## Nearest Neighborhood Collaborative Filtering (user-based)\n",
165 |     "* compute similarities among the users\n",
166 |     "* perform neighborhood-based collaborative filtering"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "### User-User Similarities"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 15,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "def get_cosine_sim(a: int, b: int, entity_ratings: dict) -> tuple:\n",
183 |     "    # 1. isolate e.g. users that have rated both items (a and b)\n",
184 |     "    key_intersection = set(entity_ratings[a].keys()).intersection(entity_ratings[b].keys())\n",
185 |     "    ratings = np.array([(entity_ratings[a][key], entity_ratings[b][key]) for key in key_intersection])\n",
186 |     "    n_joint_ratings = len(ratings)\n",
187 |     "    \n",
188 |     "    sim = None\n",
189 |     "    if n_joint_ratings > 1:\n",
190 |     "        nom = ratings[:, 0].dot(ratings[:, 1])\n",
191 |     "        denom = np.linalg.norm(ratings[:, 0]) * np.linalg.norm(ratings[:, 1])\n",
192 |     "        sim = nom / denom\n",
193 |     "        \n",
194 |     "    return sim, n_joint_ratings"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 16,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "user_user_sims = {}\n",
204 |     "user_pairs = itertools.combinations(people, 2)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 17,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "for pair in user_pairs:\n",
214 |     "    user_user_sims[pair] = get_cosine_sim(pair[0], pair[1], ratings)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 18,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "(0.7071067811865476, 2)"
226 |       ]
227 |      },
228 |      "execution_count": 18,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "user_user_sims[(\"Barbara\", \"Rudi\")]"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "### 1. Nearest Neighbors for a given user"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 19,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "def get_k_nearest_neighbors(user: int, k: int, users: list, user_user_sims: dict) -> list:\n",
251 |     "    neighbors = set(users)\n",
252 |     "    neighbors.remove(user)\n",
253 |     "\n",
254 |     "    nearest_neighbors = dict()\n",
255 |     "    for neighbor in neighbors:\n",
256 |     "        sim = user_user_sims[tuple(sorted((user, neighbor)))][0]\n",
257 |     "        if pd.notnull(sim):\n",
258 |     "            nearest_neighbors[neighbor] = sim\n",
259 |     "\n",
260 |     "    nearest_neighbors = sorted(nearest_neighbors.items(),\n",
261 |     "                               key=lambda kv: kv[1],\n",
262 |     "                               reverse=True)\n",
263 |     "    \n",
264 |     "    return nearest_neighbors[:k]"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 20,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "user_neighbors = get_k_nearest_neighbors(\"Barbara\", 2, people, user_user_sims)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 21,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "data": {
283 |       "text/plain": [
284 |        "[('Birol', 0.9713237285143654), ('Guido', 0.8277591347639633)]"
285 |       ]
286 |      },
287 |      "execution_count": 21,
288 |      "metadata": {},
289 |      "output_type": "execute_result"
290 |     }
291 |    ],
292 |    "source": [
293 |     "user_neighbors"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "### 2. Obtain the Neighborhood Ratings"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 22,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "def get_neighborhood_ratings(user, user_neighbors: list, ratings: dict) -> dict:\n",
310 |     "    neighborhood_ratings = {}\n",
311 |     "    for neighbor, sim in user_neighbors:\n",
312 |     "        neighbor_ratings = ratings[neighbor].copy()\n",
313 |     "        \n",
314 |     "        # collect neighbor ratings and items\n",
315 |     "        for item, rating in neighbor_ratings.items():\n",
316 |     "            add_item = {'sim': sim, 'rating': rating}\n",
317 |     "            if item not in neighborhood_ratings.keys():\n",
318 |     "                neighborhood_ratings[item] = [add_item]\n",
319 |     "            else:\n",
320 |     "                neighborhood_ratings[item].append(add_item)\n",
321 |     "        \n",
322 |     "    # remove known items\n",
323 |     "    known_items = list(ratings[user].keys())\n",
324 |     "    for known_item in known_items:\n",
325 |     "        neighborhood_ratings.pop(known_item, None)\n",
326 |     "    \n",
327 |     "    return neighborhood_ratings"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 23,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "neighborhood_ratings = get_neighborhood_ratings(\"Barbara\", user_neighbors, ratings)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 24,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "{'hockey': [{'sim': 0.9713237285143654, 'rating': 2}],\n",
348 |        " 'swimming': [{'sim': 0.9713237285143654, 'rating': 5}],\n",
349 |        " 'tennis': [{'sim': 0.9713237285143654, 'rating': 5},\n",
350 |        "  {'sim': 0.8277591347639633, 'rating': 4}],\n",
351 |        " 'volleyball': [{'sim': 0.8277591347639633, 'rating': 4}]}"
352 |       ]
353 |      },
354 |      "execution_count": 24,
355 |      "metadata": {},
356 |      "output_type": "execute_result"
357 |     }
358 |    ],
359 |    "source": [
360 |     "neighborhood_ratings"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "### 3. Compute Rating Predictions from Neighborhood Ratings"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 25,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "def compute_rating_pred(neighborhood_ratings: dict) -> dict:\n",
377 |     "    rating_preds = dict()\n",
378 |     "    for item, ratings in neighborhood_ratings.items():\n",
379 |     "        if len(ratings) > 0:\n",
380 |     "            sims = np.array([rating['sim'] for rating in ratings])\n",
381 |     "            ratings = np.array([rating['rating'] for rating in ratings])\n",
382 |     "            pred_rating = (sims * ratings).sum() / sims.sum()\n",
383 |     "            count = len(sims)\n",
384 |     "            rating_preds[item] = {'pred': pred_rating,\n",
385 |     "                                  'count': count}\n",
386 |     "        else:\n",
387 |     "            rating_preds[item] = {'pred': None, 'count': 0}\n",
388 |     "\n",
389 |     "    return rating_preds"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 26,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "rating_preds = compute_rating_pred(neighborhood_ratings)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 27,
404 |    "metadata": {},
405 |    "outputs": [
406 |     {
407 |      "data": {
408 |       "text/plain": [
409 |        "{'hockey': {'pred': 2.0, 'count': 1},\n",
410 |        " 'swimming': {'pred': 5.0, 'count': 1},\n",
411 |        " 'tennis': {'pred': 4.5398993833693675, 'count': 2},\n",
412 |        " 'volleyball': {'pred': 4.0, 'count': 1}}"
413 |       ]
414 |      },
415 |      "execution_count": 27,
416 |      "metadata": {},
417 |      "output_type": "execute_result"
418 |     }
419 |    ],
420 |    "source": [
421 |     "rating_preds"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "### 4. Compute the Top-$N$ Recommendation Items"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 28,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "from collections import OrderedDict\n",
438 |     "\n",
439 |     "def compute_top_n(rating_preds: dict, min_count: int, N: int) -> OrderedDict:\n",
440 |     "    rating_preds = {key: val for (key, val) in rating_preds.items()\n",
441 |     "                    if val['count'] >= min_count}\n",
442 |     "    # assuming more ratings mean higher confidence in the prediction\n",
443 |     "    sorted_rating_preds = sorted(rating_preds.items(),\n",
444 |     "                                 key=lambda kv: (kv[1]['pred'], kv[1]['count']),\n",
445 |     "                                 reverse=True)\n",
446 |     "\n",
447 |     "    return OrderedDict(sorted_rating_preds[:N])"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 29,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "top_n_recs = compute_top_n(rating_preds, min_count=2, N=1)"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 30,
462 |    "metadata": {},
463 |    "outputs": [
464 |     {
465 |      "data": {
466 |       "text/plain": [
467 |        "OrderedDict([('tennis', {'pred': 4.5398993833693675, 'count': 2})])"
468 |       ]
469 |      },
470 |      "execution_count": 30,
471 |      "metadata": {},
472 |      "output_type": "execute_result"
473 |     }
474 |    ],
475 |    "source": [
476 |     "top_n_recs"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "### Combined all steps"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": 31,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": [
492 |     "def get_recommendations(user: int,\n",
493 |     "                        users: list,\n",
494 |     "                        user_user_sims: dict,\n",
495 |     "                        ratings: dict,\n",
496 |     "                        k: int,\n",
497 |     "                        C: int,\n",
498 |     "                        N: int):\n",
499 |     "    user_neighbors = get_k_nearest_neighbors(user, k=k, users=users, user_user_sims=user_user_sims)\n",
500 |     "    neighborhood_ratings = get_neighborhood_ratings(user, user_neighbors, ratings)\n",
501 |     "    rating_preds = compute_rating_pred(neighborhood_ratings)\n",
502 |     "    top_n_recs = compute_top_n(rating_preds, min_count=C, N=N)\n",
503 |     "    return top_n_recs"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 32,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "rec = get_recommendations(\"Barbara\", people, user_user_sims, ratings, k=2, C=2, N=1)"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 33,
518 |    "metadata": {},
519 |    "outputs": [
520 |     {
521 |      "data": {
522 |       "text/plain": [
523 |        "OrderedDict([('tennis', {'pred': 4.5398993833693675, 'count': 2})])"
524 |       ]
525 |      },
526 |      "execution_count": 33,
527 |      "metadata": {},
528 |      "output_type": "execute_result"
529 |     }
530 |    ],
531 |    "source": [
532 |     "rec"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 34,
538 |    "metadata": {},
539 |    "outputs": [
540 |     {
541 |      "name": "stdout",
542 |      "output_type": "stream",
543 |      "text": [
544 |       "Barbara --> tennis     @ 4.5 - 2 neighbor ratings\n",
545 |       "Birol   --> running    @ 3.5 - 2 neighbor ratings\n",
546 |       "Guido   --> running    @ 4.0 - 2 neighbor ratings\n",
547 |       "Lisa    --> tennis     @ 4.5 - 2 neighbor ratings\n",
548 |       "Rudi    --> Nothing for you :(\n",
549 |       "Suna    --> biking     @ 3.0 - 2 neighbor ratings\n",
550 |       "Sven    --> tennis     @ 5.0 - 2 neighbor ratings\n",
551 |       "Yvonne  --> volleyball @ 4.5 - 2 neighbor ratings\n"
552 |      ]
553 |     }
554 |    ],
555 |    "source": [
556 |     "for person in people:\n",
557 |     "    recs = get_recommendations(person, people, user_user_sims, ratings, k=2, C=2, N=1)\n",
558 |     "    person = person.ljust(7)\n",
559 |     "    if len(recs) > 0:\n",
560 |     "        sport = list(recs)[0]\n",
561 |     "        pred, count = recs.pop(sport).values()\n",
562 |     "        print(f\"{person} --> {sport.ljust(10)} @ {round(pred, 1)} - {count} neighbor ratings\")\n",
563 |     "    else:\n",
564 |     "        print(f\"{person} --> Nothing for you :(\")"
565 |    ]
566 |   }
567 |  ],
568 |  "metadata": {
569 |   "kernelspec": {
570 |    "display_name": "Python 3",
571 |    "language": "python",
572 |    "name": "python3"
573 |   },
574 |   "language_info": {
575 |    "codemirror_mode": {
576 |     "name": "ipython",
577 |     "version": 3
578 |    },
579 |    "file_extension": ".py",
580 |    "mimetype": "text/x-python",
581 |    "name": "python",
582 |    "nbconvert_exporter": "python",
583 |    "pygments_lexer": "ipython3",
584 |    "version": "3.7.5"
585 |   }
586 |  },
587 |  "nbformat": 4,
588 |  "nbformat_minor": 4
589 | }
590 | 


--------------------------------------------------------------------------------
/notebooks/solutions/9_s_ligthfm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 9: LightFM"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "You almost made it - this is the final lesson and it is also going to be the easiest one.\n",
 15 |     "\n",
 16 |     "As you may already assume - there are a lot of recommender packages in Python out there. In this lesson we will look at LightFM - an easy to use and lightweight implementation of different approaches and algorithms (FM, BPR, WARP, ...) to perform CF, CBF and hybrid recommenders.\n",
 17 |     "\n",
 18 |     "Within a few lines of code we set-up, train and use a recommender for recommendations.\n",
 19 |     "\n",
 20 |     "* [LightFM on GitHub](https://github.com/lyst/lightfm)\n",
 21 |     "* [LightFM documentation](https://making.lyst.com/lightfm/docs/home.html)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 240,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import matplotlib.pyplot as plt\n",
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "from scipy.sparse import coo_matrix\n",
 34 |     "\n",
 35 |     "from recsys_training.data import Dataset, genres"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stderr",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "/Users/mkurovski/anaconda3/envs/recsys_training/lib/python3.9/site-packages/lightfm/_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used.\n",
 48 |       "  warnings.warn(\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "from lightfm.datasets import fetch_movielens\n",
 54 |     "from lightfm.evaluation import precision_at_k\n",
 55 |     "from lightfm import LightFM"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 7,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'\n",
 65 |     "ml100k_item_filepath = '../../data/raw/ml-100k/u.item'\n",
 66 |     "ml100k_user_filepath = '../../data/raw/ml-100k/u.user'"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Load Data"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### You may easily load Movielens Data ..."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 306,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "data = fetch_movielens(min_rating=4.0, genre_features=True)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 307,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "{'train': <943x1682 sparse matrix of type '<class 'numpy.int32'>'\n",
101 |        " \twith 49906 stored elements in COOrdinate format>,\n",
102 |        " 'test': <943x1682 sparse matrix of type '<class 'numpy.int32'>'\n",
103 |        " \twith 5469 stored elements in COOrdinate format>,\n",
104 |        " 'item_features': <1682x1701 sparse matrix of type '<class 'numpy.float32'>'\n",
105 |        " \twith 4575 stored elements in Compressed Sparse Row format>,\n",
106 |        " 'item_feature_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,\n",
107 |        "        'genre:Thriller', 'genre:War', 'genre:Western'], dtype=object),\n",
108 |        " 'item_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,\n",
109 |        "        'Sliding Doors (1998)', 'You So Crazy (1994)',\n",
110 |        "        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object)}"
111 |       ]
112 |      },
113 |      "execution_count": 307,
114 |      "metadata": {},
115 |      "output_type": "execute_result"
116 |     }
117 |    ],
118 |    "source": [
119 |     "data"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "### But, we want to use the exact same data and split that we used in the lessons before"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 321,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "data = Dataset(ml100k_ratings_filepath)\n",
136 |     "data.filter(min_rating=4.0)\n",
137 |     "data.rating_split(seed=42)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "#### Transform our training and testing data into sparse matrices"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 322,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# Train DataFrame to Train COO Matrix\n",
154 |     "ratings = data.train_ratings[\"rating\"].values\n",
155 |     "# We subtract 1 to make user/item ids 0-index-based\n",
156 |     "rows = data.train_ratings[\"user\"].values - 1\n",
157 |     "cols = data.train_ratings[\"item\"].values - 1\n",
158 |     "\n",
159 |     "train_mat = coo_matrix((ratings, (rows, cols)),\n",
160 |     "                       shape=(data.n_users, data.n_items))\n",
161 |     "\n",
162 |     "\n",
163 |     "# Test DataFrame to Test COO Matrix\n",
164 |     "ratings = data.test_ratings[\"rating\"].values\n",
165 |     "# We subtract 1 to make user/item ids 0-index-based\n",
166 |     "rows = data.test_ratings[\"user\"].values - 1\n",
167 |     "cols = data.test_ratings[\"item\"].values - 1\n",
168 |     "\n",
169 |     "test_mat = coo_matrix((ratings, (rows, cols)),\n",
170 |     "                      shape=(data.n_users, data.n_items))"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 323,
176 |    "metadata": {},
177 |    "outputs": [
178 |     {
179 |      "data": {
180 |       "text/plain": [
181 |        "<943x1682 sparse matrix of type '<class 'numpy.int64'>'\n",
182 |        "\twith 44300 stored elements in COOrdinate format>"
183 |       ]
184 |      },
185 |      "execution_count": 323,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "train_mat"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 324,
197 |    "metadata": {},
198 |    "outputs": [
199 |     {
200 |      "data": {
201 |       "text/plain": [
202 |        "<943x1682 sparse matrix of type '<class 'numpy.int64'>'\n",
203 |        "\twith 11075 stored elements in COOrdinate format>"
204 |       ]
205 |      },
206 |      "execution_count": 324,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "test_mat"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "## Collaborative Filtering"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 433,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "params = {\n",
229 |     "    'no_components': 10,\n",
230 |     "    'loss': 'bpr',\n",
231 |     "    'learning_rate': 0.07,\n",
232 |     "    'random_state': 42,\n",
233 |     "    'user_alpha': 0.0002,\n",
234 |     "    'item_alpha': 0.0002\n",
235 |     "}\n",
236 |     "\n",
237 |     "epochs = 10\n",
238 |     "\n",
239 |     "N = 10"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 434,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "cf_model = LightFM(**params)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 435,
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "name": "stderr",
258 |      "output_type": "stream",
259 |      "text": [
260 |       "Epoch: 100%|██████████| 10/10 [00:00<00:00, 48.66it/s]\n"
261 |      ]
262 |     },
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "<lightfm.lightfm.LightFM at 0x7ffbc3e3ff70>"
267 |       ]
268 |      },
269 |      "execution_count": 435,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "cf_model.fit(train_mat, epochs=epochs, verbose=True)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "### Evaluate the `MAP@10` on test data\n",
283 |     "\n",
284 |     "If we provide training data with evaluation, known positives will be removed."
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 436,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "prec_at_N = precision_at_k(cf_model, test_mat, train_mat, k=N)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 437,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "data": {
303 |       "text/plain": [
304 |        "0.17415851"
305 |       ]
306 |      },
307 |      "execution_count": 437,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "prec_at_N.mean()"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "### Evaluate the `MAP@10` on train data"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 438,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "prec_at_N = precision_at_k(cf_model, train_mat, k=N)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 439,
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "data": {
339 |       "text/plain": [
340 |        "0.4393843"
341 |       ]
342 |      },
343 |      "execution_count": 439,
344 |      "metadata": {},
345 |      "output_type": "execute_result"
346 |     }
347 |    ],
348 |    "source": [
349 |     "prec_at_N.mean()"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "markdown",
354 |    "metadata": {},
355 |    "source": [
356 |     "Maybe try adding some regularization to improve the recommendation relevancy - simply add `user_alpha` and `item_alpha` to the `params` dictionary and find appropriate values."
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "## Hybrid (CF + CBF)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "### Load user and item features"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 440,
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "def min_max_scale(val, bounds):\n",
380 |     "    min_max_range = bounds['max']-bounds['min']\n",
381 |     "    return (val-bounds['min'])/min_max_range\n",
382 |     "\n",
383 |     "\n",
384 |     "def user_profiler(group):\n",
385 |     "    genre_dist = group[genres].mean()\n",
386 |     "    year_dist = group['release_year'].describe()[['mean', 'std', '50%']]\n",
387 |     "\n",
388 |     "    return pd.concat((genre_dist, year_dist), axis=0)\n",
389 |     "\n",
390 |     "\n",
391 |     "def get_user_profiles(ratings: pd.DataFrame,\n",
392 |     "                      item_feat: pd.DataFrame,\n",
393 |     "                      min_rating: float = 4.0) -> pd.DataFrame:\n",
394 |     "    ratings = ratings[ratings.rating >= min_rating]\n",
395 |     "    ratings = ratings[['user', 'item']]\n",
396 |     "    ratings = ratings.merge(item_feat, on='item', how='left')\n",
397 |     "    ratings.drop(['item'], axis=1, inplace=True)\n",
398 |     "\n",
399 |     "    grouped = ratings.groupby('user')\n",
400 |     "    profiles = grouped.apply(user_profiler).reset_index()\n",
401 |     "    profiles.rename(columns={'50%': 'median'}, inplace=True)\n",
402 |     "    \n",
403 |     "    return profiles\n",
404 |     "\n",
405 |     "\n",
406 |     "item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n",
407 |     "                        names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n",
408 |     "                        engine='python')\n",
409 |     "\n",
410 |     "user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n",
411 |     "                        names=['user', 'age', 'gender', 'occupation', 'zip'])\n",
412 |     "\n",
413 |     "# Infer the release year\n",
414 |     "idxs = item_feat[item_feat['release'].notnull()].index\n",
415 |     "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')\n",
416 |     "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)\n",
417 |     "\n",
418 |     "# Impute median release year value for the items with missing release year\n",
419 |     "top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']\n",
420 |     "idx = item_feat[item_feat['release'].isnull()].index\n",
421 |     "item_feat.loc[idx, 'release_year'] = top_year\n",
422 |     "\n",
423 |     "# Min-max scale the release year\n",
424 |     "item_year_bounds = {'min': item_feat['release_year'].min(),\n",
425 |     "                    'max': item_feat['release_year'].max()}\n",
426 |     "item_feat['release_year'] = item_feat['release_year'].apply(\n",
427 |     "    lambda year: min_max_scale(year, item_year_bounds))\n",
428 |     "\n",
429 |     "# Drop other columns\n",
430 |     "item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)\n",
431 |     "\n",
432 |     "# Min-max scale the age\n",
433 |     "user_age_bounds = {'min': user_feat['age'].min(),\n",
434 |     "                   'max': user_feat['age'].max()}\n",
435 |     "user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))\n",
436 |     "\n",
437 |     "# Transform gender characters to numerical values (categories)\n",
438 |     "genders = sorted(user_feat['gender'].unique())\n",
439 |     "user_gender_map = dict(zip(genders, range(len(genders))))\n",
440 |     "user_feat['gender'] = user_feat['gender'].map(user_gender_map)\n",
441 |     "\n",
442 |     "# Transform occupation strings to numerical values (categories)\n",
443 |     "occupations = sorted(user_feat['occupation'].unique())\n",
444 |     "user_occupation_map = dict(zip(occupations, range(len(occupations))))\n",
445 |     "user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)\n",
446 |     "\n",
447 |     "# Transform the zip codes to categories keeping the first three digits and impute for missing\n",
448 |     "idxs = user_feat[~user_feat['zip'].str.isnumeric()].index\n",
449 |     "user_feat.loc[idxs, 'zip'] = '00000'\n",
450 |     "zip_digits_to_cut = 3\n",
451 |     "user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)\n",
452 |     "\n",
453 |     "\n",
454 |     "profiles = get_user_profiles(data.train_ratings, item_feat)\n",
455 |     "user_feat = user_feat.merge(profiles, on='user', how='left')\n",
456 |     "\n",
457 |     "occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')\n",
458 |     "zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')\n",
459 |     "\n",
460 |     "user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)\n",
461 |     "user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)\n",
462 |     "\n",
463 |     "user_feat.fillna(0, inplace=True)\n",
464 |     "\n",
465 |     "\n",
466 |     "user_feat.index = user_feat['user'].values\n",
467 |     "user_feat.drop('user', axis=1, inplace=True)\n",
468 |     "\n",
469 |     "item_feat.index = item_feat['item'].values\n",
470 |     "item_feat.drop('item', axis=1, inplace=True)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 441,
476 |    "metadata": {},
477 |    "outputs": [
478 |     {
479 |      "data": {
480 |       "text/plain": [
481 |        "0.8608033813918158"
482 |       ]
483 |      },
484 |      "execution_count": 441,
485 |      "metadata": {},
486 |      "output_type": "execute_result"
487 |     }
488 |    ],
489 |    "source": [
490 |     "(user_feat==0).sum().sum()/user_feat.size"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 442,
496 |    "metadata": {},
497 |    "outputs": [
498 |     {
499 |      "data": {
500 |       "text/plain": [
501 |        "0.8640309155766944"
502 |       ]
503 |      },
504 |      "execution_count": 442,
505 |      "metadata": {},
506 |      "output_type": "execute_result"
507 |     }
508 |    ],
509 |    "source": [
510 |     "(item_feat==0).sum().sum()/item_feat.size"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 443,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": [
519 |     "# Create User Feature COO Matrix\n",
520 |     "# user_feat_mat = coo_matrix(np.eye(data.n_users))\n",
521 |     "user_feat_mat = coo_matrix(np.concatenate((user_feat.values, np.eye(data.n_users)), axis=1))\n",
522 |     "\n",
523 |     "# Create Item Feature COO Matrix\n",
524 |     "# item_feat_mat = coo_matrix(np.eye(data.n_items))\n",
525 |     "item_feat_mat = coo_matrix(np.concatenate((item_feat.values, np.eye(data.n_items)), axis=1))"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 444,
531 |    "metadata": {},
532 |    "outputs": [
533 |     {
534 |      "data": {
535 |       "text/plain": [
536 |        "<943x1084 sparse matrix of type '<class 'numpy.float64'>'\n",
537 |        "\twith 19451 stored elements in COOrdinate format>"
538 |       ]
539 |      },
540 |      "execution_count": 444,
541 |      "metadata": {},
542 |      "output_type": "execute_result"
543 |     }
544 |    ],
545 |    "source": [
546 |     "user_feat_mat"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": 445,
552 |    "metadata": {},
553 |    "outputs": [
554 |     {
555 |      "data": {
556 |       "text/plain": [
557 |        "<1682x1702 sparse matrix of type '<class 'numpy.float64'>'\n",
558 |        "\twith 6256 stored elements in COOrdinate format>"
559 |       ]
560 |      },
561 |      "execution_count": 445,
562 |      "metadata": {},
563 |      "output_type": "execute_result"
564 |     }
565 |    ],
566 |    "source": [
567 |     "item_feat_mat"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "markdown",
572 |    "metadata": {},
573 |    "source": [
574 |     "### Model Training"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": 454,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "params = {\n",
584 |     "    'no_components': 10,\n",
585 |     "    'loss': 'warp',\n",
586 |     "    'learning_rate': 0.07,\n",
587 |     "    'random_state': 42,\n",
588 |     "    'user_alpha': 0.0002,\n",
589 |     "    'item_alpha': 0.0002\n",
590 |     "}\n",
591 |     "\n",
592 |     "epochs = 10\n",
593 |     "\n",
594 |     "N = 10"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": 455,
600 |    "metadata": {},
601 |    "outputs": [
602 |     {
603 |      "name": "stderr",
604 |      "output_type": "stream",
605 |      "text": [
606 |       "Epoch: 100%|██████████| 10/10 [00:00<00:00, 19.44it/s]\n"
607 |      ]
608 |     },
609 |     {
610 |      "data": {
611 |       "text/plain": [
612 |        "<lightfm.lightfm.LightFM at 0x7ffbc3ef92b0>"
613 |       ]
614 |      },
615 |      "execution_count": 455,
616 |      "metadata": {},
617 |      "output_type": "execute_result"
618 |     }
619 |    ],
620 |    "source": [
621 |     "hybrid_model = LightFM(**params)\n",
622 |     "\n",
623 |     "hybrid_model.fit(train_mat,\n",
624 |     "                 user_features=user_feat_mat,\n",
625 |     "                 item_features=item_feat_mat,\n",
626 |     "                 epochs=epochs,\n",
627 |     "                 verbose=True)"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": 456,
633 |    "metadata": {},
634 |    "outputs": [],
635 |    "source": [
636 |     "prec_at_N = precision_at_k(hybrid_model,\n",
637 |     "                           test_mat,\n",
638 |     "                           train_mat,\n",
639 |     "                           k=N,\n",
640 |     "                           user_features=user_feat_mat,\n",
641 |     "                           item_features=item_feat_mat)"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 457,
647 |    "metadata": {},
648 |    "outputs": [
649 |     {
650 |      "data": {
651 |       "text/plain": [
652 |        "0.19381107"
653 |       ]
654 |      },
655 |      "execution_count": 457,
656 |      "metadata": {},
657 |      "output_type": "execute_result"
658 |     }
659 |    ],
660 |    "source": [
661 |     "prec_at_N.mean()"
662 |    ]
663 |   }
664 |  ],
665 |  "metadata": {
666 |   "kernelspec": {
667 |    "display_name": "Python 3",
668 |    "language": "python",
669 |    "name": "python3"
670 |   },
671 |   "language_info": {
672 |    "codemirror_mode": {
673 |     "name": "ipython",
674 |     "version": 3
675 |    },
676 |    "file_extension": ".py",
677 |    "mimetype": "text/x-python",
678 |    "name": "python",
679 |    "nbconvert_exporter": "python",
680 |    "pygments_lexer": "ipython3",
681 |    "version": "3.9.4"
682 |   },
683 |   "pycharm": {
684 |    "stem_cell": {
685 |     "cell_type": "raw",
686 |     "metadata": {
687 |      "collapsed": false
688 |     },
689 |     "source": []
690 |    }
691 |   }
692 |  },
693 |  "nbformat": 4,
694 |  "nbformat_minor": 4
695 | }
696 | 


--------------------------------------------------------------------------------
/notebooks/8_e_hybrid_fm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 8: Hybrid Recommender Model using both Collaborative Filtering and Content-based Filtering using a Factorization Machine"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this section, we combine CF and CBF.\n",
 15 |     "\n",
 16 |     "Therefore, we simply add the one-hot-encoded user and item IDs to the data. Thus, the model is capable of factorizing the similarities in rating and features for rating prediction. This combination is called hybrid as it combines two recommenders."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from collections import OrderedDict\n",
 26 |     "import itertools\n",
 27 |     "from typing import Dict, List, Tuple\n",
 28 |     "\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import numpy as np\n",
 31 |     "import pandas as pd\n",
 32 |     "from pyfm import pylibfm\n",
 33 |     "from scipy import sparse\n",
 34 |     "from sklearn.metrics import mean_squared_error, mean_absolute_error"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from recsys_training.data import Dataset, genres\n",
 44 |     "from recsys_training.evaluation import get_relevant_items\n",
 45 |     "from recsys_training.utils import get_sparsity"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n",
 55 |     "ml100k_item_filepath = '../data/raw/ml-100k/u.item'\n",
 56 |     "ml100k_user_filepath = '../data/raw/ml-100k/u.user'"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Load Data"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "data = Dataset(ml100k_ratings_filepath)\n",
 73 |     "data.rating_split(seed=42)\n",
 74 |     "user_ratings = data.get_user_ratings()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n",
 84 |     "                        names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n",
 85 |     "                        engine='python')"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n",
 95 |     "                        names=['user', 'age', 'gender', 'occupation', 'zip'])"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## User and Item Content (Features)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "### Preprocessing"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "#### Items"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "We keep the following information for items:\n",
124 |     "* release year\n",
125 |     "* genres"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "def min_max_scale(val, bounds):\n",
135 |     "    min_max_range = bounds['max']-bounds['min']\n",
136 |     "    return (val-bounds['min'])/min_max_range"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# Infer the release year\n",
146 |     "idxs = item_feat[item_feat['release'].notnull()].index\n",
147 |     "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')\n",
148 |     "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)\n",
149 |     "\n",
150 |     "# Impute median release year value for the items with missing release year\n",
151 |     "top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']\n",
152 |     "idx = item_feat[item_feat['release'].isnull()].index\n",
153 |     "item_feat.loc[idx, 'release_year'] = top_year\n",
154 |     "\n",
155 |     "# Min-max scale the release year\n",
156 |     "item_year_bounds = {'min': item_feat['release_year'].min(),\n",
157 |     "                    'max': item_feat['release_year'].max()}\n",
158 |     "item_feat['release_year'] = item_feat['release_year'].apply(\n",
159 |     "    lambda year: min_max_scale(year, item_year_bounds))\n",
160 |     "\n",
161 |     "# Drop other columns\n",
162 |     "item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "#### users"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "We keep the following information for users:\n",
177 |     "* age\n",
178 |     "* gender\n",
179 |     "* occupation\n",
180 |     "* zip-code"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "# Min-max scale the age\n",
190 |     "user_age_bounds = {'min': user_feat['age'].min(),\n",
191 |     "                   'max': user_feat['age'].max()}\n",
192 |     "user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))\n",
193 |     "\n",
194 |     "# Transform gender characters to numerical values (categories)\n",
195 |     "genders = sorted(user_feat['gender'].unique())\n",
196 |     "user_gender_map = dict(zip(genders, range(len(genders))))\n",
197 |     "user_feat['gender'] = user_feat['gender'].map(user_gender_map)\n",
198 |     "\n",
199 |     "# Transform occupation strings to numerical values (categories)\n",
200 |     "occupations = sorted(user_feat['occupation'].unique())\n",
201 |     "user_occupation_map = dict(zip(occupations, range(len(occupations))))\n",
202 |     "user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)\n",
203 |     "\n",
204 |     "# Transform the zip codes to categories keeping the first three digits and impute for missing\n",
205 |     "idxs = user_feat[~user_feat['zip'].str.isnumeric()].index\n",
206 |     "user_feat.loc[idxs, 'zip'] = '00000'\n",
207 |     "zip_digits_to_cut = 3\n",
208 |     "user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "In addition, we infer profiles by combining item information with rating data for each user to get features that represent the users' preferred genres and film age"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "def user_profiler(group):\n",
225 |     "    genre_dist = group[genres].mean()\n",
226 |     "    year_dist = group['release_year'].describe()[['mean', 'std', '50%']]\n",
227 |     "\n",
228 |     "    return pd.concat((genre_dist, year_dist), axis=0)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "def get_user_profiles(ratings: pd.DataFrame,\n",
238 |     "                      item_feat: pd.DataFrame,\n",
239 |     "                      min_rating: float = 4.0) -> pd.DataFrame:\n",
240 |     "    ratings = ratings[ratings.rating >= min_rating]\n",
241 |     "    ratings = ratings[['user', 'item']]\n",
242 |     "    ratings = ratings.merge(item_feat, on='item', how='left')\n",
243 |     "    ratings.drop(['item'], axis=1, inplace=True)\n",
244 |     "\n",
245 |     "    grouped = ratings.groupby('user')\n",
246 |     "    profiles = grouped.apply(user_profiler).reset_index()\n",
247 |     "    profiles.rename(columns={'50%': 'median'}, inplace=True)\n",
248 |     "    \n",
249 |     "    return profiles"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "Finally, we join the original user information with their profiles' information and one-hot-encode categorical information"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "profiles = get_user_profiles(data.train_ratings, item_feat)\n",
266 |     "user_feat = user_feat.merge(profiles, on='user', how='left')\n",
267 |     "\n",
268 |     "occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')\n",
269 |     "zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')\n",
270 |     "\n",
271 |     "user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)\n",
272 |     "user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)\n",
273 |     "\n",
274 |     "user_feat.fillna(0, inplace=True)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "We remove the user/item id columns and replace the current dataframe indices with their values"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "user_feat.index = user_feat['user'].values\n",
291 |     "user_feat.drop('user', axis=1, inplace=True)\n",
292 |     "\n",
293 |     "item_feat.index = item_feat['item'].values\n",
294 |     "item_feat.drop('item', axis=1, inplace=True)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "## Factorization Machine for a Hybrid Recommender"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "[Steffen Rendle: Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)\n",
309 |     "\n",
310 |     "[pyFM - Factorization Machines in Python](https://github.com/coreylynch/pyFM)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "#### Create Feature Matrices"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "# fetch content information for all observed user-item rating combinations\n",
327 |     "user_cb_feat_train = user_feat.loc[data.train_ratings.user.values].values\n",
328 |     "user_cb_feat_test = user_feat.loc[data.test_ratings.user.values].values\n",
329 |     "item_cb_feat_train = item_feat.loc[data.train_ratings.item.values].values\n",
330 |     "item_cb_feat_test = item_feat.loc[data.test_ratings.item.values].values"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "![](Parrot.png)\n",
338 |     "\n",
339 |     "**Task:** Implement additional arrays for user and item IDs and adjust the design matrices `X_train` and `X_test` accordingly."
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "def one_hot_encode_ids(ids: np.array, length):\n",
349 |     "    pass\n",
350 |     "    return one_hot_enc"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "# Subtract 1 to turn 1-base-indexed into 0-base-indexed IDs for 0-base-indexed array\n",
360 |     "pass"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "# concatenate user and item content information to form design matrices\n",
370 |     "# and convert to sparse matrix in Compressed Sparse Row (CSR) format\n",
371 |     "X_train = pass\n",
372 |     "X_train = pass\n",
373 |     "X_test = pass\n",
374 |     "X_test = pass"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "X_train"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "# Sparsity of Training Data\n",
393 |     "get_sparsity(X_train)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "X_test"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "# Sparsity of Test Data\n",
412 |     "get_sparsity(X_test)"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "#### Create Target Matrices for Rating Predictions"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "y_train = data.train_ratings.rating.values.astype(float)\n",
429 |     "y_test = data.test_ratings.rating.values"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "markdown",
434 |    "metadata": {},
435 |    "source": [
436 |     "#### Train Factorization Machine for Rating Prediction as Regressor using pyFM"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {},
443 |    "outputs": [],
444 |    "source": [
445 |     "n_epochs = 50  # number of full stochastic passes through the training data\n",
446 |     "k = 16\n",
447 |     "random_seed = 28"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "fm_hybrid = pylibfm.FM(num_factors=k,\n",
457 |     "                       num_iter=n_epochs,\n",
458 |     "                       verbose=True,\n",
459 |     "                       task=\"regression\",\n",
460 |     "                       initial_learning_rate=0.001,\n",
461 |     "                       learning_rate_schedule=\"optimal\",\n",
462 |     "                       seed=random_seed)\n",
463 |     "fm_hybrid.fit(X_train, y_train)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "## Evaluation on Test Set"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "y_pred = fm_hybrid.predict(X_test)"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "metadata": {},
485 |    "source": [
486 |     "$MSE$"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": null,
492 |    "metadata": {},
493 |    "outputs": [],
494 |    "source": [
495 |     "mean_squared_error(y_test, y_pred)"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "markdown",
500 |    "metadata": {},
501 |    "source": [
502 |     "$MAE$"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": null,
508 |    "metadata": {},
509 |    "outputs": [],
510 |    "source": [
511 |     "mean_absolute_error(y_test, y_pred)"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "def get_prediction(fm: object, user: int, user_feat: pd.DataFrame, item_feat: pd.DataFrame,\n",
521 |     "                   items: np.array = None, remove_known_pos: bool = True) -> Dict[int, Dict[str, float]]:\n",
522 |     "    \n",
523 |     "    if items is None:\n",
524 |     "        if remove_known_pos:\n",
525 |     "            # Predict from unobserved items\n",
526 |     "            known_items = np.array(list(user_ratings[user].keys()))\n",
527 |     "            items = np.setdiff1d(data.items, known_items)\n",
528 |     "        else:\n",
529 |     "            items = np.array(data.items)\n",
530 |     "    if type(items) == np.int64:\n",
531 |     "        items = np.array([items])\n",
532 |     "    \n",
533 |     "    n_items = len(items)\n",
534 |     "    \n",
535 |     "    single_user_cb_feat = user_feat.loc[user].values.reshape(1, -1).repeat(n_items, axis=0)\n",
536 |     "    all_items_cb_feat = item_feat.loc[items].values\n",
537 |     "    \n",
538 |     "    input_data = np.concatenate((single_user_cb_feat, all_items_cb_feat), axis=1)\n",
539 |     "    input_data = sparse.csr_matrix(input_data)\n",
540 |     "    \n",
541 |     "    preds = fm.predict(input_data)\n",
542 |     "    sorting = np.argsort(preds)[::-1]\n",
543 |     "    \n",
544 |     "    preds = {item: {'pred': pred} for item, pred in\n",
545 |     "             zip(items[sorting], preds[sorting])}\n",
546 |     "    \n",
547 |     "    return preds"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": null,
553 |    "metadata": {},
554 |    "outputs": [],
555 |    "source": [
556 |     "predictions = get_prediction(fm_hybrid, 1, user_feat, item_feat)\n",
557 |     "list(predictions.items())[:10]"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {},
564 |    "outputs": [],
565 |    "source": [
566 |     "def get_recommendations(fm_cb: object,\n",
567 |     "                        user: int,\n",
568 |     "                        N: int,\n",
569 |     "                        user_feat: pd.DataFrame,\n",
570 |     "                        item_feat: pd.DataFrame,\n",
571 |     "                        remove_known_pos: bool = True) -> List[Tuple[int, Dict[str, float]]]:\n",
572 |     "    \n",
573 |     "    recommendations = []\n",
574 |     "    \n",
575 |     "    predictions = get_prediction(fm_cb, user, user_feat, item_feat,\n",
576 |     "                                 remove_known_pos=remove_known_pos)\n",
577 |     "\n",
578 |     "    for item, pred in predictions.items():\n",
579 |     "        add_item = (item, pred)\n",
580 |     "        recommendations.append(add_item)\n",
581 |     "        if len(recommendations) == N:\n",
582 |     "            break\n",
583 |     "\n",
584 |     "    return recommendations"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "get_recommendations(fm_hybrid, 1, N=10, user_feat=user_feat, item_feat=item_feat)"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "markdown",
598 |    "metadata": {},
599 |    "source": [
600 |     "## Evaluation"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": null,
606 |    "metadata": {},
607 |    "outputs": [],
608 |    "source": [
609 |     "N = 10"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": null,
615 |    "metadata": {},
616 |    "outputs": [],
617 |    "source": [
618 |     "relevant_items = get_relevant_items(data.test_ratings)"
619 |    ]
620 |   },
621 |   {
622 |    "cell_type": "code",
623 |    "execution_count": null,
624 |    "metadata": {},
625 |    "outputs": [],
626 |    "source": [
627 |     "users = relevant_items.keys()\n",
628 |     "prec_at_N = dict.fromkeys(data.users)\n",
629 |     "\n",
630 |     "for user in users:\n",
631 |     "    recommendations = get_recommendations(fm_hybrid, user, N,\n",
632 |     "                                          user_feat=user_feat, item_feat=item_feat)\n",
633 |     "    recommendations = [val[0] for val in recommendations]\n",
634 |     "    hits = np.intersect1d(recommendations,\n",
635 |     "                          relevant_items[user])\n",
636 |     "    prec_at_N[user] = len(hits)/N"
637 |    ]
638 |   },
639 |   {
640 |    "cell_type": "code",
641 |    "execution_count": null,
642 |    "metadata": {},
643 |    "outputs": [],
644 |    "source": [
645 |     "recommendations"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": null,
651 |    "metadata": {},
652 |    "outputs": [],
653 |    "source": [
654 |     "np.mean([val for val in prec_at_N.values() if val is not None])"
655 |    ]
656 |   }
657 |  ],
658 |  "metadata": {
659 |   "kernelspec": {
660 |    "display_name": "Python 3",
661 |    "language": "python",
662 |    "name": "python3"
663 |   },
664 |   "language_info": {
665 |    "codemirror_mode": {
666 |     "name": "ipython",
667 |     "version": 3
668 |    },
669 |    "file_extension": ".py",
670 |    "mimetype": "text/x-python",
671 |    "name": "python",
672 |    "nbconvert_exporter": "python",
673 |    "pygments_lexer": "ipython3",
674 |    "version": "3.9.4"
675 |   },
676 |   "pycharm": {
677 |    "stem_cell": {
678 |     "cell_type": "raw",
679 |     "metadata": {
680 |      "collapsed": false
681 |     },
682 |     "source": []
683 |    }
684 |   }
685 |  },
686 |  "nbformat": 4,
687 |  "nbformat_minor": 4
688 | }
689 | 


--------------------------------------------------------------------------------
/notebooks/solutions/4_s_cf_knn_rating_pred.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit 4: Neighborhood-based Collaborative Filtering for Rating Prediction"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this section we generate personalized recommendations for the first time. We exploit rating similarities among users and items to identify similar users and items that assist in finding the relevant items to recommend for each user.\n",
 15 |     "\n",
 16 |     "This describes the fundamental idea behind Collaborative Filtering (CF) and using kNN is a neighborhood-based approach towards CF. In a later unit we will also have a look at model-based approaches.\n",
 17 |     "\n",
 18 |     "This is also the first time we try to predict user ratings for unknown items using rating predictions to take the top-$N$ items with the highest rating predictions and recommend those to the user."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from collections import OrderedDict\n",
 28 |     "import itertools\n",
 29 |     "from typing import Dict, List, Tuple\n",
 30 |     "\n",
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from recsys_training.data import Dataset\n",
 42 |     "from recsys_training.evaluation import get_relevant_items\n",
 43 |     "from recsys_training.utils import get_entity_sim"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Load Data"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 4,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "data = Dataset(ml100k_ratings_filepath)\n",
 69 |     "data.rating_split(seed=42)\n",
 70 |     "user_ratings = data.get_user_ratings()"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "The idea behind this recommender is to use item ratings of the $k$ most similar users (neighbors). We identify those _nearest neighbors_ with a similarity metric which we apply to the ratings both, root user and possible neighbor, have in common. Similarity thereby means having a similar opinion on movies.\n",
 78 |     "\n",
 79 |     "The steps are as follows:\n",
 80 |     "\n",
 81 |     "1. Compute user-user similarities (we use the Pearson Correlation Coefficient here, but feel free to try other similarity metrics)\n",
 82 |     "\n",
 83 |     "2. For each user:\n",
 84 |     "\n",
 85 |     "    1. Get the k nearest neighbors along with their similarities\n",
 86 |     "    2. Collect the neighborhood item ratings and ignore those already rated by the root user\n",
 87 |     "    3. Item Rating Prediction: Compute the similarity-weighted sum of neighborhood item ratings\n",
 88 |     "    4. Recommendations: Get the $N$ items with the highest ratings that have a minimum rating count"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "### 1. User-User Similarities"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "sim_metric = 'pearson'\n",
105 |     "user_user_sims = {}\n",
106 |     "user_pairs = itertools.combinations(data.users, 2)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "The following takes a few seconds to finish ..."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stderr",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "/anaconda3/envs/recsys_training/lib/python3.7/site-packages/numpy/lib/function_base.py:2534: RuntimeWarning: invalid value encountered in true_divide\n",
126 |       "  c /= stddev[:, None]\n",
127 |       "/anaconda3/envs/recsys_training/lib/python3.7/site-packages/numpy/lib/function_base.py:2535: RuntimeWarning: invalid value encountered in true_divide\n",
128 |       "  c /= stddev[None, :]\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "for pair in user_pairs:\n",
134 |     "    user_user_sims[pair] = get_entity_sim(pair[0], pair[1],\n",
135 |     "                                          user_ratings,\n",
136 |     "                                          sim_metric)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 7,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "data": {
146 |       "text/plain": [
147 |        "(0.9759000729485333, 5)"
148 |       ]
149 |      },
150 |      "execution_count": 7,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "user_user_sims[(1,4)]"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## 2. Computing Recommendations"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "### A. Implement Nearest Neighbors for a given user\n",
171 |     "\n",
172 |     "![](../Parrot.png)\n",
173 |     "\n",
174 |     "**Task:** It's your turn again. Complete `get_k_nearest_neighbors` to return a sorted list of the $k$ nearest neighbors - identified by their id - for a given user, each along with its similarity."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 8,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "def get_k_nearest_neighbors(user: int, k: int, user_user_sims: dict) -> List[Tuple[int, float]]:\n",
184 |     "    neighbors = set(data.users)\n",
185 |     "    neighbors.remove(user)\n",
186 |     "\n",
187 |     "    nearest_neighbors = dict()\n",
188 |     "    for neighbor in neighbors:\n",
189 |     "        sim = user_user_sims[tuple(sorted((user, neighbor)))][0]\n",
190 |     "        if pd.notnull(sim):\n",
191 |     "            nearest_neighbors[neighbor] = sim\n",
192 |     "\n",
193 |     "    nearest_neighbors = sorted(nearest_neighbors.items(),\n",
194 |     "                               key=lambda kv: kv[1],\n",
195 |     "                               reverse=True)\n",
196 |     "    \n",
197 |     "    return nearest_neighbors[:k]"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 9,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "user_neighbors = get_k_nearest_neighbors(1, k=10, user_user_sims=user_user_sims)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 10,
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "data": {
216 |       "text/plain": [
217 |        "[(107, 1.0),\n",
218 |        " (443, 1.0),\n",
219 |        " (485, 1.0),\n",
220 |        " (687, 1.0),\n",
221 |        " (791, 1.0),\n",
222 |        " (820, 1.0),\n",
223 |        " (34, 0.9999999999999999),\n",
224 |        " (240, 0.9999999999999999),\n",
225 |        " (281, 0.9999999999999999),\n",
226 |        " (384, 0.9999999999999999)]"
227 |       ]
228 |      },
229 |      "execution_count": 10,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "user_neighbors"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "### B. Obtain the Neighborhood Ratings\n",
243 |     "\n",
244 |     "**Task:** Now, use the nearest neighbors and get their ratings, but leave out the items our root user has already rated (known positives). Return a mapping from unknown item to a list of dicts with neighbor similarity and item rating."
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 11,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "def get_neighborhood_ratings(user, user_neighbors: List[Tuple[int, float]]) -> Dict[int, List[Dict[str, float]]]:\n",
254 |     "    neighborhood_ratings = {}\n",
255 |     "    for neighbor, sim in user_neighbors:\n",
256 |     "        neighbor_ratings = user_ratings[neighbor].copy()\n",
257 |     "        \n",
258 |     "        # collect neighbor ratings and items\n",
259 |     "        for item, rating in neighbor_ratings.items():\n",
260 |     "            add_item = {'sim': sim, 'rating': rating}\n",
261 |     "            if item not in neighborhood_ratings.keys():\n",
262 |     "                neighborhood_ratings[item] = [add_item]\n",
263 |     "            else:\n",
264 |     "                neighborhood_ratings[item].append(add_item)\n",
265 |     "        \n",
266 |     "    # remove known items\n",
267 |     "    known_items = list(user_ratings[user].keys())\n",
268 |     "    for known_item in known_items:\n",
269 |     "        neighborhood_ratings.pop(known_item, None)\n",
270 |     "    \n",
271 |     "    return neighborhood_ratings"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 12,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "neighborhood_ratings = get_neighborhood_ratings(1, user_neighbors)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 13,
286 |    "metadata": {},
287 |    "outputs": [
288 |     {
289 |      "data": {
290 |       "text/plain": [
291 |        "[(340,\n",
292 |        "  [{'sim': 1.0, 'rating': 5.0},\n",
293 |        "   {'sim': 1.0, 'rating': 5.0},\n",
294 |        "   {'sim': 0.9999999999999999, 'rating': 4.0}]),\n",
295 |        " (325, [{'sim': 1.0, 'rating': 3.0}]),\n",
296 |        " (288,\n",
297 |        "  [{'sim': 1.0, 'rating': 3.0},\n",
298 |        "   {'sim': 1.0, 'rating': 3.0},\n",
299 |        "   {'sim': 1.0, 'rating': 4.0},\n",
300 |        "   {'sim': 1.0, 'rating': 3.0},\n",
301 |        "   {'sim': 1.0, 'rating': 5.0},\n",
302 |        "   {'sim': 0.9999999999999999, 'rating': 5.0}]),\n",
303 |        " (312,\n",
304 |        "  [{'sim': 1.0, 'rating': 4.0}, {'sim': 0.9999999999999999, 'rating': 4.0}]),\n",
305 |        " (313,\n",
306 |        "  [{'sim': 1.0, 'rating': 2.0},\n",
307 |        "   {'sim': 1.0, 'rating': 4.0},\n",
308 |        "   {'sim': 1.0, 'rating': 5.0},\n",
309 |        "   {'sim': 1.0, 'rating': 5.0},\n",
310 |        "   {'sim': 0.9999999999999999, 'rating': 5.0},\n",
311 |        "   {'sim': 0.9999999999999999, 'rating': 5.0}]),\n",
312 |        " (300,\n",
313 |        "  [{'sim': 1.0, 'rating': 1.0},\n",
314 |        "   {'sim': 0.9999999999999999, 'rating': 3.0},\n",
315 |        "   {'sim': 0.9999999999999999, 'rating': 4.0},\n",
316 |        "   {'sim': 0.9999999999999999, 'rating': 4.0}]),\n",
317 |        " (264,\n",
318 |        "  [{'sim': 1.0, 'rating': 3.0},\n",
319 |        "   {'sim': 1.0, 'rating': 3.0},\n",
320 |        "   {'sim': 1.0, 'rating': 3.0}]),\n",
321 |        " (333,\n",
322 |        "  [{'sim': 1.0, 'rating': 3.0},\n",
323 |        "   {'sim': 1.0, 'rating': 5.0},\n",
324 |        "   {'sim': 1.0, 'rating': 5.0},\n",
325 |        "   {'sim': 0.9999999999999999, 'rating': 3.0},\n",
326 |        "   {'sim': 0.9999999999999999, 'rating': 4.0}]),\n",
327 |        " (1243, [{'sim': 1.0, 'rating': 3.0}]),\n",
328 |        " (322,\n",
329 |        "  [{'sim': 1.0, 'rating': 1.0}, {'sim': 0.9999999999999999, 'rating': 4.0}])]"
330 |       ]
331 |      },
332 |      "execution_count": 13,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "list(neighborhood_ratings.items())[:10]"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "markdown",
343 |    "metadata": {},
344 |    "source": [
345 |     "### C. Compute Rating Predictions from Neighborhood Ratings\n",
346 |     "\n",
347 |     "![](../Parrot.png)\n",
348 |     "\n",
349 |     "**Task:** In this step, we estimate ratings for the seed user based on the neighborhood ratings. We implement a similarity weighted average of neighbor ratings for that. Return a mapping from item to its prediction and the count of neighbor ratings received."
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 14,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "def compute_rating_pred(neighborhood_ratings: dict) -> dict:\n",
359 |     "    rating_preds = dict()\n",
360 |     "    for item, ratings in neighborhood_ratings.items():\n",
361 |     "        if len(ratings) > 0:\n",
362 |     "            sims = np.array([rating['sim'] for rating in ratings])\n",
363 |     "            ratings = np.array([rating['rating'] for rating in ratings])\n",
364 |     "            pred_rating = (sims * ratings).sum() / sims.sum()\n",
365 |     "            count = len(sims)\n",
366 |     "            rating_preds[item] = {'pred': pred_rating,\n",
367 |     "                                  'count': count}\n",
368 |     "        else:\n",
369 |     "            rating_preds[item] = {'pred': None, 'count': 0}\n",
370 |     "\n",
371 |     "    return rating_preds"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 15,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "rating_preds = compute_rating_pred(neighborhood_ratings)"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": 16,
386 |    "metadata": {},
387 |    "outputs": [
388 |     {
389 |      "data": {
390 |       "text/plain": [
391 |        "[(340, {'pred': 4.666666666666667, 'count': 3}),\n",
392 |        " (325, {'pred': 3.0, 'count': 1}),\n",
393 |        " (288, {'pred': 3.8333333333333335, 'count': 6}),\n",
394 |        " (312, {'pred': 4.0, 'count': 2}),\n",
395 |        " (313, {'pred': 4.333333333333333, 'count': 6}),\n",
396 |        " (300, {'pred': 2.9999999999999996, 'count': 4}),\n",
397 |        " (264, {'pred': 3.0, 'count': 3}),\n",
398 |        " (333, {'pred': 4.0, 'count': 5}),\n",
399 |        " (1243, {'pred': 3.0, 'count': 1}),\n",
400 |        " (322, {'pred': 2.5, 'count': 2}),\n",
401 |        " (305, {'pred': 4.0, 'count': 1}),\n",
402 |        " (327, {'pred': 4.0, 'count': 3}),\n",
403 |        " (302, {'pred': 4.6, 'count': 5}),\n",
404 |        " (687, {'pred': 3.0, 'count': 1}),\n",
405 |        " (358, {'pred': 1.0, 'count': 2}),\n",
406 |        " (323, {'pred': 2.5, 'count': 2}),\n",
407 |        " (286, {'pred': 3.875, 'count': 8}),\n",
408 |        " (678, {'pred': 2.0, 'count': 1}),\n",
409 |        " (343, {'pred': 4.0, 'count': 2}),\n",
410 |        " (644, {'pred': 3.0, 'count': 1})]"
411 |       ]
412 |      },
413 |      "execution_count": 16,
414 |      "metadata": {},
415 |      "output_type": "execute_result"
416 |     }
417 |    ],
418 |    "source": [
419 |     "list(rating_preds.items())[:20]"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "### D. Compute the Top-$N$ Recommendation Items\n",
427 |     "\n",
428 |     "![](../Parrot.png)\n",
429 |     "\n",
430 |     "**Task:** The last step takes the rating predictions and returns the $N$ highest predictions which have a minimum rating count, i.e. the number of neighbors from the neighborhood that rated this item."
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 17,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "def compute_top_n(rating_preds: dict, min_count: int, N: int) -> OrderedDict:\n",
440 |     "    rating_preds = {key: val for (key, val) in rating_preds.items()\n",
441 |     "                    if val['count'] >= min_count}\n",
442 |     "    # assuming more ratings mean higher confidence in the prediction\n",
443 |     "    sorted_rating_preds = sorted(rating_preds.items(),\n",
444 |     "                                 key=lambda kv: (kv[1]['pred'], kv[1]['count']),\n",
445 |     "                                 reverse=True)\n",
446 |     "\n",
447 |     "    return OrderedDict(sorted_rating_preds[:N])"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 18,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "top_n_recs = compute_top_n(rating_preds, min_count=2, N=10)"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 19,
462 |    "metadata": {},
463 |    "outputs": [
464 |     {
465 |      "data": {
466 |       "text/plain": [
467 |        "OrderedDict([(242, {'pred': 5.0, 'count': 2}),\n",
468 |        "             (340, {'pred': 4.666666666666667, 'count': 3}),\n",
469 |        "             (332, {'pred': 4.666666666666667, 'count': 3}),\n",
470 |        "             (302, {'pred': 4.6, 'count': 5}),\n",
471 |        "             (690, {'pred': 4.5, 'count': 2}),\n",
472 |        "             (313, {'pred': 4.333333333333333, 'count': 6}),\n",
473 |        "             (333, {'pred': 4.0, 'count': 5}),\n",
474 |        "             (327, {'pred': 4.0, 'count': 3}),\n",
475 |        "             (312, {'pred': 4.0, 'count': 2}),\n",
476 |        "             (343, {'pred': 4.0, 'count': 2})])"
477 |       ]
478 |      },
479 |      "execution_count": 19,
480 |      "metadata": {},
481 |      "output_type": "execute_result"
482 |     }
483 |    ],
484 |    "source": [
485 |     "top_n_recs"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "### Combine all steps in `get_recommendations`"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 20,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": [
501 |     "def get_recommendations(user: int,\n",
502 |     "                        user_user_sims: dict,\n",
503 |     "                        k: int,\n",
504 |     "                        C: int,\n",
505 |     "                        N: int):\n",
506 |     "    user_neighbors = get_k_nearest_neighbors(user, k=k, user_user_sims=user_user_sims)\n",
507 |     "    neighborhood_ratings = get_neighborhood_ratings(user, user_neighbors)\n",
508 |     "    rating_preds = compute_rating_pred(neighborhood_ratings)\n",
509 |     "    top_n_recs = compute_top_n(rating_preds, min_count=C, N=N)\n",
510 |     "    return top_n_recs"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 21,
516 |    "metadata": {},
517 |    "outputs": [
518 |     {
519 |      "data": {
520 |       "text/plain": [
521 |        "OrderedDict([(242, {'pred': 5.0, 'count': 2}),\n",
522 |        "             (340, {'pred': 4.666666666666667, 'count': 3}),\n",
523 |        "             (332, {'pred': 4.666666666666667, 'count': 3}),\n",
524 |        "             (302, {'pred': 4.6, 'count': 5}),\n",
525 |        "             (690, {'pred': 4.5, 'count': 2}),\n",
526 |        "             (313, {'pred': 4.333333333333333, 'count': 6}),\n",
527 |        "             (333, {'pred': 4.0, 'count': 5}),\n",
528 |        "             (327, {'pred': 4.0, 'count': 3}),\n",
529 |        "             (312, {'pred': 4.0, 'count': 2}),\n",
530 |        "             (343, {'pred': 4.0, 'count': 2})])"
531 |       ]
532 |      },
533 |      "execution_count": 21,
534 |      "metadata": {},
535 |      "output_type": "execute_result"
536 |     }
537 |    ],
538 |    "source": [
539 |     "get_recommendations(1, user_user_sims, 10, 2, 10)"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "markdown",
544 |    "metadata": {},
545 |    "source": [
546 |     "## Evaluation"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "Let's check the performance of the neighborhood- and user-based recommender for a neighborhood size of $k = 60$, minimum rating count of $C = 10$ and stay with $N = 10$ recommendations."
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 22,
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": [
562 |     "k = 60\n",
563 |     "C = 10\n",
564 |     "N = 10"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": 23,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "relevant_items = get_relevant_items(data.test_ratings)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 24,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "users = relevant_items.keys()\n",
583 |     "prec_at_N = dict.fromkeys(data.users)\n",
584 |     "\n",
585 |     "for user in users:\n",
586 |     "    recommendations = get_recommendations(user, user_user_sims, k, C, N)\n",
587 |     "    recommendations = list(recommendations.keys())\n",
588 |     "    hits = np.intersect1d(recommendations,\n",
589 |     "                          relevant_items[user])\n",
590 |     "    prec_at_N[user] = len(hits)/N"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": 25,
596 |    "metadata": {},
597 |    "outputs": [
598 |     {
599 |      "data": {
600 |       "text/plain": [
601 |        "0.08106382978723406"
602 |       ]
603 |      },
604 |      "execution_count": 25,
605 |      "metadata": {},
606 |      "output_type": "execute_result"
607 |     }
608 |    ],
609 |    "source": [
610 |     "np.mean([val for val in prec_at_N.values() if val is not None])"
611 |    ]
612 |   }
613 |  ],
614 |  "metadata": {
615 |   "kernelspec": {
616 |    "display_name": "Python 3",
617 |    "language": "python",
618 |    "name": "python3"
619 |   },
620 |   "language_info": {
621 |    "codemirror_mode": {
622 |     "name": "ipython",
623 |     "version": 3
624 |    },
625 |    "file_extension": ".py",
626 |    "mimetype": "text/x-python",
627 |    "name": "python",
628 |    "nbconvert_exporter": "python",
629 |    "pygments_lexer": "ipython3",
630 |    "version": "3.9.4"
631 |   },
632 |   "pycharm": {
633 |    "stem_cell": {
634 |     "cell_type": "raw",
635 |     "metadata": {
636 |      "collapsed": false
637 |     },
638 |     "source": []
639 |    }
640 |   }
641 |  },
642 |  "nbformat": 4,
643 |  "nbformat_minor": 4
644 | }
645 | 


--------------------------------------------------------------------------------