├── configs └── .gitignore ├── references └── .gitignore ├── reports └── figures │ └── .gitignore ├── docs ├── _static │ └── .gitignore ├── authors.rst ├── changelog.rst ├── license.rst ├── index.rst ├── Makefile └── conf.py ├── postBuild ├── data ├── raw │ ├── ml-100k │ │ ├── u.info │ │ ├── u.item │ │ ├── u.genre │ │ └── u.occupation │ └── .gitignore ├── external │ └── .gitignore ├── interim │ └── .gitignore ├── preprocessed │ └── .gitignore └── .gitignore ├── notebooks ├── fm.png ├── parrot.png ├── template.ipynb ├── 1_e_explore_movielens.ipynb ├── 3_e_demographic_recs.ipynb ├── 4_e_cf_knn_rating_pred.ipynb ├── 2_e_popularity_recs.ipynb ├── 9_e_ligthfm.ipynb ├── 6_e_cf_mf_ranking_pred.ipynb ├── extra_sport_recommender.ipynb ├── solutions │ ├── 9_s_ligthfm.ipynb │ └── 4_s_cf_knn_rating_pred.ipynb └── 8_e_hybrid_fm.ipynb ├── models └── .gitignore ├── AUTHORS.rst ├── CHANGELOG.rst ├── tests └── conftest.py ├── .isort.cfg ├── docker ├── docker-compose.yaml └── Dockerfile ├── src └── recsys_training │ ├── __init__.py │ ├── evaluation.py │ ├── utils.py │ └── data.py ├── environment.yml ├── setup.py ├── .pre-commit-config.yaml ├── .coveragerc ├── .gitignore ├── LICENSE.txt ├── scripts └── train_model.py ├── README.md └── setup.cfg /configs/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /references/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reports/figures/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/_static/.gitignore: -------------------------------------------------------------------------------- 1 | # Empty directory 2 | -------------------------------------------------------------------------------- /postBuild: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | pip install . 4 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. _authors: 2 | .. include:: ../AUTHORS.rst 3 | -------------------------------------------------------------------------------- /data/raw/ml-100k/u.info: -------------------------------------------------------------------------------- 1 | 943 users 2 | 1682 items 3 | 100000 ratings 4 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. _changes: 2 | .. include:: ../CHANGELOG.rst 3 | -------------------------------------------------------------------------------- /notebooks/fm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkurovski/recsys_training/HEAD/notebooks/fm.png -------------------------------------------------------------------------------- /notebooks/parrot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkurovski/recsys_training/HEAD/notebooks/parrot.png -------------------------------------------------------------------------------- /data/raw/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /data/raw/ml-100k/u.item: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkurovski/recsys_training/HEAD/data/raw/ml-100k/u.item -------------------------------------------------------------------------------- /models/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /data/external/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /data/interim/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | .. _license: 2 | 3 | ======= 4 | License 5 | ======= 6 | 7 | .. include:: ../LICENSE.txt 8 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributors 3 | ============ 4 | 5 | * squall-1002 6 | -------------------------------------------------------------------------------- /data/preprocessed/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | Version 0.1 6 | =========== 7 | 8 | - Feature A added 9 | - FIX: nasty bug #1729 fixed 10 | - add your changes here! 11 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file and .gitignore in sub directories 4 | !.gitignore 5 | !raw 6 | !external 7 | !preprocessed 8 | !interim 9 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Dummy conftest.py for recsys_training. 4 | 5 | If you don't know what this is for, just leave it empty. 6 | Read more about conftest.py under: 7 | https://pytest.org/latest/plugins.html 8 | """ 9 | 10 | # import pytest 11 | -------------------------------------------------------------------------------- /data/raw/ml-100k/u.genre: -------------------------------------------------------------------------------- 1 | unknown|0 2 | Action|1 3 | Adventure|2 4 | Animation|3 5 | Children's|4 6 | Comedy|5 7 | Crime|6 8 | Documentary|7 9 | Drama|8 10 | Fantasy|9 11 | Film-Noir|10 12 | Horror|11 13 | Musical|12 14 | Mystery|13 15 | Romance|14 16 | Sci-Fi|15 17 | Thriller|16 18 | War|17 19 | Western|18 20 | 21 | -------------------------------------------------------------------------------- /data/raw/ml-100k/u.occupation: -------------------------------------------------------------------------------- 1 | administrator 2 | artist 3 | doctor 4 | educator 5 | engineer 6 | entertainment 7 | executive 8 | healthcare 9 | homemaker 10 | lawyer 11 | librarian 12 | marketing 13 | none 14 | other 15 | programmer 16 | retired 17 | salesman 18 | scientist 19 | student 20 | technician 21 | writer 22 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | line_length=88 3 | indent=' ' 4 | skip=.tox,.venv,build,dist 5 | known_standard_library=setuptools,pkg_resources 6 | known_test=pytest 7 | known_first_party=recsys_training 8 | sections=FUTURE,STDLIB,COMPAT,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER 9 | default_section=THIRDPARTY 10 | multi_line_output=3 11 | -------------------------------------------------------------------------------- /docker/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | recsys-training-mle: 5 | image: recsys-training:mle 6 | container_name: recsys-training-mle 7 | command: "bash -c 'conda init bash && source /root/.bashrc && conda activate recsys_training && jupyter lab --no-browser --ip=* --port=8888 --allow-root --notebook-dir=/root/recsys_training/notebooks --NotebookApp.token=\"\"'" 8 | ports: 9 | - 8888:8888 10 | -------------------------------------------------------------------------------- /src/recsys_training/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pkg_resources import get_distribution, DistributionNotFound 3 | 4 | try: 5 | # Change here if project is renamed and does not equal the package name 6 | dist_name = __name__ 7 | __version__ = get_distribution(dist_name).version 8 | except DistributionNotFound: 9 | __version__ = 'unknown' 10 | finally: 11 | del get_distribution, DistributionNotFound 12 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: recsys_training 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - cython=0.29.23 8 | - ipython=7.22.0 9 | - jupyterlab=3.0.14 10 | - lightfm=1.16 11 | - matplotlib=3.3.4 12 | - notebook=6.3.0 13 | - numpy=1.20.1 14 | - pandas=1.2.4 15 | - pip=21.1 16 | - python=3.9.4 17 | - scikit-learn=0.24.1 18 | - scipy=1.6.2 19 | - seaborn=0.11.1 20 | - setuptools=49.6.0 21 | - statsmodels=0.12.2 22 | - pip: 23 | - "--editable=git+https://github.com/coreylynch/pyFM#egg=pyfm" 24 | - tqdm==4.60.0 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Setup file for recsys_training. 4 | Use setup.cfg to configure your project. 5 | 6 | This file was generated with PyScaffold 3.2.3. 7 | PyScaffold helps you to put up the scaffold of your new Python project. 8 | Learn more under: https://pyscaffold.org/ 9 | """ 10 | import sys 11 | 12 | from pkg_resources import VersionConflict, require 13 | from setuptools import setup 14 | 15 | try: 16 | require('setuptools>=38.3') 17 | except VersionConflict: 18 | print("Error: version of setuptools is too old (<38.3)!") 19 | sys.exit(1) 20 | 21 | 22 | if __name__ == "__main__": 23 | setup(use_pyscaffold=False) 24 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: '^docs/conf.py' 2 | 3 | repos: 4 | - repo: git://github.com/pre-commit/pre-commit-hooks 5 | rev: v2.2.3 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: check-added-large-files 9 | - id: check-ast 10 | - id: check-json 11 | - id: check-merge-conflict 12 | - id: check-xml 13 | - id: check-yaml 14 | - id: debug-statements 15 | - id: end-of-file-fixer 16 | - id: requirements-txt-fixer 17 | - id: mixed-line-ending 18 | args: ['--fix=no'] 19 | - id: flake8 20 | args: ['--max-line-length=88'] # default of Black 21 | 22 | - repo: https://github.com/pre-commit/mirrors-isort 23 | rev: v4.3.4 24 | hooks: 25 | - id: isort 26 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch = True 4 | source = recsys_training 5 | # omit = bad_file.py 6 | 7 | [paths] 8 | source = 9 | src/ 10 | */site-packages/ 11 | 12 | [report] 13 | # Regexes for lines to exclude from consideration 14 | exclude_lines = 15 | # Have to re-enable the standard pragma 16 | pragma: no cover 17 | 18 | # Don't complain about missing debug-only code: 19 | def __repr__ 20 | if self\.debug 21 | 22 | # Don't complain if tests don't hit defensive assertion code: 23 | raise AssertionError 24 | raise NotImplementedError 25 | 26 | # Don't complain if non-runnable code isn't run: 27 | if 0: 28 | if __name__ == .__main__.: 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary and binary files 2 | *~ 3 | *.py[cod] 4 | *.so 5 | *.cfg 6 | !.isort.cfg 7 | !setup.cfg 8 | *.orig 9 | *.log 10 | *.pot 11 | __pycache__/* 12 | .cache/* 13 | .*.swp 14 | */.ipynb_checkpoints/* 15 | .DS_Store 16 | 17 | # Project files 18 | .ropeproject 19 | .project 20 | .pydevproject 21 | .settings 22 | .idea 23 | tags 24 | 25 | # Package files 26 | *.egg 27 | *.eggs/ 28 | .installed.cfg 29 | *.egg-info 30 | 31 | # Unittest and coverage 32 | htmlcov/* 33 | .coverage 34 | .tox 35 | junit.xml 36 | coverage.xml 37 | .pytest_cache/ 38 | 39 | # Build and docs folder/files 40 | build/* 41 | dist/* 42 | sdist/* 43 | docs/api/* 44 | docs/_rst/* 45 | docs/_build/* 46 | cover/* 47 | MANIFEST 48 | 49 | # Per-project virtualenvs 50 | .venv*/ 51 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 squall-1002 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/train_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | from pathlib import Path 5 | import sys 6 | 7 | import click 8 | from IPython.core import ultratb 9 | 10 | import recsys_training 11 | 12 | # fallback to debugger on error 13 | sys.excepthook = ultratb.FormattedTB(mode='Verbose', color_scheme='Linux', call_pdb=1) 14 | 15 | _logger = logging.getLogger(__name__) 16 | 17 | 18 | @click.command() 19 | @click.option('-c', '--config', 'cfg_path', required=True, 20 | type=click.Path(exists=True), help='path to config file') 21 | @click.option('--quiet', 'log_level', flag_value=logging.WARNING, default=True) 22 | @click.option('-v', '--verbose', 'log_level', flag_value=logging.INFO) 23 | @click.option('-vv', '--very-verbose', 'log_level', flag_value=logging.DEBUG) 24 | @click.version_option(recsys_training.__version__) 25 | def main(cfg_path: Path, log_level: int): 26 | logging.basicConfig(stream=sys.stdout, 27 | level=log_level, 28 | datefmt='%Y-%m-%d %H:%M', 29 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 30 | # YOUR CODE GOES HERE! Keep the main functionality in src/recsys_training 31 | # est = recsys_training.models.Estimator() 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:latest 2 | 3 | # $ docker build . -t continuumio/miniconda:latest -t continuumio/miniconda:4.5.11 -t continuumio/miniconda2:latest -t continuumio/miniconda2:4.5.11 4 | # $ docker run --rm -it continuumio/miniconda2:latest /bin/bash 5 | # $ docker push continuumio/miniconda:latest 6 | # $ docker push continuumio/miniconda:4.5.11 7 | # $ docker push continuumio/miniconda2:latest 8 | # $ docker push continuumio/miniconda2:4.5.11 9 | 10 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 11 | ENV PATH /opt/conda/bin:$PATH 12 | 13 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 14 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 15 | git mercurial subversion 16 | 17 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ 18 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 19 | rm ~/miniconda.sh && \ 20 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 21 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ 22 | echo "conda activate base" >> ~/.bashrc 23 | 24 | RUN apt-get install -y gcc unzip git curl grep sed dpkg && \ 25 | TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \ 26 | curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ 27 | dpkg -i tini.deb && \ 28 | rm tini.deb && \ 29 | apt-get clean 30 | 31 | RUN cd ~ && \ 32 | git clone https://github.com/mkurovski/recsys_training.git && \ 33 | cd recsys_training && \ 34 | conda env create -f environment.yaml 35 | 36 | RUN /bin/bash -c "source activate recsys_training && \ 37 | cd ~/recsys_training && \ 38 | python setup.py install" 39 | 40 | # Download an unzip Data 41 | RUN wget http://files.grouplens.org/datasets/movielens/ml-100k.zip -O ~/recsys_training/data/raw/ml-100k.zip && \ 42 | cd ~/recsys_training/data/raw && \ 43 | unzip ml-100k.zip 44 | 45 | ENTRYPOINT [ "/usr/bin/tini", "--" ] 46 | CMD [ "/bin/bash" ] 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # recsys_training 2 | 3 | Recommender System Training Package 4 | 5 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/mkurovski/recsys_training/master) 6 | 7 | ## Description 8 | 9 | Hands-on Training for Recommender Systems developed for Machine Learning Essentials 2020. 10 | 11 | ## Installation 12 | 13 | In order to set up the necessary environment: 14 | 15 | 1. create an environment `recsys_training` with the help of [conda], 16 | 17 | ``` 18 | conda env create -f environment.yaml 19 | ``` 20 | 21 | 2. activate the new environment with 22 | 23 | ``` 24 | conda activate recsys_training 25 | ``` 26 | 27 | 3. install `recsys_training` with: 28 | 29 | ``` 30 | python setup.py install # or develop 31 | ``` 32 | 33 | ### Docker 34 | 35 | Make sure you have `docker` and `docker-compose` installed. 36 | 37 | 1. Build the image with using the `Dockerfile` in `docker` 38 | ``` 39 | docker build -t recsys-training:mle -f Dockerfile . 40 | ``` 41 | 42 | 2. Start the container with `docker-compose` pointing to the yaml-file 43 | ``` 44 | docker-compose up -f docker/docker-compose.yaml 45 | ``` 46 | 47 | The jupyter lab port `8888` will be mapped to the same port on your host machine, simply got to your preferred browser and enter via 48 | ``` 49 | http://localhost:8888/ 50 | ``` 51 | 52 | ## Usage 53 | 54 | There are 9 notebooks within `notebooks/` each starting with a number followed by `_e_` for exercise. Within `notebooks/solutions/`you will find all notebooks with a solution proposal implemented. It is strongly advised to go through the notebooks in numerically ascending order. 55 | 56 | We use MovieLens 100k as example dataset for the lessons. You can find the data in `data/raw/`. 57 | 58 | ## Note 59 | 60 | This project has been set up using PyScaffold 3.2.3 and the [dsproject extension] 0.4. 61 | For details and usage information on PyScaffold see https://pyscaffold.org/. 62 | 63 | [conda]: https://docs.conda.io/ 64 | [pre-commit]: https://pre-commit.com/ 65 | [Jupyter]: https://jupyter.org/ 66 | [nbstripout]: https://github.com/kynan/nbstripout 67 | [Google style]: http://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings 68 | [dsproject extension]: https://github.com/pyscaffold/pyscaffoldext-dsproject 69 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | recsys_training 3 | =============== 4 | 5 | This is the documentation of **recsys_training**. 6 | 7 | .. note:: 8 | 9 | This is the main page of your project's `Sphinx`_ documentation. 10 | It is formatted in `reStructuredText`_. Add additional pages 11 | by creating rst-files in ``docs`` and adding them to the `toctree`_ below. 12 | Use then `references`_ in order to link them from this page, e.g. 13 | :ref:`authors` and :ref:`changes`. 14 | 15 | It is also possible to refer to the documentation of other Python packages 16 | with the `Python domain syntax`_. By default you can reference the 17 | documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_, 18 | `Pandas`_, `Scikit-Learn`_. You can add more by extending the 19 | ``intersphinx_mapping`` in your Sphinx's ``conf.py``. 20 | 21 | The pretty useful extension `autodoc`_ is activated by default and lets 22 | you include documentation from docstrings. Docstrings can be written in 23 | `Google style`_ (recommended!), `NumPy style`_ and `classical style`_. 24 | 25 | 26 | Contents 27 | ======== 28 | 29 | .. toctree:: 30 | :maxdepth: 2 31 | 32 | License 33 | Authors 34 | Changelog 35 | Module Reference 36 | 37 | 38 | Indices and tables 39 | ================== 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` 43 | * :ref:`search` 44 | 45 | .. _toctree: http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html 46 | .. _reStructuredText: http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html 47 | .. _references: http://www.sphinx-doc.org/en/stable/markup/inline.html 48 | .. _Python domain syntax: http://sphinx-doc.org/domains.html#the-python-domain 49 | .. _Sphinx: http://www.sphinx-doc.org/ 50 | .. _Python: http://docs.python.org/ 51 | .. _Numpy: http://docs.scipy.org/doc/numpy 52 | .. _SciPy: http://docs.scipy.org/doc/scipy/reference/ 53 | .. _matplotlib: https://matplotlib.org/contents.html# 54 | .. _Pandas: http://pandas.pydata.org/pandas-docs/stable 55 | .. _Scikit-Learn: http://scikit-learn.org/stable 56 | .. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html 57 | .. _Google style: https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings 58 | .. _NumPy style: https://numpydoc.readthedocs.io/en/latest/format.html 59 | .. _classical style: http://www.sphinx-doc.org/en/stable/domains.html#info-field-lists 60 | -------------------------------------------------------------------------------- /notebooks/template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import sys\n", 11 | "import math\n", 12 | "import logging\n", 13 | "from pathlib import Path\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "import scipy as sp\n", 17 | "import sklearn\n", 18 | "import statsmodels.api as sm\n", 19 | "from statsmodels.formula.api import ols\n", 20 | "\n", 21 | "%load_ext autoreload\n", 22 | "%autoreload 2\n", 23 | "\n", 24 | "import matplotlib as mpl\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "%matplotlib inline\n", 27 | "%config InlineBackend.figure_format = 'retina'\n", 28 | "\n", 29 | "import seaborn as sns\n", 30 | "sns.set_context(\"poster\")\n", 31 | "sns.set(rc={'figure.figsize': (16, 9.)})\n", 32 | "sns.set_style(\"whitegrid\")\n", 33 | "\n", 34 | "import pandas as pd\n", 35 | "pd.set_option(\"display.max_rows\", 120)\n", 36 | "pd.set_option(\"display.max_columns\", 120)\n", 37 | "\n", 38 | "logging.basicConfig(level=logging.INFO, stream=sys.stdout)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from recsys_training import *" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "**PLEASE** save this file right now using the following naming convention: `NUMBER_FOR_SORTING-YOUR_INITIALS-SHORT_DESCRIPTION`, e.g. `1.0-fw-initial-data-exploration`. Use the number to order the file within the directory according to its usage." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "Python 3", 68 | "language": "python", 69 | "name": "python3" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.7.3" 82 | }, 83 | "pycharm": { 84 | "stem_cell": { 85 | "cell_type": "raw", 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "source": [] 90 | } 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 2 95 | } 96 | -------------------------------------------------------------------------------- /src/recsys_training/evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ranking and Rating Evaluation Metrics 3 | """ 4 | __author__ = "Marcel Kurovski" 5 | __copyright__ = "Marcel Kurovski" 6 | __license__ = "mit" 7 | 8 | from typing import Dict, List, Tuple 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | 14 | def compute_mae(test_ratings: pd.DataFrame, recommender) -> Tuple[float, float]: 15 | pred = test_ratings.apply(lambda row: 16 | recommender.get_prediction(row['user'], row['item']), 17 | axis=1) 18 | 19 | pred = pred.apply(lambda val: list(val.values())[0]['pred']) 20 | notnulls = pred.notnull() 21 | mae = np.mean(np.abs(test_ratings.rating[notnulls] - pred[notnulls])) 22 | coverage = notnulls.sum()/len(test_ratings) 23 | 24 | return {'mae': mae, 'coverage': coverage} 25 | 26 | 27 | # TODO: Remove min_rating logic from here (should be done before on data through binarize) 28 | def retrieval_score(test_ratings: pd.DataFrame, 29 | recommender, 30 | remove_known_pos: bool = False, 31 | metric: str = 'mrr') -> float: 32 | """ 33 | Mean Average Precision / Mean Reciprocal Rank of first relevant item @ N 34 | """ 35 | N = recommender.N 36 | user_scores = [] 37 | relevant_items = get_relevant_items(test_ratings) 38 | 39 | for user in recommender.users: 40 | if user in relevant_items.keys(): 41 | predicted_items = recommender.get_recommendations(user, remove_known_pos) 42 | predicted_items = [item for item, _ in predicted_items] 43 | if metric == 'map': 44 | true_positives = np.intersect1d(relevant_items[user], 45 | predicted_items) 46 | score = len(true_positives) / N 47 | elif metric == 'mrr': 48 | score = np.mean([reciprocal_rank(item, predicted_items) 49 | for item in relevant_items[user]]) 50 | else: 51 | raise ValueError(f"Unknown value {metric} for Argument `metric`") 52 | 53 | user_scores.append(score) 54 | 55 | return np.mean(user_scores) 56 | 57 | 58 | def reciprocal_rank(item: int, ranking: List[int]) -> float: 59 | rr = 0 60 | if item in ranking: 61 | rr = 1/(ranking.index(item)+1) 62 | 63 | return rr 64 | 65 | 66 | def get_relevant_items(test_ratings: pd.DataFrame) -> Dict[int, List[int]]: 67 | """ 68 | returns {user: [items]} as a list of relevant items per user 69 | for all users found in the test dataset 70 | """ 71 | relevant_items = test_ratings[['user', 'item']] 72 | relevant_items = relevant_items.groupby('user') 73 | relevant_items = {user: relevant_items.get_group(user)['item'].values 74 | for user in relevant_items.groups.keys()} 75 | 76 | return relevant_items 77 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # This file is used to configure your project. 2 | # Read more about the various options under: 3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files 4 | 5 | [metadata] 6 | name = recsys_training 7 | description = Add a short description here! 8 | author = squall-1002 9 | author-email = marcel.kurovski@googlemail.com 10 | license = mit 11 | long-description = file: README.rst 12 | long-description-content-type = text/markdown 13 | # long-description-content-type = text/x-rst; charset=UTF-8 14 | url = https://github.com/pyscaffold/pyscaffold/ 15 | project-urls = 16 | Documentation = https://pyscaffold.org/ 17 | # Change if running only on Windows, Mac or Linux (comma-separated) 18 | platforms = any 19 | # Add here all kinds of additional classifiers as defined under 20 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 21 | classifiers = 22 | Development Status :: 4 - Beta 23 | Programming Language :: Python 24 | 25 | [options] 26 | zip_safe = False 27 | packages = find: 28 | include_package_data = True 29 | package_dir = 30 | =src 31 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD! 32 | setup_requires = pyscaffold>=3.2a0,<3.3a0 33 | # Add here dependencies of your project (semicolon/line-separated), e.g. 34 | # install_requires = numpy; scipy 35 | # The usage of test_requires is discouraged, see `Dependency Management` docs 36 | # tests_require = pytest; pytest-cov 37 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4 38 | # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* 39 | 40 | [options.packages.find] 41 | where = src 42 | exclude = 43 | tests 44 | 45 | [options.extras_require] 46 | # Add here additional requirements for extra features, to install with: 47 | # `pip install recsys_training[PDF]` like: 48 | # PDF = ReportLab; RXP 49 | # Add here test requirements (semicolon/line-separated) 50 | testing = 51 | pytest 52 | pytest-cov 53 | 54 | [options.entry_points] 55 | # Add here console scripts like: 56 | # console_scripts = 57 | # script_name = recsys_training.module:function 58 | # For example: 59 | # console_scripts = 60 | # fibonacci = recsys_training.skeleton:run 61 | # And any other entry points, for example: 62 | # pyscaffold.cli = 63 | # awesome = pyscaffoldext.awesome.extension:AwesomeExtension 64 | 65 | [test] 66 | # py.test options when running `python setup.py test` 67 | # addopts = --verbose 68 | extras = True 69 | 70 | [tool:pytest] 71 | # Options for py.test: 72 | # Specify command line options as you would do when invoking py.test directly. 73 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml 74 | # in order to write a coverage file that can be read by Jenkins. 75 | addopts = 76 | --cov recsys_training --cov-report term-missing 77 | --verbose 78 | norecursedirs = 79 | dist 80 | build 81 | .tox 82 | testpaths = tests 83 | 84 | [aliases] 85 | dists = bdist_wheel 86 | 87 | [bdist_wheel] 88 | # Use this option if your package is pure-python 89 | universal = 1 90 | 91 | [build_sphinx] 92 | source_dir = docs 93 | build_dir = build/sphinx 94 | 95 | [devpi:upload] 96 | # Options for the devpi: PyPI server and packaging tool 97 | # VCS export must be deactivated since we are using setuptools-scm 98 | no-vcs = 1 99 | formats = bdist_wheel 100 | 101 | [flake8] 102 | # Some sane defaults for the code style checker flake8 103 | exclude = 104 | .tox 105 | build 106 | dist 107 | .eggs 108 | docs/conf.py 109 | 110 | [pyscaffold] 111 | # PyScaffold's parameters when the project was created. 112 | # This will be used when updating. Do not change! 113 | version = 3.2.3 114 | package = recsys_training 115 | extensions = 116 | no_skeleton 117 | pre_commit 118 | dsproject 119 | -------------------------------------------------------------------------------- /src/recsys_training/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions 3 | """ 4 | __author__ = "Marcel Kurovski" 5 | __copyright__ = "Marcel Kurovski" 6 | __license__ = "mit" 7 | 8 | import logging 9 | import sys 10 | from typing import Dict 11 | 12 | import pandas as pd 13 | import numpy as np 14 | import scipy as sp 15 | 16 | 17 | def setup_logging(loglevel): 18 | """Setup basic logging 19 | 20 | Args: 21 | loglevel (int): minimum loglevel for emitting messages 22 | """ 23 | logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s" 24 | logging.basicConfig(level=loglevel, stream=sys.stdout, 25 | format=logformat, datefmt="%Y-%m-%d %H:%M:%S") 26 | 27 | 28 | def get_entity_sim(a: int, b: int, 29 | entity_ratings: Dict[int, float], 30 | metric: str = 'pearson') -> tuple: 31 | """ 32 | Cosine Similarity 33 | Pearson Correlation 34 | Adjusted Cosine Similarity 35 | Jaccard Similarity (intersection over union) - not a good idea as it does not incorporate ratings, e.g. 36 | even the same users have rated two items, highest Jaccard similarity as evidence for high item similarity, 37 | their judgement may be very differently on the two items, justifying dissimilarity 38 | """ 39 | # 1. isolate e.g. users that have rated both items (a and b) 40 | key_intersection = set(entity_ratings[a].keys()).intersection(entity_ratings[b].keys()) 41 | ratings = np.array([(entity_ratings[a][key], entity_ratings[b][key]) for key in key_intersection]) 42 | n_joint_ratings = len(ratings) 43 | 44 | if n_joint_ratings > 1: 45 | # 2. apply a similarity computation technique 46 | if metric == 'pearson': 47 | # Warning and nan if for one entity the variance is 0 48 | sim = np.corrcoef(ratings, rowvar=False)[0, 1] 49 | elif metric == 'cosine': 50 | nom = ratings[:, 0].dot(ratings[:, 1]) 51 | denom = np.linalg.norm(ratings[:, 0]) * np.linalg.norm(ratings[:, 1]) 52 | sim = nom / denom 53 | elif metric == 'euclidean': 54 | sim = normalized_euclidean_sim(ratings[:, 0], ratings[:, 1]) 55 | elif metric == 'adj_cosine': 56 | sim = None 57 | else: 58 | raise ValueError(f"Value {metric} for argument 'mode' not supported.") 59 | else: 60 | sim = None 61 | 62 | return sim, n_joint_ratings 63 | 64 | 65 | def normalized_euclidean_sim(a, b): 66 | # scale to unit vectors 67 | a_norm = a / np.linalg.norm(a) 68 | b_norm = b / np.linalg.norm(b) 69 | 70 | dist = np.linalg.norm(a_norm - b_norm) 71 | sim = 2 - dist - 1 72 | return sim 73 | 74 | 75 | def min_max_scale(val, bounds): 76 | min_max_range = bounds['max']-bounds['min'] 77 | return (val-bounds['min'])/min_max_range 78 | 79 | 80 | def sigmoid(x): 81 | return 1/(1+np.exp(-x)) 82 | 83 | 84 | def df_to_coo(df, n_users, n_items): 85 | coo = sp.sparse.coo_matrix(([1]*len(df), (df.user.values-1, df.item.values-1)), 86 | shape=(n_users, n_items), dtype=np.int32) 87 | return coo 88 | 89 | 90 | def coo_to_df(coo): 91 | mat = np.concatenate((coo.row.reshape(-1, 1)+1, 92 | coo.col.reshape(-1, 1)+1), 93 | axis=1) 94 | return pd.DataFrame(mat, columns=['user', 'item']) 95 | 96 | 97 | def get_sparsity(sparse_arr) -> float: 98 | num_elements = sparse_arr.shape[0]*sparse_arr.shape[1] 99 | num_nonzero_elements = sparse_arr.nnz 100 | density = num_nonzero_elements/num_elements 101 | return 1-density 102 | 103 | 104 | def one_hot_encode_ids(ids: np.array, length): 105 | one_hot_enc = np.zeros((len(ids), length)) 106 | one_hot_enc[np.arange(len(ids)), ids] = 1 107 | return one_hot_enc 108 | -------------------------------------------------------------------------------- /notebooks/1_e_explore_movielens.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 1: Exploratory Data Analysis on the MovieLens 100k Dataset" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In these lessons, we will be working a lot with [pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html) and numpy - so please take the time to at least get yourself familiar with it, e.g. with [10 Minutes to Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html).\n", 15 | "\n", 16 | "The [MovieLens](https://grouplens.org/datasets/movielens/) datasets are for recommender systems practitioners and researchers what MNIST is for computer vision people. Of course, the MovieLens datasets are not the only public datasets used in the RecSys community, but one of the most widely used. There are also the\n", 17 | "* [Million Song Dataset](http://millionsongdataset.com/)\n", 18 | "* [Amazon product review dataset](https://nijianmo.github.io/amazon/index.html)\n", 19 | "* [Criteo datasets](https://labs.criteo.com/category/dataset/)\n", 20 | "* [Twitter RecSys Challenge 2020](https://recsys-twitter.com/previous_challenge)\n", 21 | "* [Spotify Million Playlist Dataset](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge)\n", 22 | "* [YooChoose RecSys Challenge 2015](https://www.kaggle.com/chadgostopp/recsys-challenge-2015)\n", 23 | "* [BookCrossings](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) and many more\n", 24 | "\n", 25 | "On _kdnuggets_ you can find a [simple overview](https://www.kdnuggets.com/2016/02/nine-datasets-investigating-recommender-systems.html) of some of them.\n", 26 | "\n", 27 | "MovieLens comes in different sizes regarding the number of movie ratings, user, items. Take a look at the GroupLens website and explore them youself." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import numpy as np\n", 37 | "import pandas as pd\n", 38 | "import seaborn as sns" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from recsys_training.data import genres" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n", 57 | "ml100k_item_filepath = '../data/raw/ml-100k/u.item'\n", 58 | "ml100k_user_filepath = '../data/raw/ml-100k/u.user'" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Load Data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "ratings = pd.read_csv(ml100k_ratings_filepath,\n", 75 | " sep='\\t',\n", 76 | " header=None,\n", 77 | " names=['user', 'item', 'rating', 'timestamp'],\n", 78 | " engine='python')" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "items = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n", 88 | " names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n", 89 | " engine='python')" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "users = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n", 99 | " names=['user', 'age', 'gender', 'occupation', 'zip'])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "## Data Exploration" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "In this unit, we like to get a better picture of the data we use for making recommendations in the upcoming units. Therefore, let's have a look to some statistics to get confident with the data and algorithms.\n", 114 | "\n", 115 | "![](parrot.png)\n", 116 | "\n", 117 | "**TODO:**\n", 118 | "Let's find out the following:\n", 119 | "\n", 120 | "* number of users\n", 121 | "* number of items\n", 122 | "* rating distribution\n", 123 | "* user / item mean ratings\n", 124 | "* popularity skewness\n", 125 | " * user rating count distribution\n", 126 | " * item rating count distribution\n", 127 | "* time\n", 128 | "* sparsity\n", 129 | "* user / item features" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.9.4" 157 | }, 158 | "pycharm": { 159 | "stem_cell": { 160 | "cell_type": "raw", 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "source": [] 165 | } 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /src/recsys_training/data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Module for MovieLens Data Loading and Preprocessing 4 | """ 5 | __author__ = "Marcel Kurovski" 6 | __copyright__ = "Marcel Kurovski" 7 | __license__ = "mit" 8 | 9 | import calendar 10 | import logging 11 | from typing import Dict 12 | 13 | import numpy as np 14 | import pandas as pd 15 | 16 | from .utils import min_max_scale 17 | 18 | 19 | _logger = logging.getLogger(__name__) 20 | 21 | 22 | genres = [ 23 | 'unknown', 24 | 'Action', 25 | 'Adventure', 26 | 'Animation', 27 | 'Children', 28 | 'Comedy', 29 | 'Crime', 30 | 'Documentary', 31 | 'Drama', 32 | 'Fantasy', 33 | 'Film-Noir', 34 | 'Horror', 35 | 'Musical', 36 | 'Mystery', 37 | 'Romance', 38 | 'Sci-Fi', 39 | 'Thriller', 40 | 'War', 41 | 'Western' 42 | ] 43 | 44 | 45 | # TODO: Generalize initialization into from dataframes and from file 46 | class Dataset(object): 47 | def __init__(self, filepath: str): 48 | self.filepath = filepath 49 | self._load() 50 | 51 | def _load(self): 52 | self.ratings = pd.read_csv(self.filepath, 53 | sep='\t', 54 | header=None, 55 | names=['user', 'item', 'rating', 'timestamp'], 56 | engine='python') 57 | self.users = sorted(self.ratings['user'].unique()) 58 | self.items = sorted(self.ratings['item'].unique()) 59 | self.n_users = len(self.users) 60 | self.n_items = len(self.items) 61 | self.n_ratings = len(self.ratings) 62 | 63 | def rating_split(self, train_size: int = 0.8, seed: int = 42): 64 | # rating split instead of user/item split 65 | np.random.seed(seed) 66 | idxs = np.random.choice(self.n_ratings, size=self.n_ratings, replace=False) 67 | split_point = int(self.n_ratings * train_size) 68 | train_idxs, test_idxs = idxs[:split_point], idxs[split_point:] 69 | self.train_ratings = self.ratings.loc[train_idxs] 70 | self.test_ratings = self.ratings.loc[test_idxs] 71 | 72 | def filter(self, min_rating: float = 4.0): 73 | """Only keep ratings above threshold as positive implicit feedback""" 74 | idxs = self.ratings[self.ratings['rating'] >= min_rating].index.values 75 | self.ratings = self.ratings.loc[idxs, ['user', 'item', 'rating']] 76 | self.ratings.reset_index(drop=True, inplace=True) 77 | self.n_ratings = len(self.ratings) 78 | 79 | def get_user_ratings(self, dataset='train') -> Dict[int, Dict[int, float]]: 80 | user_ratings = {} 81 | if dataset == 'train': 82 | grouped = self.train_ratings[['user', 'item', 'rating']].groupby('user') 83 | else: 84 | grouped = self.test_ratings[['user', 'item', 'rating']].groupby('user') 85 | 86 | for user in self.users: 87 | vals = grouped.get_group(user)[['item', 'rating']].values 88 | user_ratings[user] = dict(zip(vals[:, 0].astype(int), 89 | vals[:, 1].astype(float))) 90 | 91 | return user_ratings 92 | 93 | 94 | def preprocess_users(users: pd.DataFrame, zip_digits_to_cut: int = 3) -> pd.DataFrame: 95 | user_age_bounds = {'min': users['age'].min(), 96 | 'max': users['age'].max()} 97 | occupations = sorted(users['occupation'].unique()) 98 | user_occupation_map = dict(zip(occupations, range(len(occupations)))) 99 | genders = sorted(users['gender'].unique()) 100 | user_gender_map = dict(zip(genders, range(len(genders)))) 101 | idxs = users[~users['zip'].str.isnumeric()].index 102 | users.loc[idxs, 'zip'] = '00000' 103 | 104 | users['age'] = users['age'].apply(lambda age: min_max_scale(age, user_age_bounds)) 105 | users['occupation'] = users['occupation'].map(user_occupation_map) 106 | users['gender'] = users['gender'].map(user_gender_map) 107 | users['zip'] = users['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut) 108 | 109 | return users 110 | 111 | 112 | def preprocess_items(items: pd.DataFrame) -> pd.DataFrame: 113 | idxs = items[items['release'].notnull()].index 114 | items.loc[idxs, 'release_month'] = items.loc[idxs, 'release'].str.split('-') 115 | items.loc[idxs, 'release_month'] = \ 116 | items.loc[idxs, 'release_month'].apply(lambda val: val[1]) 117 | items.loc[idxs, 'release_year'] = items.loc[idxs, 'release'].str.split('-') 118 | items.loc[idxs, 'release_year'] = \ 119 | items.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int) 120 | 121 | release_month_map = dict((v, k) for k, v in enumerate(calendar.month_abbr)) 122 | items.loc[idxs, 'release_month'] = items.loc[idxs, 'release_month'].map( 123 | release_month_map) 124 | 125 | # using top month and top year to impute the only missing one 126 | top_month = items['release_month'].value_counts().index[0] 127 | top_year = items.loc[idxs, 'release_year'].astype(int).describe()['50%'] 128 | idx = items[items['release'].isnull()].index 129 | items.loc[idx, 'release_month'] = top_month 130 | items.loc[idx, 'release_year'] = top_year 131 | 132 | item_year_bounds = {'min': items['release_year'].min(), 133 | 'max': items['release_year'].max()} 134 | items['release_year'] = items['release_year'].apply( 135 | lambda year: min_max_scale(year, item_year_bounds)) 136 | items.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True) 137 | 138 | return items 139 | 140 | 141 | def get_user_profiles(ratings: pd.DataFrame, prep_items: pd.DataFrame) -> pd.DataFrame: 142 | min_rating = 4 143 | ratings = ratings[ratings.rating >= min_rating] 144 | ratings.drop(['rating', 'timestamp'], axis=1, inplace=True) 145 | ratings = ratings.merge(prep_items, on='item', how='left') 146 | ratings.drop(['item', 'release_month'], axis=1, inplace=True) 147 | grouped = ratings.groupby('user') 148 | profiles = grouped.apply(user_profiler).reset_index() 149 | profiles.rename(columns={'50%': 'median'}, inplace=True) 150 | 151 | return profiles 152 | 153 | 154 | def user_profiler(group): 155 | genre_dist = group[genres].mean() 156 | year_dist = group['release_year'].describe()[['mean', 'std', '50%']] 157 | 158 | return pd.concat((genre_dist, year_dist), axis=0) 159 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = ../build/sphinx/ 9 | AUTODOCDIR = api 10 | AUTODOCBUILD = sphinx-apidoc 11 | PROJECT = recsys_training 12 | MODULEDIR = ../src/recsys_training 13 | 14 | # User-friendly check for sphinx-build 15 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1) 16 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 17 | endif 18 | 19 | # Internal variables. 20 | PAPEROPT_a4 = -D latex_paper_size=a4 21 | PAPEROPT_letter = -D latex_paper_size=letter 22 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 23 | # the i18n builder cannot share the environment and doctrees with the others 24 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 25 | 26 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext doc-requirements 27 | 28 | help: 29 | @echo "Please use \`make ' where is one of" 30 | @echo " html to make standalone HTML files" 31 | @echo " dirhtml to make HTML files named index.html in directories" 32 | @echo " singlehtml to make a single large HTML file" 33 | @echo " pickle to make pickle files" 34 | @echo " json to make JSON files" 35 | @echo " htmlhelp to make HTML files and a HTML help project" 36 | @echo " qthelp to make HTML files and a qthelp project" 37 | @echo " devhelp to make HTML files and a Devhelp project" 38 | @echo " epub to make an epub" 39 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 40 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 41 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 42 | @echo " text to make text files" 43 | @echo " man to make manual pages" 44 | @echo " texinfo to make Texinfo files" 45 | @echo " info to make Texinfo files and run them through makeinfo" 46 | @echo " gettext to make PO message catalogs" 47 | @echo " changes to make an overview of all changed/added/deprecated items" 48 | @echo " xml to make Docutils-native XML files" 49 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 50 | @echo " linkcheck to check all external links for integrity" 51 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 52 | 53 | clean: 54 | rm -rf $(BUILDDIR)/* $(AUTODOCDIR) 55 | 56 | $(AUTODOCDIR): $(MODULEDIR) 57 | mkdir -p $@ 58 | $(AUTODOCBUILD) -f -o $@ $^ 59 | 60 | doc-requirements: $(AUTODOCDIR) 61 | 62 | html: doc-requirements 63 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 64 | @echo 65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 66 | 67 | dirhtml: doc-requirements 68 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 69 | @echo 70 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 71 | 72 | singlehtml: doc-requirements 73 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 74 | @echo 75 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 76 | 77 | pickle: doc-requirements 78 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 79 | @echo 80 | @echo "Build finished; now you can process the pickle files." 81 | 82 | json: doc-requirements 83 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 84 | @echo 85 | @echo "Build finished; now you can process the JSON files." 86 | 87 | htmlhelp: doc-requirements 88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 89 | @echo 90 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 91 | ".hhp project file in $(BUILDDIR)/htmlhelp." 92 | 93 | qthelp: doc-requirements 94 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 95 | @echo 96 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 97 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 98 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/$(PROJECT).qhcp" 99 | @echo "To view the help file:" 100 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/$(PROJECT).qhc" 101 | 102 | devhelp: doc-requirements 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $HOME/.local/share/devhelp/$(PROJECT)" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $HOME/.local/share/devhelp/$(PROJEC)" 109 | @echo "# devhelp" 110 | 111 | epub: doc-requirements 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | patch-latex: 117 | find _build/latex -iname "*.tex" | xargs -- \ 118 | sed -i'' 's~includegraphics{~includegraphics\[keepaspectratio,max size={\\textwidth}{\\textheight}\]{~g' 119 | 120 | latex: doc-requirements 121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 122 | $(MAKE) patch-latex 123 | @echo 124 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 125 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 126 | "(use \`make latexpdf' here to do that automatically)." 127 | 128 | latexpdf: doc-requirements 129 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 130 | $(MAKE) patch-latex 131 | @echo "Running LaTeX files through pdflatex..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | latexpdfja: doc-requirements 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through platex and dvipdfmx..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | text: doc-requirements 142 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 143 | @echo 144 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 145 | 146 | man: doc-requirements 147 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 148 | @echo 149 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 150 | 151 | texinfo: doc-requirements 152 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 153 | @echo 154 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 155 | @echo "Run \`make' in that directory to run these through makeinfo" \ 156 | "(use \`make info' here to do that automatically)." 157 | 158 | info: doc-requirements 159 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 160 | @echo "Running Texinfo files through makeinfo..." 161 | make -C $(BUILDDIR)/texinfo info 162 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 163 | 164 | gettext: doc-requirements 165 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 166 | @echo 167 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 168 | 169 | changes: doc-requirements 170 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 171 | @echo 172 | @echo "The overview file is in $(BUILDDIR)/changes." 173 | 174 | linkcheck: doc-requirements 175 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 176 | @echo 177 | @echo "Link check complete; look for any errors in the above output " \ 178 | "or in $(BUILDDIR)/linkcheck/output.txt." 179 | 180 | doctest: doc-requirements 181 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 182 | @echo "Testing of doctests in the sources finished, look at the " \ 183 | "results in $(BUILDDIR)/doctest/output.txt." 184 | 185 | xml: doc-requirements 186 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 187 | @echo 188 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 189 | 190 | pseudoxml: doc-requirements 191 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 192 | @echo 193 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 194 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is execfile()d with the current directory set to its containing dir. 4 | # 5 | # Note that not all possible configuration values are present in this 6 | # autogenerated file. 7 | # 8 | # All configuration values have a default; values that are commented out 9 | # serve to show the default. 10 | 11 | import os 12 | import sys 13 | import inspect 14 | import shutil 15 | 16 | __location__ = os.path.join(os.getcwd(), os.path.dirname( 17 | inspect.getfile(inspect.currentframe()))) 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | sys.path.insert(0, os.path.join(__location__, '../src')) 23 | 24 | # -- Run sphinx-apidoc ------------------------------------------------------ 25 | # This hack is necessary since RTD does not issue `sphinx-apidoc` before running 26 | # `sphinx-build -b html . _build/html`. See Issue: 27 | # https://github.com/rtfd/readthedocs.org/issues/1139 28 | # DON'T FORGET: Check the box "Install your project inside a virtualenv using 29 | # setup.py install" in the RTD Advanced Settings. 30 | # Additionally it helps us to avoid running apidoc manually 31 | 32 | try: # for Sphinx >= 1.7 33 | from sphinx.ext import apidoc 34 | except ImportError: 35 | from sphinx import apidoc 36 | 37 | output_dir = os.path.join(__location__, "api") 38 | module_dir = os.path.join(__location__, "../src/recsys_training") 39 | try: 40 | shutil.rmtree(output_dir) 41 | except FileNotFoundError: 42 | pass 43 | 44 | try: 45 | import sphinx 46 | from pkg_resources import parse_version 47 | 48 | cmd_line_template = "sphinx-apidoc -f -o {outputdir} {moduledir}" 49 | cmd_line = cmd_line_template.format(outputdir=output_dir, moduledir=module_dir) 50 | 51 | args = cmd_line.split(" ") 52 | if parse_version(sphinx.__version__) >= parse_version('1.7'): 53 | args = args[1:] 54 | 55 | apidoc.main(args) 56 | except Exception as e: 57 | print("Running `sphinx-apidoc` failed!\n{}".format(e)) 58 | 59 | # -- General configuration ----------------------------------------------------- 60 | 61 | # If your documentation needs a minimal Sphinx version, state it here. 62 | # needs_sphinx = '1.0' 63 | 64 | # Add any Sphinx extension module names here, as strings. They can be extensions 65 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 66 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 67 | 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage', 68 | 'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.mathjax', 69 | 'sphinx.ext.napoleon'] 70 | 71 | # Add any paths that contain templates here, relative to this directory. 72 | templates_path = ['_templates'] 73 | 74 | 75 | # To configure AutoStructify 76 | def setup(app): 77 | from recommonmark.transform import AutoStructify 78 | app.add_config_value('recommonmark_config', { 79 | 'auto_toc_tree_section': 'Contents', 80 | 'enable_eval_rst': True, 81 | 'enable_auto_doc_ref': True, 82 | 'enable_math': True, 83 | 'enable_inline_math': True 84 | }, True) 85 | app.add_transform(AutoStructify) 86 | 87 | # Additional parsers besides rst 88 | source_parsers = { 89 | '.md': 'recommonmark.parser.CommonMarkParser', 90 | } 91 | 92 | # The suffix of source filenames. 93 | source_suffix = ['.rst', '.md'] 94 | 95 | # The encoding of source files. 96 | # source_encoding = 'utf-8-sig' 97 | 98 | # The master toctree document. 99 | master_doc = 'index' 100 | 101 | # General information about the project. 102 | project = u'recsys_training' 103 | copyright = u'2019, squall-1002' 104 | 105 | # The version info for the project you're documenting, acts as replacement for 106 | # |version| and |release|, also used in various other places throughout the 107 | # built documents. 108 | # 109 | # The short X.Y version. 110 | version = '' # Is set by calling `setup.py docs` 111 | # The full version, including alpha/beta/rc tags. 112 | release = '' # Is set by calling `setup.py docs` 113 | 114 | # The language for content autogenerated by Sphinx. Refer to documentation 115 | # for a list of supported languages. 116 | # language = None 117 | 118 | # There are two options for replacing |today|: either, you set today to some 119 | # non-false value, then it is used: 120 | # today = '' 121 | # Else, today_fmt is used as the format for a strftime call. 122 | # today_fmt = '%B %d, %Y' 123 | 124 | # List of patterns, relative to source directory, that match files and 125 | # directories to ignore when looking for source files. 126 | exclude_patterns = ['_build'] 127 | 128 | # The reST default role (used for this markup: `text`) to use for all documents. 129 | # default_role = None 130 | 131 | # If true, '()' will be appended to :func: etc. cross-reference text. 132 | # add_function_parentheses = True 133 | 134 | # If true, the current module name will be prepended to all description 135 | # unit titles (such as .. function::). 136 | # add_module_names = True 137 | 138 | # If true, sectionauthor and moduleauthor directives will be shown in the 139 | # output. They are ignored by default. 140 | # show_authors = False 141 | 142 | # The name of the Pygments (syntax highlighting) style to use. 143 | pygments_style = 'sphinx' 144 | 145 | # A list of ignored prefixes for module index sorting. 146 | # modindex_common_prefix = [] 147 | 148 | # If true, keep warnings as "system message" paragraphs in the built documents. 149 | # keep_warnings = False 150 | 151 | 152 | # -- Options for HTML output --------------------------------------------------- 153 | 154 | # The theme to use for HTML and HTML Help pages. See the documentation for 155 | # a list of builtin themes. 156 | html_theme = 'alabaster' 157 | 158 | # Theme options are theme-specific and customize the look and feel of a theme 159 | # further. For a list of options available for each theme, see the 160 | # documentation. 161 | html_theme_options = { 162 | 'sidebar_width': '300px', 163 | 'page_width': '1200px' 164 | } 165 | 166 | # Add any paths that contain custom themes here, relative to this directory. 167 | # html_theme_path = [] 168 | 169 | # The name for this set of Sphinx documents. If None, it defaults to 170 | # " v documentation". 171 | try: 172 | from recsys_training import __version__ as version 173 | except ImportError: 174 | pass 175 | else: 176 | release = version 177 | 178 | # A shorter title for the navigation bar. Default is the same as html_title. 179 | # html_short_title = None 180 | 181 | # The name of an image file (relative to this directory) to place at the top 182 | # of the sidebar. 183 | # html_logo = "" 184 | 185 | # The name of an image file (within the static path) to use as favicon of the 186 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 187 | # pixels large. 188 | # html_favicon = None 189 | 190 | # Add any paths that contain custom static files (such as style sheets) here, 191 | # relative to this directory. They are copied after the builtin static files, 192 | # so a file named "default.css" will overwrite the builtin "default.css". 193 | html_static_path = ['_static'] 194 | 195 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 196 | # using the given strftime format. 197 | # html_last_updated_fmt = '%b %d, %Y' 198 | 199 | # If true, SmartyPants will be used to convert quotes and dashes to 200 | # typographically correct entities. 201 | # html_use_smartypants = True 202 | 203 | # Custom sidebar templates, maps document names to template names. 204 | # html_sidebars = {} 205 | 206 | # Additional templates that should be rendered to pages, maps page names to 207 | # template names. 208 | # html_additional_pages = {} 209 | 210 | # If false, no module index is generated. 211 | # html_domain_indices = True 212 | 213 | # If false, no index is generated. 214 | # html_use_index = True 215 | 216 | # If true, the index is split into individual pages for each letter. 217 | # html_split_index = False 218 | 219 | # If true, links to the reST sources are added to the pages. 220 | # html_show_sourcelink = True 221 | 222 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 223 | # html_show_sphinx = True 224 | 225 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 226 | # html_show_copyright = True 227 | 228 | # If true, an OpenSearch description file will be output, and all pages will 229 | # contain a tag referring to it. The value of this option must be the 230 | # base URL from which the finished HTML is served. 231 | # html_use_opensearch = '' 232 | 233 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 234 | # html_file_suffix = None 235 | 236 | # Output file base name for HTML help builder. 237 | htmlhelp_basename = 'recsys_training-doc' 238 | 239 | 240 | # -- Options for LaTeX output -------------------------------------------------- 241 | 242 | latex_elements = { 243 | # The paper size ('letterpaper' or 'a4paper'). 244 | # 'papersize': 'letterpaper', 245 | 246 | # The font size ('10pt', '11pt' or '12pt'). 247 | # 'pointsize': '10pt', 248 | 249 | # Additional stuff for the LaTeX preamble. 250 | # 'preamble': '', 251 | } 252 | 253 | # Grouping the document tree into LaTeX files. List of tuples 254 | # (source start file, target name, title, author, documentclass [howto/manual]). 255 | latex_documents = [ 256 | ('index', 'user_guide.tex', u'recsys_training Documentation', 257 | u'squall-1002', 'manual'), 258 | ] 259 | 260 | # The name of an image file (relative to this directory) to place at the top of 261 | # the title page. 262 | # latex_logo = "" 263 | 264 | # For "manual" documents, if this is true, then toplevel headings are parts, 265 | # not chapters. 266 | # latex_use_parts = False 267 | 268 | # If true, show page references after internal links. 269 | # latex_show_pagerefs = False 270 | 271 | # If true, show URL addresses after external links. 272 | # latex_show_urls = False 273 | 274 | # Documents to append as an appendix to all manuals. 275 | # latex_appendices = [] 276 | 277 | # If false, no module index is generated. 278 | # latex_domain_indices = True 279 | 280 | # -- External mapping ------------------------------------------------------------ 281 | python_version = '.'.join(map(str, sys.version_info[0:2])) 282 | intersphinx_mapping = { 283 | 'sphinx': ('http://www.sphinx-doc.org/en/stable', None), 284 | 'python': ('https://docs.python.org/' + python_version, None), 285 | 'matplotlib': ('https://matplotlib.org', None), 286 | 'numpy': ('https://docs.scipy.org/doc/numpy', None), 287 | 'sklearn': ('http://scikit-learn.org/stable', None), 288 | 'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None), 289 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), 290 | } 291 | -------------------------------------------------------------------------------- /notebooks/3_e_demographic_recs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 3: Demographic Recommendations" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this section we leave the boring field of unpersonalized content and do our first steps for more personalization. But, before tailoring content to individuals we first tailor content to groups of individuals that by some criteria seem to be similar and therefore - assumed to - consume similar content.\n", 15 | "\n", 16 | "We distinguish individuals into groups by using demographic information we have on these individuals. This can be any of\n", 17 | "* age\n", 18 | "* gender\n", 19 | "* citizenship\n", 20 | "* income\n", 21 | "* etc." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import itertools\n", 31 | "from typing import List\n", 32 | "\n", 33 | "import numpy as np\n", 34 | "import pandas as pd" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from recsys_training.data import Dataset\n", 44 | "from recsys_training.evaluation import get_relevant_items" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n", 54 | "ml100k_user_filepath = '../data/raw/ml-100k/u.user'" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Load Data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "data = Dataset(ml100k_ratings_filepath)\n", 71 | "data.rating_split(seed=42)\n", 72 | "user_ratings = data.get_user_ratings()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "MovieLens also provides some demographic data on users along with the datasets. We will user _age_ and _gender_ in this tutorial to create different groups." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "users = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n", 89 | " names=['user', 'age', 'gender', 'occupation', 'zip'])" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Explore Data" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "users.head()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "users.age.hist()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Let's define 2 x 6 user groups by splitting by gender and age class (see advice [here](https://support.google.com/analytics/answer/2799357?hl=de))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "gender_groups = ['M', 'F']\n", 131 | "age_groups = [(18, 24),\n", 132 | " (25, 34),\n", 133 | " (35, 44),\n", 134 | " (45, 54),\n", 135 | " (55, 65),\n", 136 | " (65, 73)]\n", 137 | "\n", 138 | "user_groups = list(itertools.product(gender_groups, age_groups))\n", 139 | "user_group_indices = range(len(user_groups))\n", 140 | "user_groups = dict(zip(user_group_indices, user_groups))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "user_groups" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "def assign_group(row, age_groups=age_groups):\n", 159 | " for age_group in age_groups:\n", 160 | " if row['age'] >= age_group[0] and row['age'] <= age_group[1]:\n", 161 | " break\n", 162 | " return (row['gender'], age_group)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "users['group'] = users.apply(lambda row: assign_group(row, age_groups), axis=1)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "users['group'] = users['group'].map(lambda val: list(user_groups.values()).index(val))" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "users['group'].value_counts()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "![](../parrot.png)\n", 197 | "\n", 198 | "For each group we use popularity recommendations based on the groups historical viewing popularity.\n", 199 | "\n", 200 | "**Task**: Infer the `group_popularities` as a mapping from group index to the item ordering array." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "group_popularities = dict.fromkeys(user_group_indices)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "for group_idx in user_group_indices:\n", 219 | " pass" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "group_popularities" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "![](parrot.png)\n", 236 | "\n", 237 | "**Task:** Adapt `get_recommendations` from the previous notebook and compute the $MAP@10$ for demographic recommendations." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "user_group_map = dict(zip(users['user'].values,users['group'].values))" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "def get_recommendations(user: int,\n", 256 | " user_ratings: dict,\n", 257 | " item_popularity_order: np.array,\n", 258 | " N: int) -> List[int]:\n", 259 | " known_positives = None\n", 260 | " recommendations = None\n", 261 | " \n", 262 | " return recommendations" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "## Evaluation Evaluating the Relevance of Recommendations" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 19, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "relevant_items = get_relevant_items(data.test_ratings)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "Computing $MAP@10$" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 20, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "N = 10" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 21, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "users = relevant_items.keys()\n", 304 | "prec_at_N = dict.fromkeys(users)\n", 305 | "\n", 306 | "for user in users:\n", 307 | " recommendations = get_recommendations(user,\n", 308 | " user_ratings,\n", 309 | " user_group_map,\n", 310 | " group_popularities,\n", 311 | " N=N)\n", 312 | " hits = np.intersect1d(recommendations,\n", 313 | " relevant_items[user])\n", 314 | " prec_at_N[user] = len(hits)/N" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 22, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "0.06404255319148937" 326 | ] 327 | }, 328 | "execution_count": 22, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "np.mean(list(prec_at_N.values()))" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "What is the $MAP@10$ for ea. specific group?" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 23, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "group_maps = dict.fromkeys(user_group_indices, list())\n", 351 | "for user in users:\n", 352 | " group_maps[user_group_map[user]].append(prec_at_N[user])\n", 353 | "for group in user_group_indices:\n", 354 | " group_maps[group] = np.mean(group_maps[group])" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 24, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/plain": [ 365 | "{0: 0.06404255319148937,\n", 366 | " 1: 0.06404255319148937,\n", 367 | " 2: 0.06404255319148937,\n", 368 | " 3: 0.06404255319148937,\n", 369 | " 4: 0.06404255319148937,\n", 370 | " 5: 0.06404255319148937,\n", 371 | " 6: 0.06404255319148937,\n", 372 | " 7: 0.06404255319148937,\n", 373 | " 8: 0.06404255319148937,\n", 374 | " 9: 0.06404255319148937,\n", 375 | " 10: 0.06404255319148937,\n", 376 | " 11: 0.06404255319148937}" 377 | ] 378 | }, 379 | "execution_count": 24, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "group_maps" 386 | ] 387 | } 388 | ], 389 | "metadata": { 390 | "kernelspec": { 391 | "display_name": "Python 3", 392 | "language": "python", 393 | "name": "python3" 394 | }, 395 | "language_info": { 396 | "codemirror_mode": { 397 | "name": "ipython", 398 | "version": 3 399 | }, 400 | "file_extension": ".py", 401 | "mimetype": "text/x-python", 402 | "name": "python", 403 | "nbconvert_exporter": "python", 404 | "pygments_lexer": "ipython3", 405 | "version": "3.7.5" 406 | } 407 | }, 408 | "nbformat": 4, 409 | "nbformat_minor": 4 410 | } 411 | -------------------------------------------------------------------------------- /notebooks/4_e_cf_knn_rating_pred.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 4: Neighborhood-based Collaborative Filtering for Rating Prediction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this section we generate personalized recommendations for the first time. We exploit rating similarities among users and items to identify similar users and items that assist in finding the relevant items to recommend for each user.\n", 15 | "\n", 16 | "This describes the fundamental idea behind Collaborative Filtering (CF) and using kNN is a neighborhood-based approach towards CF. In a later unit we will also have a look at model-based approaches.\n", 17 | "\n", 18 | "This is also the first time we try to predict user ratings for unknown items using rating predictions to take the top-$N$ items with the highest rating predictions and recommend those to the user." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from collections import OrderedDict\n", 28 | "import itertools\n", 29 | "from typing import Dict, List, Tuple\n", 30 | "\n", 31 | "import numpy as np\n", 32 | "import pandas as pd" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from recsys_training.data import Dataset\n", 42 | "from recsys_training.evaluation import get_relevant_items\n", 43 | "from recsys_training.utils import get_entity_sim" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Load Data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "data = Dataset(ml100k_ratings_filepath)\n", 69 | "data.rating_split(seed=42)\n", 70 | "user_ratings = data.get_user_ratings()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The idea behind this recommender is to use item ratings of the $k$ most similar users (neighbors). We identify those _nearest neighbors_ with a similarity metric which we apply to the ratings both, root user and possible neighbor, have in common. Similarity thereby means having a similar opinion on movies.\n", 78 | "\n", 79 | "The steps are as follows:\n", 80 | "\n", 81 | "1. Compute user-user similarities (we use the Pearson Correlation Coefficient here, but feel free to try other similarity metrics)\n", 82 | "\n", 83 | "2. For each user:\n", 84 | "\n", 85 | " 1. Get the k nearest neighbors along with their similarities\n", 86 | " 2. Collect the neighborhood item ratings and ignore those already rated by the root user\n", 87 | " 3. Item Rating Prediction: Compute the similarity-weighted sum of neighborhood item ratings\n", 88 | " 4. Recommendations: Get the $N$ items with the highest ratings that have a minimum rating count" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### 1. User-User Similarities" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "sim_metric = 'pearson'\n", 105 | "user_user_sims = {}\n", 106 | "user_pairs = itertools.combinations(data.users, 2)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "The following takes a few seconds to finish ..." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "for pair in user_pairs:\n", 123 | " user_user_sims[pair] = get_entity_sim(pair[0], pair[1],\n", 124 | " user_ratings,\n", 125 | " sim_metric)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "user_user_sims[(1,4)]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## 2. Computing Recommendations" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "### A. Implement Nearest Neighbors for a given user\n", 149 | "\n", 150 | "![](Parrot.png)\n", 151 | "\n", 152 | "**Task:** It's your turn again. Complete `get_k_nearest_neighbors` to return a sorted list of the $k$ nearest neighbors - identified by their id - for a given user, each along with its similarity." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 4, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "def get_k_nearest_neighbors(user: int, k: int, user_user_sims: dict) -> List[Tuple[int, float]]:\n", 162 | " neighbors = set(data.users)\n", 163 | " neighbors.remove(user)\n", 164 | "\n", 165 | " nearest_neighbors = dict()\n", 166 | " \n", 167 | " pass\n", 168 | " \n", 169 | " return nearest_neighbors[:k]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "user_neighbors = get_k_nearest_neighbors(1, k=10, user_user_sims=user_user_sims)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "user_neighbors" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "### B. Obtain the Neighborhood Ratings\n", 195 | "\n", 196 | "![](Parrot.png)\n", 197 | "\n", 198 | "**Task:** Now, use the nearest neighbors and get their ratings, but leave out the items our root user has already rated (known positives). Return a mapping from unknown item to a list of dicts with neighbor similarity and item rating." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "def get_neighborhood_ratings(user, user_neighbors: List[Tuple[int, float]]) -> Dict[int, List[Dict[str, float]]]:\n", 208 | " neighborhood_ratings = dict()\n", 209 | " \n", 210 | " pass\n", 211 | " \n", 212 | " return neighborhood_ratings" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "neighborhood_ratings = get_neighborhood_ratings(1, user_neighbors)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "neighborhood_ratings" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### C. Compute Rating Predictions from Neighborhood Ratings\n", 238 | "\n", 239 | "![](Parrot.png)\n", 240 | "\n", 241 | "**Task:** In this step, we estimate ratings for the seed user based on the neighborhood ratings. We implement a similarity weighted average of neighbor ratings for that. Return a mapping from item to its prediction and the count of neighbor ratings received." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "def compute_rating_pred(neighborhood_ratings: dict) -> dict:\n", 251 | " rating_preds = dict()\n", 252 | " \n", 253 | " pass\n", 254 | "\n", 255 | " return rating_preds" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "rating_preds = compute_rating_pred(neighborhood_ratings)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "list(rating_preds.items())[:20]" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "### D. Compute the Top-$N$ Recommendation Items\n", 281 | "\n", 282 | "![](Parrot.png)\n", 283 | "\n", 284 | "**Task:** The last step takes the rating predictions and returns the $N$ highest predictions which have a minimum rating count, i.e. the number of neighbors from the neighborhood that rated this item." 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "def compute_top_n(rating_preds: dict, min_count: int, N: int) -> OrderedDict:\n", 294 | " pass\n", 295 | " \n", 296 | " return OrderedDict(sorted_rating_preds[:N])" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "top_n_recs = compute_top_n(rating_preds, min_count=2, N=10)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "top_n_recs" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "### Combine all steps in `get_recommendations`" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "def get_recommendations(user: int,\n", 331 | " user_user_sims: dict,\n", 332 | " k: int,\n", 333 | " C: int,\n", 334 | " N: int):\n", 335 | " user_neighbors = get_k_nearest_neighbors(user, k=k, user_user_sims=user_user_sims)\n", 336 | " neighborhood_ratings = get_neighborhood_ratings(user, user_neighbors)\n", 337 | " rating_preds = compute_rating_pred(neighborhood_ratings)\n", 338 | " top_n_recs = compute_top_n(rating_preds, min_count=C, N=N)\n", 339 | " return top_n_recs" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "get_recommendations(1, user_user_sims, 10, 2, 10)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Evaluation" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "Let's check the performance of the neighborhood- and user-based recommender for a neighborhood size of $k = 60$, minimum rating count of $C = 10$ and stay with $N = 10$ recommendations." 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "k = 60\n", 372 | "C = 10\n", 373 | "N = 10" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "relevant_items = get_relevant_items(data.test_ratings)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "users = relevant_items.keys()\n", 392 | "prec_at_N = dict.fromkeys(data.users)\n", 393 | "\n", 394 | "for user in users:\n", 395 | " recommendations = get_recommendations(user, user_user_sims, k, C, N)\n", 396 | " recommendations = list(recommendations.keys())\n", 397 | " hits = np.intersect1d(recommendations,\n", 398 | " relevant_items[user])\n", 399 | " prec_at_N[user] = len(hits)/N" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "np.mean([val for val in prec_at_N.values() if val is not None])" 409 | ] 410 | } 411 | ], 412 | "metadata": { 413 | "kernelspec": { 414 | "display_name": "Python 3", 415 | "language": "python", 416 | "name": "python3" 417 | }, 418 | "language_info": { 419 | "codemirror_mode": { 420 | "name": "ipython", 421 | "version": 3 422 | }, 423 | "file_extension": ".py", 424 | "mimetype": "text/x-python", 425 | "name": "python", 426 | "nbconvert_exporter": "python", 427 | "pygments_lexer": "ipython3", 428 | "version": "3.9.4" 429 | }, 430 | "pycharm": { 431 | "stem_cell": { 432 | "cell_type": "raw", 433 | "metadata": { 434 | "collapsed": false 435 | }, 436 | "source": [] 437 | } 438 | } 439 | }, 440 | "nbformat": 4, 441 | "nbformat_minor": 4 442 | } 443 | -------------------------------------------------------------------------------- /notebooks/2_e_popularity_recs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 2: Popularity Recommendations" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this section we build a recommender that sorts items by popularity as of the number of ratings they received. As a result we return the $N$ most popular items as recommendations." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from typing import Dict, List\n", 24 | "\n", 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "from scipy.stats import spearmanr" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# `Dataset` is just a wrapper for the MovieLens training data\n", 37 | "from recsys_training.data import Dataset, genres" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n", 47 | "ml100k_item_filepath = '../data/raw/ml-100k/u.item'" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Load Data" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "We load the dataset with 100,000 ratings and split it $4:1$ into train and test set.\n", 62 | "\n", 63 | "(**Remark**: We do not focus on proper hyperparameter search within this tutorial and therefore do not generate a separate validation dataset)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "data = Dataset(ml100k_ratings_filepath)\n", 73 | "data.rating_split(train_size=0.8, seed=42)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "items = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n", 83 | " names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n", 84 | " engine='python')" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "data.train_ratings" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "data.test_ratings" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "Build a Mapping from user id to its item ratings. We will need this later." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "user_ratings = data.get_user_ratings()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "Show up to 20 user ratings for the first user" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "user = 1\n", 135 | "list(user_ratings[user].items())[:20]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "## Popularity Ranking" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "How do we define _popularity_? It turns out that there can be different things justifying the popularity of content:\n", 150 | "- **pure count**: simply count the number of ratings or interactions an item received regardless of their quality\n", 151 | "- **positive count**: only count the number of ratings or interactions that we assume reflect preference towards items, e.g. ratings above user mean ratings\n", 152 | "- **time-dependency**: despite evergreen stars items may also be popular for a limited time only - how can we account for this?\n", 153 | "\n", 154 | "**Remark**: Popularity ranking entails no personalization. We obtain a single popularity ranking of items which is independent from the user and serve the same top-$N$ items to every user." 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Popularity based on simple Interaction Counts" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "![](parrot.png)\n", 169 | "\n", 170 | "**Task**: Infer the item popularity order from training ratings as an array with items in descending order of popularity." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "item_popularity = data.train_ratings.item.value_counts()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "item_popularity" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "item_order = item_popularity.values" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "item_order" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "What are the most popular movies?" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "top_movie_ids = item_order[:5]\n", 223 | "items[items['item'].isin(top_movie_ids)][['item', 'title']]" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Popularity based on positive Interaction Counts\n", 231 | "\n", 232 | "We assume that the the mean rating for each user is the threshold above which movies are regarded as favorable and below which movies are deemed as bad.\n", 233 | "\n", 234 | "1. compute that user mean rating for each user.\n", 235 | "2. remove all ratings that fall below this threshold.\n", 236 | "3. apply the process above to the remaining ratings." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "user_mean_ratings = data.train_ratings[['user', 'rating']].groupby('user')\n", 246 | "user_mean_ratings = user_mean_ratings.mean().reset_index()\n", 247 | "user_mean_ratings.rename(columns={'rating': 'user_mean_rating'},\n", 248 | " inplace=True)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "user_mean_ratings" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "positive_train_ratings = data.train_ratings.merge(user_mean_ratings,\n", 267 | " on='user',\n", 268 | " how='left')" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "keep_ratings = (positive_train_ratings['rating'] >= positive_train_ratings['user_mean_rating'])" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "positive_train_ratings = positive_train_ratings[keep_ratings]\n", 287 | "positive_train_ratings.drop(columns='user_mean_rating', inplace=True)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "positive_train_ratings" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "item_popularity_positive = positive_train_ratings.item.value_counts()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "item_popularity_positive" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "item_order_positive = item_popularity.index.values" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "items[items['item'].isin(item_order_positive[:5])][['item', 'title']]" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "#### How strong do both orderings correlate with each other?" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "Check spearman rank correlation between both orderings to quantify the distortion in ordering." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "joint_counts = [[item_popularity.loc[item], item_popularity_positive[item]]\n", 356 | " for item in np.intersect1d(item_popularity_positive.index.values,\n", 357 | " item_popularity.index.values)]\n", 358 | "joint_counts = np.array(joint_counts)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "joint_counts" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "spearmanr(joint_counts)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "### Using Popularity Ordering for top-$N$ Recommendations\n", 384 | "\n", 385 | "Now, we can produce recommendations from our popularity ordering." 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "![](parrot.png)\n", 393 | "\n", 394 | "**Task**: Write a method `get_recommendation` that returns the top-$N$ items without any known positives, i.e. items the user has already viewed." 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "def get_recommendations(user: int,\n", 404 | " user_ratings: dict,\n", 405 | " item_popularity_order: np.array,\n", 406 | " N: int) -> List[int]:\n", 407 | " known_positives = None\n", 408 | " recommendations = None\n", 409 | " \n", 410 | " return recommendations" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "Try it ..." 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "get_recommendations(1, user_ratings, item_order, 10)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "## Evaluating the Relevance of Recommendations" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "def get_relevant_items(test_ratings: pd.DataFrame) -> Dict[int, List[int]]:\n", 443 | " \"\"\"\n", 444 | " returns {user: [items]} as a list of relevant items per user\n", 445 | " for all users found in the test dataset\n", 446 | " \"\"\"\n", 447 | " relevant_items = test_ratings[['user', 'item']]\n", 448 | " relevant_items = relevant_items.groupby('user')\n", 449 | " relevant_items = {user: relevant_items.get_group(user)['item'].values\n", 450 | " for user in relevant_items.groups.keys()}\n", 451 | "\n", 452 | " return relevant_items" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "relevant_items = get_relevant_items(data.test_ratings)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "relevant_items[1]" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "### $Precision@10$" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "Now, we can compute the intersection between the top-$N$ recommended items and the items each user interacted with. Ideally, we want every recommendation to be a hit, i.e. an item the user consumed. In this case the size of intersections is $N$ given $N$ recommendations which is a precision of 100% = $\\frac{N}{N}$.\n", 485 | "\n", 486 | "We compute the so called $Precision@N$ for every user and take the mean over all. The resulting metric is called _mean average precision at N_ or short $MAP@N$." 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "![](parrot.png)\n", 494 | "\n", 495 | "**Task:** Compute the $MAP@N$ for popularity recommendations" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "def get_precision(users: List[int], user_ratings: Dict[int, Dict[int, float]],\n", 505 | " item_order: np.array, N: int) -> Dict[int, float]:\n", 506 | " \n", 507 | " pass\n", 508 | " \n", 509 | " return prec_at_N" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": {}, 515 | "source": [ 516 | "Try it ..." 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "N = 10\n", 526 | "users = relevant_items.keys()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "prec_at_N = get_precision(users, user_ratings, item_order, N)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "np.mean(list(prec_at_N.values()))" 545 | ] 546 | } 547 | ], 548 | "metadata": { 549 | "kernelspec": { 550 | "display_name": "Python 3", 551 | "language": "python", 552 | "name": "python3" 553 | }, 554 | "language_info": { 555 | "codemirror_mode": { 556 | "name": "ipython", 557 | "version": 3 558 | }, 559 | "file_extension": ".py", 560 | "mimetype": "text/x-python", 561 | "name": "python", 562 | "nbconvert_exporter": "python", 563 | "pygments_lexer": "ipython3", 564 | "version": "3.9.4" 565 | }, 566 | "pycharm": { 567 | "stem_cell": { 568 | "cell_type": "raw", 569 | "metadata": { 570 | "collapsed": false 571 | }, 572 | "source": [] 573 | } 574 | } 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 4 578 | } 579 | -------------------------------------------------------------------------------- /notebooks/9_e_ligthfm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 9: LightFM" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You almost made it - this is the final lesson and it is also going to be the easiest one.\n", 15 | "\n", 16 | "As you may already assume - there are a lot of recommender packages in Python out there. In this lesson we will look at LightFM - an easy to use and lightweight implementation of different approaches and algorithms (FM, BPR, WARP, ...) to perform CF, CBF and hybrid recommenders.\n", 17 | "\n", 18 | "Within a few lines of code we set-up, train and use a recommender for recommendations.\n", 19 | "\n", 20 | "* [LightFM on GitHub](https://github.com/lyst/lightfm)\n", 21 | "* [LightFM documentation](https://making.lyst.com/lightfm/docs/home.html)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import matplotlib.pyplot as plt\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from scipy.sparse import coo_matrix\n", 34 | "\n", 35 | "from recsys_training.data import Dataset, genres" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from lightfm.datasets import fetch_movielens\n", 45 | "from lightfm.evaluation import precision_at_k\n", 46 | "from lightfm import LightFM" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'\n", 56 | "ml100k_item_filepath = '../../data/raw/ml-100k/u.item'\n", 57 | "ml100k_user_filepath = '../../data/raw/ml-100k/u.user'" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Load Data" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### You may easily load Movielens Data ..." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "data = fetch_movielens(min_rating=4.0, genre_features=True)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### But, we want to use the exact same data and split that we used in the lessons before" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "data = Dataset(ml100k_ratings_filepath)\n", 106 | "data.filter(min_rating=4.0)\n", 107 | "data.rating_split(seed=42)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "#### Transform our training and testing data into sparse matrices" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# Train DataFrame to Train COO Matrix\n", 124 | "ratings = data.train_ratings[\"rating\"].values\n", 125 | "# We subtract 1 to make user/item ids 0-index-based\n", 126 | "rows = data.train_ratings[\"user\"].values - 1\n", 127 | "cols = data.train_ratings[\"item\"].values - 1\n", 128 | "\n", 129 | "train_mat = coo_matrix((ratings, (rows, cols)),\n", 130 | " shape=(data.n_users, data.n_items))\n", 131 | "\n", 132 | "\n", 133 | "# Test DataFrame to Test COO Matrix\n", 134 | "ratings = data.test_ratings[\"rating\"].values\n", 135 | "# We subtract 1 to make user/item ids 0-index-based\n", 136 | "rows = data.test_ratings[\"user\"].values - 1\n", 137 | "cols = data.test_ratings[\"item\"].values - 1\n", 138 | "\n", 139 | "test_mat = coo_matrix((ratings, (rows, cols)),\n", 140 | " shape=(data.n_users, data.n_items))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "train_mat" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "test_mat" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## Collaborative Filtering" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "params = {\n", 175 | " 'no_components': 10,\n", 176 | " 'loss': 'bpr',\n", 177 | " 'learning_rate': 0.07,\n", 178 | " 'random_state': 42,\n", 179 | " 'user_alpha': 0.0002,\n", 180 | " 'item_alpha': 0.0002\n", 181 | "}\n", 182 | "\n", 183 | "epochs = 10\n", 184 | "\n", 185 | "N = 10" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "cf_model = LightFM(**params)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "cf_model.fit(train_mat, epochs=epochs, verbose=True)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Evaluate the `MAP@10` on test data\n", 211 | "\n", 212 | "If we provide training data with evaluation, known positives will be removed." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "prec_at_N = precision_at_k(cf_model, test_mat, train_mat, k=N)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "prec_at_N.mean()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### Evaluate the `MAP@10` on train data" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "prec_at_N = precision_at_k(cf_model, train_mat, k=N)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "prec_at_N.mean()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Maybe try adding some regularization to improve the recommendation relevancy - simply add `user_alpha` and `item_alpha` to the `params` dictionary and find appropriate values." 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "## Hybrid (CF + CBF)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "### Load user and item features" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "def min_max_scale(val, bounds):\n", 286 | " min_max_range = bounds['max']-bounds['min']\n", 287 | " return (val-bounds['min'])/min_max_range\n", 288 | "\n", 289 | "\n", 290 | "def user_profiler(group):\n", 291 | " genre_dist = group[genres].mean()\n", 292 | " year_dist = group['release_year'].describe()[['mean', 'std', '50%']]\n", 293 | "\n", 294 | " return pd.concat((genre_dist, year_dist), axis=0)\n", 295 | "\n", 296 | "\n", 297 | "def get_user_profiles(ratings: pd.DataFrame,\n", 298 | " item_feat: pd.DataFrame,\n", 299 | " min_rating: float = 4.0) -> pd.DataFrame:\n", 300 | " ratings = ratings[ratings.rating >= min_rating]\n", 301 | " ratings = ratings[['user', 'item']]\n", 302 | " ratings = ratings.merge(item_feat, on='item', how='left')\n", 303 | " ratings.drop(['item'], axis=1, inplace=True)\n", 304 | "\n", 305 | " grouped = ratings.groupby('user')\n", 306 | " profiles = grouped.apply(user_profiler).reset_index()\n", 307 | " profiles.rename(columns={'50%': 'median'}, inplace=True)\n", 308 | " \n", 309 | " return profiles\n", 310 | "\n", 311 | "\n", 312 | "item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n", 313 | " names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n", 314 | " engine='python')\n", 315 | "\n", 316 | "user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n", 317 | " names=['user', 'age', 'gender', 'occupation', 'zip'])\n", 318 | "\n", 319 | "# Infer the release year\n", 320 | "idxs = item_feat[item_feat['release'].notnull()].index\n", 321 | "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')\n", 322 | "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)\n", 323 | "\n", 324 | "# Impute median release year value for the items with missing release year\n", 325 | "top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']\n", 326 | "idx = item_feat[item_feat['release'].isnull()].index\n", 327 | "item_feat.loc[idx, 'release_year'] = top_year\n", 328 | "\n", 329 | "# Min-max scale the release year\n", 330 | "item_year_bounds = {'min': item_feat['release_year'].min(),\n", 331 | " 'max': item_feat['release_year'].max()}\n", 332 | "item_feat['release_year'] = item_feat['release_year'].apply(\n", 333 | " lambda year: min_max_scale(year, item_year_bounds))\n", 334 | "\n", 335 | "# Drop other columns\n", 336 | "item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)\n", 337 | "\n", 338 | "# Min-max scale the age\n", 339 | "user_age_bounds = {'min': user_feat['age'].min(),\n", 340 | " 'max': user_feat['age'].max()}\n", 341 | "user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))\n", 342 | "\n", 343 | "# Transform gender characters to numerical values (categories)\n", 344 | "genders = sorted(user_feat['gender'].unique())\n", 345 | "user_gender_map = dict(zip(genders, range(len(genders))))\n", 346 | "user_feat['gender'] = user_feat['gender'].map(user_gender_map)\n", 347 | "\n", 348 | "# Transform occupation strings to numerical values (categories)\n", 349 | "occupations = sorted(user_feat['occupation'].unique())\n", 350 | "user_occupation_map = dict(zip(occupations, range(len(occupations))))\n", 351 | "user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)\n", 352 | "\n", 353 | "# Transform the zip codes to categories keeping the first three digits and impute for missing\n", 354 | "idxs = user_feat[~user_feat['zip'].str.isnumeric()].index\n", 355 | "user_feat.loc[idxs, 'zip'] = '00000'\n", 356 | "zip_digits_to_cut = 3\n", 357 | "user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)\n", 358 | "\n", 359 | "\n", 360 | "profiles = get_user_profiles(data.train_ratings, item_feat)\n", 361 | "user_feat = user_feat.merge(profiles, on='user', how='left')\n", 362 | "\n", 363 | "occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')\n", 364 | "zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')\n", 365 | "\n", 366 | "user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)\n", 367 | "user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)\n", 368 | "\n", 369 | "user_feat.fillna(0, inplace=True)\n", 370 | "\n", 371 | "\n", 372 | "user_feat.index = user_feat['user'].values\n", 373 | "user_feat.drop('user', axis=1, inplace=True)\n", 374 | "\n", 375 | "item_feat.index = item_feat['item'].values\n", 376 | "item_feat.drop('item', axis=1, inplace=True)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "(user_feat==0).sum().sum()/user_feat.size" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "(item_feat==0).sum().sum()/item_feat.size" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "# Create User Feature COO Matrix\n", 404 | "# user_feat_mat = coo_matrix(np.eye(data.n_users))\n", 405 | "user_feat_mat = coo_matrix(np.concatenate((user_feat.values, np.eye(data.n_users)), axis=1))\n", 406 | "\n", 407 | "# Create Item Feature COO Matrix\n", 408 | "# item_feat_mat = coo_matrix(np.eye(data.n_items))\n", 409 | "item_feat_mat = coo_matrix(np.concatenate((item_feat.values, np.eye(data.n_items)), axis=1))" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "user_feat_mat" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "item_feat_mat" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "### Model Training" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "![](Parrot.png)\n", 442 | "\n", 443 | "**Task:** Check the [lightFM API](https://making.lyst.com/lightfm/docs/home.html) to see how you can incorporate proper data - can you tweek the algorithm to beat pure Collaborative Filtering?" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "params = {\n", 453 | " 'no_components': 10,\n", 454 | " 'loss': 'warp',\n", 455 | " 'learning_rate': 0.03,\n", 456 | " 'random_state': 42,\n", 457 | " 'user_alpha': 0.0001,\n", 458 | " 'item_alpha': 0.0001\n", 459 | "}\n", 460 | "\n", 461 | "epochs = 10\n", 462 | "\n", 463 | "N = 10" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "hybrid_model = None\n", 473 | "\n", 474 | "#\n", 475 | "# Up to you ;)\n", 476 | "#" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "prec_at_N = precision_at_k(hybrid_model,\n", 486 | " test_mat,\n", 487 | " train_mat,\n", 488 | " k=N,\n", 489 | " user_features=user_feat_mat,\n", 490 | " item_features=item_feat_mat)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "prec_at_N.mean()" 500 | ] 501 | } 502 | ], 503 | "metadata": { 504 | "kernelspec": { 505 | "display_name": "Python 3", 506 | "language": "python", 507 | "name": "python3" 508 | }, 509 | "language_info": { 510 | "codemirror_mode": { 511 | "name": "ipython", 512 | "version": 3 513 | }, 514 | "file_extension": ".py", 515 | "mimetype": "text/x-python", 516 | "name": "python", 517 | "nbconvert_exporter": "python", 518 | "pygments_lexer": "ipython3", 519 | "version": "3.9.4" 520 | }, 521 | "pycharm": { 522 | "stem_cell": { 523 | "cell_type": "raw", 524 | "metadata": { 525 | "collapsed": false 526 | }, 527 | "source": [] 528 | } 529 | } 530 | }, 531 | "nbformat": 4, 532 | "nbformat_minor": 4 533 | } 534 | -------------------------------------------------------------------------------- /notebooks/6_e_cf_mf_ranking_pred.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 6: Model-based Collaborative Filtering for **Ranking** Prediction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "However, we still do Collaborative Filtering and Matrix Factorization in this unit, we do something fundamentally different: we change from rating prediction to **ranking prediction**.\n", 15 | "\n", 16 | "We achieve this by changing the optimization criterion. Instead of minimizing the deviation between true and predicted ratings we push positive and negative user-item combinationa as much as possible apart. We transform explicit user feedback into implicit feedback. Implicit feedback refers to user interaction without the purpose to reflect preference or disregard and is much more common in pactice. Ranking prediction algorithms tackle to learn from implicit feedback data.\n", 17 | "\n", 18 | "In addition, ranking-based algorithms yield a much more intuitive prediction result. Our goal is to present to the user a very limited amount of items in the correct ordering. Therefore, ordering is much more important than rating prediction. Ranking-based algorithms like BPR work pair-wise, i.e. for a user and two items they yield the correct order of both items for the user. Generalizing from this, we can impose an ordering on our item corpus and pick the top-$N$ to present to the user." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from collections import OrderedDict\n", 28 | "import itertools\n", 29 | "from typing import Dict, List, Tuple\n", 30 | "\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import numpy as np\n", 33 | "import pandas as pd" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from recsys_training.data import Dataset\n", 43 | "from recsys_training.evaluation import get_relevant_items" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Load Data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Different to previous units, we work with implicit feedback data now. However, MovieLens is an explicit feedback dataset, we can argue that everything above the user mean ratings is positive and everything below is negative. Bayesian Personalized Ranking learns from implicit positive feedback data and randomly samples negative feedback data during training. Thus, we keep all ratings above a threhold of $4.0$ and remove all other ratings." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "data = Dataset(ml100k_ratings_filepath)\n", 76 | "data.filter(min_rating=4.0)\n", 77 | "data.rating_split(seed=42)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "As we want to learn the user/item latent factors from rating data, we first randomly initialize them" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "seed = 42\n", 94 | "m = data.n_users\n", 95 | "n = data.n_items\n", 96 | "d = 10" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# Latent Factor initialization\n", 106 | "random_state = np.random.RandomState(seed)\n", 107 | "user_factors = (random_state.rand(m, d) - 0.5) / d\n", 108 | "item_factors = (random_state.rand(n, d) - 0.5) / d\n", 109 | " \n", 110 | "ratings = data.train_ratings.sample(frac=1, random_state=seed)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 8, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# positive implicit feedback items\n", 120 | "user_pos_items = dict()\n", 121 | "# corpus of all remaining items for every user\n", 122 | "# Ask me about the \"Non missing at random hypothesis\" ;)\n", 123 | "user_neg_items = dict()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 11, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "grouped = ratings[['user', 'item']].groupby('user')\n", 133 | "groups = grouped.groups.keys()\n", 134 | "for user in data.users:\n", 135 | " pos_items = []\n", 136 | " if user in groups:\n", 137 | " pos_items = grouped.get_group(user).item.values\n", 138 | " neg_items = np.setdiff1d(data.items, pos_items)\n", 139 | " user_pos_items[user] = pos_items\n", 140 | " user_neg_items[user] = neg_items" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## Training" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Yes, there is some math involved:\n", 155 | "\n", 156 | "\\begin{equation*}\n", 157 | "\\hat{x}_{uij} = \\hat{x}_{ui} - \\hat{x}_{uj} \\\\\n", 158 | "x_{ui} = \\sum_{f=1}^{d} w_{uf} \\cdot h_{if}, i \\in I_u^+ \\\\\n", 159 | "x_{uj} = \\sum_{f=1}^{d} w_{uf} \\cdot h_{jf}, j \\in I_u^- \\\\\n", 160 | "\\end{equation*}\n", 161 | "\n", 162 | "\\begin{equation*}\n", 163 | "\\text{BPR-Opt} := \\sum_{(u,i,j) \\in D_S} \\ln\\sigma(\\hat{x}_{uijj}) - \\lambda_{\\Theta} \\cdot ||\\Theta||^2\n", 164 | "\\end{equation*}\n", 165 | "\n", 166 | "\\begin{equation*}\n", 167 | "\\frac{\\partial \\text{BPR-Opt}}{\\partial \\Theta} = \\frac{-e^{-\\hat{x}_{uij}}}{1+e^{-\\hat{x}_{uij}}} \\cdot \\frac{\\partial \\hat{x}_{uij}}{\\partial \\Theta} - \\lambda_{\\Theta} \\cdot \\Theta\n", 168 | "\\end{equation*}\n", 169 | "\n", 170 | "\\begin{equation*}\n", 171 | "\\frac{\\partial x_{uij}}{\\partial \\Theta} =\n", 172 | "\\begin{cases}\n", 173 | "(h_{if}-h_{jf}) & \\text{for } \\Theta = w_{uf} \\\\\n", 174 | "w_{uf} & \\text{for } \\Theta = h_{if} \\\\\n", 175 | "-w_{uf} & \\text{for } \\Theta = h_{jf}\n", 176 | "\\end{cases}\n", 177 | "\\end{equation*}" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Let's talk about regularization!" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "def sigmoid(x):\n", 194 | " return 1/(1+np.exp(-x))" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "def negative_sampling(user: int, user_neg_items: Dict[int, np.array]) -> int:\n", 204 | " \"\"\"\n", 205 | " Return the item ids for negative samples\n", 206 | " \"\"\"\n", 207 | " negative_item = np.random.choice(user_neg_items[user])\n", 208 | " \n", 209 | " return negative_item" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "![](Parrot.png)\n", 217 | "\n", 218 | "**Task:** Adapt the `compute_gradients` method from the unit before to realize stochastic gradient descent (SGD) for Bayesian Personalized Ranking." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "def compute_gradients(user_embed: np.array,\n", 228 | " pos_item_embed: np.array,\n", 229 | " neg_item_embed: np.array,\n", 230 | " l2_decay: Dict[str, float]) -> Tuple[np.array, np.array, np.array]:\n", 231 | " \n", 232 | " pos_pred = np.sum(user_embed * pos_item_embed)\n", 233 | " neg_pred = np.sum(user_embed * neg_item_embed)\n", 234 | " pred = pos_pred - neg_pred\n", 235 | "\n", 236 | " generic_grad = None\n", 237 | " \n", 238 | " # Gradients\n", 239 | " user_grad = None\n", 240 | " pos_item_grad = None\n", 241 | " neg_item_grad = None\n", 242 | " \n", 243 | " # Add L2-Decay\n", 244 | " user_grad += None\n", 245 | " pos_item_grad += None\n", 246 | " neg_item_grad += None\n", 247 | "\n", 248 | " return user_grad, pos_item_grad, neg_item_grad" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "def print_update(epoch: int, samples: np.array) -> float:\n", 258 | " # take the 1000 most recent ratings and compute the mean ranking loss\n", 259 | " users = samples[:, 0]\n", 260 | " pos_items = samples[:, 1]\n", 261 | " neg_items = np.array([negative_sampling(user, user_neg_items)\n", 262 | " for user in users])\n", 263 | "\n", 264 | " user_embeds = user_factors[users - 1]\n", 265 | " pos_item_embeds = item_factors[pos_items - 1]\n", 266 | " neg_item_embeds = item_factors[neg_items - 1]\n", 267 | "\n", 268 | " pos_preds = np.sum(user_embeds * pos_item_embeds, axis=1)\n", 269 | " neg_preds = np.sum(user_embeds * neg_item_embeds, axis=1)\n", 270 | " preds = pos_preds - neg_preds\n", 271 | "\n", 272 | " loss = -np.log(sigmoid(preds)).mean()\n", 273 | " print(f\"Epoch {epoch+1:02d}: Mean Ranking Loss: {loss:.4f}\")\n", 274 | " \n", 275 | " return loss" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "Instead of minibatch gradient descent we do **stochastic gradient descent** (SGD) here. It just shrinks the batch size down to 1 instance." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "epochs = 30\n", 292 | "learning_rate = 0.05\n", 293 | "l2_decay = {'user': 0.002, 'pos': 0.0, 'neg': 0.002}\n", 294 | "verbose = True" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "ratings_arr = ratings[['user', 'item']].values\n", 304 | "n_ratings = len(ratings_arr)\n", 305 | "loss_trace = []\n", 306 | "\n", 307 | "for epoch in range(epochs):\n", 308 | "\n", 309 | " for _ in range(len(ratings)):\n", 310 | " random_index = np.random.randint(n_ratings)\n", 311 | " user, pos_item = tuple(ratings_arr[random_index])\n", 312 | " neg_item = negative_sampling(user, user_neg_items)\n", 313 | "\n", 314 | " # Deduct 1 as user ids are 1-indexed, but array is 0-indexed\n", 315 | " user_embed = user_factors[user - 1]\n", 316 | " pos_item_embed = item_factors[pos_item - 1]\n", 317 | " neg_item_embed = item_factors[neg_item - 1]\n", 318 | "\n", 319 | " user_grad, pos_item_grad, neg_item_grad = \\\n", 320 | " compute_gradients(user_embed,\n", 321 | " pos_item_embed,\n", 322 | " neg_item_embed,\n", 323 | " l2_decay)\n", 324 | "\n", 325 | " user_factors[user - 1] -= learning_rate * user_grad\n", 326 | " item_factors[pos_item - 1] -= learning_rate * pos_item_grad\n", 327 | " item_factors[neg_item - 1] -= learning_rate * neg_item_grad\n", 328 | "\n", 329 | " if verbose:\n", 330 | " samples = ratings_arr[-1000:]\n", 331 | " loss = print_update(epoch, samples)\n", 332 | " loss_trace.append(loss)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "plt.figure(figsize=(12,8))\n", 342 | "plt.plot(range(epochs), train_loss_trace, 'b--', label='Train')\n", 343 | "plt.plot(range(epochs), test_loss_trace, 'g--', label='Test')\n", 344 | "plt.grid(True)\n", 345 | "plt.legend()\n", 346 | "plt.show()" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "### Using the model for Recommendations" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "We have now created a model to describe users and items in terms of latent vectors. But this time we fitted them to get the rankings correctly. So for obtaining recommendations we simply multiply user-item latent vectors we are interested in and achieve an estimate that can be used to order items for a given user. This time it is not a rating prediction, but still a prediction.\n", 361 | "\n", 362 | "For that, we can reuse the `get_prediction` method from previous units.\n", 363 | "\n", 364 | "Thus, before writing the `get_recommendations` again we first implement `get_prediction`." 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "def get_prediction(user: int, items: np.array = None, remove_known_pos: bool = True) -> Dict[int, Dict[str, float]]:\n", 374 | " if items is None:\n", 375 | " if remove_known_pos:\n", 376 | " # Predict from unobserved items\n", 377 | " # We simplified this compared to the unit before\n", 378 | " items = user_neg_items[user]\n", 379 | " else:\n", 380 | " items = np.array(data.items)\n", 381 | " if type(items) == np.int64:\n", 382 | " items = np.array([items])\n", 383 | " \n", 384 | " user_embed = user_factors[user - 1].reshape(1, -1)\n", 385 | " item_embeds = item_factors[items - 1].reshape(len(items), -1)\n", 386 | "\n", 387 | " # use array-broadcasting\n", 388 | " preds = np.sum(user_embed * item_embeds, axis=1)\n", 389 | " sorting = np.argsort(preds)[::-1]\n", 390 | " preds = {item: {'pred': pred} for item, pred in\n", 391 | " zip(items[sorting], preds[sorting])}\n", 392 | "\n", 393 | " return preds" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "item_predictions = get_prediction(1)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "list(item_predictions.items())[:20]" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "def get_recommendations(user: int, N: int, remove_known_pos: bool = False) -> List[Tuple[int, Dict[str, float]]]:\n", 421 | " predictions = get_prediction(user, remove_known_pos=remove_known_pos)\n", 422 | " recommendations = []\n", 423 | " for item, pred in predictions.items():\n", 424 | " add_item = (item, pred)\n", 425 | " recommendations.append(add_item)\n", 426 | " if len(recommendations) == N:\n", 427 | " break\n", 428 | "\n", 429 | " return recommendations" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "recommendations = get_recommendations(1, 10)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "recommendations" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "## Evaluation" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "N = 10" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "relevant_items = get_relevant_items(data.test_ratings)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "users = relevant_items.keys()\n", 482 | "prec_at_N = dict.fromkeys(data.users)\n", 483 | "\n", 484 | "for user in users:\n", 485 | " recommendations = get_recommendations(user, N, remove_known_pos=True)\n", 486 | " recommendations = [val[0] for val in recommendations]\n", 487 | " hits = np.intersect1d(recommendations,\n", 488 | " relevant_items[user])\n", 489 | " prec_at_N[user] = len(hits)/N" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "recommendations" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "np.mean([val for val in prec_at_N.values() if val is not None])" 508 | ] 509 | } 510 | ], 511 | "metadata": { 512 | "kernelspec": { 513 | "display_name": "Python 3", 514 | "language": "python", 515 | "name": "python3" 516 | }, 517 | "language_info": { 518 | "codemirror_mode": { 519 | "name": "ipython", 520 | "version": 3 521 | }, 522 | "file_extension": ".py", 523 | "mimetype": "text/x-python", 524 | "name": "python", 525 | "nbconvert_exporter": "python", 526 | "pygments_lexer": "ipython3", 527 | "version": "3.9.4" 528 | }, 529 | "pycharm": { 530 | "stem_cell": { 531 | "cell_type": "raw", 532 | "metadata": { 533 | "collapsed": false 534 | }, 535 | "source": [] 536 | } 537 | } 538 | }, 539 | "nbformat": 4, 540 | "nbformat_minor": 4 541 | } 542 | -------------------------------------------------------------------------------- /notebooks/extra_sport_recommender.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import itertools\n", 10 | "from typing import Dict, List, Tuple\n", 11 | "\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "import pandas as pd" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 6, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "sports = [\n", 24 | " 'badminton',\n", 25 | " 'basketball',\n", 26 | " 'biking',\n", 27 | " 'boxing',\n", 28 | " 'fighting',\n", 29 | " 'fishing',\n", 30 | " 'football',\n", 31 | " 'hockey',\n", 32 | " 'running',\n", 33 | " 'swimming',\n", 34 | " 'tabletennis',\n", 35 | " 'tennis',\n", 36 | " 'volleyball'\n", 37 | "]" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 7, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "people = ['Barbara', 'Birol', 'Guido', 'Lisa', 'Rudi', 'Suna', 'Sven', 'Yvonne']" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 8, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "ratings = {\n", 56 | " \"Barbara\": {\"football\": 3, \"basketball\": 5, \"boxing\": 4, \"biking\": 2, \"fighting\": 4},\n", 57 | " \"Birol\": {\"boxing\": 4, \"hockey\": 2, \"biking\": 4, \"fighting\": 5, \"swimming\": 5, \"tennis\": 5},\n", 58 | " \"Guido\": {\"basketball\": 2, \"tennis\": 4, \"boxing\": 2, \"biking\": 2, \"volleyball\": 4, \"football\": 5},\n", 59 | " \"Lisa\": {\"football\": 4, \"tabletennis\": 3, \"running\": 4, \"volleyball\": 5, \"swimming\": 1},\n", 60 | " \"Rudi\": {\"football\": 1, \"badminton\": 4, \"biking\": 5, \"running\": 5, \"tabletennis\": 1},\n", 61 | " \"Suna\": {\"swimming\": 4, \"volleyball\": 5, \"running\": 3, \"tennis\": 5, \"tabletennis\": 4},\n", 62 | " \"Sven\": {\"swimming\": 5, \"biking\": 4, \"running\": 4, \"fishing\": 1, \"badminton\": 5},\n", 63 | " \"Yvonne\": {\"basketball\": 1, \"badminton\": 3, \"tennis\": 5, \"fighting\": 2, \"football\": 5, \"running\": 5}\n", 64 | "}\n", 65 | "ratings = {k: ratings[k] for k in sorted(ratings.keys())}" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 9, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "rows = []\n", 75 | "for person, individual_ratings in ratings.items():\n", 76 | " for sport, rating in individual_ratings.items():\n", 77 | " rows.append([person, sport, rating])\n", 78 | "\n", 79 | "ratings_df = pd.DataFrame(rows, columns=[\"person\", \"sport\", \"rating\"])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 10, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "m = len(people)\n", 89 | "n = len(sports)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Fill the Matrix" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 11, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# initialize\n", 106 | "rating_matrix = np.zeros((m, n))\n", 107 | "# fill with ratings\n", 108 | "for person_idx, person in enumerate(people):\n", 109 | " individual_ratings = ratings[person]\n", 110 | " for sport, rating in individual_ratings.items():\n", 111 | " sport_idx = sports.index(sport)\n", 112 | " rating_matrix[person_idx, sport_idx] = rating" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 12, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "array([[0., 5., 2., 4., 4., 0., 3., 0., 0., 0., 0., 0., 0.],\n", 124 | " [0., 0., 4., 4., 5., 0., 0., 2., 0., 5., 0., 5., 0.],\n", 125 | " [0., 2., 2., 2., 0., 0., 5., 0., 0., 0., 0., 4., 4.],\n", 126 | " [0., 0., 0., 0., 0., 0., 4., 0., 4., 1., 3., 0., 5.],\n", 127 | " [4., 0., 5., 0., 0., 0., 1., 0., 5., 0., 1., 0., 0.],\n", 128 | " [0., 0., 0., 0., 0., 0., 0., 0., 3., 4., 4., 5., 5.],\n", 129 | " [5., 0., 4., 0., 0., 1., 0., 0., 4., 5., 0., 0., 0.],\n", 130 | " [3., 1., 0., 0., 2., 0., 5., 0., 5., 0., 0., 5., 0.]])" 131 | ] 132 | }, 133 | "execution_count": 12, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "rating_matrix" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 13, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "Sparsity: 58.65%\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "sparsity = (rating_matrix == 0).sum() / rating_matrix.size\n", 157 | "print(f\"Sparsity: {sparsity: .2%}\")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Nearest Neighborhood Collaborative Filtering (user-based)\n", 165 | "* compute similarities among the users\n", 166 | "* perform neighborhood-based collaborative filtering" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### User-User Similarities" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 15, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "def get_cosine_sim(a: int, b: int, entity_ratings: dict) -> tuple:\n", 183 | " # 1. isolate e.g. users that have rated both items (a and b)\n", 184 | " key_intersection = set(entity_ratings[a].keys()).intersection(entity_ratings[b].keys())\n", 185 | " ratings = np.array([(entity_ratings[a][key], entity_ratings[b][key]) for key in key_intersection])\n", 186 | " n_joint_ratings = len(ratings)\n", 187 | " \n", 188 | " sim = None\n", 189 | " if n_joint_ratings > 1:\n", 190 | " nom = ratings[:, 0].dot(ratings[:, 1])\n", 191 | " denom = np.linalg.norm(ratings[:, 0]) * np.linalg.norm(ratings[:, 1])\n", 192 | " sim = nom / denom\n", 193 | " \n", 194 | " return sim, n_joint_ratings" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 16, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "user_user_sims = {}\n", 204 | "user_pairs = itertools.combinations(people, 2)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 17, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "for pair in user_pairs:\n", 214 | " user_user_sims[pair] = get_cosine_sim(pair[0], pair[1], ratings)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 18, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "(0.7071067811865476, 2)" 226 | ] 227 | }, 228 | "execution_count": 18, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "user_user_sims[(\"Barbara\", \"Rudi\")]" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "### 1. Nearest Neighbors for a given user" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 19, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "def get_k_nearest_neighbors(user: int, k: int, users: list, user_user_sims: dict) -> list:\n", 251 | " neighbors = set(users)\n", 252 | " neighbors.remove(user)\n", 253 | "\n", 254 | " nearest_neighbors = dict()\n", 255 | " for neighbor in neighbors:\n", 256 | " sim = user_user_sims[tuple(sorted((user, neighbor)))][0]\n", 257 | " if pd.notnull(sim):\n", 258 | " nearest_neighbors[neighbor] = sim\n", 259 | "\n", 260 | " nearest_neighbors = sorted(nearest_neighbors.items(),\n", 261 | " key=lambda kv: kv[1],\n", 262 | " reverse=True)\n", 263 | " \n", 264 | " return nearest_neighbors[:k]" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 20, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "user_neighbors = get_k_nearest_neighbors(\"Barbara\", 2, people, user_user_sims)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 21, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "text/plain": [ 284 | "[('Birol', 0.9713237285143654), ('Guido', 0.8277591347639633)]" 285 | ] 286 | }, 287 | "execution_count": 21, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "user_neighbors" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### 2. Obtain the Neighborhood Ratings" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 22, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "def get_neighborhood_ratings(user, user_neighbors: list, ratings: dict) -> dict:\n", 310 | " neighborhood_ratings = {}\n", 311 | " for neighbor, sim in user_neighbors:\n", 312 | " neighbor_ratings = ratings[neighbor].copy()\n", 313 | " \n", 314 | " # collect neighbor ratings and items\n", 315 | " for item, rating in neighbor_ratings.items():\n", 316 | " add_item = {'sim': sim, 'rating': rating}\n", 317 | " if item not in neighborhood_ratings.keys():\n", 318 | " neighborhood_ratings[item] = [add_item]\n", 319 | " else:\n", 320 | " neighborhood_ratings[item].append(add_item)\n", 321 | " \n", 322 | " # remove known items\n", 323 | " known_items = list(ratings[user].keys())\n", 324 | " for known_item in known_items:\n", 325 | " neighborhood_ratings.pop(known_item, None)\n", 326 | " \n", 327 | " return neighborhood_ratings" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 23, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "neighborhood_ratings = get_neighborhood_ratings(\"Barbara\", user_neighbors, ratings)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 24, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "{'hockey': [{'sim': 0.9713237285143654, 'rating': 2}],\n", 348 | " 'swimming': [{'sim': 0.9713237285143654, 'rating': 5}],\n", 349 | " 'tennis': [{'sim': 0.9713237285143654, 'rating': 5},\n", 350 | " {'sim': 0.8277591347639633, 'rating': 4}],\n", 351 | " 'volleyball': [{'sim': 0.8277591347639633, 'rating': 4}]}" 352 | ] 353 | }, 354 | "execution_count": 24, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "neighborhood_ratings" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "### 3. Compute Rating Predictions from Neighborhood Ratings" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 25, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "def compute_rating_pred(neighborhood_ratings: dict) -> dict:\n", 377 | " rating_preds = dict()\n", 378 | " for item, ratings in neighborhood_ratings.items():\n", 379 | " if len(ratings) > 0:\n", 380 | " sims = np.array([rating['sim'] for rating in ratings])\n", 381 | " ratings = np.array([rating['rating'] for rating in ratings])\n", 382 | " pred_rating = (sims * ratings).sum() / sims.sum()\n", 383 | " count = len(sims)\n", 384 | " rating_preds[item] = {'pred': pred_rating,\n", 385 | " 'count': count}\n", 386 | " else:\n", 387 | " rating_preds[item] = {'pred': None, 'count': 0}\n", 388 | "\n", 389 | " return rating_preds" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 26, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "rating_preds = compute_rating_pred(neighborhood_ratings)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 27, 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "data": { 408 | "text/plain": [ 409 | "{'hockey': {'pred': 2.0, 'count': 1},\n", 410 | " 'swimming': {'pred': 5.0, 'count': 1},\n", 411 | " 'tennis': {'pred': 4.5398993833693675, 'count': 2},\n", 412 | " 'volleyball': {'pred': 4.0, 'count': 1}}" 413 | ] 414 | }, 415 | "execution_count": 27, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "rating_preds" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "### 4. Compute the Top-$N$ Recommendation Items" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 28, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "from collections import OrderedDict\n", 438 | "\n", 439 | "def compute_top_n(rating_preds: dict, min_count: int, N: int) -> OrderedDict:\n", 440 | " rating_preds = {key: val for (key, val) in rating_preds.items()\n", 441 | " if val['count'] >= min_count}\n", 442 | " # assuming more ratings mean higher confidence in the prediction\n", 443 | " sorted_rating_preds = sorted(rating_preds.items(),\n", 444 | " key=lambda kv: (kv[1]['pred'], kv[1]['count']),\n", 445 | " reverse=True)\n", 446 | "\n", 447 | " return OrderedDict(sorted_rating_preds[:N])" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 29, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "top_n_recs = compute_top_n(rating_preds, min_count=2, N=1)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 30, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "data": { 466 | "text/plain": [ 467 | "OrderedDict([('tennis', {'pred': 4.5398993833693675, 'count': 2})])" 468 | ] 469 | }, 470 | "execution_count": 30, 471 | "metadata": {}, 472 | "output_type": "execute_result" 473 | } 474 | ], 475 | "source": [ 476 | "top_n_recs" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "### Combined all steps" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 31, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "def get_recommendations(user: int,\n", 493 | " users: list,\n", 494 | " user_user_sims: dict,\n", 495 | " ratings: dict,\n", 496 | " k: int,\n", 497 | " C: int,\n", 498 | " N: int):\n", 499 | " user_neighbors = get_k_nearest_neighbors(user, k=k, users=users, user_user_sims=user_user_sims)\n", 500 | " neighborhood_ratings = get_neighborhood_ratings(user, user_neighbors, ratings)\n", 501 | " rating_preds = compute_rating_pred(neighborhood_ratings)\n", 502 | " top_n_recs = compute_top_n(rating_preds, min_count=C, N=N)\n", 503 | " return top_n_recs" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 32, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "rec = get_recommendations(\"Barbara\", people, user_user_sims, ratings, k=2, C=2, N=1)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 33, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/plain": [ 523 | "OrderedDict([('tennis', {'pred': 4.5398993833693675, 'count': 2})])" 524 | ] 525 | }, 526 | "execution_count": 33, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "rec" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 34, 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "name": "stdout", 542 | "output_type": "stream", 543 | "text": [ 544 | "Barbara --> tennis @ 4.5 - 2 neighbor ratings\n", 545 | "Birol --> running @ 3.5 - 2 neighbor ratings\n", 546 | "Guido --> running @ 4.0 - 2 neighbor ratings\n", 547 | "Lisa --> tennis @ 4.5 - 2 neighbor ratings\n", 548 | "Rudi --> Nothing for you :(\n", 549 | "Suna --> biking @ 3.0 - 2 neighbor ratings\n", 550 | "Sven --> tennis @ 5.0 - 2 neighbor ratings\n", 551 | "Yvonne --> volleyball @ 4.5 - 2 neighbor ratings\n" 552 | ] 553 | } 554 | ], 555 | "source": [ 556 | "for person in people:\n", 557 | " recs = get_recommendations(person, people, user_user_sims, ratings, k=2, C=2, N=1)\n", 558 | " person = person.ljust(7)\n", 559 | " if len(recs) > 0:\n", 560 | " sport = list(recs)[0]\n", 561 | " pred, count = recs.pop(sport).values()\n", 562 | " print(f\"{person} --> {sport.ljust(10)} @ {round(pred, 1)} - {count} neighbor ratings\")\n", 563 | " else:\n", 564 | " print(f\"{person} --> Nothing for you :(\")" 565 | ] 566 | } 567 | ], 568 | "metadata": { 569 | "kernelspec": { 570 | "display_name": "Python 3", 571 | "language": "python", 572 | "name": "python3" 573 | }, 574 | "language_info": { 575 | "codemirror_mode": { 576 | "name": "ipython", 577 | "version": 3 578 | }, 579 | "file_extension": ".py", 580 | "mimetype": "text/x-python", 581 | "name": "python", 582 | "nbconvert_exporter": "python", 583 | "pygments_lexer": "ipython3", 584 | "version": "3.7.5" 585 | } 586 | }, 587 | "nbformat": 4, 588 | "nbformat_minor": 4 589 | } 590 | -------------------------------------------------------------------------------- /notebooks/solutions/9_s_ligthfm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 9: LightFM" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You almost made it - this is the final lesson and it is also going to be the easiest one.\n", 15 | "\n", 16 | "As you may already assume - there are a lot of recommender packages in Python out there. In this lesson we will look at LightFM - an easy to use and lightweight implementation of different approaches and algorithms (FM, BPR, WARP, ...) to perform CF, CBF and hybrid recommenders.\n", 17 | "\n", 18 | "Within a few lines of code we set-up, train and use a recommender for recommendations.\n", 19 | "\n", 20 | "* [LightFM on GitHub](https://github.com/lyst/lightfm)\n", 21 | "* [LightFM documentation](https://making.lyst.com/lightfm/docs/home.html)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 240, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import matplotlib.pyplot as plt\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from scipy.sparse import coo_matrix\n", 34 | "\n", 35 | "from recsys_training.data import Dataset, genres" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "/Users/mkurovski/anaconda3/envs/recsys_training/lib/python3.9/site-packages/lightfm/_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used.\n", 48 | " warnings.warn(\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "from lightfm.datasets import fetch_movielens\n", 54 | "from lightfm.evaluation import precision_at_k\n", 55 | "from lightfm import LightFM" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 7, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'\n", 65 | "ml100k_item_filepath = '../../data/raw/ml-100k/u.item'\n", 66 | "ml100k_user_filepath = '../../data/raw/ml-100k/u.user'" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Load Data" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### You may easily load Movielens Data ..." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 306, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "data = fetch_movielens(min_rating=4.0, genre_features=True)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 307, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "{'train': <943x1682 sparse matrix of type ''\n", 101 | " \twith 49906 stored elements in COOrdinate format>,\n", 102 | " 'test': <943x1682 sparse matrix of type ''\n", 103 | " \twith 5469 stored elements in COOrdinate format>,\n", 104 | " 'item_features': <1682x1701 sparse matrix of type ''\n", 105 | " \twith 4575 stored elements in Compressed Sparse Row format>,\n", 106 | " 'item_feature_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,\n", 107 | " 'genre:Thriller', 'genre:War', 'genre:Western'], dtype=object),\n", 108 | " 'item_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,\n", 109 | " 'Sliding Doors (1998)', 'You So Crazy (1994)',\n", 110 | " 'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object)}" 111 | ] 112 | }, 113 | "execution_count": 307, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "data" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### But, we want to use the exact same data and split that we used in the lessons before" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 321, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "data = Dataset(ml100k_ratings_filepath)\n", 136 | "data.filter(min_rating=4.0)\n", 137 | "data.rating_split(seed=42)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "#### Transform our training and testing data into sparse matrices" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 322, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# Train DataFrame to Train COO Matrix\n", 154 | "ratings = data.train_ratings[\"rating\"].values\n", 155 | "# We subtract 1 to make user/item ids 0-index-based\n", 156 | "rows = data.train_ratings[\"user\"].values - 1\n", 157 | "cols = data.train_ratings[\"item\"].values - 1\n", 158 | "\n", 159 | "train_mat = coo_matrix((ratings, (rows, cols)),\n", 160 | " shape=(data.n_users, data.n_items))\n", 161 | "\n", 162 | "\n", 163 | "# Test DataFrame to Test COO Matrix\n", 164 | "ratings = data.test_ratings[\"rating\"].values\n", 165 | "# We subtract 1 to make user/item ids 0-index-based\n", 166 | "rows = data.test_ratings[\"user\"].values - 1\n", 167 | "cols = data.test_ratings[\"item\"].values - 1\n", 168 | "\n", 169 | "test_mat = coo_matrix((ratings, (rows, cols)),\n", 170 | " shape=(data.n_users, data.n_items))" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 323, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "<943x1682 sparse matrix of type ''\n", 182 | "\twith 44300 stored elements in COOrdinate format>" 183 | ] 184 | }, 185 | "execution_count": 323, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "train_mat" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 324, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "<943x1682 sparse matrix of type ''\n", 203 | "\twith 11075 stored elements in COOrdinate format>" 204 | ] 205 | }, 206 | "execution_count": 324, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "test_mat" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "## Collaborative Filtering" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 433, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "params = {\n", 229 | " 'no_components': 10,\n", 230 | " 'loss': 'bpr',\n", 231 | " 'learning_rate': 0.07,\n", 232 | " 'random_state': 42,\n", 233 | " 'user_alpha': 0.0002,\n", 234 | " 'item_alpha': 0.0002\n", 235 | "}\n", 236 | "\n", 237 | "epochs = 10\n", 238 | "\n", 239 | "N = 10" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 434, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "cf_model = LightFM(**params)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 435, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "name": "stderr", 258 | "output_type": "stream", 259 | "text": [ 260 | "Epoch: 100%|██████████| 10/10 [00:00<00:00, 48.66it/s]\n" 261 | ] 262 | }, 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "" 267 | ] 268 | }, 269 | "execution_count": 435, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "cf_model.fit(train_mat, epochs=epochs, verbose=True)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "### Evaluate the `MAP@10` on test data\n", 283 | "\n", 284 | "If we provide training data with evaluation, known positives will be removed." 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 436, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "prec_at_N = precision_at_k(cf_model, test_mat, train_mat, k=N)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 437, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "0.17415851" 305 | ] 306 | }, 307 | "execution_count": 437, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "prec_at_N.mean()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "### Evaluate the `MAP@10` on train data" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 438, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "prec_at_N = precision_at_k(cf_model, train_mat, k=N)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 439, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "0.4393843" 341 | ] 342 | }, 343 | "execution_count": 439, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "prec_at_N.mean()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "Maybe try adding some regularization to improve the recommendation relevancy - simply add `user_alpha` and `item_alpha` to the `params` dictionary and find appropriate values." 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Hybrid (CF + CBF)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### Load user and item features" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 440, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "def min_max_scale(val, bounds):\n", 380 | " min_max_range = bounds['max']-bounds['min']\n", 381 | " return (val-bounds['min'])/min_max_range\n", 382 | "\n", 383 | "\n", 384 | "def user_profiler(group):\n", 385 | " genre_dist = group[genres].mean()\n", 386 | " year_dist = group['release_year'].describe()[['mean', 'std', '50%']]\n", 387 | "\n", 388 | " return pd.concat((genre_dist, year_dist), axis=0)\n", 389 | "\n", 390 | "\n", 391 | "def get_user_profiles(ratings: pd.DataFrame,\n", 392 | " item_feat: pd.DataFrame,\n", 393 | " min_rating: float = 4.0) -> pd.DataFrame:\n", 394 | " ratings = ratings[ratings.rating >= min_rating]\n", 395 | " ratings = ratings[['user', 'item']]\n", 396 | " ratings = ratings.merge(item_feat, on='item', how='left')\n", 397 | " ratings.drop(['item'], axis=1, inplace=True)\n", 398 | "\n", 399 | " grouped = ratings.groupby('user')\n", 400 | " profiles = grouped.apply(user_profiler).reset_index()\n", 401 | " profiles.rename(columns={'50%': 'median'}, inplace=True)\n", 402 | " \n", 403 | " return profiles\n", 404 | "\n", 405 | "\n", 406 | "item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n", 407 | " names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n", 408 | " engine='python')\n", 409 | "\n", 410 | "user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n", 411 | " names=['user', 'age', 'gender', 'occupation', 'zip'])\n", 412 | "\n", 413 | "# Infer the release year\n", 414 | "idxs = item_feat[item_feat['release'].notnull()].index\n", 415 | "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')\n", 416 | "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)\n", 417 | "\n", 418 | "# Impute median release year value for the items with missing release year\n", 419 | "top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']\n", 420 | "idx = item_feat[item_feat['release'].isnull()].index\n", 421 | "item_feat.loc[idx, 'release_year'] = top_year\n", 422 | "\n", 423 | "# Min-max scale the release year\n", 424 | "item_year_bounds = {'min': item_feat['release_year'].min(),\n", 425 | " 'max': item_feat['release_year'].max()}\n", 426 | "item_feat['release_year'] = item_feat['release_year'].apply(\n", 427 | " lambda year: min_max_scale(year, item_year_bounds))\n", 428 | "\n", 429 | "# Drop other columns\n", 430 | "item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)\n", 431 | "\n", 432 | "# Min-max scale the age\n", 433 | "user_age_bounds = {'min': user_feat['age'].min(),\n", 434 | " 'max': user_feat['age'].max()}\n", 435 | "user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))\n", 436 | "\n", 437 | "# Transform gender characters to numerical values (categories)\n", 438 | "genders = sorted(user_feat['gender'].unique())\n", 439 | "user_gender_map = dict(zip(genders, range(len(genders))))\n", 440 | "user_feat['gender'] = user_feat['gender'].map(user_gender_map)\n", 441 | "\n", 442 | "# Transform occupation strings to numerical values (categories)\n", 443 | "occupations = sorted(user_feat['occupation'].unique())\n", 444 | "user_occupation_map = dict(zip(occupations, range(len(occupations))))\n", 445 | "user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)\n", 446 | "\n", 447 | "# Transform the zip codes to categories keeping the first three digits and impute for missing\n", 448 | "idxs = user_feat[~user_feat['zip'].str.isnumeric()].index\n", 449 | "user_feat.loc[idxs, 'zip'] = '00000'\n", 450 | "zip_digits_to_cut = 3\n", 451 | "user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)\n", 452 | "\n", 453 | "\n", 454 | "profiles = get_user_profiles(data.train_ratings, item_feat)\n", 455 | "user_feat = user_feat.merge(profiles, on='user', how='left')\n", 456 | "\n", 457 | "occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')\n", 458 | "zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')\n", 459 | "\n", 460 | "user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)\n", 461 | "user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)\n", 462 | "\n", 463 | "user_feat.fillna(0, inplace=True)\n", 464 | "\n", 465 | "\n", 466 | "user_feat.index = user_feat['user'].values\n", 467 | "user_feat.drop('user', axis=1, inplace=True)\n", 468 | "\n", 469 | "item_feat.index = item_feat['item'].values\n", 470 | "item_feat.drop('item', axis=1, inplace=True)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 441, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/plain": [ 481 | "0.8608033813918158" 482 | ] 483 | }, 484 | "execution_count": 441, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "(user_feat==0).sum().sum()/user_feat.size" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 442, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "0.8640309155766944" 502 | ] 503 | }, 504 | "execution_count": 442, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "(item_feat==0).sum().sum()/item_feat.size" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 443, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "# Create User Feature COO Matrix\n", 520 | "# user_feat_mat = coo_matrix(np.eye(data.n_users))\n", 521 | "user_feat_mat = coo_matrix(np.concatenate((user_feat.values, np.eye(data.n_users)), axis=1))\n", 522 | "\n", 523 | "# Create Item Feature COO Matrix\n", 524 | "# item_feat_mat = coo_matrix(np.eye(data.n_items))\n", 525 | "item_feat_mat = coo_matrix(np.concatenate((item_feat.values, np.eye(data.n_items)), axis=1))" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 444, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/plain": [ 536 | "<943x1084 sparse matrix of type ''\n", 537 | "\twith 19451 stored elements in COOrdinate format>" 538 | ] 539 | }, 540 | "execution_count": 444, 541 | "metadata": {}, 542 | "output_type": "execute_result" 543 | } 544 | ], 545 | "source": [ 546 | "user_feat_mat" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 445, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "data": { 556 | "text/plain": [ 557 | "<1682x1702 sparse matrix of type ''\n", 558 | "\twith 6256 stored elements in COOrdinate format>" 559 | ] 560 | }, 561 | "execution_count": 445, 562 | "metadata": {}, 563 | "output_type": "execute_result" 564 | } 565 | ], 566 | "source": [ 567 | "item_feat_mat" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "### Model Training" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 454, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "params = {\n", 584 | " 'no_components': 10,\n", 585 | " 'loss': 'warp',\n", 586 | " 'learning_rate': 0.07,\n", 587 | " 'random_state': 42,\n", 588 | " 'user_alpha': 0.0002,\n", 589 | " 'item_alpha': 0.0002\n", 590 | "}\n", 591 | "\n", 592 | "epochs = 10\n", 593 | "\n", 594 | "N = 10" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 455, 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "name": "stderr", 604 | "output_type": "stream", 605 | "text": [ 606 | "Epoch: 100%|██████████| 10/10 [00:00<00:00, 19.44it/s]\n" 607 | ] 608 | }, 609 | { 610 | "data": { 611 | "text/plain": [ 612 | "" 613 | ] 614 | }, 615 | "execution_count": 455, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "hybrid_model = LightFM(**params)\n", 622 | "\n", 623 | "hybrid_model.fit(train_mat,\n", 624 | " user_features=user_feat_mat,\n", 625 | " item_features=item_feat_mat,\n", 626 | " epochs=epochs,\n", 627 | " verbose=True)" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 456, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "prec_at_N = precision_at_k(hybrid_model,\n", 637 | " test_mat,\n", 638 | " train_mat,\n", 639 | " k=N,\n", 640 | " user_features=user_feat_mat,\n", 641 | " item_features=item_feat_mat)" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 457, 647 | "metadata": {}, 648 | "outputs": [ 649 | { 650 | "data": { 651 | "text/plain": [ 652 | "0.19381107" 653 | ] 654 | }, 655 | "execution_count": 457, 656 | "metadata": {}, 657 | "output_type": "execute_result" 658 | } 659 | ], 660 | "source": [ 661 | "prec_at_N.mean()" 662 | ] 663 | } 664 | ], 665 | "metadata": { 666 | "kernelspec": { 667 | "display_name": "Python 3", 668 | "language": "python", 669 | "name": "python3" 670 | }, 671 | "language_info": { 672 | "codemirror_mode": { 673 | "name": "ipython", 674 | "version": 3 675 | }, 676 | "file_extension": ".py", 677 | "mimetype": "text/x-python", 678 | "name": "python", 679 | "nbconvert_exporter": "python", 680 | "pygments_lexer": "ipython3", 681 | "version": "3.9.4" 682 | }, 683 | "pycharm": { 684 | "stem_cell": { 685 | "cell_type": "raw", 686 | "metadata": { 687 | "collapsed": false 688 | }, 689 | "source": [] 690 | } 691 | } 692 | }, 693 | "nbformat": 4, 694 | "nbformat_minor": 4 695 | } 696 | -------------------------------------------------------------------------------- /notebooks/8_e_hybrid_fm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 8: Hybrid Recommender Model using both Collaborative Filtering and Content-based Filtering using a Factorization Machine" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this section, we combine CF and CBF.\n", 15 | "\n", 16 | "Therefore, we simply add the one-hot-encoded user and item IDs to the data. Thus, the model is capable of factorizing the similarities in rating and features for rating prediction. This combination is called hybrid as it combines two recommenders." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from collections import OrderedDict\n", 26 | "import itertools\n", 27 | "from typing import Dict, List, Tuple\n", 28 | "\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "from pyfm import pylibfm\n", 33 | "from scipy import sparse\n", 34 | "from sklearn.metrics import mean_squared_error, mean_absolute_error" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from recsys_training.data import Dataset, genres\n", 44 | "from recsys_training.evaluation import get_relevant_items\n", 45 | "from recsys_training.utils import get_sparsity" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'\n", 55 | "ml100k_item_filepath = '../data/raw/ml-100k/u.item'\n", 56 | "ml100k_user_filepath = '../data/raw/ml-100k/u.user'" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Load Data" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "data = Dataset(ml100k_ratings_filepath)\n", 73 | "data.rating_split(seed=42)\n", 74 | "user_ratings = data.get_user_ratings()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,\n", 84 | " names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,\n", 85 | " engine='python')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,\n", 95 | " names=['user', 'age', 'gender', 'occupation', 'zip'])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## User and Item Content (Features)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "### Preprocessing" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "#### Items" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "We keep the following information for items:\n", 124 | "* release year\n", 125 | "* genres" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "def min_max_scale(val, bounds):\n", 135 | " min_max_range = bounds['max']-bounds['min']\n", 136 | " return (val-bounds['min'])/min_max_range" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# Infer the release year\n", 146 | "idxs = item_feat[item_feat['release'].notnull()].index\n", 147 | "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')\n", 148 | "item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)\n", 149 | "\n", 150 | "# Impute median release year value for the items with missing release year\n", 151 | "top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']\n", 152 | "idx = item_feat[item_feat['release'].isnull()].index\n", 153 | "item_feat.loc[idx, 'release_year'] = top_year\n", 154 | "\n", 155 | "# Min-max scale the release year\n", 156 | "item_year_bounds = {'min': item_feat['release_year'].min(),\n", 157 | " 'max': item_feat['release_year'].max()}\n", 158 | "item_feat['release_year'] = item_feat['release_year'].apply(\n", 159 | " lambda year: min_max_scale(year, item_year_bounds))\n", 160 | "\n", 161 | "# Drop other columns\n", 162 | "item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "#### users" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "We keep the following information for users:\n", 177 | "* age\n", 178 | "* gender\n", 179 | "* occupation\n", 180 | "* zip-code" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "# Min-max scale the age\n", 190 | "user_age_bounds = {'min': user_feat['age'].min(),\n", 191 | " 'max': user_feat['age'].max()}\n", 192 | "user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))\n", 193 | "\n", 194 | "# Transform gender characters to numerical values (categories)\n", 195 | "genders = sorted(user_feat['gender'].unique())\n", 196 | "user_gender_map = dict(zip(genders, range(len(genders))))\n", 197 | "user_feat['gender'] = user_feat['gender'].map(user_gender_map)\n", 198 | "\n", 199 | "# Transform occupation strings to numerical values (categories)\n", 200 | "occupations = sorted(user_feat['occupation'].unique())\n", 201 | "user_occupation_map = dict(zip(occupations, range(len(occupations))))\n", 202 | "user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)\n", 203 | "\n", 204 | "# Transform the zip codes to categories keeping the first three digits and impute for missing\n", 205 | "idxs = user_feat[~user_feat['zip'].str.isnumeric()].index\n", 206 | "user_feat.loc[idxs, 'zip'] = '00000'\n", 207 | "zip_digits_to_cut = 3\n", 208 | "user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "In addition, we infer profiles by combining item information with rating data for each user to get features that represent the users' preferred genres and film age" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "def user_profiler(group):\n", 225 | " genre_dist = group[genres].mean()\n", 226 | " year_dist = group['release_year'].describe()[['mean', 'std', '50%']]\n", 227 | "\n", 228 | " return pd.concat((genre_dist, year_dist), axis=0)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "def get_user_profiles(ratings: pd.DataFrame,\n", 238 | " item_feat: pd.DataFrame,\n", 239 | " min_rating: float = 4.0) -> pd.DataFrame:\n", 240 | " ratings = ratings[ratings.rating >= min_rating]\n", 241 | " ratings = ratings[['user', 'item']]\n", 242 | " ratings = ratings.merge(item_feat, on='item', how='left')\n", 243 | " ratings.drop(['item'], axis=1, inplace=True)\n", 244 | "\n", 245 | " grouped = ratings.groupby('user')\n", 246 | " profiles = grouped.apply(user_profiler).reset_index()\n", 247 | " profiles.rename(columns={'50%': 'median'}, inplace=True)\n", 248 | " \n", 249 | " return profiles" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "Finally, we join the original user information with their profiles' information and one-hot-encode categorical information" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "profiles = get_user_profiles(data.train_ratings, item_feat)\n", 266 | "user_feat = user_feat.merge(profiles, on='user', how='left')\n", 267 | "\n", 268 | "occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')\n", 269 | "zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')\n", 270 | "\n", 271 | "user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)\n", 272 | "user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)\n", 273 | "\n", 274 | "user_feat.fillna(0, inplace=True)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "We remove the user/item id columns and replace the current dataframe indices with their values" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "user_feat.index = user_feat['user'].values\n", 291 | "user_feat.drop('user', axis=1, inplace=True)\n", 292 | "\n", 293 | "item_feat.index = item_feat['item'].values\n", 294 | "item_feat.drop('item', axis=1, inplace=True)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "## Factorization Machine for a Hybrid Recommender" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "[Steffen Rendle: Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)\n", 309 | "\n", 310 | "[pyFM - Factorization Machines in Python](https://github.com/coreylynch/pyFM)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "#### Create Feature Matrices" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "# fetch content information for all observed user-item rating combinations\n", 327 | "user_cb_feat_train = user_feat.loc[data.train_ratings.user.values].values\n", 328 | "user_cb_feat_test = user_feat.loc[data.test_ratings.user.values].values\n", 329 | "item_cb_feat_train = item_feat.loc[data.train_ratings.item.values].values\n", 330 | "item_cb_feat_test = item_feat.loc[data.test_ratings.item.values].values" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "![](Parrot.png)\n", 338 | "\n", 339 | "**Task:** Implement additional arrays for user and item IDs and adjust the design matrices `X_train` and `X_test` accordingly." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "def one_hot_encode_ids(ids: np.array, length):\n", 349 | " pass\n", 350 | " return one_hot_enc" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "# Subtract 1 to turn 1-base-indexed into 0-base-indexed IDs for 0-base-indexed array\n", 360 | "pass" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "# concatenate user and item content information to form design matrices\n", 370 | "# and convert to sparse matrix in Compressed Sparse Row (CSR) format\n", 371 | "X_train = pass\n", 372 | "X_train = pass\n", 373 | "X_test = pass\n", 374 | "X_test = pass" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "X_train" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "# Sparsity of Training Data\n", 393 | "get_sparsity(X_train)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "X_test" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "# Sparsity of Test Data\n", 412 | "get_sparsity(X_test)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "#### Create Target Matrices for Rating Predictions" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "y_train = data.train_ratings.rating.values.astype(float)\n", 429 | "y_test = data.test_ratings.rating.values" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "#### Train Factorization Machine for Rating Prediction as Regressor using pyFM" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "n_epochs = 50 # number of full stochastic passes through the training data\n", 446 | "k = 16\n", 447 | "random_seed = 28" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "fm_hybrid = pylibfm.FM(num_factors=k,\n", 457 | " num_iter=n_epochs,\n", 458 | " verbose=True,\n", 459 | " task=\"regression\",\n", 460 | " initial_learning_rate=0.001,\n", 461 | " learning_rate_schedule=\"optimal\",\n", 462 | " seed=random_seed)\n", 463 | "fm_hybrid.fit(X_train, y_train)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "## Evaluation on Test Set" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "y_pred = fm_hybrid.predict(X_test)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "$MSE$" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "mean_squared_error(y_test, y_pred)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "$MAE$" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "mean_absolute_error(y_test, y_pred)" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "def get_prediction(fm: object, user: int, user_feat: pd.DataFrame, item_feat: pd.DataFrame,\n", 521 | " items: np.array = None, remove_known_pos: bool = True) -> Dict[int, Dict[str, float]]:\n", 522 | " \n", 523 | " if items is None:\n", 524 | " if remove_known_pos:\n", 525 | " # Predict from unobserved items\n", 526 | " known_items = np.array(list(user_ratings[user].keys()))\n", 527 | " items = np.setdiff1d(data.items, known_items)\n", 528 | " else:\n", 529 | " items = np.array(data.items)\n", 530 | " if type(items) == np.int64:\n", 531 | " items = np.array([items])\n", 532 | " \n", 533 | " n_items = len(items)\n", 534 | " \n", 535 | " single_user_cb_feat = user_feat.loc[user].values.reshape(1, -1).repeat(n_items, axis=0)\n", 536 | " all_items_cb_feat = item_feat.loc[items].values\n", 537 | " \n", 538 | " input_data = np.concatenate((single_user_cb_feat, all_items_cb_feat), axis=1)\n", 539 | " input_data = sparse.csr_matrix(input_data)\n", 540 | " \n", 541 | " preds = fm.predict(input_data)\n", 542 | " sorting = np.argsort(preds)[::-1]\n", 543 | " \n", 544 | " preds = {item: {'pred': pred} for item, pred in\n", 545 | " zip(items[sorting], preds[sorting])}\n", 546 | " \n", 547 | " return preds" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "predictions = get_prediction(fm_hybrid, 1, user_feat, item_feat)\n", 557 | "list(predictions.items())[:10]" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [ 566 | "def get_recommendations(fm_cb: object,\n", 567 | " user: int,\n", 568 | " N: int,\n", 569 | " user_feat: pd.DataFrame,\n", 570 | " item_feat: pd.DataFrame,\n", 571 | " remove_known_pos: bool = True) -> List[Tuple[int, Dict[str, float]]]:\n", 572 | " \n", 573 | " recommendations = []\n", 574 | " \n", 575 | " predictions = get_prediction(fm_cb, user, user_feat, item_feat,\n", 576 | " remove_known_pos=remove_known_pos)\n", 577 | "\n", 578 | " for item, pred in predictions.items():\n", 579 | " add_item = (item, pred)\n", 580 | " recommendations.append(add_item)\n", 581 | " if len(recommendations) == N:\n", 582 | " break\n", 583 | "\n", 584 | " return recommendations" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "get_recommendations(fm_hybrid, 1, N=10, user_feat=user_feat, item_feat=item_feat)" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "## Evaluation" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "N = 10" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "relevant_items = get_relevant_items(data.test_ratings)" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "users = relevant_items.keys()\n", 628 | "prec_at_N = dict.fromkeys(data.users)\n", 629 | "\n", 630 | "for user in users:\n", 631 | " recommendations = get_recommendations(fm_hybrid, user, N,\n", 632 | " user_feat=user_feat, item_feat=item_feat)\n", 633 | " recommendations = [val[0] for val in recommendations]\n", 634 | " hits = np.intersect1d(recommendations,\n", 635 | " relevant_items[user])\n", 636 | " prec_at_N[user] = len(hits)/N" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": null, 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [ 645 | "recommendations" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "np.mean([val for val in prec_at_N.values() if val is not None])" 655 | ] 656 | } 657 | ], 658 | "metadata": { 659 | "kernelspec": { 660 | "display_name": "Python 3", 661 | "language": "python", 662 | "name": "python3" 663 | }, 664 | "language_info": { 665 | "codemirror_mode": { 666 | "name": "ipython", 667 | "version": 3 668 | }, 669 | "file_extension": ".py", 670 | "mimetype": "text/x-python", 671 | "name": "python", 672 | "nbconvert_exporter": "python", 673 | "pygments_lexer": "ipython3", 674 | "version": "3.9.4" 675 | }, 676 | "pycharm": { 677 | "stem_cell": { 678 | "cell_type": "raw", 679 | "metadata": { 680 | "collapsed": false 681 | }, 682 | "source": [] 683 | } 684 | } 685 | }, 686 | "nbformat": 4, 687 | "nbformat_minor": 4 688 | } 689 | -------------------------------------------------------------------------------- /notebooks/solutions/4_s_cf_knn_rating_pred.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 4: Neighborhood-based Collaborative Filtering for Rating Prediction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this section we generate personalized recommendations for the first time. We exploit rating similarities among users and items to identify similar users and items that assist in finding the relevant items to recommend for each user.\n", 15 | "\n", 16 | "This describes the fundamental idea behind Collaborative Filtering (CF) and using kNN is a neighborhood-based approach towards CF. In a later unit we will also have a look at model-based approaches.\n", 17 | "\n", 18 | "This is also the first time we try to predict user ratings for unknown items using rating predictions to take the top-$N$ items with the highest rating predictions and recommend those to the user." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from collections import OrderedDict\n", 28 | "import itertools\n", 29 | "from typing import Dict, List, Tuple\n", 30 | "\n", 31 | "import numpy as np\n", 32 | "import pandas as pd" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from recsys_training.data import Dataset\n", 42 | "from recsys_training.evaluation import get_relevant_items\n", 43 | "from recsys_training.utils import get_entity_sim" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Load Data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "data = Dataset(ml100k_ratings_filepath)\n", 69 | "data.rating_split(seed=42)\n", 70 | "user_ratings = data.get_user_ratings()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The idea behind this recommender is to use item ratings of the $k$ most similar users (neighbors). We identify those _nearest neighbors_ with a similarity metric which we apply to the ratings both, root user and possible neighbor, have in common. Similarity thereby means having a similar opinion on movies.\n", 78 | "\n", 79 | "The steps are as follows:\n", 80 | "\n", 81 | "1. Compute user-user similarities (we use the Pearson Correlation Coefficient here, but feel free to try other similarity metrics)\n", 82 | "\n", 83 | "2. For each user:\n", 84 | "\n", 85 | " 1. Get the k nearest neighbors along with their similarities\n", 86 | " 2. Collect the neighborhood item ratings and ignore those already rated by the root user\n", 87 | " 3. Item Rating Prediction: Compute the similarity-weighted sum of neighborhood item ratings\n", 88 | " 4. Recommendations: Get the $N$ items with the highest ratings that have a minimum rating count" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### 1. User-User Similarities" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "sim_metric = 'pearson'\n", 105 | "user_user_sims = {}\n", 106 | "user_pairs = itertools.combinations(data.users, 2)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "The following takes a few seconds to finish ..." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stderr", 123 | "output_type": "stream", 124 | "text": [ 125 | "/anaconda3/envs/recsys_training/lib/python3.7/site-packages/numpy/lib/function_base.py:2534: RuntimeWarning: invalid value encountered in true_divide\n", 126 | " c /= stddev[:, None]\n", 127 | "/anaconda3/envs/recsys_training/lib/python3.7/site-packages/numpy/lib/function_base.py:2535: RuntimeWarning: invalid value encountered in true_divide\n", 128 | " c /= stddev[None, :]\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "for pair in user_pairs:\n", 134 | " user_user_sims[pair] = get_entity_sim(pair[0], pair[1],\n", 135 | " user_ratings,\n", 136 | " sim_metric)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 7, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "(0.9759000729485333, 5)" 148 | ] 149 | }, 150 | "execution_count": 7, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "user_user_sims[(1,4)]" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "## 2. Computing Recommendations" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### A. Implement Nearest Neighbors for a given user\n", 171 | "\n", 172 | "![](../Parrot.png)\n", 173 | "\n", 174 | "**Task:** It's your turn again. Complete `get_k_nearest_neighbors` to return a sorted list of the $k$ nearest neighbors - identified by their id - for a given user, each along with its similarity." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 8, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "def get_k_nearest_neighbors(user: int, k: int, user_user_sims: dict) -> List[Tuple[int, float]]:\n", 184 | " neighbors = set(data.users)\n", 185 | " neighbors.remove(user)\n", 186 | "\n", 187 | " nearest_neighbors = dict()\n", 188 | " for neighbor in neighbors:\n", 189 | " sim = user_user_sims[tuple(sorted((user, neighbor)))][0]\n", 190 | " if pd.notnull(sim):\n", 191 | " nearest_neighbors[neighbor] = sim\n", 192 | "\n", 193 | " nearest_neighbors = sorted(nearest_neighbors.items(),\n", 194 | " key=lambda kv: kv[1],\n", 195 | " reverse=True)\n", 196 | " \n", 197 | " return nearest_neighbors[:k]" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 9, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "user_neighbors = get_k_nearest_neighbors(1, k=10, user_user_sims=user_user_sims)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 10, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "[(107, 1.0),\n", 218 | " (443, 1.0),\n", 219 | " (485, 1.0),\n", 220 | " (687, 1.0),\n", 221 | " (791, 1.0),\n", 222 | " (820, 1.0),\n", 223 | " (34, 0.9999999999999999),\n", 224 | " (240, 0.9999999999999999),\n", 225 | " (281, 0.9999999999999999),\n", 226 | " (384, 0.9999999999999999)]" 227 | ] 228 | }, 229 | "execution_count": 10, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "user_neighbors" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### B. Obtain the Neighborhood Ratings\n", 243 | "\n", 244 | "**Task:** Now, use the nearest neighbors and get their ratings, but leave out the items our root user has already rated (known positives). Return a mapping from unknown item to a list of dicts with neighbor similarity and item rating." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 11, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "def get_neighborhood_ratings(user, user_neighbors: List[Tuple[int, float]]) -> Dict[int, List[Dict[str, float]]]:\n", 254 | " neighborhood_ratings = {}\n", 255 | " for neighbor, sim in user_neighbors:\n", 256 | " neighbor_ratings = user_ratings[neighbor].copy()\n", 257 | " \n", 258 | " # collect neighbor ratings and items\n", 259 | " for item, rating in neighbor_ratings.items():\n", 260 | " add_item = {'sim': sim, 'rating': rating}\n", 261 | " if item not in neighborhood_ratings.keys():\n", 262 | " neighborhood_ratings[item] = [add_item]\n", 263 | " else:\n", 264 | " neighborhood_ratings[item].append(add_item)\n", 265 | " \n", 266 | " # remove known items\n", 267 | " known_items = list(user_ratings[user].keys())\n", 268 | " for known_item in known_items:\n", 269 | " neighborhood_ratings.pop(known_item, None)\n", 270 | " \n", 271 | " return neighborhood_ratings" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 12, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "neighborhood_ratings = get_neighborhood_ratings(1, user_neighbors)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 13, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "[(340,\n", 292 | " [{'sim': 1.0, 'rating': 5.0},\n", 293 | " {'sim': 1.0, 'rating': 5.0},\n", 294 | " {'sim': 0.9999999999999999, 'rating': 4.0}]),\n", 295 | " (325, [{'sim': 1.0, 'rating': 3.0}]),\n", 296 | " (288,\n", 297 | " [{'sim': 1.0, 'rating': 3.0},\n", 298 | " {'sim': 1.0, 'rating': 3.0},\n", 299 | " {'sim': 1.0, 'rating': 4.0},\n", 300 | " {'sim': 1.0, 'rating': 3.0},\n", 301 | " {'sim': 1.0, 'rating': 5.0},\n", 302 | " {'sim': 0.9999999999999999, 'rating': 5.0}]),\n", 303 | " (312,\n", 304 | " [{'sim': 1.0, 'rating': 4.0}, {'sim': 0.9999999999999999, 'rating': 4.0}]),\n", 305 | " (313,\n", 306 | " [{'sim': 1.0, 'rating': 2.0},\n", 307 | " {'sim': 1.0, 'rating': 4.0},\n", 308 | " {'sim': 1.0, 'rating': 5.0},\n", 309 | " {'sim': 1.0, 'rating': 5.0},\n", 310 | " {'sim': 0.9999999999999999, 'rating': 5.0},\n", 311 | " {'sim': 0.9999999999999999, 'rating': 5.0}]),\n", 312 | " (300,\n", 313 | " [{'sim': 1.0, 'rating': 1.0},\n", 314 | " {'sim': 0.9999999999999999, 'rating': 3.0},\n", 315 | " {'sim': 0.9999999999999999, 'rating': 4.0},\n", 316 | " {'sim': 0.9999999999999999, 'rating': 4.0}]),\n", 317 | " (264,\n", 318 | " [{'sim': 1.0, 'rating': 3.0},\n", 319 | " {'sim': 1.0, 'rating': 3.0},\n", 320 | " {'sim': 1.0, 'rating': 3.0}]),\n", 321 | " (333,\n", 322 | " [{'sim': 1.0, 'rating': 3.0},\n", 323 | " {'sim': 1.0, 'rating': 5.0},\n", 324 | " {'sim': 1.0, 'rating': 5.0},\n", 325 | " {'sim': 0.9999999999999999, 'rating': 3.0},\n", 326 | " {'sim': 0.9999999999999999, 'rating': 4.0}]),\n", 327 | " (1243, [{'sim': 1.0, 'rating': 3.0}]),\n", 328 | " (322,\n", 329 | " [{'sim': 1.0, 'rating': 1.0}, {'sim': 0.9999999999999999, 'rating': 4.0}])]" 330 | ] 331 | }, 332 | "execution_count": 13, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "list(neighborhood_ratings.items())[:10]" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "### C. Compute Rating Predictions from Neighborhood Ratings\n", 346 | "\n", 347 | "![](../Parrot.png)\n", 348 | "\n", 349 | "**Task:** In this step, we estimate ratings for the seed user based on the neighborhood ratings. We implement a similarity weighted average of neighbor ratings for that. Return a mapping from item to its prediction and the count of neighbor ratings received." 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 14, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "def compute_rating_pred(neighborhood_ratings: dict) -> dict:\n", 359 | " rating_preds = dict()\n", 360 | " for item, ratings in neighborhood_ratings.items():\n", 361 | " if len(ratings) > 0:\n", 362 | " sims = np.array([rating['sim'] for rating in ratings])\n", 363 | " ratings = np.array([rating['rating'] for rating in ratings])\n", 364 | " pred_rating = (sims * ratings).sum() / sims.sum()\n", 365 | " count = len(sims)\n", 366 | " rating_preds[item] = {'pred': pred_rating,\n", 367 | " 'count': count}\n", 368 | " else:\n", 369 | " rating_preds[item] = {'pred': None, 'count': 0}\n", 370 | "\n", 371 | " return rating_preds" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 15, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "rating_preds = compute_rating_pred(neighborhood_ratings)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 16, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "text/plain": [ 391 | "[(340, {'pred': 4.666666666666667, 'count': 3}),\n", 392 | " (325, {'pred': 3.0, 'count': 1}),\n", 393 | " (288, {'pred': 3.8333333333333335, 'count': 6}),\n", 394 | " (312, {'pred': 4.0, 'count': 2}),\n", 395 | " (313, {'pred': 4.333333333333333, 'count': 6}),\n", 396 | " (300, {'pred': 2.9999999999999996, 'count': 4}),\n", 397 | " (264, {'pred': 3.0, 'count': 3}),\n", 398 | " (333, {'pred': 4.0, 'count': 5}),\n", 399 | " (1243, {'pred': 3.0, 'count': 1}),\n", 400 | " (322, {'pred': 2.5, 'count': 2}),\n", 401 | " (305, {'pred': 4.0, 'count': 1}),\n", 402 | " (327, {'pred': 4.0, 'count': 3}),\n", 403 | " (302, {'pred': 4.6, 'count': 5}),\n", 404 | " (687, {'pred': 3.0, 'count': 1}),\n", 405 | " (358, {'pred': 1.0, 'count': 2}),\n", 406 | " (323, {'pred': 2.5, 'count': 2}),\n", 407 | " (286, {'pred': 3.875, 'count': 8}),\n", 408 | " (678, {'pred': 2.0, 'count': 1}),\n", 409 | " (343, {'pred': 4.0, 'count': 2}),\n", 410 | " (644, {'pred': 3.0, 'count': 1})]" 411 | ] 412 | }, 413 | "execution_count": 16, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | } 417 | ], 418 | "source": [ 419 | "list(rating_preds.items())[:20]" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "### D. Compute the Top-$N$ Recommendation Items\n", 427 | "\n", 428 | "![](../Parrot.png)\n", 429 | "\n", 430 | "**Task:** The last step takes the rating predictions and returns the $N$ highest predictions which have a minimum rating count, i.e. the number of neighbors from the neighborhood that rated this item." 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 17, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "def compute_top_n(rating_preds: dict, min_count: int, N: int) -> OrderedDict:\n", 440 | " rating_preds = {key: val for (key, val) in rating_preds.items()\n", 441 | " if val['count'] >= min_count}\n", 442 | " # assuming more ratings mean higher confidence in the prediction\n", 443 | " sorted_rating_preds = sorted(rating_preds.items(),\n", 444 | " key=lambda kv: (kv[1]['pred'], kv[1]['count']),\n", 445 | " reverse=True)\n", 446 | "\n", 447 | " return OrderedDict(sorted_rating_preds[:N])" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 18, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "top_n_recs = compute_top_n(rating_preds, min_count=2, N=10)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 19, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "data": { 466 | "text/plain": [ 467 | "OrderedDict([(242, {'pred': 5.0, 'count': 2}),\n", 468 | " (340, {'pred': 4.666666666666667, 'count': 3}),\n", 469 | " (332, {'pred': 4.666666666666667, 'count': 3}),\n", 470 | " (302, {'pred': 4.6, 'count': 5}),\n", 471 | " (690, {'pred': 4.5, 'count': 2}),\n", 472 | " (313, {'pred': 4.333333333333333, 'count': 6}),\n", 473 | " (333, {'pred': 4.0, 'count': 5}),\n", 474 | " (327, {'pred': 4.0, 'count': 3}),\n", 475 | " (312, {'pred': 4.0, 'count': 2}),\n", 476 | " (343, {'pred': 4.0, 'count': 2})])" 477 | ] 478 | }, 479 | "execution_count": 19, 480 | "metadata": {}, 481 | "output_type": "execute_result" 482 | } 483 | ], 484 | "source": [ 485 | "top_n_recs" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "### Combine all steps in `get_recommendations`" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 20, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "def get_recommendations(user: int,\n", 502 | " user_user_sims: dict,\n", 503 | " k: int,\n", 504 | " C: int,\n", 505 | " N: int):\n", 506 | " user_neighbors = get_k_nearest_neighbors(user, k=k, user_user_sims=user_user_sims)\n", 507 | " neighborhood_ratings = get_neighborhood_ratings(user, user_neighbors)\n", 508 | " rating_preds = compute_rating_pred(neighborhood_ratings)\n", 509 | " top_n_recs = compute_top_n(rating_preds, min_count=C, N=N)\n", 510 | " return top_n_recs" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 21, 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "OrderedDict([(242, {'pred': 5.0, 'count': 2}),\n", 522 | " (340, {'pred': 4.666666666666667, 'count': 3}),\n", 523 | " (332, {'pred': 4.666666666666667, 'count': 3}),\n", 524 | " (302, {'pred': 4.6, 'count': 5}),\n", 525 | " (690, {'pred': 4.5, 'count': 2}),\n", 526 | " (313, {'pred': 4.333333333333333, 'count': 6}),\n", 527 | " (333, {'pred': 4.0, 'count': 5}),\n", 528 | " (327, {'pred': 4.0, 'count': 3}),\n", 529 | " (312, {'pred': 4.0, 'count': 2}),\n", 530 | " (343, {'pred': 4.0, 'count': 2})])" 531 | ] 532 | }, 533 | "execution_count": 21, 534 | "metadata": {}, 535 | "output_type": "execute_result" 536 | } 537 | ], 538 | "source": [ 539 | "get_recommendations(1, user_user_sims, 10, 2, 10)" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "## Evaluation" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "Let's check the performance of the neighborhood- and user-based recommender for a neighborhood size of $k = 60$, minimum rating count of $C = 10$ and stay with $N = 10$ recommendations." 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 22, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "k = 60\n", 563 | "C = 10\n", 564 | "N = 10" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 23, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "relevant_items = get_relevant_items(data.test_ratings)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 24, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "users = relevant_items.keys()\n", 583 | "prec_at_N = dict.fromkeys(data.users)\n", 584 | "\n", 585 | "for user in users:\n", 586 | " recommendations = get_recommendations(user, user_user_sims, k, C, N)\n", 587 | " recommendations = list(recommendations.keys())\n", 588 | " hits = np.intersect1d(recommendations,\n", 589 | " relevant_items[user])\n", 590 | " prec_at_N[user] = len(hits)/N" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 25, 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "0.08106382978723406" 602 | ] 603 | }, 604 | "execution_count": 25, 605 | "metadata": {}, 606 | "output_type": "execute_result" 607 | } 608 | ], 609 | "source": [ 610 | "np.mean([val for val in prec_at_N.values() if val is not None])" 611 | ] 612 | } 613 | ], 614 | "metadata": { 615 | "kernelspec": { 616 | "display_name": "Python 3", 617 | "language": "python", 618 | "name": "python3" 619 | }, 620 | "language_info": { 621 | "codemirror_mode": { 622 | "name": "ipython", 623 | "version": 3 624 | }, 625 | "file_extension": ".py", 626 | "mimetype": "text/x-python", 627 | "name": "python", 628 | "nbconvert_exporter": "python", 629 | "pygments_lexer": "ipython3", 630 | "version": "3.9.4" 631 | }, 632 | "pycharm": { 633 | "stem_cell": { 634 | "cell_type": "raw", 635 | "metadata": { 636 | "collapsed": false 637 | }, 638 | "source": [] 639 | } 640 | } 641 | }, 642 | "nbformat": 4, 643 | "nbformat_minor": 4 644 | } 645 | --------------------------------------------------------------------------------