├── .dockerignore ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ └── test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .travis.yml ├── CONTRIBUTORS ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── changelog.md ├── doc ├── Makefile ├── conf.py ├── cross_validation.rst ├── datasets.rst ├── examples.rst ├── examples │ ├── dataset.rst │ ├── hybrid_crossvalidated.rst │ ├── learning_schedules.rst │ ├── learning_schedules_files │ │ ├── learning_schedules_5_0.png │ │ └── learning_schedules_8_0.png │ ├── movielens_implicit.rst │ ├── warp_loss.rst │ └── warp_loss_files │ │ ├── warp_loss_5_0.png │ │ ├── warp_loss_7_0.png │ │ ├── warp_loss_9_0.png │ │ └── warp_loss_9_1.png ├── faq.rst ├── home.rst ├── index.rst ├── lightfm.data.rst ├── lightfm.evaluation.rst ├── lightfm.rst └── quickstart.rst ├── docker-compose.yml ├── docs-requirements.txt ├── examples ├── ann │ └── annoy_nsmlib_example.ipynb ├── dataset │ ├── Makefile │ ├── dataset.pmd │ ├── download.py │ └── readme.rst ├── movielens │ ├── data.py │ ├── example.ipynb │ ├── learning_schedules.ipynb │ ├── readme.md │ └── warp_loss.ipynb ├── quickstart │ ├── quickstart.ipynb │ └── short_quickstart.ipynb └── stackexchange │ └── hybrid_crossvalidated.ipynb ├── lightfm.png ├── lightfm ├── __init__.py ├── _lightfm_fast.py ├── _lightfm_fast.pyx.template ├── _lightfm_fast_no_openmp.c ├── _lightfm_fast_openmp.c ├── cross_validation.py ├── data.py ├── datasets │ ├── __init__.py │ ├── _common.py │ ├── movielens.py │ └── stackexchange.py ├── evaluation.py ├── lightfm.py └── version.py ├── lint-requirements.txt ├── setup.cfg ├── setup.py ├── test-requirements.txt └── tests ├── __init__.py ├── test_api.py ├── test_cross_validation.py ├── test_data.py ├── test_datasets.py ├── test_evaluation.py ├── test_fast_functions.py └── test_movielens.py /.dockerignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | *.7z 3 | *.xml 4 | examples/crossvalidated/*.7z 5 | examples/crossvalidated/*.xml 6 | examples/movielens/*.zip 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Thanks for opening an issue! 2 | 3 | Please include as much detail as possible: what does your dataset look like, what hyperparameters you are using (and have you tried other ones?). 4 | 5 | When including code snippets, make sure you use appropriate code formatting using backtics. Have a look at the markdown reference for details: https://guides.github.com/features/mastering-markdown/. 6 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: LightFM test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ubuntu-latest, macos-latest, windows-latest] 18 | python-version: ["3.7", "3.11"] 19 | exclude: 20 | - os: macos-latest 21 | python-version: "3.7" 22 | - os: windows-latest 23 | python-version: "3.7" 24 | 25 | steps: 26 | - uses: actions/checkout@v3 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v4 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install flake8 pytest 35 | - name: Lint with flake8 36 | run: | 37 | # stop the build if there are Python syntax errors or undefined names 38 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 39 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 40 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 41 | - name: Install 42 | run: | 43 | pip install -e . 44 | - name: Test with pytest 45 | run: | 46 | pytest 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | *.pyc 3 | *.egg* 4 | *~ 5 | *.zip 6 | *.so 7 | examples/movielens/.ipynb_checkpoints/ 8 | examples/quickstart/.ipynb_checkpoints/ 9 | examples/stackexchange/.ipynb_checkpoints/ 10 | build/ 11 | dist/ 12 | bench/ 13 | *#* 14 | *.7z 15 | *.xml 16 | doc/_build/* 17 | lightfm/_lightfm_fast_openmp.pyx 18 | lightfm/_lightfm_fast_no_openmp.pyx 19 | *.*-checkpoint 20 | 21 | # Editor specific 22 | .vscode/ 23 | .idea/ 24 | .devcontainer/ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - repo: "https://github.com/psf/black" 11 | rev: "22.1.0" 12 | hooks: 13 | - id: black 14 | - repo: https://gitlab.com/pycqa/flake8 15 | rev: "4.0.1" 16 | hooks: 17 | - id: flake8 18 | types: [file, python] 19 | exclude: doc/ 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: 2 | - osx 3 | before_install: 4 | - python3 -m venv venv 5 | - venv/bin/pip install -r test-requirements.txt 6 | install: 7 | - venv/bin/pip install -e . 8 | script: venv/bin/py.test -xv tests/ 9 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | Oliver Grisel 2 | Jong Wook Kim 3 | Maciej Kula 4 | Paolo Rais 5 | Kent Shikama 6 | Mice Pápai 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update 4 | RUN apt-get install -y libxml2 libxslt-dev wget bzip2 gcc 5 | 6 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ 7 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ 8 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 9 | rm ~/miniconda.sh 10 | 11 | ENV PATH /opt/conda/bin:$PATH 12 | 13 | RUN conda install pytest jupyter scikit-learn 14 | 15 | ENV PYTHONDONTWRITEBYTECODE 1 16 | 17 | ADD . /home/lightfm/ 18 | WORKDIR /home/ 19 | 20 | RUN cd lightfm && pip install -e . 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2015 Lyst 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: examples 2 | examples: 3 | jupyter nbconvert --to rst examples/quickstart/quickstart.ipynb 4 | mv examples/quickstart/quickstart.rst doc/ 5 | jupyter nbconvert --to rst examples/movielens/example.ipynb 6 | mv examples/movielens/example.rst doc/examples/movielens_implicit.rst 7 | jupyter nbconvert --to rst examples/movielens/learning_schedules.ipynb 8 | mv examples/movielens/learning_schedules.rst doc/examples/ 9 | cp -r examples/movielens/learning_schedules_files doc/examples/ 10 | rm -rf examples/movielens/learning_schedules_files 11 | jupyter nbconvert --to rst examples/stackexchange/hybrid_crossvalidated.ipynb 12 | mv examples/stackexchange/hybrid_crossvalidated.rst doc/examples/ 13 | jupyter nbconvert --to rst examples/movielens/warp_loss.ipynb 14 | mv examples/movielens/warp_loss.rst doc/examples/ 15 | cp -r examples/movielens/warp_loss_files doc/examples/ 16 | rm -rf examples/movielens/warp_loss_files 17 | .PHONY: update-docs 18 | update-docs: 19 | pip install -e . \ 20 | && cd doc && make html && cd .. \ 21 | && git fetch origin gh-pages && git checkout gh-pages \ 22 | && rm -rf ./docs/ \ 23 | && mkdir ./docs/ \ 24 | && cp -r ./doc/_build/html/* ./docs/ \ 25 | && git add -A ./docs/* \ 26 | && git commit -m 'Update docs.' && git push origin gh-pages 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LightFM 2 | 3 | ![LightFM logo](lightfm.png) 4 | 5 | | Build status | | 6 | |---|---| 7 | | Linux |[![Circle CI](https://circleci.com/gh/lyst/lightfm.svg?style=svg)](https://circleci.com/gh/lyst/lightfm)| 8 | | OSX (OpenMP disabled)|[![Travis CI](https://travis-ci.org/lyst/lightfm.svg?branch=master)](https://travis-ci.org/lyst/lightfm)| 9 | | Windows (OpenMP disabled) |[![Appveyor](https://ci.appveyor.com/api/projects/status/6cqpqb6969i1h4p7/branch/master?svg=true)](https://ci.appveyor.com/project/maciejkula/lightfm/branch/master)| 10 | 11 | [![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/lightfm-rec/Lobby) [![PyPI](https://img.shields.io/pypi/v/lightfm.svg)](https://pypi.python.org/pypi/lightfm/) 12 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/lightfm/badges/version.svg)](https://anaconda.org/conda-forge/lightfm) 13 | 14 | LightFM is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback, including efficient implementation of BPR and WARP ranking losses. It's easy to use, fast (via multithreaded model estimation), and produces high quality results. 15 | 16 | It also makes it possible to incorporate both item and user metadata into the traditional matrix factorization algorithms. It represents each user and item as the sum of the latent representations of their features, thus allowing recommendations to generalise to new items (via item features) and to new users (via user features). 17 | 18 | For more details, see the [Documentation](http://lyst.github.io/lightfm/docs/home.html). 19 | 20 | Need help? Contact me via [email](mailto:lightfm@zoho.com), [Twitter](https://twitter.com/Maciej_Kula), or [Gitter](https://gitter.im/lightfm-rec/Lobby). 21 | 22 | ## Installation 23 | Install from `pip`: 24 | ``` 25 | pip install lightfm 26 | ``` 27 | or Conda: 28 | ``` 29 | conda install -c conda-forge lightfm 30 | ``` 31 | 32 | ## Quickstart 33 | Fitting an implicit feedback model on the MovieLens 100k dataset is very easy: 34 | ```python 35 | from lightfm import LightFM 36 | from lightfm.datasets import fetch_movielens 37 | from lightfm.evaluation import precision_at_k 38 | 39 | # Load the MovieLens 100k dataset. Only five 40 | # star ratings are treated as positive. 41 | data = fetch_movielens(min_rating=5.0) 42 | 43 | # Instantiate and train the model 44 | model = LightFM(loss='warp') 45 | model.fit(data['train'], epochs=30, num_threads=2) 46 | 47 | # Evaluate the trained model 48 | test_precision = precision_at_k(model, data['test'], k=5).mean() 49 | ``` 50 | 51 | ## Articles and tutorials on using LightFM 52 | 1. [Learning to Rank Sketchfab Models with LightFM](http://blog.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/) 53 | 2. [Metadata Embeddings for User and Item Cold-start Recommendations](http://building-babylon.net/2016/01/26/metadata-embeddings-for-user-and-item-cold-start-recommendations/) 54 | 3. [Recommendation Systems - Learn Python for Data Science](https://www.youtube.com/watch?v=9gBC9R-msAk) 55 | 4. [Using LightFM to Recommend Projects to Consultants](https://medium.com/product-at-catalant-technologies/using-lightfm-to-recommend-projects-to-consultants-44084df7321c#.gu887ky51) 56 | 57 | ## How to cite 58 | Please cite LightFM if it helps your research. You can use the following BibTeX entry: 59 | ``` 60 | @inproceedings{DBLP:conf/recsys/Kula15, 61 | author = {Maciej Kula}, 62 | editor = {Toine Bogers and 63 | Marijn Koolen}, 64 | title = {Metadata Embeddings for User and Item Cold-start Recommendations}, 65 | booktitle = {Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender 66 | Systems co-located with 9th {ACM} Conference on Recommender Systems 67 | (RecSys 2015), Vienna, Austria, September 16-20, 2015.}, 68 | series = {{CEUR} Workshop Proceedings}, 69 | volume = {1448}, 70 | pages = {14--21}, 71 | publisher = {CEUR-WS.org}, 72 | year = {2015}, 73 | url = {http://ceur-ws.org/Vol-1448/paper4.pdf}, 74 | } 75 | ``` 76 | 77 | ## Development 78 | Pull requests are welcome. To install for development: 79 | 80 | 1. Clone the repository: `git clone git@github.com:lyst/lightfm.git` 81 | 2. Setup a virtual environment: `cd lightfm && python3 -m venv venv && source ./venv/bin/activate` 82 | 3. Install it for development using pip: `pip install -e . && pip install -r test-requirements.txt` 83 | 4. You can run tests by running `./venv/bin/py.test tests`. 84 | 5. LightFM uses [black](https://github.com/ambv/black) to enforce code formatting and flake8 for linting, see `lint-requirements.txt`. 85 | 6. [Optional]: You can install pre-commit to locally enfore formatting and linting. Install with: 86 | ```bash 87 | pip install pre-commit 88 | pre-commit install 89 | ``` 90 | 91 | When making changes to the `.pyx` extension files, you'll need to run `python setup.py cythonize` in order to produce the extension `.c` files before running `pip install -e .`. 92 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [1.17][2023-03-19] 4 | 5 | ### Fixed 6 | 7 | - Re-Cythonized cython files to fix compilation errors with newer compilers. 8 | - Fixed `np.object` usage in tests. 9 | 10 | ## [1.16][2020-11-27] 11 | 12 | ### Addded 13 | - Set the `LIGHTFM_NO_CFLAGS` environment variable when building LightFM to prevent it from setting 14 | `-ffast-math` or `-march=native` compiler flags. 15 | 16 | ### Changed 17 | - `predict` now returns float32 predictions. 18 | 19 | ## [1.15][2018-05-26] 20 | ### Added 21 | - Added a check that there is no overlap between test and train in `predict_ranks` (thanks to [@artdgn](https://github.com/artdgn)). 22 | - Added dataset builder functionality. 23 | ### Fixed 24 | - Fixed error message when item features have the wrong dimensions. 25 | - Predict now checks for overflow in inputs to predict. 26 | - WARP fitting is now numerically stable when there are very few items to 27 | draw negative samples from (< max_sampled). 28 | 29 | ## [1.14][2017-11-18] 30 | ### Added 31 | - added additional input checks for non-normal inputs (NaNs, infinites) for features 32 | - added additional input checks for non-normal inputs (NaNs, infinites) for interactions 33 | - cross validation module with dataset splitting utilities 34 | ### Changed 35 | - LightFM model now raises a ValueError (instead of assertion) when the number of supplied 36 | features exceeds the number of estimated feature embeddings. 37 | - Warn and delete downloaded file when Movielens download is corrputed. This happens in the wild 38 | cofuses users terribly. 39 | 40 | ## [1.13][2017-05-20] 41 | ### Added 42 | - added get_{user/item}_representations functions to facilitate extracting the latent representations out of the model. 43 | ### Fixed 44 | - recall_at_k and precision_at_k now work correctly at k=1 (thanks to Zank Bennett). 45 | - Moved Movielens data to data release to prevent grouplens server flakiness from affecting users. 46 | - Fix segfault when trying to predict from a model that has not been fitted. 47 | 48 | ## [1.12][2017-01-26] 49 | ### Changed 50 | - Ranks are now computed pessimistically: when two items are tied, the positive item is assumed to have higher rank. This will lead to zero precision scores for models that predict all zeros, for example. 51 | - The model will raise a ValueError if, during fitting, any of the parameters become non-finite (NaN or +/- infinity). 52 | - Added mid-epoch regularization when a lot of regularization is used. This reduces the likelihood of numerical instability at high regularization rates. 53 | 54 | 55 | ## [1.11][2016-12-26] 56 | ### Changed 57 | - negative samples in BPR are now drawn from the empirical distributions of positives. This improves accuracy slightly on the Movielens 100k dataset. 58 | 59 | ### Fixed 60 | - incorrect calculation of BPR loss (thanks to @TimonVS for reporting this). 61 | 62 | 63 | ## [1.10][2016-11-25] 64 | ### Added 65 | - added recall@k evaluation function 66 | ### Fixed 67 | - added >=0.17.0 scipy depdendency to setup.py 68 | - fixed segfaults on when duplicate entries are present in input COO matrices (thanks to Florian 69 | Wilhelm for the bug report). 70 | 71 | ## [1.9][2016-05-25] 72 | ### Fixed 73 | - fixed gradient accumulation in adagrad (the feature value is now correctly used when accumulating gradient). 74 | Thanks to Benjamin Wilson for the bug report. 75 | - all interaction values greater than 0.0 are now treated as positives for ranking losses. 76 | ### Added 77 | - max_sampled hyperparameter for WARP losses. This allows trading off accuracy for WARP training time: a smaller value 78 | will mean less negative sampling and faster training when the model is near the optimum. 79 | - Added a sample_weight argument to fit and fit_partial functions. A high value will now increase the size of the SGD step taken for that interaction. 80 | - Added an evaluation module for more efficient evaluation of learning-to-rank models. 81 | - Added a random_state keyword argument to LightFM to allow repeatable model runs. 82 | ### Changed 83 | - By default, an OpenMP-less version will be built on OSX. This allows much easier installation at the expense of 84 | performance. 85 | - The default value of the max_sampled argument is now 10. This represents a decent default value that allows fast training. 86 | 87 | ## [1.8][2016-01-14] 88 | ### Changed 89 | - fix scipy missing from requirements in setup.py 90 | - remove dependency on glibc by including a translation of the musl rand_r implementation 91 | 92 | ## [1.7][2015-10-14] 93 | ### Changed 94 | - fixed bug where item momentum would be incorrectly used in adadelta training for user features (thanks to Jong Wook Kim @jongwook for the bug report). 95 | - user and item features are now floats (instead of ints), allowing fractional feature weights to be used when fitting models. 96 | 97 | ## [1.6][2015-09-29] 98 | ### Changed 99 | - when installing into an Anaconda distribution, drop -march=native compiler flag 100 | due to assembler issues. 101 | - when installing on OSX, search macports and homebrew install location for gcc 102 | version 5.x 103 | 104 | ## [1.5][2015-09-24] 105 | ### Changed 106 | - when installing on OSX, search macports install location for gcc 107 | 108 | ## [1.4][2015-09-18] 109 | ### Changed 110 | - input matrices automatically converted to correct dtype if necessary 111 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " epub3 to make an epub3" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | @echo " dummy to check syntax errors of document sources" 51 | 52 | .PHONY: apidoc 53 | apidoc: 54 | sphinx-apidoc -o . lightfm 55 | 56 | .PHONY: clean 57 | clean: 58 | rm -rf $(BUILDDIR)/* 59 | 60 | .PHONY: html 61 | html: 62 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 65 | 66 | .PHONY: dirhtml 67 | dirhtml: 68 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 69 | @echo 70 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 71 | 72 | .PHONY: singlehtml 73 | singlehtml: 74 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 75 | @echo 76 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 77 | 78 | .PHONY: pickle 79 | pickle: 80 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 81 | @echo 82 | @echo "Build finished; now you can process the pickle files." 83 | 84 | .PHONY: json 85 | json: 86 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 87 | @echo 88 | @echo "Build finished; now you can process the JSON files." 89 | 90 | .PHONY: htmlhelp 91 | htmlhelp: 92 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 93 | @echo 94 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 95 | ".hhp project file in $(BUILDDIR)/htmlhelp." 96 | 97 | .PHONY: qthelp 98 | qthelp: 99 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 100 | @echo 101 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 102 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 103 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/LightFM.qhcp" 104 | @echo "To view the help file:" 105 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/LightFM.qhc" 106 | 107 | .PHONY: applehelp 108 | applehelp: 109 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 110 | @echo 111 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 112 | @echo "N.B. You won't be able to view it unless you put it in" \ 113 | "~/Library/Documentation/Help or install it in your application" \ 114 | "bundle." 115 | 116 | .PHONY: devhelp 117 | devhelp: 118 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 119 | @echo 120 | @echo "Build finished." 121 | @echo "To view the help file:" 122 | @echo "# mkdir -p $$HOME/.local/share/devhelp/LightFM" 123 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/LightFM" 124 | @echo "# devhelp" 125 | 126 | .PHONY: epub 127 | epub: 128 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 129 | @echo 130 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 131 | 132 | .PHONY: epub3 133 | epub3: 134 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 135 | @echo 136 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 137 | 138 | .PHONY: latex 139 | latex: 140 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 141 | @echo 142 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 143 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 144 | "(use \`make latexpdf' here to do that automatically)." 145 | 146 | .PHONY: latexpdf 147 | latexpdf: 148 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 149 | @echo "Running LaTeX files through pdflatex..." 150 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 151 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 152 | 153 | .PHONY: latexpdfja 154 | latexpdfja: 155 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 156 | @echo "Running LaTeX files through platex and dvipdfmx..." 157 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 158 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 159 | 160 | .PHONY: text 161 | text: 162 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 163 | @echo 164 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 165 | 166 | .PHONY: man 167 | man: 168 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 169 | @echo 170 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 171 | 172 | .PHONY: texinfo 173 | texinfo: 174 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 175 | @echo 176 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 177 | @echo "Run \`make' in that directory to run these through makeinfo" \ 178 | "(use \`make info' here to do that automatically)." 179 | 180 | .PHONY: info 181 | info: 182 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 183 | @echo "Running Texinfo files through makeinfo..." 184 | make -C $(BUILDDIR)/texinfo info 185 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 186 | 187 | .PHONY: gettext 188 | gettext: 189 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 190 | @echo 191 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 192 | 193 | .PHONY: changes 194 | changes: 195 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 196 | @echo 197 | @echo "The overview file is in $(BUILDDIR)/changes." 198 | 199 | .PHONY: linkcheck 200 | linkcheck: 201 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 202 | @echo 203 | @echo "Link check complete; look for any errors in the above output " \ 204 | "or in $(BUILDDIR)/linkcheck/output.txt." 205 | 206 | .PHONY: doctest 207 | doctest: 208 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 209 | @echo "Testing of doctests in the sources finished, look at the " \ 210 | "results in $(BUILDDIR)/doctest/output.txt." 211 | 212 | .PHONY: coverage 213 | coverage: 214 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 215 | @echo "Testing of coverage in the sources finished, look at the " \ 216 | "results in $(BUILDDIR)/coverage/python.txt." 217 | 218 | .PHONY: xml 219 | xml: 220 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 221 | @echo 222 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 223 | 224 | .PHONY: pseudoxml 225 | pseudoxml: 226 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 227 | @echo 228 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 229 | 230 | .PHONY: dummy 231 | dummy: 232 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 233 | @echo 234 | @echo "Build finished. Dummy builder generates no files." 235 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # LightFM documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Apr 21 12:26:52 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | import lightfm 19 | import sphinx_rtd_theme 20 | 21 | 22 | # If extensions (or modules to document with autodoc) are in another directory, 23 | # add these directories to sys.path here. If the directory is relative to the 24 | # documentation root, use os.path.abspath to make it absolute, like shown here. 25 | sys.path.insert(0, os.path.abspath("..")) 26 | 27 | # -- General configuration ------------------------------------------------ 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | "sphinx.ext.autodoc", 37 | "sphinx.ext.doctest", 38 | "sphinx.ext.githubpages", 39 | "sphinx.ext.napoleon", 40 | "sphinx.ext.viewcode", 41 | 'sphinx_rtd_theme', 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ["_templates"] 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | # source_suffix = ['.rst', '.md'] 50 | source_suffix = ".rst" 51 | 52 | # The encoding of source files. 53 | # source_encoding = 'utf-8-sig' 54 | 55 | # The master toctree document. 56 | master_doc = "index" 57 | 58 | # General information about the project. 59 | project = "LightFM" 60 | copyright = "2016, Lyst (Maciej Kula)" 61 | author = "Lyst (Maciej Kula)" 62 | 63 | # The version info for the project you're documenting, acts as replacement for 64 | # |version| and |release|, also used in various other places throughout the 65 | # built documents. 66 | # 67 | # The short X.Y version. 68 | version = lightfm.__version__ 69 | # The full version, including alpha/beta/rc tags. 70 | release = lightfm.__version__ 71 | 72 | # The language for content autogenerated by Sphinx. Refer to documentation 73 | # for a list of supported languages. 74 | # 75 | # This is also used if you do content translation via gettext catalogs. 76 | # Usually you set "language" from the command line for these cases. 77 | language = None 78 | 79 | # There are two options for replacing |today|: either, you set today to some 80 | # non-false value, then it is used: 81 | # today = '' 82 | # Else, today_fmt is used as the format for a strftime call. 83 | # today_fmt = '%B %d, %Y' 84 | 85 | # List of patterns, relative to source directory, that match files and 86 | # directories to ignore when looking for source files. 87 | # This patterns also effect to html_static_path and html_extra_path 88 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 89 | 90 | # The reST default role (used for this markup: `text`) to use for all 91 | # documents. 92 | # default_role = None 93 | 94 | # If true, '()' will be appended to :func: etc. cross-reference text. 95 | # add_function_parentheses = True 96 | 97 | # If true, the current module name will be prepended to all description 98 | # unit titles (such as .. function::). 99 | # add_module_names = True 100 | 101 | # If true, sectionauthor and moduleauthor directives will be shown in the 102 | # output. They are ignored by default. 103 | # show_authors = False 104 | 105 | # The name of the Pygments (syntax highlighting) style to use. 106 | pygments_style = "sphinx" 107 | 108 | # A list of ignored prefixes for module index sorting. 109 | # modindex_common_prefix = [] 110 | 111 | # If true, keep warnings as "system message" paragraphs in the built documents. 112 | # keep_warnings = False 113 | 114 | # If true, `todo` and `todoList` produce output, else they produce nothing. 115 | todo_include_todos = False 116 | 117 | 118 | # -- Options for HTML output ---------------------------------------------- 119 | 120 | # The theme to use for HTML and HTML Help pages. See the documentation for 121 | # a list of builtin themes. 122 | html_theme = "sphinx_rtd_theme" 123 | 124 | # Theme options are theme-specific and customize the look and feel of a theme 125 | # further. For a list of options available for each theme, see the 126 | # documentation. 127 | # html_theme_options = {} 128 | 129 | # Add any paths that contain custom themes here, relative to this directory. 130 | # html_theme_path = [] 131 | 132 | # The name for this set of Sphinx documents. 133 | # " v documentation" by default. 134 | # html_title = u'LightFM v1.8' 135 | 136 | # A shorter title for the navigation bar. Default is the same as html_title. 137 | # html_short_title = None 138 | 139 | # The name of an image file (relative to this directory) to place at the top 140 | # of the sidebar. 141 | # html_logo = None 142 | 143 | # The name of an image file (relative to this directory) to use as a favicon of 144 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 145 | # pixels large. 146 | # html_favicon = None 147 | 148 | # Add any paths that contain custom static files (such as style sheets) here, 149 | # relative to this directory. They are copied after the builtin static files, 150 | # so a file named "default.css" will overwrite the builtin "default.css". 151 | html_static_path = [] 152 | 153 | # Add any extra paths that contain custom files (such as robots.txt or 154 | # .htaccess) here, relative to this directory. These files are copied 155 | # directly to the root of the documentation. 156 | # html_extra_path = [] 157 | 158 | # If not None, a 'Last updated on:' timestamp is inserted at every page 159 | # bottom, using the given strftime format. 160 | # The empty string is equivalent to '%b %d, %Y'. 161 | # html_last_updated_fmt = None 162 | 163 | # If true, SmartyPants will be used to convert quotes and dashes to 164 | # typographically correct entities. 165 | # html_use_smartypants = True 166 | 167 | # Custom sidebar templates, maps document names to template names. 168 | # html_sidebars = {} 169 | 170 | # Additional templates that should be rendered to pages, maps page names to 171 | # template names. 172 | # html_additional_pages = {} 173 | 174 | # If false, no module index is generated. 175 | # html_domain_indices = True 176 | 177 | # If false, no index is generated. 178 | # html_use_index = True 179 | 180 | # If true, the index is split into individual pages for each letter. 181 | # html_split_index = False 182 | 183 | # If true, links to the reST sources are added to the pages. 184 | # html_show_sourcelink = True 185 | 186 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 187 | # html_show_sphinx = True 188 | 189 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 190 | # html_show_copyright = True 191 | 192 | # If true, an OpenSearch description file will be output, and all pages will 193 | # contain a tag referring to it. The value of this option must be the 194 | # base URL from which the finished HTML is served. 195 | # html_use_opensearch = '' 196 | 197 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 198 | # html_file_suffix = None 199 | 200 | # Language to be used for generating the HTML full-text search index. 201 | # Sphinx supports the following languages: 202 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 203 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' 204 | # html_search_language = 'en' 205 | 206 | # A dictionary with options for the search language support, empty by default. 207 | # 'ja' uses this config value. 208 | # 'zh' user can custom change `jieba` dictionary path. 209 | # html_search_options = {'type': 'default'} 210 | 211 | # The name of a javascript file (relative to the configuration directory) that 212 | # implements a search results scorer. If empty, the default will be used. 213 | # html_search_scorer = 'scorer.js' 214 | 215 | # Output file base name for HTML help builder. 216 | htmlhelp_basename = "LightFMdoc" 217 | 218 | # -- Options for LaTeX output --------------------------------------------- 219 | 220 | latex_elements = { 221 | # The paper size ('letterpaper' or 'a4paper'). 222 | #'papersize': 'letterpaper', 223 | # The font size ('10pt', '11pt' or '12pt'). 224 | #'pointsize': '10pt', 225 | # Additional stuff for the LaTeX preamble. 226 | #'preamble': '', 227 | # Latex figure (float) alignment 228 | #'figure_align': 'htbp', 229 | } 230 | 231 | # Grouping the document tree into LaTeX files. List of tuples 232 | # (source start file, target name, title, 233 | # author, documentclass [howto, manual, or own class]). 234 | latex_documents = [ 235 | ( 236 | master_doc, 237 | "LightFM.tex", 238 | "LightFM Documentation", 239 | "Lyst (Maciej Kula)", 240 | "manual", 241 | ), 242 | ] 243 | 244 | # The name of an image file (relative to this directory) to place at the top of 245 | # the title page. 246 | # latex_logo = None 247 | 248 | # For "manual" documents, if this is true, then toplevel headings are parts, 249 | # not chapters. 250 | # latex_use_parts = False 251 | 252 | # If true, show page references after internal links. 253 | # latex_show_pagerefs = False 254 | 255 | # If true, show URL addresses after external links. 256 | # latex_show_urls = False 257 | 258 | # Documents to append as an appendix to all manuals. 259 | # latex_appendices = [] 260 | 261 | # If false, no module index is generated. 262 | # latex_domain_indices = True 263 | 264 | 265 | # -- Options for manual page output --------------------------------------- 266 | 267 | # One entry per manual page. List of tuples 268 | # (source start file, name, description, authors, manual section). 269 | man_pages = [(master_doc, "lightfm", "LightFM Documentation", [author], 1)] 270 | 271 | # If true, show URL addresses after external links. 272 | # man_show_urls = False 273 | 274 | 275 | # -- Options for Texinfo output ------------------------------------------- 276 | 277 | # Grouping the document tree into Texinfo files. List of tuples 278 | # (source start file, target name, title, author, 279 | # dir menu entry, description, category) 280 | texinfo_documents = [ 281 | ( 282 | master_doc, 283 | "LightFM", 284 | "LightFM Documentation", 285 | author, 286 | "LightFM", 287 | "One line description of project.", 288 | "Miscellaneous", 289 | ), 290 | ] 291 | 292 | # Documents to append as an appendix to all manuals. 293 | # texinfo_appendices = [] 294 | 295 | # If false, no module index is generated. 296 | # texinfo_domain_indices = True 297 | 298 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 299 | # texinfo_show_urls = 'footnote' 300 | 301 | # If true, do not generate a @detailmenu in the "Top" node's menu. 302 | # texinfo_no_detailmenu = False 303 | 304 | # Compact attribute lists 305 | napoleon_use_ivar = True 306 | -------------------------------------------------------------------------------- /doc/cross_validation.rst: -------------------------------------------------------------------------------- 1 | Cross-validation 2 | ================ 3 | 4 | .. automodule:: lightfm.cross_validation 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /doc/datasets.rst: -------------------------------------------------------------------------------- 1 | Datasets 2 | =============== 3 | 4 | .. autofunction:: lightfm.datasets.movielens.fetch_movielens 5 | .. autofunction:: lightfm.datasets.stackexchange.fetch_stackexchange 6 | -------------------------------------------------------------------------------- /doc/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ========================= 3 | 4 | Many of the examples can be viewed (and run) as Jupyter notebooks in the `examples directory `_ of the LightFM repository. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | Movielens implicit feedback recommender 10 | Learning rate schedules 11 | Cold-start hybrid recommender 12 | Learning-to-rank using WARP loss 13 | Building datasets 14 | -------------------------------------------------------------------------------- /doc/examples/dataset.rst: -------------------------------------------------------------------------------- 1 | Building datasets 2 | ================= 3 | 4 | In this example, we'll use LightFM's built-in ``Dataset`` class to build 5 | an interaction dataset from raw data. The goal is to demonstrate how to 6 | go from raw data (lists of interactions and perhaps item and user 7 | features) to ``scipy.sparse`` matrices that can be used to fit a LightFM 8 | model. 9 | 10 | Getting the data 11 | ---------------- 12 | 13 | We're going to use a sample from 14 | `Goodbooks-10k `__ as our 15 | example dataset. Let's download the data first. 16 | 17 | .. code:: python 18 | 19 | import os 20 | import zipfile 21 | import csv 22 | 23 | import requests 24 | 25 | 26 | def _download(url: str, dest_path: str): 27 | 28 | req = requests.get(url, stream=True) 29 | req.raise_for_status() 30 | 31 | with open(dest_path, "wb") as fd: 32 | for chunk in req.iter_content(chunk_size=2 ** 20): 33 | fd.write(chunk) 34 | 35 | 36 | def get_data(): 37 | 38 | ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip") 39 | 40 | if not os.path.exists("data"): 41 | os.makedirs("data") 42 | 43 | _download(ratings_url, "data/data.zip") 44 | 45 | with zipfile.ZipFile("data/data.zip") as archive: 46 | return ( 47 | csv.DictReader( 48 | (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")), 49 | delimiter=";", 50 | ), 51 | csv.DictReader( 52 | (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";" 53 | ), 54 | ) 55 | 56 | 57 | def get_ratings(): 58 | 59 | return get_data()[0] 60 | 61 | 62 | def get_book_features(): 63 | 64 | return get_data()[1] 65 | 66 | The data consists of book ratings and book details: 67 | 68 | .. code:: python 69 | 70 | import json 71 | from itertools import islice 72 | 73 | ratings, book_features = get_data() 74 | 75 | Ratings look like this: 76 | 77 | .. code:: python 78 | 79 | for line in islice(ratings, 2): 80 | print(json.dumps(line, indent=4)) 81 | 82 | :: 83 | 84 | { 85 | "User-ID": "276725", 86 | "ISBN": "034545104X", 87 | "Book-Rating": "0" 88 | } 89 | { 90 | "User-ID": "276726", 91 | "ISBN": "0155061224", 92 | "Book-Rating": "5" 93 | } 94 | 95 | and book features look like this: 96 | 97 | .. code:: python 98 | 99 | for line in islice(book_features, 1): 100 | print(json.dumps(line, indent=4)) 101 | 102 | :: 103 | 104 | { 105 | "ISBN": "0195153448", 106 | "Book-Title": "Classical Mythology", 107 | "Book-Author": "Mark P. O. Morford", 108 | "Year-Of-Publication": "2002", 109 | "Publisher": "Oxford University Press", 110 | "Image-URL-S": 111 | "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg", 112 | "Image-URL-M": 113 | "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg", 114 | "Image-URL-L": 115 | "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg" 116 | } 117 | 118 | Building the ID mappings 119 | ------------------------ 120 | 121 | The first thing we need to do is to create a mapping between the user 122 | and item ids from our input data to indices that will be used internally 123 | by our model. 124 | 125 | We do this because LightFM works with user and item ids that are 126 | consecutive non-negative integers. The ``Dataset`` class allow us to 127 | create a mapping between the IDs we use in our systems and the 128 | consecutive indices preferred by the model. 129 | 130 | To do this, we create a dataset and call its ``fit`` method. The first 131 | argument is an iterable of all user ids in our data, and the second is 132 | an iterable of all item ids. In this case, we use generator expressions 133 | to lazily iterate over our data and yield user and item ids: 134 | 135 | .. code:: python 136 | 137 | from lightfm.data import Dataset 138 | 139 | dataset = Dataset() 140 | dataset.fit((x['User-ID'] for x in get_ratings()), 141 | (x['ISBN'] for x in get_ratings())) 142 | 143 | This call will assign an internal numerical id to every user and item id 144 | we pass in. These will be contiguous (from 0 to however many users and 145 | items we have), and will also determine the dimensions of the resulting 146 | LightFM model. 147 | 148 | We can check that the mappings have been created by querying the dataset 149 | on how many users and books it knows about: 150 | 151 | .. code:: python 152 | 153 | num_users, num_items = dataset.interactions_shape() 154 | print('Num users: {}, num_items {}.'.format(num_users, num_items)) 155 | 156 | :: 157 | 158 | Num users: 105283, num_items 340553. 159 | 160 | Note that if we don't have all user and items ids at once, we can 161 | repeatedly call ``fit_partial`` to supply additional ids. In this case, 162 | we will use this capability to add some item feature mappings: 163 | 164 | .. code:: python 165 | 166 | dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), 167 | item_features=(x['Book-Author'] for x in get_book_features())) 168 | 169 | This will create a feature for every unique author name in the dataset. 170 | 171 | (Note that we fit some more item ids: this is to make sure our mappings 172 | are complete even if there are items in the features dataset that are 173 | not in the interactions set.) 174 | 175 | Building the interactions matrix 176 | -------------------------------- 177 | 178 | Having created the mapping, we build the interaction matrix: 179 | 180 | .. code:: python 181 | 182 | (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN']) 183 | for x in get_ratings())) 184 | 185 | print(repr(interactions)) 186 | 187 | :: 188 | 189 | <105283x341762 sparse matrix of type '' 190 | with 1149780 stored elements in COOrdinate format> 191 | 192 | This is main input into a LightFM model: it encodes the interactions 193 | betwee users and items. 194 | 195 | Since we have item features, we can also create the item features 196 | matrix: 197 | 198 | .. code:: python 199 | 200 | item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']]) 201 | for x in get_book_features())) 202 | print(repr(item_features)) 203 | 204 | :: 205 | 206 | <341762x443805 sparse matrix of type '' 207 | with 613141 stored elements in Compressed Sparse Row format> 208 | 209 | Building a model 210 | ---------------- 211 | 212 | This is all we need to build a LightFM model: 213 | 214 | .. code:: python 215 | 216 | from lightfm import LightFM 217 | 218 | model = LightFM(loss='bpr') 219 | model.fit(interactions, item_features=item_features) 220 | 221 | :: 222 | 223 | 224 | -------------------------------------------------------------------------------- /doc/examples/hybrid_crossvalidated.rst: -------------------------------------------------------------------------------- 1 | 2 | Item cold-start: recommending StackExchange questions 3 | ===================================================== 4 | 5 | In this example we'll use the StackExchange dataset to explore 6 | recommendations under item-cold start. Data dumps from the StackExchange 7 | network are available at https://archive.org/details/stackexchange, and 8 | we'll use one of them --- for stats.stackexchange.com --- here. 9 | 10 | The consists of users answering questions: in the user-item interaction 11 | matrix, each user is a row, and each question is a column. Based on 12 | which users answered which questions in the training set, we'll try to 13 | recommend new questions in the training set. 14 | 15 | Let's start by loading the data. We'll use the ``datasets`` module. 16 | 17 | .. code:: python 18 | 19 | import numpy as np 20 | 21 | from lightfm.datasets import fetch_stackexchange 22 | 23 | data = fetch_stackexchange('crossvalidated', 24 | test_set_fraction=0.1, 25 | indicator_features=False, 26 | tag_features=True) 27 | 28 | train = data['train'] 29 | test = data['test'] 30 | 31 | Let's examine the data: 32 | 33 | .. code:: python 34 | 35 | print('The dataset has %s users and %s items, ' 36 | 'with %s interactions in the test and %s interactions in the training set.' 37 | % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) 38 | 39 | 40 | .. parsed-literal:: 41 | 42 | The dataset has 3221 users and 72360 items, with 4307 interactions in the test and 57830 interactions in the training set. 43 | 44 | 45 | The training and test set are divided chronologically: the test set 46 | contains the 10% of interactions that happened after the 90% in the 47 | training set. This means that many of the questions in the test set have 48 | no interactions. This is an accurate description of a questions 49 | answering system: it is most important to recommend questions that have 50 | not yet been answered to the expert users who can answer them. 51 | 52 | A pure collaborative filtering model 53 | ------------------------------------ 54 | 55 | This is clearly a cold-start scenario, and so we can expect a 56 | traditional collaborative filtering model to do very poorly. Let's check 57 | if that's the case: 58 | 59 | .. code:: python 60 | 61 | # Import the model 62 | from lightfm import LightFM 63 | 64 | # Set the number of threads; you can increase this 65 | # if you have more physical cores available. 66 | NUM_THREADS = 2 67 | NUM_COMPONENTS = 30 68 | NUM_EPOCHS = 3 69 | ITEM_ALPHA = 1e-6 70 | 71 | # Let's fit a WARP model: these generally have the best performance. 72 | model = LightFM(loss='warp', 73 | item_alpha=ITEM_ALPHA, 74 | no_components=NUM_COMPONENTS) 75 | 76 | # Run 3 epochs and time it. 77 | %time model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) 78 | 79 | 80 | .. parsed-literal:: 81 | 82 | CPU times: user 12.9 s, sys: 8 ms, total: 12.9 s 83 | Wall time: 6.52 s 84 | 85 | 86 | As a means of sanity checking, let's calculate the model's AUC on the 87 | training set first. If it's reasonably high, we can be sure that the 88 | model is not doing anything stupid and is fitting the training data 89 | well. 90 | 91 | .. code:: python 92 | 93 | # Import the evaluation routines 94 | from lightfm.evaluation import auc_score 95 | 96 | # Compute and print the AUC score 97 | train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean() 98 | print('Collaborative filtering train AUC: %s' % train_auc) 99 | 100 | 101 | .. parsed-literal:: 102 | 103 | Collaborative filtering train AUC: 0.887519 104 | 105 | 106 | Fantastic, the model is fitting the training set well. But what about 107 | the test set? 108 | 109 | .. code:: python 110 | 111 | # We pass in the train interactions to exclude them from predictions. 112 | # This is to simulate a recommender system where we do not 113 | # re-recommend things the user has already interacted with in the train 114 | # set. 115 | test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean() 116 | print('Collaborative filtering test AUC: %s' % test_auc) 117 | 118 | 119 | .. parsed-literal:: 120 | 121 | Collaborative filtering test AUC: 0.34728 122 | 123 | 124 | This is terrible: we do worse than random! This is not very surprising: 125 | as there is no training data for the majority of the test questions, the 126 | model cannot compute reasonable representations of the test set items. 127 | 128 | The fact that we score them lower than other items (AUC < 0.5) is due to 129 | estimated per-item biases, which can be confirmed by setting them to 130 | zero and re-evaluating the model. 131 | 132 | .. code:: python 133 | 134 | # Set biases to zero 135 | model.item_biases *= 0.0 136 | 137 | test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS, check_intersections=False).mean() 138 | print('Collaborative filtering test AUC: %s' % test_auc) 139 | 140 | 141 | .. parsed-literal:: 142 | 143 | Collaborative filtering test AUC: 0.496266 144 | 145 | 146 | A hybrid model 147 | -------------- 148 | 149 | We can do much better by employing LightFM's hybrid model capabilities. 150 | The StackExchange data comes with content information in the form of 151 | tags users apply to their questions: 152 | 153 | .. code:: python 154 | 155 | item_features = data['item_features'] 156 | tag_labels = data['item_feature_labels'] 157 | 158 | print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist())) 159 | 160 | 161 | .. parsed-literal:: 162 | 163 | There are 1246 distinct tags, with values like [u'bayesian', u'prior', u'elicitation']. 164 | 165 | 166 | We can use these features (instead of an identity feature matrix like in 167 | a pure CF model) to estimate a model which will generalize better to 168 | unseen examples: it will simply use its representations of item features 169 | to infer representations of previously unseen questions. 170 | 171 | Let's go ahead and fit a model of this type. 172 | 173 | .. code:: python 174 | 175 | # Define a new model instance 176 | model = LightFM(loss='warp', 177 | item_alpha=ITEM_ALPHA, 178 | no_components=NUM_COMPONENTS) 179 | 180 | # Fit the hybrid model. Note that this time, we pass 181 | # in the item features matrix. 182 | model = model.fit(train, 183 | item_features=item_features, 184 | epochs=NUM_EPOCHS, 185 | num_threads=NUM_THREADS) 186 | 187 | As before, let's sanity check the model on the training set. 188 | 189 | .. code:: python 190 | 191 | # Don't forget the pass in the item features again! 192 | train_auc = auc_score(model, 193 | train, 194 | item_features=item_features, 195 | num_threads=NUM_THREADS).mean() 196 | print('Hybrid training set AUC: %s' % train_auc) 197 | 198 | 199 | .. parsed-literal:: 200 | 201 | Hybrid training set AUC: 0.86049 202 | 203 | 204 | Note that the training set AUC is lower than in a pure CF model. This is 205 | fine: by using a lower-rank item feature matrix, we have effectively 206 | regularized the model, giving it less freedom to fit the training data. 207 | 208 | Despite this the model does much better on the test set: 209 | 210 | .. code:: python 211 | 212 | test_auc = auc_score(model, 213 | test, 214 | train_interactions=train, 215 | item_features=item_features, 216 | num_threads=NUM_THREADS, 217 | check_intersections=False).mean() 218 | print('Hybrid test set AUC: %s' % test_auc) 219 | 220 | 221 | .. parsed-literal:: 222 | 223 | Hybrid test set AUC: 0.703039 224 | 225 | 226 | This is as expected: because items in the test set share tags with items 227 | in the training set, we can provide better test set recommendations by 228 | using the tag representations learned from training. 229 | 230 | Bonus: tag embeddings 231 | --------------------- 232 | 233 | One of the nice properties of the hybrid model is that the estimated tag 234 | embeddings capture semantic characteristics of the tags. Like the 235 | word2vec model, we can use this property to explore semantic tag 236 | similarity: 237 | 238 | .. code:: python 239 | 240 | def get_similar_tags(model, tag_id): 241 | # Define similarity as the cosine of the angle 242 | # between the tag latent vectors 243 | 244 | # Normalize the vectors to unit length 245 | tag_embeddings = (model.item_embeddings.T 246 | / np.linalg.norm(model.item_embeddings, axis=1)).T 247 | 248 | query_embedding = tag_embeddings[tag_id] 249 | similarity = np.dot(tag_embeddings, query_embedding) 250 | most_similar = np.argsort(-similarity)[1:4] 251 | 252 | return most_similar 253 | 254 | 255 | for tag in (u'bayesian', u'regression', u'survival'): 256 | tag_id = tag_labels.tolist().index(tag) 257 | print('Most similar tags for %s: %s' % (tag_labels[tag_id], 258 | tag_labels[get_similar_tags(model, tag_id)])) 259 | 260 | 261 | .. parsed-literal:: 262 | 263 | Most similar tags for bayesian: [u'posterior' u'mcmc' u'bayes'] 264 | Most similar tags for regression: [u'multicollinearity' u'stepwise-regression' u'multiple-regression'] 265 | Most similar tags for survival: [u'cox-model' u'kaplan-meier' u'odds-ratio'] 266 | -------------------------------------------------------------------------------- /doc/examples/learning_schedules.rst: -------------------------------------------------------------------------------- 1 | 2 | Using different learning schedules 3 | ================================== 4 | 5 | ``lightfm`` implements two learning schedules: adagrad and adadelta. 6 | Neither is clearly superior, and, like other hyperparameter choices, the 7 | best learning schedule will differ based on the problem at hand. 8 | 9 | This example tries both at the Movielens 100k dataset. 10 | 11 | Preliminaries 12 | ------------- 13 | 14 | Let's first get the data and define the evaluations functions. 15 | 16 | .. code:: python 17 | 18 | import numpy as np 19 | import data 20 | 21 | %matplotlib inline 22 | 23 | import matplotlib 24 | import numpy as np 25 | import matplotlib.pyplot as plt 26 | 27 | from lightfm import LightFM 28 | from lightfm.datasets import fetch_movielens 29 | from lightfm.evaluation import auc_score 30 | 31 | movielens = fetch_movielens() 32 | 33 | train, test = movielens['train'], movielens['test'] 34 | 35 | Experiment 36 | ---------- 37 | 38 | To evaluate the performance of both learning schedules, let's create two 39 | models and run each for a number of epochs, measuring the ROC AUC on the 40 | test set at the end of each epoch. 41 | 42 | .. code:: python 43 | 44 | alpha = 1e-3 45 | epochs = 70 46 | 47 | adagrad_model = LightFM(no_components=30, 48 | loss='warp', 49 | learning_schedule='adagrad', 50 | user_alpha=alpha, 51 | item_alpha=alpha) 52 | adadelta_model = LightFM(no_components=30, 53 | loss='warp', 54 | learning_schedule='adadelta', 55 | user_alpha=alpha, 56 | item_alpha=alpha) 57 | 58 | adagrad_auc = [] 59 | 60 | for epoch in range(epochs): 61 | adagrad_model.fit_partial(train, epochs=1) 62 | adagrad_auc.append(auc_score(adagrad_model, test).mean()) 63 | 64 | 65 | adadelta_auc = [] 66 | 67 | for epoch in range(epochs): 68 | adadelta_model.fit_partial(train, epochs=1) 69 | adadelta_auc.append(auc_score(adadelta_model, test).mean()) 70 | 71 | It looks like the adadelta gets to a better result at the beginning of 72 | training. However, as we keep running more epochs adagrad wins out, 73 | converging to a better final solution. 74 | 75 | .. code:: python 76 | 77 | x = np.arange(len(adagrad_auc)) 78 | plt.plot(x, np.array(adagrad_auc)) 79 | plt.plot(x, np.array(adadelta_auc)) 80 | plt.legend(['adagrad', 'adadelta'], loc='lower right') 81 | plt.show() 82 | 83 | 84 | 85 | .. image:: learning_schedules_files/learning_schedules_5_0.png 86 | 87 | 88 | We can try the same for the k-OS loss. 89 | 90 | .. code:: python 91 | 92 | alpha = 1e-3 93 | epochs = 70 94 | 95 | adagrad_model = LightFM(no_components=30, 96 | loss='warp-kos', 97 | learning_schedule='adagrad', 98 | user_alpha=alpha, item_alpha=alpha) 99 | adadelta_model = LightFM(no_components=30, 100 | loss='warp-kos', 101 | learning_schedule='adadelta', 102 | user_alpha=alpha, item_alpha=alpha) 103 | 104 | adagrad_auc = [] 105 | 106 | for epoch in range(epochs): 107 | adagrad_model.fit_partial(train, epochs=1) 108 | adagrad_auc.append(auc_score(adagrad_model, test).mean()) 109 | 110 | 111 | adadelta_auc = [] 112 | 113 | for epoch in range(epochs): 114 | adadelta_model.fit_partial(train, epochs=1) 115 | adadelta_auc.append(auc_score(adadelta_model, test).mean()) 116 | 117 | .. code:: python 118 | 119 | x = np.arange(len(adagrad_auc)) 120 | plt.plot(x, np.array(adagrad_auc)) 121 | plt.plot(x, np.array(adadelta_auc)) 122 | plt.legend(['adagrad', 'adadelta'], loc='lower right') 123 | plt.show() 124 | 125 | 126 | 127 | .. image:: learning_schedules_files/learning_schedules_8_0.png 128 | -------------------------------------------------------------------------------- /doc/examples/learning_schedules_files/learning_schedules_5_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/learning_schedules_files/learning_schedules_5_0.png -------------------------------------------------------------------------------- /doc/examples/learning_schedules_files/learning_schedules_8_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/learning_schedules_files/learning_schedules_8_0.png -------------------------------------------------------------------------------- /doc/examples/movielens_implicit.rst: -------------------------------------------------------------------------------- 1 | 2 | An implicit feedback recommender for the Movielens dataset 3 | ========================================================== 4 | 5 | Implicit feedback 6 | ----------------- 7 | 8 | For some time, the recommender system literature focused on explicit 9 | feedback: the Netflix prize focused on accurately reproducing the 10 | ratings users have given to movies they watched. 11 | 12 | Focusing on ratings in this way ignored the importance of taking into 13 | account which movies the users chose to watch in the first place, and 14 | treating the absence of ratings as absence of information. 15 | 16 | But the things that we don't have ratings for aren't unknowns: we know 17 | the user didn't pick them. This reflects a user's conscious choice, and 18 | is a good source of information on what she thinks she might like. 19 | 20 | This sort of phenomenon is described as data which is 21 | missing-not-at-random in the literature: the ratings that are missing 22 | are more likely to be negative precisely because the user chooses which 23 | items to rate. When choosing a restaurant, you only go to places which 24 | you think you'll enjoy, and never go to places that you think you'll 25 | hate. What this leads to is that you're only going to be submitting 26 | ratings for things which, a priori, you expected to like; the things 27 | that you expect you will not like you will never rate. 28 | 29 | This observation has led to the development of models that are suitable 30 | for implicit feedback. LightFM implements two that have proven 31 | particular successful: 32 | 33 | - BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the 34 | prediction difference between a positive example and a randomly 35 | chosen negative example. Useful when only positive interactions are 36 | present and optimising ROC AUC is desired. 37 | - WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank 38 | of positive examples by repeatedly sampling negative examples until 39 | rank violating one is found. Useful when only positive interactions 40 | are present and optimising the top of the recommendation list 41 | (precision@k) is desired. 42 | 43 | This example shows how to estimate these models on the Movielens 44 | dataset. 45 | 46 | [1] Rendle, Steffen, et al. "BPR: Bayesian personalized ranking from 47 | implicit feedback." Proceedings of the Twenty-Fifth Conference on 48 | Uncertainty in Artificial Intelligence. AUAI Press, 2009. 49 | 50 | [2] Weston, Jason, Samy Bengio, and Nicolas Usunier. "Wsabie: Scaling up 51 | to large vocabulary image annotation." IJCAI. Vol. 11. 2011. 52 | 53 | Getting the data 54 | ---------------- 55 | 56 | The first step is to get the `Movielens 57 | data `__. This is a 58 | classic small recommender dataset, consisting of around 950 users, 1700 59 | movies, and 100,000 ratings. The ratings are on a scale from 1 to 5, but 60 | we'll all treat them as implicit positive feedback in this example. 61 | 62 | Fortunately, this is one of the functions provided by LightFM itself. 63 | 64 | .. code:: python 65 | 66 | import numpy as np 67 | 68 | from lightfm.datasets import fetch_movielens 69 | 70 | movielens = fetch_movielens() 71 | 72 | This gives us a dictionary with the following fields: 73 | 74 | .. code:: python 75 | 76 | for key, value in movielens.items(): 77 | print(key, type(value), value.shape) 78 | 79 | 80 | .. parsed-literal:: 81 | 82 | ('test', , (943, 1682)) 83 | ('item_features', , (1682, 1682)) 84 | ('train', , (943, 1682)) 85 | ('item_labels', , (1682,)) 86 | ('item_feature_labels', , (1682,)) 87 | 88 | 89 | .. code:: python 90 | 91 | train = movielens['train'] 92 | test = movielens['test'] 93 | 94 | The ``train`` and ``test`` elements are the most important: they contain 95 | the raw rating data, split into a train and a test set. Each row 96 | represents a user, and each column an item. Entries are ratings from 1 97 | to 5. 98 | 99 | Fitting models 100 | -------------- 101 | 102 | Now let's train a BPR model and look at its accuracy. 103 | 104 | We'll use two metrics of accuracy: precision@k and ROC AUC. Both are 105 | ranking metrics: to compute them, we'll be constructing recommendation 106 | lists for all of our users, and checking the ranking of known positive 107 | movies. For precision at k we'll be looking at whether they are within 108 | the first k results on the list; for AUC, we'll be calculating the 109 | probability that any known positive is higher on the list than a random 110 | negative example. 111 | 112 | .. code:: python 113 | 114 | from lightfm import LightFM 115 | from lightfm.evaluation import precision_at_k 116 | from lightfm.evaluation import auc_score 117 | 118 | model = LightFM(learning_rate=0.05, loss='bpr') 119 | model.fit(train, epochs=10) 120 | 121 | train_precision = precision_at_k(model, train, k=10).mean() 122 | test_precision = precision_at_k(model, test, k=10).mean() 123 | 124 | train_auc = auc_score(model, train).mean() 125 | test_auc = auc_score(model, test).mean() 126 | 127 | print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision)) 128 | print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc)) 129 | 130 | 131 | .. parsed-literal:: 132 | 133 | Precision: train 0.59, test 0.10. 134 | AUC: train 0.90, test 0.86. 135 | 136 | 137 | The WARP model, on the other hand, optimises for precision@k---we should 138 | expect its performance to be better on precision. 139 | 140 | .. code:: python 141 | 142 | model = LightFM(learning_rate=0.05, loss='warp') 143 | 144 | model.fit_partial(train, epochs=10) 145 | 146 | train_precision = precision_at_k(model, train, k=10).mean() 147 | test_precision = precision_at_k(model, test, k=10).mean() 148 | 149 | train_auc = auc_score(model, train).mean() 150 | test_auc = auc_score(model, test).mean() 151 | 152 | print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision)) 153 | print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc)) 154 | 155 | 156 | .. parsed-literal:: 157 | 158 | Precision: train 0.61, test 0.11. 159 | AUC: train 0.93, test 0.90. 160 | 161 | 162 | And that is exactly what we see: we get slightly higher precision@10 163 | (but the AUC metric is also improved). 164 | -------------------------------------------------------------------------------- /doc/examples/warp_loss.rst: -------------------------------------------------------------------------------- 1 | 2 | Learning-to-rank using the WARP loss 3 | ==================================== 4 | 5 | LightFM is probably the only recommender package implementing the WARP 6 | (Weighted Approximate-Rank Pairwise) loss for implicit feedback 7 | learning-to-rank. Generally, it perfoms better than the more popular BPR 8 | (Bayesian Personalised Ranking) loss --- often by a large margin. 9 | 10 | It was originally applied to image annotations in the Weston et al. 11 | `WSABIE 12 | paper `__, 13 | but has been extended to apply to recommendation settings in the `2013 14 | k-order statistic loss 15 | paper `__ in 16 | the form of the k-OS WARP loss, also implemented in LightFM. 17 | 18 | Like the BPR model, WARP deals with (user, positive item, negative item) 19 | triplets. Unlike BPR, the negative items in the triplet are not chosen 20 | by random sampling: they are chosen from among those negatie items which 21 | would violate the desired item ranking given the state of the model. 22 | This approximates a form of active learning where the model selects 23 | those triplets that it cannot currently rank correctly. 24 | 25 | This procedure yields roughly the following algorithm: 26 | 27 | 1. For a given (user, positive item pair), sample a negative item at 28 | random from all the remaining items. Compute predictions for both 29 | items; if the negative item's prediction exceeds that of the positive 30 | item plus a margin, perform a gradient update to rank the positive 31 | item higher and the negative item lower. If there is no rank 32 | violation, continue sampling negative items until a violation is 33 | found. 34 | 2. If you found a violating negative example at the first try, make a 35 | large gradient update: this indicates that a lot of negative items 36 | are ranked higher than positives items given the current state of the 37 | model, and the model must be updated by a large amount. If it took a 38 | lot of sampling to find a violating example, perform a small update: 39 | the model is likely close to the optimum and should be updated at a 40 | low rate. 41 | 42 | While this is fairly hand-wavy, it should give the correct intuition. 43 | For more details, read the paper itself or a more in-depth blog post 44 | `here `__. 45 | A similar approach for BPR is described in Rendle's 2014 `WSDM 2014 46 | paper `__. 47 | 48 | Having covered the theory, the rest of this example looks at the 49 | practical implications of using WARP in LightFM. 50 | 51 | Preliminaries 52 | ------------- 53 | 54 | Let's first get the data. We'll use the MovieLens 100K dataset. 55 | 56 | .. code:: python 57 | 58 | import time 59 | 60 | import numpy as np 61 | 62 | %matplotlib inline 63 | 64 | import matplotlib 65 | import numpy as np 66 | import matplotlib.pyplot as plt 67 | 68 | from lightfm import LightFM 69 | from lightfm.datasets import fetch_movielens 70 | from lightfm.evaluation import auc_score 71 | 72 | movielens = fetch_movielens() 73 | 74 | train, test = movielens['train'], movielens['test'] 75 | 76 | Accuracy 77 | -------- 78 | 79 | The first interesting experiment is to compare the accuracy between the 80 | WARP and BPR losses. Let's fit two models with equivalent 81 | hyperparameters and compare their accuracy across epochs. Whilst we're 82 | fitting them, let's also measure how much time each epoch takes. 83 | 84 | .. code:: python 85 | 86 | alpha = 1e-05 87 | epochs = 70 88 | num_components = 32 89 | 90 | warp_model = LightFM(no_components=num_components, 91 | loss='warp', 92 | learning_schedule='adagrad', 93 | max_sampled=100, 94 | user_alpha=alpha, 95 | item_alpha=alpha) 96 | 97 | bpr_model = LightFM(no_components=num_components, 98 | loss='bpr', 99 | learning_schedule='adagrad', 100 | user_alpha=alpha, 101 | item_alpha=alpha) 102 | 103 | warp_duration = [] 104 | bpr_duration = [] 105 | warp_auc = [] 106 | bpr_auc = [] 107 | 108 | for epoch in range(epochs): 109 | start = time.time() 110 | warp_model.fit_partial(train, epochs=1) 111 | warp_duration.append(time.time() - start) 112 | warp_auc.append(auc_score(warp_model, test, train_interactions=train).mean()) 113 | 114 | for epoch in range(epochs): 115 | start = time.time() 116 | bpr_model.fit_partial(train, epochs=1) 117 | bpr_duration.append(time.time() - start) 118 | bpr_auc.append(auc_score(bpr_model, test, train_interactions=train).mean()) 119 | 120 | Plotting the results immediately reveals that WARP produces superior 121 | results: a smarter way of selecting negative examples leads to higher 122 | quality rankings. Test accuracy decreases after the first 10 epochs, 123 | suggesting WARP starts overfitting and would benefit from regularization 124 | or early stopping. 125 | 126 | .. code:: python 127 | 128 | x = np.arange(epochs) 129 | plt.plot(x, np.array(warp_auc)) 130 | plt.plot(x, np.array(bpr_auc)) 131 | plt.legend(['WARP AUC', 'BPR AUC'], loc='upper right') 132 | plt.show() 133 | 134 | 135 | 136 | .. image:: warp_loss_files/warp_loss_5_0.png 137 | 138 | 139 | Fitting speed 140 | ------------- 141 | 142 | What about model fitting speed? 143 | 144 | .. code:: python 145 | 146 | x = np.arange(epochs) 147 | plt.plot(x, np.array(warp_duration)) 148 | plt.plot(x, np.array(bpr_duration)) 149 | plt.legend(['WARP duration', 'BPR duration'], loc='upper right') 150 | plt.show() 151 | 152 | 153 | 154 | .. image:: warp_loss_files/warp_loss_7_0.png 155 | 156 | 157 | WARP is slower than BPR for all epochs. Interestingly, however, it gets 158 | slower with additional epochs; every subsequent epoch takes more time. 159 | This is because of WARP's adaptive samling of negatives: the closer the 160 | model fits the training data, the more times it needs to sample in order 161 | to find rank-violating examples, leading to longer fitting times. 162 | 163 | For this reason, LightFM exposes the ``max_sampled`` hyperparameter that 164 | limits the number of attemps WARP will carry out to find a negative. 165 | Setting it to a low value and repeating the run shows that the run time 166 | actually decreases with every epoch: this is because no updates happen 167 | when a violating example cannot be found in the specified number of 168 | attempts. 169 | 170 | .. code:: python 171 | 172 | warp_model = LightFM(no_components=num_components, 173 | max_sampled=3, 174 | loss='warp', 175 | learning_schedule='adagrad', 176 | user_alpha=alpha, 177 | item_alpha=alpha) 178 | 179 | warp_duration = [] 180 | warp_auc = [] 181 | 182 | for epoch in range(epochs): 183 | start = time.time() 184 | warp_model.fit_partial(train, epochs=1) 185 | warp_duration.append(time.time() - start) 186 | warp_auc.append(auc_score(warp_model, test, train_interactions=train).mean()) 187 | 188 | x = np.arange(epochs) 189 | plt.plot(x, np.array(warp_duration)) 190 | plt.legend(['WARP duration'], loc='upper right') 191 | plt.title('Duration') 192 | plt.show() 193 | 194 | x = np.arange(epochs) 195 | plt.plot(x, np.array(warp_auc)) 196 | plt.legend(['WARP AUC'], loc='upper right') 197 | plt.title('AUC') 198 | plt.show() 199 | 200 | 201 | 202 | .. image:: warp_loss_files/warp_loss_9_0.png 203 | 204 | 205 | 206 | .. image:: warp_loss_files/warp_loss_9_1.png 207 | -------------------------------------------------------------------------------- /doc/examples/warp_loss_files/warp_loss_5_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_5_0.png -------------------------------------------------------------------------------- /doc/examples/warp_loss_files/warp_loss_7_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_7_0.png -------------------------------------------------------------------------------- /doc/examples/warp_loss_files/warp_loss_9_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_9_0.png -------------------------------------------------------------------------------- /doc/examples/warp_loss_files/warp_loss_9_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_9_1.png -------------------------------------------------------------------------------- /doc/faq.rst: -------------------------------------------------------------------------------- 1 | === 2 | FAQ 3 | === 4 | 5 | Does LightFM have a GPU-based implementation? 6 | ============================================= 7 | No, there is no option to run training or inference on the GPU with LightFM. There are 8 | currently no plans to change this. 9 | See https://github.com/lyst/lightfm/issues/429 10 | 11 | What are the "learning to rank" and "hybrid" aspects of LightFM and how do they relate? 12 | ======================================================================================= 13 | *Learning to rank* and *hybrid* recommendation models are independent concepts. 14 | *Learning to rank* just means that you are optimizing a ranking loss such as `WARP` or 15 | `BPR`. *Hybrid* refers to the fact that you incorporate user or item meta-data as additional features. 16 | See: https://github.com/lyst/lightfm/issues/442 17 | 18 | Adding user/item features makes my model perform worse than without features, what can I do? 19 | ============================================================================================ 20 | That's not unusual and might have various reasons. For one, make sure you 21 | don't drop per-user/item features, see the notes in :doc:`LightFM`. If that 22 | doesn't help, your features might be simply uninformative and worsen the 23 | signal to noise ratio. You can experiment with different features and try 24 | discretization strategies for continuous features. More strategies and ideas 25 | can be found here: 26 | 27 | - https://github.com/lyst/lightfm/issues/551 28 | - https://github.com/lyst/lightfm/issues/486 29 | - https://github.com/lyst/lightfm/issues/176 30 | - https://github.com/lyst/lightfm/issues/430 31 | 32 | My model is recommending the same popular items to all users, what can I do? 33 | ============================================================================ 34 | You can try to set your item bias vectors to all zeros. Another strategy is 35 | to apply inverse propensity weights to your features. 36 | See these issues for more information: 37 | 38 | - https://github.com/lyst/lightfm/issues/395 39 | - https://github.com/lyst/lightfm/issues/176 40 | 41 | How can I re-train my model on partial data and/or new users (user cold-start)? 42 | =============================================================================== 43 | This depends a lot on your specific use case. Here are some helpful discussions: 44 | 45 | - https://github.com/lyst/lightfm/issues/194 46 | - https://github.com/lyst/lightfm/issues/347 47 | - https://github.com/lyst/lightfm/issues/210 48 | - https://github.com/lyst/lightfm/issues/371 49 | - https://stackoverflow.com/questions/46924119/lightfm-handling-user-and-item-cold-start 50 | -------------------------------------------------------------------------------- /doc/home.rst: -------------------------------------------------------------------------------- 1 | Welcome to LightFM's documentation! 2 | =================================== 3 | 4 | LightFM is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback. 5 | 6 | It also makes it possible to incorporate both item and user metadata into the traditional matrix factorization algorithms. It represents each user and item as the sum of the latent representations of their features, thus allowing recommendations to generalise to new items (via item features) and to new users (via user features). 7 | 8 | The details of the approach are described in the LightFM paper, available on `arXiv `_. 9 | 10 | Quickstart 11 | ---------- 12 | 13 | Jump straight to the :doc:`Movielens quickstart ` if you're impatient. 14 | 15 | 16 | Installation 17 | ------------ 18 | 19 | PyPI 20 | ~~~~ 21 | 22 | Install from pypi using pip: ``pip install lightfm``. Everything should work out-of-the box on Linux, OSX using Homebrew Python, and Windows using Miniconda. 23 | 24 | Note for OSX and Windows users: LightFM will by default not use OpenMP on OSX and Windows, and so all model fitting will be single-threaded. This is due to the fact that Clang (and Miniconda) does not support OpenMP, and installing an OpenMP-enabled version of gcc is complicated and labour-intensive. If you'd like to use the multi-threading capabilities of LightFM on these platforms, you should try using it via Docker as described in the next section. 25 | 26 | Building with the default Python distribution included in OSX is also not supported; please try the version from Homebrew or Anaconda. 27 | 28 | Using with Docker 29 | ~~~~~~~~~~~~~~~~~ 30 | 31 | On many systems it may be more convenient to try LightFM out in a Docker container. This repository provides a small Dockerfile sufficient to run LightFM and its examples. To run it: 32 | 33 | 1. `Install Docker `_ and start the docker deamon/virtual machine. 34 | 2. Clone this repository and navigate to it: ``git clone git@github.com:lyst/lightfm.git && cd lightfm``. 35 | 3. Run ``docker-compose build lightfm`` to build the container. 36 | 37 | The container should now be ready for use. You can then: 38 | 39 | 1. Run tests by running ``docker-compose run lightfm py.test -x lightfm/tests/`` 40 | 2. Run the movielens example by running ``docker-compose run --service-ports lightfm jupyter notebook lightfm/examples/movielens/example.ipynb --allow-root --ip="0.0.0.0" --port=8888 --no-browser``. The notebook will be accessible at port 8888 of your container's IP address. 41 | 42 | Usage 43 | ----- 44 | 45 | Model fitting is very straightforward using the main :doc:`LightFM class `. 46 | 47 | Create a model instance with the desired latent dimensionality:: 48 | 49 | from lightfm import LightFM 50 | 51 | model = LightFM(no_components=30) 52 | 53 | Assuming ``train`` is a (no_users, no_items) sparse matrix (with 1s denoting positive, and -1s negative interactions), you can fit a traditional matrix factorization model by calling:: 54 | 55 | model.fit(train, epochs=20) 56 | 57 | This will train a traditional MF model, as no user or item features have been supplied. 58 | 59 | To get predictions, call ``model.predict``:: 60 | 61 | predictions = model.predict(test_user_ids, test_item_ids) 62 | 63 | 64 | User and item features can be incorporated into training by passing them into the ``fit`` method. Assuming ``user_features`` is a (no_users, no_user_features) sparse matrix (and similarly for ``item_features``), you can call:: 65 | 66 | model.fit(train, 67 | user_features=user_features, 68 | item_features=item_features, 69 | epochs=20) 70 | predictions = model.predict(test_user_ids, 71 | test_item_ids, 72 | user_features=user_features, 73 | item_features=item_features) 74 | 75 | to train the model and obtain predictions. 76 | 77 | Both training and prediction can employ multiple cores for speed:: 78 | 79 | model.fit(train, epochs=20, num_threads=4) 80 | predictions = model.predict(test_user_ids, test_item_ids, num_threads=4) 81 | 82 | This implementation uses asynchronous stochastic gradient descent [6] for training. This can lead to lower accuracy when the interaction matrix (or the feature matrices) are very dense and a large number of threads is used. In practice, however, training on a sparse dataset with 20 threads does not lead to a measurable loss of accuracy. 83 | 84 | In an implicit feedback setting, the BPR, WARP, or k-OS WARP loss functions can be used. If ``train`` is a sparse matrix with positive entries representing positive interactions, the model can be trained as follows:: 85 | 86 | model = LightFM(no_components=30, loss='warp') 87 | model.fit(train, epochs=20) 88 | 89 | 90 | Examples 91 | -------- 92 | 93 | Check the ``examples`` directory for more examples. 94 | 95 | The `Movielens example `_ shows how to use LightFM on the Movielens dataset, both with and without using movie metadata. `Another example `_ compares the performance of the adagrad and adadelta learning schedules. 96 | 97 | The `Kaggle coupon purchase prediction `_ example applies LightFM to predicting coupon purchases. 98 | 99 | Articles and tutorials on using LightFM 100 | --------------------------------------- 101 | 102 | 1. `Learning to Rank Sketchfab Models with LightFM `_ 103 | 2. `Metadata Embeddings for User and Item Cold-start Recommendations `_ 104 | 3. `Recommendation Systems - Learn Python for Data Science `_ 105 | 106 | 107 | How to cite 108 | ----------- 109 | 110 | Please cite LightFM if it helps your research. You can use the following BibTeX entry.:: 111 | 112 | @inproceedings{DBLP:conf/recsys/Kula15, 113 | author = {Maciej Kula}, 114 | editor = {Toine Bogers and 115 | Marijn Koolen}, 116 | title = {Metadata Embeddings for User and Item Cold-start Recommendations}, 117 | booktitle = {Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender 118 | Systems co-located with 9th {ACM} Conference on Recommender Systems 119 | (RecSys 2015), Vienna, Austria, September 16-20, 2015.}, 120 | series = {{CEUR} Workshop Proceedings}, 121 | volume = {1448}, 122 | pages = {14--21}, 123 | publisher = {CEUR-WS.org}, 124 | year = {2015}, 125 | url = {http://ceur-ws.org/Vol-1448/paper4.pdf}, 126 | } 127 | 128 | 129 | Development 130 | ----------- 131 | 132 | Pull requests are welcome. To install for development: 133 | 134 | 1. Clone the repository: ``git clone git@github.com:lyst/lightfm.git`` 135 | 2. Install it for development using pip: ``cd lightfm && pip install -e .`` 136 | 3. You can run tests by running ``python setup.py test``. 137 | 138 | When making changes to the ``.pyx`` extension files, you'll need to run ``python setup.py cythonize`` in order to produce the extension ``.c`` files before running ``pip install -e .``. 139 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: home.rst 2 | 3 | 4 | Contents 5 | ======== 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | Home 11 | Quickstart 12 | The LightFM model class 13 | Model evaluation 14 | Cross validation 15 | Constructing datasets 16 | Built-in datasets 17 | Examples 18 | FAQ 19 | 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | -------------------------------------------------------------------------------- /doc/lightfm.data.rst: -------------------------------------------------------------------------------- 1 | Dataset construction 2 | ==================== 3 | 4 | .. automodule:: lightfm.data 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/lightfm.evaluation.rst: -------------------------------------------------------------------------------- 1 | Model evaluation 2 | ========================= 3 | 4 | .. automodule:: lightfm.evaluation 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/lightfm.rst: -------------------------------------------------------------------------------- 1 | LightFM 2 | =============== 3 | 4 | .. autoclass:: lightfm.LightFM 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /doc/quickstart.rst: -------------------------------------------------------------------------------- 1 | 2 | Quickstart 3 | ========== 4 | 5 | In this example, we'll build an implicit feedback recommender using the 6 | Movielens 100k dataset (http://grouplens.org/datasets/movielens/100k/). 7 | 8 | The code behind this example is available as a `Jupyter 9 | notebook `__ 10 | 11 | LightFM includes functions for getting and processing this dataset, so 12 | obtaining it is quite easy. 13 | 14 | .. code:: python 15 | 16 | import numpy as np 17 | 18 | from lightfm.datasets import fetch_movielens 19 | 20 | data = fetch_movielens(min_rating=5.0) 21 | 22 | This downloads the dataset and automatically pre-processes it into 23 | sparse matrices suitable for further calculation. In particular, it 24 | prepares the sparse user-item matrices, containing positive entries 25 | where a user interacted with a product, and zeros otherwise. 26 | 27 | We have two such matrices, a training and a testing set. Both have 28 | around 1000 users and 1700 items. We'll train the model on the train 29 | matrix but test it on the test matrix. 30 | 31 | .. code:: python 32 | 33 | print(repr(data['train'])) 34 | print(repr(data['test'])) 35 | 36 | 37 | .. parsed-literal:: 38 | 39 | <943x1682 sparse matrix of type '' 40 | with 19048 stored elements in COOrdinate format> 41 | <943x1682 sparse matrix of type '' 42 | with 2153 stored elements in COOrdinate format> 43 | 44 | 45 | We need to import the model class to fit the model: 46 | 47 | .. code:: python 48 | 49 | from lightfm import LightFM 50 | 51 | We're going to use the WARP (Weighted Approximate-Rank Pairwise) model. 52 | WARP is an implicit feedback model: all interactions in the training 53 | matrix are treated as positive signals, and products that users did not 54 | interact with they implicitly do not like. The goal of the model is to 55 | score these implicit positives highly while assigining low scores to 56 | implicit negatives. 57 | 58 | Model training is accomplished via SGD (stochastic gradient descent). 59 | This means that for every pass through the data --- an epoch --- the 60 | model learns to fit the data more and more closely. We'll run it for 30 61 | epochs in this example. We can also run it on multiple cores, so we'll 62 | set that to 2. (The dataset in this example is too small for that to 63 | make a difference, but it will matter on bigger datasets.) 64 | 65 | .. code:: python 66 | 67 | model = LightFM(loss='warp') 68 | %time model.fit(data['train'], epochs=30, num_threads=2) 69 | 70 | 71 | .. parsed-literal:: 72 | 73 | CPU times: user 1.55 s, sys: 4 ms, total: 1.56 s 74 | Wall time: 838 ms 75 | 76 | 77 | 78 | 79 | .. parsed-literal:: 80 | 81 | 82 | 83 | 84 | 85 | Done! We should now evaluate the model to see how well it's doing. We're 86 | most interested in how good the ranking produced by the model is. 87 | Precision@k is one suitable metric, expressing the percentage of top k 88 | items in the ranking the user has actually interacted with. ``lightfm`` 89 | implements a number of metrics in the ``evaluation`` module. 90 | 91 | .. code:: python 92 | 93 | from lightfm.evaluation import precision_at_k 94 | 95 | We'll measure precision in both the train and the test set. 96 | 97 | .. code:: python 98 | 99 | print("Train precision: %.2f" % precision_at_k(model, data['train'], k=5).mean()) 100 | print("Test precision: %.2f" % precision_at_k(model, data['test'], k=5).mean()) 101 | 102 | 103 | .. parsed-literal:: 104 | 105 | Train precision: 0.43 106 | Test precision: 0.04 107 | 108 | 109 | Unsurprisingly, the model fits the train set better than the test set. 110 | 111 | For an alternative way of judging the model, we can sample a couple of 112 | users and get their recommendations. To make predictions for given user, 113 | we pass the id of that user and the ids of all products we want 114 | predictions for into the ``predict`` method. 115 | 116 | .. code:: python 117 | 118 | def sample_recommendation(model, data, user_ids): 119 | 120 | n_users, n_items = data['train'].shape 121 | 122 | for user_id in user_ids: 123 | known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] 124 | 125 | scores = model.predict(user_id, np.arange(n_items)) 126 | top_items = data['item_labels'][np.argsort(-scores)] 127 | 128 | print("User %s" % user_id) 129 | print(" Known positives:") 130 | 131 | for x in known_positives[:3]: 132 | print(" %s" % x) 133 | 134 | print(" Recommended:") 135 | 136 | for x in top_items[:3]: 137 | print(" %s" % x) 138 | 139 | sample_recommendation(model, data, [3, 25, 450]) 140 | 141 | 142 | .. parsed-literal:: 143 | 144 | User 3 145 | Known positives: 146 | Contact (1997) 147 | Air Force One (1997) 148 | In & Out (1997) 149 | Recommended: 150 | Air Force One (1997) 151 | Assignment, The (1997) 152 | Kiss the Girls (1997) 153 | User 25 154 | Known positives: 155 | Fargo (1996) 156 | Godfather, The (1972) 157 | L.A. Confidential (1997) 158 | Recommended: 159 | L.A. Confidential (1997) 160 | Titanic (1997) 161 | Fargo (1996) 162 | User 450 163 | Known positives: 164 | Event Horizon (1997) 165 | Scream (1996) 166 | Conspiracy Theory (1997) 167 | Recommended: 168 | Independence Day (ID4) (1996) 169 | Scream (1996) 170 | Ransom (1996) 171 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | lightfm: 3 | build: . 4 | # Uncomment this to mount your local version 5 | # of the LightFM code. 6 | # volumes: 7 | # - .:/home/lightfm/ 8 | ports: 9 | - "8888:8888" 10 | -------------------------------------------------------------------------------- /docs-requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=4.0 2 | sphinx_rtd_theme>=1.0 -------------------------------------------------------------------------------- /examples/dataset/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: html 2 | html: 3 | pweave -f pandoc2html dataset.pmd 4 | 5 | .PHONY: rst 6 | rst: 7 | pweave -f markdown dataset.pmd 8 | pandoc -s -t rst dataset.md -o dataset.rst 9 | -------------------------------------------------------------------------------- /examples/dataset/dataset.pmd: -------------------------------------------------------------------------------- 1 | # Building datasets 2 | 3 | In this example, we'll use LightFM's built-in `Dataset` class to build an interaction dataset from raw data. The goal is to demonstrate how to go from raw data (lists of interactions and perhaps item and user features) to `scipy.sparse` matrices that can be used to fit a LightFM model. 4 | 5 | ## Getting the data 6 | We're going to use a sample from [Goodbooks-10k](https://github.com/zygmuntz/goodbooks-10k) as our example dataset. Let's download the data first. 7 | 8 | ```{python, source="download.py", echo=True} 9 | ``` 10 | 11 | The data consists of book ratings and book details: 12 | ```python 13 | import json 14 | from itertools import islice 15 | 16 | ratings, book_features = get_data() 17 | ``` 18 | 19 | Ratings look like this: 20 | ```python 21 | for line in islice(ratings, 2): 22 | print(json.dumps(line, indent=4)) 23 | ``` 24 | and book features look like this: 25 | ```python 26 | for line in islice(book_features, 1): 27 | print(json.dumps(line, indent=4)) 28 | ``` 29 | 30 | ## Building the ID mappings 31 | The first thing we need to do is to create a mapping between the user and item ids from our input data to indices that will be used internally by our model. 32 | 33 | We do this because LightFM works with user and item ids that are consecutive non-negative integers. The `Dataset` class allow us to create a mapping between the IDs we use in our systems and the consecutive indices preferred by the model. 34 | 35 | To do this, we create a dataset and call its `fit` method. The first argument is an iterable of all user ids in our data, and the second is an iterable of all item ids. In this case, we use generator expressions to lazily iterate over our data and yield user and item ids: 36 | ```python 37 | from lightfm.data import Dataset 38 | 39 | dataset = Dataset() 40 | dataset.fit((x['User-ID'] for x in get_ratings()), 41 | (x['ISBN'] for x in get_ratings())) 42 | ``` 43 | 44 | This call will assign an internal numerical id to every user and item id we pass in. These will be contiguous (from 0 to however many users and items we have), and will also determine the dimensions of the resulting LightFM model. 45 | 46 | We can check that the mappings have been created by querying the dataset on how many users and books it knows about: 47 | ```python 48 | num_users, num_items = dataset.interactions_shape() 49 | print('Num users: {}, num_items {}.'.format(num_users, num_items)) 50 | ``` 51 | 52 | Note that if we don't have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings: 53 | ```python 54 | dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), 55 | item_features=(x['Book-Author'] for x in get_book_features())) 56 | ``` 57 | This will create a feature for every unique author name in the dataset. 58 | 59 | (Note that we fit some more item ids: this is to make sure our mappings are complete even if there are items in the features dataset that are not in the interactions set.) 60 | 61 | ## Building the interactions matrix 62 | Having created the mapping, we build the interaction matrix: 63 | ```python 64 | (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN']) 65 | for x in get_ratings())) 66 | 67 | print(repr(interactions)) 68 | ``` 69 | 70 | This is main input into a LightFM model: it encodes the interactions betwee users and items. 71 | 72 | Since we have item features, we can also create the item features matrix: 73 | ```python 74 | item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']]) 75 | for x in get_book_features())) 76 | print(repr(item_features)) 77 | ``` 78 | 79 | ## Building a model 80 | This is all we need to build a LightFM model: 81 | ```python 82 | from lightfm import LightFM 83 | 84 | model = LightFM(loss='bpr') 85 | model.fit(interactions, item_features=item_features) 86 | ``` 87 | -------------------------------------------------------------------------------- /examples/dataset/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import csv 4 | 5 | import requests 6 | 7 | 8 | def _download(url: str, dest_path: str): 9 | 10 | req = requests.get(url, stream=True) 11 | req.raise_for_status() 12 | 13 | with open(dest_path, "wb") as fd: 14 | for chunk in req.iter_content(chunk_size=2**20): 15 | fd.write(chunk) 16 | 17 | 18 | def get_data(): 19 | 20 | ratings_url = ( 21 | "http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip" 22 | ) 23 | 24 | if not os.path.exists("data"): 25 | os.makedirs("data") 26 | 27 | _download(ratings_url, "data/data.zip") 28 | 29 | with zipfile.ZipFile("data/data.zip") as archive: 30 | return ( 31 | csv.DictReader( 32 | ( 33 | x.decode("utf-8", "ignore") 34 | for x in archive.open("BX-Book-Ratings.csv") 35 | ), 36 | delimiter=";", 37 | ), 38 | csv.DictReader( 39 | (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), 40 | delimiter=";", 41 | ), 42 | ) 43 | 44 | 45 | def get_ratings(): 46 | 47 | return get_data()[0] 48 | 49 | 50 | def get_book_features(): 51 | 52 | return get_data()[1] 53 | -------------------------------------------------------------------------------- /examples/dataset/readme.rst: -------------------------------------------------------------------------------- 1 | Building datasets 2 | ================= 3 | 4 | In this example, we'll use LightFM's built-in ``Dataset`` class to build 5 | an interaction dataset from raw data. The goal is to demonstrate how to 6 | go from raw data (lists of interactions and perhaps item and user 7 | features) to ``scipy.sparse`` matrices that can be used to fit a LightFM 8 | model. 9 | 10 | Getting the data 11 | ---------------- 12 | 13 | We're going to use a sample from 14 | `Goodbooks-10k `__ as our 15 | example dataset. Let's download the data first. 16 | 17 | .. code:: python 18 | 19 | import os 20 | import zipfile 21 | import csv 22 | 23 | import requests 24 | 25 | 26 | def _download(url: str, dest_path: str): 27 | 28 | req = requests.get(url, stream=True) 29 | req.raise_for_status() 30 | 31 | with open(dest_path, "wb") as fd: 32 | for chunk in req.iter_content(chunk_size=2 ** 20): 33 | fd.write(chunk) 34 | 35 | 36 | def get_data(): 37 | 38 | ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip") 39 | 40 | if not os.path.exists("data"): 41 | os.makedirs("data") 42 | 43 | _download(ratings_url, "data/data.zip") 44 | 45 | with zipfile.ZipFile("data/data.zip") as archive: 46 | return ( 47 | csv.DictReader( 48 | (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")), 49 | delimiter=";", 50 | ), 51 | csv.DictReader( 52 | (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";" 53 | ), 54 | ) 55 | 56 | 57 | def get_ratings(): 58 | 59 | return get_data()[0] 60 | 61 | 62 | def get_book_features(): 63 | 64 | return get_data()[1] 65 | 66 | The data consists of book ratings and book details: 67 | 68 | .. code:: python 69 | 70 | import json 71 | from itertools import islice 72 | 73 | ratings, book_features = get_data() 74 | 75 | Ratings look like this: 76 | 77 | .. code:: python 78 | 79 | for line in islice(ratings, 2): 80 | print(json.dumps(line, indent=4)) 81 | 82 | :: 83 | 84 | { 85 | "User-ID": "276725", 86 | "ISBN": "034545104X", 87 | "Book-Rating": "0" 88 | } 89 | { 90 | "User-ID": "276726", 91 | "ISBN": "0155061224", 92 | "Book-Rating": "5" 93 | } 94 | 95 | and book features look like this: 96 | 97 | .. code:: python 98 | 99 | for line in islice(book_features, 1): 100 | print(json.dumps(line, indent=4)) 101 | 102 | :: 103 | 104 | { 105 | "ISBN": "0195153448", 106 | "Book-Title": "Classical Mythology", 107 | "Book-Author": "Mark P. O. Morford", 108 | "Year-Of-Publication": "2002", 109 | "Publisher": "Oxford University Press", 110 | "Image-URL-S": 111 | "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg", 112 | "Image-URL-M": 113 | "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg", 114 | "Image-URL-L": 115 | "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg" 116 | } 117 | 118 | Building the ID mappings 119 | ------------------------ 120 | 121 | The first thing we need to do is to create a mapping between the user 122 | and item ids from our input data to indices that will be used internally 123 | by our model. 124 | 125 | We do this because LightFM works with user and item ids that are 126 | consecutive non-negative integers. The ``Dataset`` class allow us to 127 | create a mapping between the IDs we use in our systems and the 128 | consecutive indices preferred by the model. 129 | 130 | To do this, we create a dataset and call its ``fit`` method. The first 131 | argument is an iterable of all user ids in our data, and the second is 132 | an iterable of all item ids. In this case, we use generator expressions 133 | to lazily iterate over our data and yield user and item ids: 134 | 135 | .. code:: python 136 | 137 | from lightfm.data import Dataset 138 | 139 | dataset = Dataset() 140 | dataset.fit((x['User-ID'] for x in get_ratings()), 141 | (x['ISBN'] for x in get_ratings())) 142 | 143 | This call will assign an internal numerical id to every user and item id 144 | we pass in. These will be contiguous (from 0 to however many users and 145 | items we have), and will also determine the dimensions of the resulting 146 | LightFM model. 147 | 148 | We can check that the mappings have been created by querying the dataset 149 | on how many users and books it knows about: 150 | 151 | .. code:: python 152 | 153 | num_users, num_items = dataset.interactions_shape() 154 | print('Num users: {}, num_items {}.'.format(num_users, num_items)) 155 | 156 | :: 157 | 158 | Num users: 105283, num_items 340553. 159 | 160 | Note that if we don't have all user and items ids at once, we can 161 | repeatedly call ``fit_partial`` to supply additional ids. In this case, 162 | we will use this capability to add some item feature mappings: 163 | 164 | .. code:: python 165 | 166 | dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), 167 | item_features=(x['Book-Author'] for x in get_book_features())) 168 | 169 | This will create a feature for every unique author name in the dataset. 170 | 171 | (Note that we fit some more item ids: this is to make sure our mappings 172 | are complete even if there are items in the features dataset that are 173 | not in the interactions set.) 174 | 175 | Building the interactions matrix 176 | -------------------------------- 177 | 178 | Having created the mapping, we build the interaction matrix: 179 | 180 | .. code:: python 181 | 182 | (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN']) 183 | for x in get_ratings())) 184 | 185 | print(repr(interactions)) 186 | 187 | :: 188 | 189 | <105283x341762 sparse matrix of type '' 190 | with 1149780 stored elements in COOrdinate format> 191 | 192 | This is main input into a LightFM model: it encodes the interactions 193 | between users and items. 194 | 195 | Since we have item features, we can also create the item features 196 | matrix: 197 | 198 | .. code:: python 199 | 200 | item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']]) 201 | for x in get_book_features())) 202 | print(repr(item_features)) 203 | 204 | :: 205 | 206 | <341762x443805 sparse matrix of type '' 207 | with 613141 stored elements in Compressed Sparse Row format> 208 | 209 | Building a model 210 | ---------------- 211 | 212 | This is all we need to build a LightFM model: 213 | 214 | .. code:: python 215 | 216 | from lightfm import LightFM 217 | 218 | model = LightFM(loss='bpr') 219 | model.fit(interactions, item_features=item_features) 220 | 221 | :: 222 | 223 | 224 | -------------------------------------------------------------------------------- /examples/movielens/data.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | import zipfile 4 | 5 | import numpy as np 6 | 7 | import requests 8 | 9 | import scipy.sparse as sp 10 | 11 | 12 | def _get_movielens_path(): 13 | """ 14 | Get path to the movielens dataset file. 15 | """ 16 | 17 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "movielens.zip") 18 | 19 | 20 | def _download_movielens(dest_path): 21 | """ 22 | Download the dataset. 23 | """ 24 | 25 | url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip" 26 | req = requests.get(url, stream=True) 27 | 28 | with open(dest_path, "wb") as fd: 29 | for chunk in req.iter_content(): 30 | fd.write(chunk) 31 | 32 | 33 | def _get_raw_movielens_data(): 34 | """ 35 | Return the raw lines of the train and test files. 36 | """ 37 | 38 | path = _get_movielens_path() 39 | 40 | if not os.path.isfile(path): 41 | _download_movielens(path) 42 | 43 | with zipfile.ZipFile(path) as datafile: 44 | return ( 45 | datafile.read("ml-100k/ua.base").decode().split("\n"), 46 | datafile.read("ml-100k/ua.test").decode().split("\n"), 47 | ) 48 | 49 | 50 | def _parse(data): 51 | """ 52 | Parse movielens dataset lines. 53 | """ 54 | 55 | for line in data: 56 | 57 | if not line: 58 | continue 59 | 60 | uid, iid, rating, timestamp = [int(x) for x in line.split("\t")] 61 | 62 | yield uid, iid, rating, timestamp 63 | 64 | 65 | def _build_interaction_matrix(rows, cols, data): 66 | """ 67 | Build the training matrix (no_users, no_items), 68 | with ratings >= 4.0 being marked as positive and 69 | the rest as negative. 70 | """ 71 | 72 | mat = sp.lil_matrix((rows, cols), dtype=np.int32) 73 | 74 | for uid, iid, rating, timestamp in data: 75 | if rating >= 4.0: 76 | mat[uid, iid] = 1.0 77 | else: 78 | mat[uid, iid] = -1.0 79 | 80 | return mat.tocoo() 81 | 82 | 83 | def _get_movie_raw_metadata(): 84 | """ 85 | Get raw lines of the genre file. 86 | """ 87 | 88 | path = _get_movielens_path() 89 | 90 | if not os.path.isfile(path): 91 | _download_movielens(path) 92 | 93 | with zipfile.ZipFile(path) as datafile: 94 | return datafile.read("ml-100k/u.item").decode(errors="ignore").split("\n") 95 | 96 | 97 | def get_movielens_item_metadata(use_item_ids): 98 | """ 99 | Build a matrix of genre features (no_items, no_features). 100 | 101 | If use_item_ids is True, per-item features will also be used. 102 | """ 103 | 104 | features = {} 105 | genre_set = set() 106 | 107 | for line in _get_movie_raw_metadata(): 108 | 109 | if not line: 110 | continue 111 | 112 | splt = line.split("|") 113 | item_id = int(splt[0]) 114 | 115 | genres = [ 116 | idx for idx, val in zip(range(len(splt[5:])), splt[5:]) if int(val) > 0 117 | ] 118 | 119 | if use_item_ids: 120 | # Add item-specific features too 121 | genres.append(item_id) 122 | 123 | for genre_id in genres: 124 | genre_set.add(genre_id) 125 | 126 | features[item_id] = genres 127 | 128 | mat = sp.lil_matrix((len(features) + 1, len(genre_set)), dtype=np.int32) 129 | 130 | for item_id, genre_ids in features.items(): 131 | for genre_id in genre_ids: 132 | mat[item_id, genre_id] = 1 133 | 134 | return mat 135 | 136 | 137 | def get_movielens_data(): 138 | """ 139 | Return (train_interactions, test_interactions). 140 | """ 141 | 142 | train_data, test_data = _get_raw_movielens_data() 143 | 144 | uids = set() 145 | iids = set() 146 | 147 | for uid, iid, rating, timestamp in itertools.chain( 148 | _parse(train_data), _parse(test_data) 149 | ): 150 | uids.add(uid) 151 | iids.add(iid) 152 | 153 | rows = max(uids) + 1 154 | cols = max(iids) + 1 155 | 156 | return ( 157 | _build_interaction_matrix(rows, cols, _parse(train_data)), 158 | _build_interaction_matrix(rows, cols, _parse(test_data)), 159 | ) 160 | -------------------------------------------------------------------------------- /examples/movielens/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# An implicit feedback recommender for the Movielens dataset" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "## Implicit feedback\n", 17 | "For some time, the recommender system literature focused on explicit feedback: the Netflix prize focused on accurately reproducing the ratings users have given to movies they watched.\n", 18 | "\n", 19 | "Focusing on ratings in this way ignored the importance of taking into account which movies the users chose to watch in the first place, and treating the absence of ratings as absence of information.\n", 20 | "\n", 21 | "But the things that we don't have ratings for aren't unknowns: we know the user didn't pick them. This reflects a user's conscious choice, and is a good source of information on what she thinks she might like. \n", 22 | "\n", 23 | "This sort of phenomenon is described as data which is missing-not-at-random in the literature: the ratings that are missing are more likely to be negative precisely because the user chooses which items to rate. When choosing a restaurant, you only go to places which you think you'll enjoy, and never go to places that you think you'll hate. What this leads to is that you're only going to be submitting ratings for things which, a priori, you expected to like; the things that you expect you will not like you will never rate.\n", 24 | "\n", 25 | "This observation has led to the development of models that are suitable for implicit feedback. LightFM implements two that have proven particular successful:\n", 26 | "\n", 27 | "- BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.\n", 28 | "- WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.\n", 29 | "\n", 30 | "This example shows how to estimate these models on the Movielens dataset.\n", 31 | "\n", 32 | "[1] Rendle, Steffen, et al. \"BPR: Bayesian personalized ranking from implicit feedback.\" Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. AUAI Press, 2009.\n", 33 | "\n", 34 | "[2] Weston, Jason, Samy Bengio, and Nicolas Usunier. \"Wsabie: Scaling up to large vocabulary image annotation.\" IJCAI. Vol. 11. 2011.\n", 35 | "\n", 36 | "\n", 37 | "## Getting the data\n", 38 | "The first step is to get the [Movielens data](http://grouplens.org/datasets/movielens/100k/). This is a classic small recommender dataset, consisting of around 950 users, 1700 movies, and 100,000 ratings. The ratings are on a scale from 1 to 5, but we'll all treat them as implicit positive feedback in this example.\n", 39 | "\n", 40 | "\n", 41 | "Fortunately, this is one of the functions provided by LightFM itself." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "import numpy as np\n", 53 | "\n", 54 | "from lightfm.datasets import fetch_movielens\n", 55 | "\n", 56 | "movielens = fetch_movielens()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "This gives us a dictionary with the following fields:" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 2, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "('test', , (943, 1682))\n", 78 | "('item_features', , (1682, 1682))\n", 79 | "('train', , (943, 1682))\n", 80 | "('item_labels', , (1682,))\n", 81 | "('item_feature_labels', , (1682,))\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "for key, value in movielens.items():\n", 87 | " print(key, type(value), value.shape)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "train = movielens['train']\n", 99 | "test = movielens['test']" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "The `train` and `test` elements are the most important: they contain the raw rating data, split into a train and a test set. Each row represents a user, and each column an item. Entries are ratings from 1 to 5." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## Fitting models\n", 114 | "\n", 115 | "Now let's train a BPR model and look at its accuracy.\n", 116 | "\n", 117 | "We'll use two metrics of accuracy: precision@k and ROC AUC. Both are ranking metrics: to compute them, we'll be constructing recommendation lists for all of our users, and checking the ranking of known positive movies. For precision at k we'll be looking at whether they are within the first k results on the list; for AUC, we'll be calculating the probability that any known positive is higher on the list than a random negative example." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "Precision: train 0.59, test 0.10.\n", 132 | "AUC: train 0.90, test 0.86.\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "from lightfm import LightFM\n", 138 | "from lightfm.evaluation import precision_at_k\n", 139 | "from lightfm.evaluation import auc_score\n", 140 | "\n", 141 | "model = LightFM(learning_rate=0.05, loss='bpr')\n", 142 | "model.fit(train, epochs=10)\n", 143 | "\n", 144 | "train_precision = precision_at_k(model, train, k=10).mean()\n", 145 | "test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()\n", 146 | "\n", 147 | "train_auc = auc_score(model, train).mean()\n", 148 | "test_auc = auc_score(model, test, train_interactions=train).mean()\n", 149 | "\n", 150 | "print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))\n", 151 | "print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "The WARP model, on the other hand, optimises for precision@k---we should expect its performance to be better on precision." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 5, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "Precision: train 0.61, test 0.11.\n", 173 | "AUC: train 0.93, test 0.90.\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "model = LightFM(learning_rate=0.05, loss='warp')\n", 179 | "\n", 180 | "model.fit_partial(train, epochs=10)\n", 181 | "\n", 182 | "train_precision = precision_at_k(model, train, k=10).mean()\n", 183 | "test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()\n", 184 | "\n", 185 | "train_auc = auc_score(model, train).mean()\n", 186 | "test_auc = auc_score(model, test, train_interactions=train).mean()\n", 187 | "\n", 188 | "print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))\n", 189 | "print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "And that is exactly what we see: we get slightly higher precision@10 (but the AUC metric is also improved)." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 2", 212 | "language": "python", 213 | "name": "python2" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 2 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython2", 225 | "version": "2.7.12" 226 | }, 227 | "widgets": { 228 | "state": {}, 229 | "version": "1.1.2" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 0 234 | } 235 | -------------------------------------------------------------------------------- /examples/movielens/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # An implicit feedback recommender for the Movielens dataset 3 | 4 | ## Implicit feedback 5 | For some time, the recommender system literature focused on explicit feedback: the Netflix prize focused on accurately reproducing the ratings users have given to movies they watched. 6 | 7 | Focusing on ratings in this way ignored the importance of taking into account which movies the users chose to watch in the first place, and treating the absence of ratings as absence of information. 8 | 9 | But the things that we don't have ratings for aren't unknowns: we know the user didn't pick them. This reflects a user's conscious choice, and is a good source of information on what she thinks she might like. 10 | 11 | This sort of phenomenon is described as data which is missing-not-at-random in the literature: the ratings that are missing are more likely to be negative precisely because the user chooses which items to rate. When choosing a restaurant, you only go to places which you think you'll enjoy, and never go to places that you think you'll hate. What this leads to is that you're only going to be submitting ratings for things which, a priori, you expected to like; the things that you expect you will not like you will never rate. 12 | 13 | This observation has led to the development of models that are suitable for implicit feedback. LightFM implements two that have proven particular successful: 14 | 15 | - BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired. 16 | - WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired. 17 | 18 | This example shows how to estimate these models on the Movielens dataset. 19 | 20 | [1] Rendle, Steffen, et al. "BPR: Bayesian personalized ranking from implicit feedback." Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. AUAI Press, 2009. 21 | 22 | [2] Weston, Jason, Samy Bengio, and Nicolas Usunier. "Wsabie: Scaling up to large vocabulary image annotation." IJCAI. Vol. 11. 2011. 23 | 24 | 25 | ## Getting the data 26 | The first step is to get the [Movielens data](http://grouplens.org/datasets/movielens/100k/). This is a classic small recommender dataset, consisting of around 950 users, 1700 movies, and 100,000 ratings. The ratings are on a scale from 1 to 5, but we'll all treat them as implicit positive feedback in this example. 27 | 28 | 29 | Fortunately, this is one of the functions provided by LightFM itself. 30 | 31 | 32 | ```python 33 | import numpy as np 34 | 35 | from lightfm.datasets import fetch_movielens 36 | 37 | movielens = fetch_movielens() 38 | ``` 39 | 40 | This gives us a dictionary with the following fields: 41 | 42 | 43 | ```python 44 | for key, value in movielens.items(): 45 | print(key, type(value), value.shape) 46 | ``` 47 | 48 | ('test', , (943, 1682)) 49 | ('item_features', , (1682, 1682)) 50 | ('train', , (943, 1682)) 51 | ('item_labels', , (1682,)) 52 | ('item_feature_labels', , (1682,)) 53 | 54 | 55 | 56 | ```python 57 | train = movielens['train'] 58 | test = movielens['test'] 59 | ``` 60 | 61 | The `train` and `test` elements are the most important: they contain the raw rating data, split into a train and a test set. Each row represents a user, and each column an item. Entries are ratings from 1 to 5. 62 | 63 | ## Fitting models 64 | 65 | Now let's train a BPR model and look at its accuracy. 66 | 67 | We'll use two metrics of accuracy: precision@k and ROC AUC. Both are ranking metrics: to compute them, we'll be constructing recommendation lists for all of our users, and checking the ranking of known positive movies. For precision at k we'll be looking at whether they are within the first k results on the list; for AUC, we'll be calculating the probability that any known positive is higher on the list than a random negative example. 68 | 69 | 70 | ```python 71 | from lightfm import LightFM 72 | from lightfm.evaluation import precision_at_k 73 | from lightfm.evaluation import auc_score 74 | 75 | model = LightFM(learning_rate=0.05, loss='bpr') 76 | model.fit(train, epochs=10) 77 | 78 | train_precision = precision_at_k(model, train, k=10).mean() 79 | test_precision = precision_at_k(model, test, k=10).mean() 80 | 81 | train_auc = auc_score(model, train).mean() 82 | test_auc = auc_score(model, test).mean() 83 | 84 | print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision)) 85 | print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc)) 86 | ``` 87 | 88 | Precision: train 0.59, test 0.10. 89 | AUC: train 0.90, test 0.86. 90 | 91 | 92 | The WARP model, on the other hand, optimises for precision@k---we should expect its performance to be better on precision. 93 | 94 | 95 | ```python 96 | model = LightFM(learning_rate=0.05, loss='warp') 97 | 98 | model.fit_partial(train, epochs=10) 99 | 100 | train_precision = precision_at_k(model, train, k=10).mean() 101 | test_precision = precision_at_k(model, test, k=10).mean() 102 | 103 | train_auc = auc_score(model, train).mean() 104 | test_auc = auc_score(model, test).mean() 105 | 106 | print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision)) 107 | print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc)) 108 | ``` 109 | 110 | Precision: train 0.61, test 0.11. 111 | AUC: train 0.93, test 0.90. 112 | 113 | 114 | And that is exactly what we see: we get slightly higher precision@10 (but the AUC metric is also improved). 115 | -------------------------------------------------------------------------------- /examples/quickstart/quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Quickstart\n", 8 | "In this example, we'll build an implicit feedback recommender using the Movielens 100k dataset (http://grouplens.org/datasets/movielens/100k/).\n", 9 | "\n", 10 | "The code behind this example is available as a [Jupyter notebook](https://github.com/lyst/lightfm/tree/master/examples/quickstart/quickstart.ipynb)\n", 11 | "\n", 12 | "LightFM includes functions for getting and processing this dataset, so obtaining it is quite easy." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "\n", 25 | "from lightfm.datasets import fetch_movielens\n", 26 | "\n", 27 | "data = fetch_movielens(min_rating=5.0)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "This downloads the dataset and automatically pre-processes it into sparse matrices suitable for further calculation. In particular, it prepares the sparse user-item matrices, containing positive entries where a user interacted with a product, and zeros otherwise.\n", 35 | "\n", 36 | "We have two such matrices, a training and a testing set. Both have around 1000 users and 1700 items. We'll train the model on the train matrix but test it on the test matrix." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "<943x1682 sparse matrix of type ''\n", 51 | "\twith 19048 stored elements in COOrdinate format>\n", 52 | "<943x1682 sparse matrix of type ''\n", 53 | "\twith 2153 stored elements in COOrdinate format>\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "print(repr(data['train']))\n", 59 | "print(repr(data['test']))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "We need to import the model class to fit the model:" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "from lightfm import LightFM" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "We're going to use the WARP (Weighted Approximate-Rank Pairwise) model. WARP is an implicit feedback model: all interactions in the training matrix are treated as positive signals, and products that users did not interact with they implicitly do not like. The goal of the model is to score these implicit positives highly while assigining low scores to implicit negatives.\n", 85 | "\n", 86 | "Model training is accomplished via SGD (stochastic gradient descent). This means that for every pass through the data --- an epoch --- the model learns to fit the data more and more closely. We'll run it for 30 epochs in this example. We can also run it on multiple cores, so we'll set that to 2. (The dataset in this example is too small for that to make a difference, but it will matter on bigger datasets.)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 57, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "CPU times: user 1.55 s, sys: 4 ms, total: 1.56 s\n", 101 | "Wall time: 838 ms\n" 102 | ] 103 | }, 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "" 108 | ] 109 | }, 110 | "execution_count": 57, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "model = LightFM(loss='warp')\n", 117 | "%time model.fit(data['train'], epochs=30, num_threads=2)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Done! We should now evaluate the model to see how well it's doing. We're most interested in how good the ranking produced by the model is. Precision@k is one suitable metric, expressing the percentage of top k items in the ranking the user has actually interacted with. `lightfm` implements a number of metrics in the `evaluation` module. " 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 55, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "from lightfm.evaluation import precision_at_k" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "We'll measure precision in both the train and the test set." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 58, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "Train precision: 0.43\n", 157 | "Test precision: 0.04\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "print(\"Train precision: %.2f\" % precision_at_k(model, data['train'], k=5).mean())\n", 163 | "print(\"Test precision: %.2f\" % precision_at_k(model, data['test'], k=5).mean())" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Unsurprisingly, the model fits the train set better than the test set.\n", 171 | "\n", 172 | "For an alternative way of judging the model, we can sample a couple of users and get their recommendations. To make predictions for given user, we pass the id of that user and the ids of all products we want predictions for into the `predict` method." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 60, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "User 3\n", 187 | " Known positives:\n", 188 | " Contact (1997)\n", 189 | " Air Force One (1997)\n", 190 | " In & Out (1997)\n", 191 | " Recommended:\n", 192 | " Air Force One (1997)\n", 193 | " Assignment, The (1997)\n", 194 | " Kiss the Girls (1997)\n", 195 | "User 25\n", 196 | " Known positives:\n", 197 | " Fargo (1996)\n", 198 | " Godfather, The (1972)\n", 199 | " L.A. Confidential (1997)\n", 200 | " Recommended:\n", 201 | " L.A. Confidential (1997)\n", 202 | " Titanic (1997)\n", 203 | " Fargo (1996)\n", 204 | "User 450\n", 205 | " Known positives:\n", 206 | " Event Horizon (1997)\n", 207 | " Scream (1996)\n", 208 | " Conspiracy Theory (1997)\n", 209 | " Recommended:\n", 210 | " Independence Day (ID4) (1996)\n", 211 | " Scream (1996)\n", 212 | " Ransom (1996)\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "def sample_recommendation(model, data, user_ids):\n", 218 | " \n", 219 | "\n", 220 | " n_users, n_items = data['train'].shape\n", 221 | "\n", 222 | " for user_id in user_ids:\n", 223 | " known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]\n", 224 | " \n", 225 | " scores = model.predict(user_id, np.arange(n_items))\n", 226 | " top_items = data['item_labels'][np.argsort(-scores)]\n", 227 | " \n", 228 | " print(\"User %s\" % user_id)\n", 229 | " print(\" Known positives:\")\n", 230 | " \n", 231 | " for x in known_positives[:3]:\n", 232 | " print(\" %s\" % x)\n", 233 | "\n", 234 | " print(\" Recommended:\")\n", 235 | " \n", 236 | " for x in top_items[:3]:\n", 237 | " print(\" %s\" % x)\n", 238 | " \n", 239 | "sample_recommendation(model, data, [3, 25, 450]) " 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": true 247 | }, 248 | "outputs": [], 249 | "source": [] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 2", 255 | "language": "python", 256 | "name": "python2" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 2 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython2", 268 | "version": "2.7.10" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 0 273 | } 274 | -------------------------------------------------------------------------------- /examples/quickstart/short_quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Short quickstart" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from lightfm import LightFM\n", 19 | "from lightfm.datasets import fetch_movielens\n", 20 | "from lightfm.evaluation import precision_at_k\n", 21 | "\n", 22 | "# Load the MovieLens 100k dataset. Only five\n", 23 | "# star ratings are treated as positive.\n", 24 | "data = fetch_movielens(min_rating=5.0)\n", 25 | "\n", 26 | "# Instantiate and train the model\n", 27 | "model = LightFM(loss='warp')\n", 28 | "model.fit(data['train'], epochs=30, num_threads=2)\n", 29 | "\n", 30 | "# Evaluate the trained model\n", 31 | "test_precision = precision_at_k(model, data['test'], k=5).mean()" 32 | ] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 2", 38 | "language": "python", 39 | "name": "python2" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 2 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython2", 51 | "version": "2.7.8" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 0 56 | } 57 | -------------------------------------------------------------------------------- /lightfm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/lightfm.png -------------------------------------------------------------------------------- /lightfm/__init__.py: -------------------------------------------------------------------------------- 1 | from .lightfm import LightFM 2 | from .version import __version__ 3 | 4 | __all__ = ["LightFM", "datasets", "evaluation", "__version__"] 5 | -------------------------------------------------------------------------------- /lightfm/_lightfm_fast.py: -------------------------------------------------------------------------------- 1 | try: 2 | # Import OpenMP-enabled extension 3 | from ._lightfm_fast_openmp import * # NOQA 4 | from ._lightfm_fast_openmp import __test_in_positives # NOQA 5 | except ImportError: 6 | # Fall back on OpenMP-less extension 7 | import warnings 8 | 9 | warnings.warn( 10 | "LightFM was compiled without OpenMP support. " 11 | "Only a single thread will be used." 12 | ) 13 | 14 | from ._lightfm_fast_no_openmp import * # NOQA 15 | from ._lightfm_fast_no_openmp import __test_in_positives # NOQA 16 | -------------------------------------------------------------------------------- /lightfm/cross_validation.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | Dataset splitting functions. 4 | """ 5 | 6 | import numpy as np 7 | import scipy.sparse as sp 8 | 9 | 10 | def _shuffle(uids, iids, data, random_state): 11 | 12 | shuffle_indices = np.arange(len(uids)) 13 | random_state.shuffle(shuffle_indices) 14 | 15 | return (uids[shuffle_indices], iids[shuffle_indices], data[shuffle_indices]) 16 | 17 | 18 | def random_train_test_split(interactions, test_percentage=0.2, random_state=None): 19 | """ 20 | Randomly split interactions between training and testing. 21 | 22 | This function takes an interaction set and splits it into 23 | two disjoint sets, a training set and a test set. Note that 24 | no effort is made to make sure that all items and users with 25 | interactions in the test set also have interactions in the 26 | training set; this may lead to a partial cold-start problem 27 | in the test set. 28 | To split a sample_weight matrix along the same lines, pass it 29 | into this function with the same random_state seed as was used 30 | for splitting the interactions. 31 | 32 | Parameters 33 | ---------- 34 | 35 | interactions: a scipy sparse matrix containing interactions 36 | The interactions to split. 37 | test_percentage: float, optional 38 | The fraction of interactions to place in the test set. 39 | random_state: int or numpy.random.RandomState, optional 40 | Random seed used to initialize the numpy.random.RandomState number generator. 41 | Accepts an instance of numpy.random.RandomState for backwards compatibility. 42 | 43 | Returns 44 | ------- 45 | 46 | (train, test): (scipy.sparse.COOMatrix, 47 | scipy.sparse.COOMatrix) 48 | A tuple of (train data, test data) 49 | """ 50 | 51 | if not sp.issparse(interactions): 52 | raise ValueError("Interactions must be a scipy.sparse matrix.") 53 | 54 | if not isinstance(random_state, np.random.RandomState): 55 | random_state = np.random.RandomState(seed=random_state) 56 | 57 | interactions = interactions.tocoo() 58 | 59 | shape = interactions.shape 60 | uids, iids, data = (interactions.row, interactions.col, interactions.data) 61 | 62 | uids, iids, data = _shuffle(uids, iids, data, random_state) 63 | 64 | cutoff = int((1.0 - test_percentage) * len(uids)) 65 | 66 | train_idx = slice(None, cutoff) 67 | test_idx = slice(cutoff, None) 68 | 69 | train = sp.coo_matrix( 70 | (data[train_idx], (uids[train_idx], iids[train_idx])), 71 | shape=shape, 72 | dtype=interactions.dtype, 73 | ) 74 | test = sp.coo_matrix( 75 | (data[test_idx], (uids[test_idx], iids[test_idx])), 76 | shape=shape, 77 | dtype=interactions.dtype, 78 | ) 79 | 80 | return train, test 81 | -------------------------------------------------------------------------------- /lightfm/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from lightfm.datasets.movielens import fetch_movielens # NOQA 2 | from lightfm.datasets.stackexchange import fetch_stackexchange # NOQA 3 | -------------------------------------------------------------------------------- /lightfm/datasets/_common.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | 6 | def get_data_dir(): 7 | 8 | return os.path.join(os.path.expanduser("~"), "lightfm_data") 9 | 10 | 11 | def create_data_dir(path): 12 | 13 | if not os.path.isdir(path): 14 | os.makedirs(path) 15 | 16 | 17 | def download(url, dest_path): 18 | 19 | req = requests.get(url, stream=True) 20 | req.raise_for_status() 21 | 22 | with open(dest_path, "wb") as fd: 23 | for chunk in req.iter_content(chunk_size=2**20): 24 | fd.write(chunk) 25 | 26 | 27 | def get_data(data_home, url, dest_subdir, dest_filename, download_if_missing): 28 | 29 | if data_home is None: 30 | data_dir = os.path.join(get_data_dir(), dest_subdir) 31 | else: 32 | data_dir = os.path.join(os.path.abspath(data_home), dest_subdir) 33 | 34 | create_data_dir(data_dir) 35 | 36 | dest_path = os.path.join(data_dir, dest_filename) 37 | 38 | if not os.path.isfile(dest_path): 39 | if download_if_missing: 40 | download(url, dest_path) 41 | else: 42 | raise IOError("Dataset missing.") 43 | 44 | return dest_path 45 | -------------------------------------------------------------------------------- /lightfm/datasets/movielens.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | import zipfile 4 | 5 | import numpy as np 6 | 7 | import scipy.sparse as sp 8 | 9 | from lightfm.datasets import _common 10 | 11 | 12 | def _read_raw_data(path): 13 | """ 14 | Return the raw lines of the train and test files. 15 | """ 16 | 17 | with zipfile.ZipFile(path) as datafile: 18 | return ( 19 | datafile.read("ml-100k/ua.base").decode().split("\n"), 20 | datafile.read("ml-100k/ua.test").decode().split("\n"), 21 | datafile.read("ml-100k/u.item").decode(errors="ignore").split("\n"), 22 | datafile.read("ml-100k/u.genre").decode(errors="ignore").split("\n"), 23 | ) 24 | 25 | 26 | def _parse(data): 27 | 28 | for line in data: 29 | 30 | if not line: 31 | continue 32 | 33 | uid, iid, rating, timestamp = [int(x) for x in line.split("\t")] 34 | 35 | # Subtract one from ids to shift 36 | # to zero-based indexing 37 | yield uid - 1, iid - 1, rating, timestamp 38 | 39 | 40 | def _get_dimensions(train_data, test_data): 41 | 42 | uids = set() 43 | iids = set() 44 | 45 | for uid, iid, _, _ in itertools.chain(train_data, test_data): 46 | uids.add(uid) 47 | iids.add(iid) 48 | 49 | rows = max(uids) + 1 50 | cols = max(iids) + 1 51 | 52 | return rows, cols 53 | 54 | 55 | def _build_interaction_matrix(rows, cols, data, min_rating): 56 | 57 | mat = sp.lil_matrix((rows, cols), dtype=np.int32) 58 | 59 | for uid, iid, rating, _ in data: 60 | if rating >= min_rating: 61 | mat[uid, iid] = rating 62 | 63 | return mat.tocoo() 64 | 65 | 66 | def _parse_item_metadata(num_items, item_metadata_raw, genres_raw): 67 | 68 | genres = [] 69 | 70 | for line in genres_raw: 71 | if line: 72 | genre, gid = line.split("|") 73 | genres.append("genre:{}".format(genre)) 74 | 75 | id_feature_labels = np.empty(num_items, dtype=str) 76 | genre_feature_labels = np.array(genres) 77 | 78 | id_features = sp.identity(num_items, format="csr", dtype=np.float32) 79 | genre_features = sp.lil_matrix((num_items, len(genres)), dtype=np.float32) 80 | 81 | for line in item_metadata_raw: 82 | 83 | if not line: 84 | continue 85 | 86 | splt = line.split("|") 87 | 88 | # Zero-based indexing 89 | iid = int(splt[0]) - 1 90 | title = splt[1] 91 | 92 | id_feature_labels[iid] = title 93 | 94 | item_genres = [idx for idx, val in enumerate(splt[5:]) if int(val) > 0] 95 | 96 | for gid in item_genres: 97 | genre_features[iid, gid] = 1.0 98 | 99 | return ( 100 | id_features, 101 | id_feature_labels, 102 | genre_features.tocsr(), 103 | genre_feature_labels, 104 | ) 105 | 106 | 107 | def fetch_movielens( 108 | data_home=None, 109 | indicator_features=True, 110 | genre_features=False, 111 | min_rating=0.0, 112 | download_if_missing=True, 113 | ): 114 | """ 115 | Fetch the `Movielens 100k dataset `_. 116 | 117 | The dataset contains 100,000 interactions from 1000 users on 1700 movies, 118 | and is exhaustively described in its 119 | `README `_. 120 | 121 | Parameters 122 | ---------- 123 | 124 | data_home: path, optional 125 | Path to the directory in which the downloaded data should be placed. 126 | Defaults to ``~/lightfm_data/``. 127 | indicator_features: bool, optional 128 | Use an [n_items, n_items] identity matrix for item features. When True with genre_features, 129 | indicator and genre features are concatenated into a single feature matrix of shape 130 | [n_items, n_items + n_genres]. 131 | genre_features: bool, optional 132 | Use a [n_items, n_genres] matrix for item features. When True with item_indicator_features, 133 | indicator and genre features are concatenated into a single feature matrix of shape 134 | [n_items, n_items + n_genres]. 135 | min_rating: float, optional 136 | Minimum rating to include in the interaction matrix. 137 | download_if_missing: bool, optional 138 | Download the data if not present. Raises an IOError if False and data is missing. 139 | 140 | Notes 141 | ----- 142 | 143 | The return value is a dictionary containing the following keys: 144 | 145 | Returns 146 | ------- 147 | 148 | train: sp.coo_matrix of shape [n_users, n_items] 149 | Contains training set interactions. 150 | test: sp.coo_matrix of shape [n_users, n_items] 151 | Contains testing set interactions. 152 | item_features: sp.csr_matrix of shape [n_items, n_item_features] 153 | Contains item features. 154 | item_feature_labels: np.array of strings of shape [n_item_features,] 155 | Labels of item features. 156 | item_labels: np.array of strings of shape [n_items,] 157 | Items' titles. 158 | """ 159 | 160 | if not (indicator_features or genre_features): 161 | raise ValueError( 162 | "At least one of item_indicator_features " "or genre_features must be True" 163 | ) 164 | 165 | zip_path = _common.get_data( 166 | data_home, 167 | ( 168 | "https://github.com/maciejkula/" 169 | "lightfm_datasets/releases/" 170 | "download/v0.1.0/movielens.zip" 171 | ), 172 | "movielens100k", 173 | "movielens.zip", 174 | download_if_missing, 175 | ) 176 | 177 | # Load raw data 178 | try: 179 | (train_raw, test_raw, item_metadata_raw, genres_raw) = _read_raw_data(zip_path) 180 | except zipfile.BadZipFile: 181 | # Download was corrupted, get rid of the partially 182 | # downloaded file so that we re-download on the 183 | # next try. 184 | os.unlink(zip_path) 185 | raise ValueError( 186 | "Corrupted Movielens download. Check your " 187 | "internet connection and try again." 188 | ) 189 | 190 | # Figure out the dimensions 191 | num_users, num_items = _get_dimensions(_parse(train_raw), _parse(test_raw)) 192 | 193 | # Load train interactions 194 | train = _build_interaction_matrix( 195 | num_users, num_items, _parse(train_raw), min_rating 196 | ) 197 | # Load test interactions 198 | test = _build_interaction_matrix(num_users, num_items, _parse(test_raw), min_rating) 199 | 200 | assert train.shape == test.shape 201 | 202 | # Load metadata features 203 | ( 204 | id_features, 205 | id_feature_labels, 206 | genre_features_matrix, 207 | genre_feature_labels, 208 | ) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw) 209 | 210 | assert id_features.shape == (num_items, len(id_feature_labels)) 211 | assert genre_features_matrix.shape == (num_items, len(genre_feature_labels)) 212 | 213 | if indicator_features and not genre_features: 214 | features = id_features 215 | feature_labels = id_feature_labels 216 | elif genre_features and not indicator_features: 217 | features = genre_features_matrix 218 | feature_labels = genre_feature_labels 219 | else: 220 | features = sp.hstack([id_features, genre_features_matrix]).tocsr() 221 | feature_labels = np.concatenate((id_feature_labels, genre_feature_labels)) 222 | 223 | data = { 224 | "train": train, 225 | "test": test, 226 | "item_features": features, 227 | "item_feature_labels": feature_labels, 228 | "item_labels": id_feature_labels, 229 | } 230 | 231 | return data 232 | -------------------------------------------------------------------------------- /lightfm/datasets/stackexchange.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | import scipy.sparse as sp 6 | 7 | from lightfm.datasets import _common 8 | 9 | 10 | def fetch_stackexchange( 11 | dataset, 12 | test_set_fraction=0.2, 13 | min_training_interactions=1, 14 | data_home=None, 15 | indicator_features=True, 16 | tag_features=False, 17 | download_if_missing=True, 18 | ): 19 | """ 20 | Fetch a dataset from the `StackExchange network `_. 21 | 22 | The datasets contain users answering questions: an interaction is defined as a user 23 | answering a given question. 24 | 25 | The following datasets from the StackExchange network are available: 26 | 27 | - CrossValidated: From stats.stackexchange.com. Approximately 9000 users, 72000 questions, 28 | and 70000 answers. 29 | - StackOverflow: From stackoverflow.stackexchange.com. Approximately 1.3M users, 11M questions, 30 | and 18M answers. 31 | 32 | Parameters 33 | ---------- 34 | 35 | dataset: string, one of ('crossvalidated', 'stackoverflow') 36 | The part of the StackExchange network for which to fetch the dataset. 37 | test_set_fraction: float, optional 38 | The fraction of the dataset used for testing. Splitting into the train and test set is done 39 | in a time-based fashion: all interactions before a certain time are in the train set and 40 | all interactions after that time are in the test set. 41 | min_training_interactions: int, optional 42 | Only include users with this amount of interactions in the training set. 43 | data_home: path, optional 44 | Path to the directory in which the downloaded data should be placed. 45 | Defaults to ``~/lightfm_data/``. 46 | indicator_features: bool, optional 47 | Use an [n_users, n_users] identity matrix for item features. When True with genre_features, 48 | indicator and genre features are concatenated into a single feature matrix of shape 49 | [n_users, n_users + n_genres]. 50 | download_if_missing: bool, optional 51 | Download the data if not present. Raises an IOError if False and data is missing. 52 | 53 | Notes 54 | ----- 55 | 56 | The return value is a dictionary containing the following keys: 57 | 58 | Returns 59 | ------- 60 | 61 | train: sp.coo_matrix of shape [n_users, n_items] 62 | Contains training set interactions. 63 | test: sp.coo_matrix of shape [n_users, n_items] 64 | Contains testing set interactions. 65 | item_features: sp.csr_matrix of shape [n_items, n_item_features] 66 | Contains item features. 67 | item_feature_labels: np.array of strings of shape [n_item_features,] 68 | Labels of item features. 69 | """ 70 | 71 | if not (indicator_features or tag_features): 72 | raise ValueError( 73 | "At least one of item_indicator_features " "or tag_features must be True" 74 | ) 75 | 76 | if dataset not in ("crossvalidated", "stackoverflow"): 77 | raise ValueError("Unknown dataset") 78 | 79 | if not (0.0 < test_set_fraction < 1.0): 80 | raise ValueError("Test set fraction must be between 0 and 1") 81 | 82 | urls = { 83 | "crossvalidated": ( 84 | "https://github.com/maciejkula/lightfm_datasets/releases/" 85 | "download/v0.1.0/stackexchange_crossvalidated.npz" 86 | ), 87 | "stackoverflow": ( 88 | "https://github.com/maciejkula/lightfm_datasets/releases/" 89 | "download/v0.1.0/stackexchange_stackoverflow.npz" 90 | ), 91 | } 92 | 93 | path = _common.get_data( 94 | data_home, 95 | urls[dataset], 96 | os.path.join("stackexchange", dataset), 97 | "data.npz", 98 | download_if_missing, 99 | ) 100 | 101 | data = np.load(path) 102 | 103 | interactions = sp.coo_matrix( 104 | ( 105 | data["interactions_data"], 106 | (data["interactions_row"], data["interactions_col"]), 107 | ), 108 | shape=data["interactions_shape"].flatten(), 109 | ) 110 | interactions.sum_duplicates() 111 | 112 | tag_features_mat = sp.coo_matrix( 113 | (data["features_data"], (data["features_row"], data["features_col"])), 114 | shape=data["features_shape"].flatten(), 115 | ) 116 | tag_labels = data["labels"] 117 | 118 | test_cutoff_index = int(len(interactions.data) * (1.0 - test_set_fraction)) 119 | test_cutoff_timestamp = np.sort(interactions.data)[test_cutoff_index] 120 | in_train = interactions.data < test_cutoff_timestamp 121 | in_test = np.logical_not(in_train) 122 | 123 | train = sp.coo_matrix( 124 | ( 125 | np.ones(in_train.sum(), dtype=np.float32), 126 | (interactions.row[in_train], interactions.col[in_train]), 127 | ), 128 | shape=interactions.shape, 129 | ) 130 | test = sp.coo_matrix( 131 | ( 132 | np.ones(in_test.sum(), dtype=np.float32), 133 | (interactions.row[in_test], interactions.col[in_test]), 134 | ), 135 | shape=interactions.shape, 136 | ) 137 | 138 | if min_training_interactions > 0: 139 | include = np.squeeze(np.array(train.getnnz(axis=1))) > min_training_interactions 140 | 141 | train = train.tocsr()[include].tocoo() 142 | test = test.tocsr()[include].tocoo() 143 | 144 | if indicator_features and not tag_features: 145 | features = sp.identity(train.shape[1], format="csr", dtype=np.float32) 146 | labels = np.array(["question_id:{}".format(x) for x in range(train.shape[1])]) 147 | elif not indicator_features and tag_features: 148 | features = tag_features_mat.tocsr() 149 | labels = tag_labels 150 | else: 151 | id_features = sp.identity(train.shape[1], format="csr", dtype=np.float32) 152 | features = sp.hstack([id_features, tag_features_mat]).tocsr() 153 | labels = np.concatenate( 154 | [ 155 | np.array(["question_id:{}".format(x) for x in range(train.shape[1])]), 156 | tag_labels, 157 | ] 158 | ) 159 | 160 | return { 161 | "train": train, 162 | "test": test, 163 | "item_features": features, 164 | "item_feature_labels": labels, 165 | } 166 | -------------------------------------------------------------------------------- /lightfm/evaluation.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | Module containing evaluation functions suitable for judging the performance of 4 | a fitted LightFM model. 5 | """ 6 | 7 | import numpy as np 8 | 9 | from ._lightfm_fast import CSRMatrix, calculate_auc_from_rank 10 | 11 | __all__ = ["precision_at_k", "recall_at_k", "auc_score", "reciprocal_rank"] 12 | 13 | 14 | def precision_at_k( 15 | model, 16 | test_interactions, 17 | train_interactions=None, 18 | k=10, 19 | user_features=None, 20 | item_features=None, 21 | preserve_rows=False, 22 | num_threads=1, 23 | check_intersections=True, 24 | ): 25 | """ 26 | Measure the precision at k metric for a model: the fraction of known 27 | positives in the first k positions of the ranked list of results. 28 | A perfect score is 1.0. 29 | 30 | Parameters 31 | ---------- 32 | 33 | model: LightFM instance 34 | the fitted model to be evaluated 35 | test_interactions: np.float32 csr_matrix of shape [n_users, n_items] 36 | Non-zero entries representing known positives in the evaluation set. 37 | train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional 38 | Non-zero entries representing known positives in the train set. These 39 | will be omitted from the score calculations to avoid re-recommending 40 | known positives. 41 | k: integer, optional 42 | The k parameter. 43 | user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional 44 | Each row contains that user's weights over features. 45 | item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional 46 | Each row contains that item's weights over features. 47 | preserve_rows: boolean, optional 48 | When False (default), the number of rows in the output will be equal 49 | to the number of users with interactions in the evaluation set. 50 | When True, the number of rows in the output will be equal to the 51 | number of users. 52 | num_threads: int, optional 53 | Number of parallel computation threads to use. Should 54 | not be higher than the number of physical cores. 55 | check_intersections: bool, optional, True by default, 56 | Only relevant when train_interactions are supplied. 57 | A flag that signals whether the test and train matrices should be checked 58 | for intersections to prevent optimistic ranks / wrong evaluation / bad data split. 59 | 60 | Returns 61 | ------- 62 | 63 | np.array of shape [n_users with interactions or n_users,] 64 | Numpy array containing precision@k scores for each user. If there are 65 | no interactions for a given user the returned precision will be 0. 66 | """ 67 | 68 | if num_threads < 1: 69 | raise ValueError("Number of threads must be 1 or larger.") 70 | 71 | ranks = model.predict_rank( 72 | test_interactions, 73 | train_interactions=train_interactions, 74 | user_features=user_features, 75 | item_features=item_features, 76 | num_threads=num_threads, 77 | check_intersections=check_intersections, 78 | ) 79 | 80 | ranks.data = np.less(ranks.data, k, ranks.data) 81 | 82 | precision = np.squeeze(np.array(ranks.sum(axis=1))) / k 83 | 84 | if not preserve_rows: 85 | precision = precision[test_interactions.getnnz(axis=1) > 0] 86 | 87 | return precision 88 | 89 | 90 | def recall_at_k( 91 | model, 92 | test_interactions, 93 | train_interactions=None, 94 | k=10, 95 | user_features=None, 96 | item_features=None, 97 | preserve_rows=False, 98 | num_threads=1, 99 | check_intersections=True, 100 | ): 101 | """ 102 | Measure the recall at k metric for a model: the number of positive items in 103 | the first k positions of the ranked list of results divided by the number 104 | of positive items in the test period. A perfect score is 1.0. 105 | 106 | Parameters 107 | ---------- 108 | 109 | model: LightFM instance 110 | the fitted model to be evaluated 111 | test_interactions: np.float32 csr_matrix of shape [n_users, n_items] 112 | Non-zero entries representing known positives in the evaluation set. 113 | train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional 114 | Non-zero entries representing known positives in the train set. These 115 | will be omitted from the score calculations to avoid re-recommending 116 | known positives. 117 | k: integer, optional 118 | The k parameter. 119 | user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional 120 | Each row contains that user's weights over features. 121 | item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional 122 | Each row contains that item's weights over features. 123 | preserve_rows: boolean, optional 124 | When False (default), the number of rows in the output will be equal 125 | to the number of users with interactions in the evaluation set. 126 | When True, the number of rows in the output will be equal to the 127 | number of users. 128 | num_threads: int, optional 129 | Number of parallel computation threads to use. Should 130 | not be higher than the number of physical cores. 131 | check_intersections: bool, optional, True by default, 132 | Only relevant when train_interactions are supplied. 133 | A flag that signals whether the test and train matrices should be checked 134 | for intersections to prevent optimistic ranks / wrong evaluation / bad data split. 135 | 136 | Returns 137 | ------- 138 | 139 | np.array of shape [n_users with interactions or n_users,] 140 | Numpy array containing recall@k scores for each user. If there are no 141 | interactions for a given user having items in the test period, the 142 | returned recall will be 0. 143 | """ 144 | 145 | if num_threads < 1: 146 | raise ValueError("Number of threads must be 1 or larger.") 147 | 148 | ranks = model.predict_rank( 149 | test_interactions, 150 | train_interactions=train_interactions, 151 | user_features=user_features, 152 | item_features=item_features, 153 | num_threads=num_threads, 154 | check_intersections=check_intersections, 155 | ) 156 | 157 | ranks.data = np.less(ranks.data, k, ranks.data) 158 | 159 | retrieved = np.squeeze(test_interactions.getnnz(axis=1)) 160 | hit = np.squeeze(np.array(ranks.sum(axis=1))) 161 | 162 | if not preserve_rows: 163 | hit = hit[test_interactions.getnnz(axis=1) > 0] 164 | retrieved = retrieved[test_interactions.getnnz(axis=1) > 0] 165 | 166 | return hit / retrieved 167 | 168 | 169 | def auc_score( 170 | model, 171 | test_interactions, 172 | train_interactions=None, 173 | user_features=None, 174 | item_features=None, 175 | preserve_rows=False, 176 | num_threads=1, 177 | check_intersections=True, 178 | ): 179 | """ 180 | Measure the ROC AUC metric for a model: the probability that a randomly 181 | chosen positive example has a higher score than a randomly chosen negative 182 | example. 183 | A perfect score is 1.0. 184 | 185 | Parameters 186 | ---------- 187 | 188 | model: LightFM instance 189 | the fitted model to be evaluated 190 | test_interactions: np.float32 csr_matrix of shape [n_users, n_items] 191 | Non-zero entries representing known positives in the evaluation set. 192 | train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional 193 | Non-zero entries representing known positives in the train set. These 194 | will be omitted from the score calculations to avoid re-recommending 195 | known positives. 196 | user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional 197 | Each row contains that user's weights over features. 198 | item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional 199 | Each row contains that item's weights over features. 200 | preserve_rows: boolean, optional 201 | When False (default), the number of rows in the output will be equal 202 | to the number of users with interactions in the evaluation set. 203 | When True, the number of rows in the output will be equal to the 204 | number of users. 205 | num_threads: int, optional 206 | Number of parallel computation threads to use. Should 207 | not be higher than the number of physical cores. 208 | check_intersections: bool, optional, True by default, 209 | Only relevant when train_interactions are supplied. 210 | A flag that signals whether the test and train matrices should be checked 211 | for intersections to prevent optimistic ranks / wrong evaluation / bad data split. 212 | 213 | Returns 214 | ------- 215 | 216 | np.array of shape [n_users with interactions or n_users,] 217 | Numpy array containing AUC scores for each user. If there are no 218 | interactions for a given user the returned AUC will be 0.5. 219 | """ 220 | 221 | if num_threads < 1: 222 | raise ValueError("Number of threads must be 1 or larger.") 223 | 224 | ranks = model.predict_rank( 225 | test_interactions, 226 | train_interactions=train_interactions, 227 | user_features=user_features, 228 | item_features=item_features, 229 | num_threads=num_threads, 230 | check_intersections=check_intersections, 231 | ) 232 | 233 | assert np.all(ranks.data >= 0) 234 | 235 | auc = np.zeros(ranks.shape[0], dtype=np.float32) 236 | 237 | if train_interactions is not None: 238 | num_train_positives = np.squeeze( 239 | np.array(train_interactions.getnnz(axis=1)).astype(np.int32) 240 | ) 241 | else: 242 | num_train_positives = np.zeros(test_interactions.shape[0], dtype=np.int32) 243 | 244 | # The second argument is modified in-place, but 245 | # here we don't care about the inconsistency 246 | # introduced into the ranks matrix. 247 | calculate_auc_from_rank( 248 | CSRMatrix(ranks), num_train_positives, ranks.data, auc, num_threads 249 | ) 250 | 251 | if not preserve_rows: 252 | auc = auc[test_interactions.getnnz(axis=1) > 0] 253 | 254 | return auc 255 | 256 | 257 | def reciprocal_rank( 258 | model, 259 | test_interactions, 260 | train_interactions=None, 261 | user_features=None, 262 | item_features=None, 263 | preserve_rows=False, 264 | num_threads=1, 265 | check_intersections=True, 266 | ): 267 | """ 268 | Measure the reciprocal rank metric for a model: 1 / the rank of the highest 269 | ranked positive example. A perfect score is 1.0. 270 | 271 | Parameters 272 | ---------- 273 | 274 | model: LightFM instance 275 | the fitted model to be evaluated 276 | test_interactions: np.float32 csr_matrix of shape [n_users, n_items] 277 | Non-zero entries representing known positives in the evaluation set. 278 | train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional 279 | Non-zero entries representing known positives in the train set. These 280 | will be omitted from the score calculations to avoid re-recommending 281 | known positives. 282 | user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional 283 | Each row contains that user's weights over features. 284 | item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional 285 | Each row contains that item's weights over features. 286 | preserve_rows: boolean, optional 287 | When False (default), the number of rows in the output will be equal 288 | to the number of users with interactions in the evaluation set. 289 | When True, the number of rows in the output will be equal to the 290 | number of users. 291 | num_threads: int, optional 292 | Number of parallel computation threads to use. Should 293 | not be higher than the number of physical cores. 294 | check_intersections: bool, optional, True by default, 295 | Only relevant when train_interactions are supplied. 296 | A flag that signals whether the test and train matrices should be checked 297 | for intersections to prevent optimistic ranks / wrong evaluation / bad data split. 298 | 299 | Returns 300 | ------- 301 | 302 | np.array of shape [n_users with interactions or n_users,] 303 | Numpy array containing reciprocal rank scores for each user. 304 | If there are no interactions for a given user the returned value will 305 | be 0.0. 306 | """ 307 | 308 | if num_threads < 1: 309 | raise ValueError("Number of threads must be 1 or larger.") 310 | 311 | ranks = model.predict_rank( 312 | test_interactions, 313 | train_interactions=train_interactions, 314 | user_features=user_features, 315 | item_features=item_features, 316 | num_threads=num_threads, 317 | check_intersections=check_intersections, 318 | ) 319 | 320 | ranks.data = 1.0 / (ranks.data + 1.0) 321 | 322 | ranks = np.squeeze(np.array(ranks.max(axis=1).todense())) 323 | 324 | if not preserve_rows: 325 | ranks = ranks[test_interactions.getnnz(axis=1) > 0] 326 | 327 | return ranks 328 | -------------------------------------------------------------------------------- /lightfm/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.17" 2 | -------------------------------------------------------------------------------- /lint-requirements.txt: -------------------------------------------------------------------------------- 1 | pre-commit==2.17.0 2 | black==22.1.0 3 | flake8==4.0.1 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [flake8] 5 | ignore = I100, W503, E203 6 | max-line-length = 100 7 | exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,docs,doc 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | import pathlib 4 | import subprocess 5 | import sys 6 | import textwrap 7 | 8 | from setuptools import Command, Extension, setup 9 | 10 | 11 | def define_extensions(use_openmp): 12 | compile_args = [] 13 | if not os.environ.get("LIGHTFM_NO_CFLAGS"): 14 | compile_args += ["-ffast-math"] 15 | 16 | if sys.platform.startswith("darwin"): 17 | compile_args += [] 18 | else: 19 | compile_args += ["-march=native"] 20 | 21 | if not use_openmp: 22 | print("Compiling without OpenMP support.") 23 | return [ 24 | Extension( 25 | "lightfm._lightfm_fast_no_openmp", 26 | ["lightfm/_lightfm_fast_no_openmp.c"], 27 | extra_compile_args=compile_args, 28 | ) 29 | ] 30 | else: 31 | return [ 32 | Extension( 33 | "lightfm._lightfm_fast_openmp", 34 | ["lightfm/_lightfm_fast_openmp.c"], 35 | extra_link_args=["-fopenmp"], 36 | extra_compile_args=compile_args + ["-fopenmp"], 37 | ) 38 | ] 39 | 40 | 41 | class Cythonize(Command): 42 | """ 43 | Compile the extension .pyx files. 44 | """ 45 | 46 | user_options = [] 47 | 48 | def initialize_options(self): 49 | pass 50 | 51 | def finalize_options(self): 52 | pass 53 | 54 | def generate_pyx(self): 55 | openmp_import = textwrap.dedent( 56 | """ 57 | from cython.parallel import parallel, prange 58 | cimport openmp 59 | """ 60 | ) 61 | 62 | lock_init = textwrap.dedent( 63 | """ 64 | cdef openmp.omp_lock_t THREAD_LOCK 65 | openmp.omp_init_lock(&THREAD_LOCK) 66 | """ 67 | ) 68 | 69 | params = ( 70 | ( 71 | "no_openmp", 72 | dict( 73 | openmp_import="", 74 | nogil_block="with nogil:", 75 | range_block="range", 76 | thread_num="0", 77 | lock_init="", 78 | lock_acquire="", 79 | lock_release="", 80 | ), 81 | ), 82 | ( 83 | "openmp", 84 | dict( 85 | openmp_import=openmp_import, 86 | nogil_block="with nogil, parallel(num_threads=num_threads):", 87 | range_block="prange", 88 | thread_num="openmp.omp_get_thread_num()", 89 | lock_init=lock_init, 90 | lock_acquire="openmp.omp_set_lock(&THREAD_LOCK)", 91 | lock_release="openmp.omp_unset_lock(&THREAD_LOCK)", 92 | ), 93 | ), 94 | ) 95 | 96 | file_dir = os.path.join(os.path.dirname(__file__), "lightfm") 97 | 98 | with open(os.path.join(file_dir, "_lightfm_fast.pyx.template"), "r") as fl: 99 | template = fl.read() 100 | 101 | for variant, template_params in params: 102 | with open( 103 | os.path.join(file_dir, "_lightfm_fast_{}.pyx".format(variant)), "w" 104 | ) as fl: 105 | fl.write(template.format(**template_params)) 106 | 107 | def run(self): 108 | from Cython.Build import cythonize 109 | 110 | self.generate_pyx() 111 | 112 | cythonize( 113 | [ 114 | Extension( 115 | "lightfm._lightfm_fast_no_openmp", 116 | ["lightfm/_lightfm_fast_no_openmp.pyx"], 117 | ), 118 | Extension( 119 | "lightfm._lightfm_fast_openmp", 120 | ["lightfm/_lightfm_fast_openmp.pyx"], 121 | extra_link_args=["-fopenmp"], 122 | ), 123 | ], 124 | compiler_directives={'language_level' : "3"} 125 | ) 126 | 127 | 128 | class Clean(Command): 129 | """ 130 | Clean build files. 131 | """ 132 | 133 | user_options = [("all", None, "(Compatibility with original clean command)")] 134 | 135 | def initialize_options(self): 136 | self.all = False 137 | 138 | def finalize_options(self): 139 | pass 140 | 141 | def run(self): 142 | pth = os.path.dirname(os.path.abspath(__file__)) 143 | 144 | subprocess.call(["rm", "-rf", os.path.join(pth, "build")]) 145 | subprocess.call(["rm", "-rf", os.path.join(pth, "lightfm.egg-info")]) 146 | subprocess.call(["find", pth, "-name", "lightfm*.pyc", "-type", "f", "-delete"]) 147 | subprocess.call(["rm", os.path.join(pth, "lightfm", "_lightfm_fast.so")]) 148 | 149 | 150 | def read_version(): 151 | mod = {} 152 | path = os.path.join( 153 | os.path.dirname(__file__), 154 | "lightfm", 155 | "version.py", 156 | ) 157 | with open(path) as fd: 158 | exec(fd.read(), mod) 159 | return mod["__version__"] 160 | 161 | 162 | use_openmp = not sys.platform.startswith("darwin") and not sys.platform.startswith( 163 | "win" 164 | ) 165 | 166 | long_description = pathlib.Path(__file__).parent.joinpath("README.md").read_text() 167 | 168 | setup( 169 | name="lightfm", 170 | version=read_version(), 171 | description="LightFM recommendation model", 172 | long_description=long_description, 173 | long_description_content_type="text/markdown", 174 | url="https://github.com/lyst/lightfm", 175 | download_url="https://github.com/lyst/lightfm/tarball/{}".format(read_version()), 176 | packages=["lightfm", "lightfm.datasets"], 177 | package_data={"": ["*.c"]}, 178 | install_requires=["numpy", "scipy>=0.17.0", "requests", "scikit-learn"], 179 | tests_require=["pytest", "requests", "scikit-learn"], 180 | cmdclass={"cythonize": Cythonize, "clean": Clean}, 181 | author="Lyst Ltd (Maciej Kula)", 182 | author_email="data@ly.st", 183 | license="MIT", 184 | classifiers=[ 185 | "Development Status :: 5 - Production/Stable", 186 | "License :: OSI Approved :: MIT License", 187 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 188 | ], 189 | ext_modules=define_extensions(use_openmp), 190 | ) 191 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pytest 4 | 5 | import scipy.sparse as sp 6 | 7 | from lightfm.lightfm import LightFM 8 | 9 | 10 | def test_empty_matrix(): 11 | 12 | no_users, no_items = (10, 100) 13 | 14 | train = sp.coo_matrix((no_users, no_items), dtype=np.int32) 15 | 16 | model = LightFM() 17 | model.fit_partial(train) 18 | 19 | 20 | def test_matrix_types(): 21 | 22 | mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix) 23 | 24 | dtypes = (np.int32, np.int64, np.float32, np.float64) 25 | 26 | no_users, no_items = (10, 100) 27 | no_features = 20 28 | 29 | for mattype in mattypes: 30 | for dtype in dtypes: 31 | train = mattype((no_users, no_items), dtype=dtype) 32 | weights = train.tocoo() 33 | 34 | user_features = mattype((no_users, no_features), dtype=dtype) 35 | item_features = mattype((no_items, no_features), dtype=dtype) 36 | 37 | model = LightFM() 38 | model.fit_partial( 39 | train, 40 | sample_weight=weights, 41 | user_features=user_features, 42 | item_features=item_features, 43 | ) 44 | 45 | model.predict( 46 | np.random.randint(0, no_users, 10).astype(np.int32), 47 | np.random.randint(0, no_items, 10).astype(np.int32), 48 | user_features=user_features, 49 | item_features=item_features, 50 | ) 51 | 52 | model.predict_rank( 53 | train, user_features=user_features, item_features=item_features 54 | ) 55 | 56 | 57 | def test_coo_with_duplicate_entries(): 58 | # Calling .tocsr on a COO matrix with duplicate entries 59 | # changes its data arrays in-place, leading to out-of-bounds 60 | # array accesses in the WARP code. 61 | # Reported in https://github.com/lyst/lightfm/issues/117. 62 | 63 | rows, cols = (1000, 100) 64 | mat = sp.random(rows, cols) 65 | mat.data[:] = 1 66 | 67 | # Duplicate entries in the COO matrix 68 | mat.data = np.concatenate((mat.data, mat.data[:1000])) 69 | mat.row = np.concatenate((mat.row, mat.row[:1000])) 70 | mat.col = np.concatenate((mat.col, mat.col[:1000])) 71 | 72 | for loss in ("warp", "bpr", "warp-kos"): 73 | model = LightFM(loss=loss) 74 | model.fit(mat) 75 | 76 | 77 | def test_predict(): 78 | 79 | no_users, no_items = (10, 100) 80 | 81 | train = sp.coo_matrix((no_users, no_items), dtype=np.int32) 82 | 83 | model = LightFM() 84 | model.fit_partial(train) 85 | 86 | for uid in range(no_users): 87 | scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) 88 | scores_int = model.predict(uid, np.arange(no_items)) 89 | assert np.allclose(scores_arr, scores_int) 90 | 91 | with pytest.raises(ValueError): 92 | model.predict("foo", np.arange(no_items)) 93 | 94 | 95 | def test_input_dtypes(): 96 | 97 | dtypes = (np.int32, np.int64, np.float32, np.float64) 98 | 99 | no_users, no_items = (10, 100) 100 | no_features = 20 101 | 102 | for dtype in dtypes: 103 | train = sp.coo_matrix((no_users, no_items), dtype=dtype) 104 | 105 | user_features = sp.coo_matrix((no_users, no_features), dtype=dtype) 106 | item_features = sp.coo_matrix((no_items, no_features), dtype=dtype) 107 | 108 | model = LightFM() 109 | model.fit_partial( 110 | train, user_features=user_features, item_features=item_features 111 | ) 112 | 113 | model.predict( 114 | np.random.randint(0, no_users, 10).astype(np.int32), 115 | np.random.randint(0, no_items, 10).astype(np.int32), 116 | user_features=user_features, 117 | item_features=item_features, 118 | ) 119 | 120 | 121 | def test_not_enough_features_fails(): 122 | 123 | no_users, no_items = (10, 100) 124 | no_features = 20 125 | 126 | train = sp.coo_matrix((no_users, no_items), dtype=np.int32) 127 | 128 | user_features = sp.csr_matrix((no_users - 1, no_features), dtype=np.int32) 129 | item_features = sp.csr_matrix((no_items - 1, no_features), dtype=np.int32) 130 | model = LightFM() 131 | with pytest.raises(Exception): 132 | model.fit_partial( 133 | train, user_features=user_features, item_features=item_features 134 | ) 135 | 136 | 137 | def test_feature_inference_fails(): 138 | 139 | # On predict if we try to use feature inference and supply 140 | # higher ids than the number of features that were supplied to fit 141 | # we should complain 142 | 143 | no_users, no_items = (10, 100) 144 | no_features = 20 145 | 146 | train = sp.coo_matrix((no_users, no_items), dtype=np.int32) 147 | 148 | user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32) 149 | item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32) 150 | model = LightFM() 151 | model.fit_partial(train, user_features=user_features, item_features=item_features) 152 | 153 | with pytest.raises(ValueError): 154 | model.predict( 155 | np.array([no_features], dtype=np.int32), 156 | np.array([no_features], dtype=np.int32), 157 | ) 158 | 159 | 160 | def test_return_self(): 161 | 162 | no_users, no_items = (10, 100) 163 | 164 | train = sp.coo_matrix((no_users, no_items), dtype=np.int32) 165 | 166 | model = LightFM() 167 | assert model.fit_partial(train) is model 168 | assert model.fit(train) is model 169 | 170 | 171 | def test_param_sanity(): 172 | 173 | with pytest.raises(AssertionError): 174 | LightFM(no_components=-1) 175 | 176 | with pytest.raises(AssertionError): 177 | LightFM(user_alpha=-1.0) 178 | 179 | with pytest.raises(AssertionError): 180 | LightFM(item_alpha=-1.0) 181 | 182 | with pytest.raises(ValueError): 183 | LightFM(max_sampled=-1.0) 184 | 185 | 186 | def test_sample_weight(): 187 | 188 | model = LightFM() 189 | 190 | train = sp.coo_matrix(np.array([[0, 1], [0, 1]])) 191 | 192 | with pytest.raises(ValueError): 193 | # Wrong number of weights 194 | sample_weight = sp.coo_matrix(np.zeros((2, 2))) 195 | 196 | model.fit(train, sample_weight=sample_weight) 197 | 198 | with pytest.raises(ValueError): 199 | # Wrong shape 200 | sample_weight = sp.coo_matrix(np.zeros(2)) 201 | model.fit(train, sample_weight=np.zeros(3)) 202 | 203 | with pytest.raises(ValueError): 204 | # Wrong order of entries 205 | sample_weight = sp.coo_matrix((train.data, (train.row[::-1], train.col[::-1]))) 206 | model.fit(train, sample_weight=np.zeros(3)) 207 | 208 | sample_weight = sp.coo_matrix((train.data, (train.row, train.col))) 209 | model.fit(train, sample_weight=sample_weight) 210 | 211 | model = LightFM(loss="warp-kos") 212 | 213 | with pytest.raises(NotImplementedError): 214 | model.fit(train, sample_weight=np.ones(1)) 215 | 216 | 217 | def test_predict_ranks(): 218 | 219 | no_users, no_items = (10, 100) 220 | 221 | train = sp.coo_matrix((no_users, no_items), dtype=np.float32) 222 | train = sp.rand(no_users, no_items, format="csr", random_state=42) 223 | 224 | model = LightFM() 225 | model.fit_partial(train) 226 | 227 | # Compute ranks for all items 228 | rank_input = sp.csr_matrix(np.ones((no_users, no_items))) 229 | ranks = model.predict_rank(rank_input, num_threads=2).todense() 230 | 231 | assert np.all(ranks.min(axis=1) == 0) 232 | assert np.all(ranks.max(axis=1) == no_items - 1) 233 | 234 | for row in range(no_users): 235 | assert np.all(np.sort(ranks[row]) == np.arange(no_items)) 236 | 237 | # Train set exclusions. All ranks should be zero 238 | # if train interactions is dense. 239 | ranks = model.predict_rank( 240 | rank_input, train_interactions=rank_input, check_intersections=False 241 | ).todense() 242 | assert np.all(ranks == 0) 243 | 244 | # Max rank should be num_items - 1 - number of positives 245 | # in train in that row 246 | ranks = model.predict_rank( 247 | rank_input, train_interactions=train, check_intersections=False 248 | ).todense() 249 | assert np.all( 250 | np.squeeze(np.array(ranks.max(axis=1))) 251 | == no_items - 1 - np.squeeze(np.array(train.getnnz(axis=1))) 252 | ) 253 | 254 | # check error is raised when train and test have interactions in common 255 | with pytest.raises(ValueError): 256 | model.predict_rank(train, train_interactions=train, check_intersections=True) 257 | 258 | # check error not raised when flag is False 259 | model.predict_rank(train, train_interactions=train, check_intersections=False) 260 | 261 | # check no errors raised when train and test have no interactions in common 262 | not_train = sp.rand(no_users, no_items, format="csr", random_state=43) - train 263 | not_train.data[not_train.data < 0] = 0 264 | not_train.eliminate_zeros() 265 | model.predict_rank(not_train, train_interactions=train, check_intersections=True) 266 | 267 | # Make sure ranks are computed pessimistically when 268 | # there are ties (that is, equal predictions for every 269 | # item will assign maximum rank to each). 270 | model.user_embeddings = np.zeros_like(model.user_embeddings) 271 | model.item_embeddings = np.zeros_like(model.item_embeddings) 272 | model.user_biases = np.zeros_like(model.user_biases) 273 | model.item_biases = np.zeros_like(model.item_biases) 274 | 275 | ranks = model.predict_rank(rank_input, num_threads=2).todense() 276 | 277 | assert np.all(ranks.min(axis=1) == 99) 278 | assert np.all(ranks.max(axis=1) == 99) 279 | 280 | # Wrong input dimensions 281 | with pytest.raises(ValueError): 282 | model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2) 283 | 284 | 285 | def test_exception_on_divergence(): 286 | 287 | no_users, no_items = (1000, 1000) 288 | 289 | train = sp.rand(no_users, no_items, format="csr", random_state=42) 290 | 291 | model = LightFM(learning_rate=10000000.0, loss="warp") 292 | 293 | with pytest.raises(ValueError): 294 | model.fit(train, epochs=10) 295 | 296 | 297 | def test_sklearn_api(): 298 | model = LightFM() 299 | params = model.get_params() 300 | model2 = LightFM(**params) 301 | params2 = model2.get_params() 302 | assert params == params2 303 | model.set_params(**params) 304 | params["invalid_param"] = 666 305 | with pytest.raises(ValueError): 306 | model.set_params(**params) 307 | 308 | 309 | def test_predict_not_fitted(): 310 | 311 | model = LightFM() 312 | 313 | with pytest.raises(ValueError): 314 | model.predict(np.arange(10), np.arange(10)) 315 | 316 | with pytest.raises(ValueError): 317 | model.predict_rank(1) 318 | 319 | with pytest.raises(ValueError): 320 | model.get_user_representations() 321 | 322 | with pytest.raises(ValueError): 323 | model.get_item_representations() 324 | 325 | 326 | def test_nan_features(): 327 | 328 | no_users, no_items = (1000, 1000) 329 | 330 | train = sp.rand(no_users, no_items, format="csr", random_state=42) 331 | 332 | features = sp.identity(no_items) 333 | features.data *= np.nan 334 | 335 | model = LightFM(loss="warp") 336 | 337 | with pytest.raises(ValueError): 338 | model.fit(train, epochs=10, user_features=features, item_features=features) 339 | 340 | 341 | def test_nan_interactions(): 342 | 343 | no_users, no_items = (1000, 1000) 344 | 345 | train = sp.rand(no_users, no_items, format="csr", random_state=42) 346 | train.data *= np.nan 347 | 348 | model = LightFM(loss="warp") 349 | 350 | with pytest.raises(ValueError): 351 | model.fit(train) 352 | 353 | 354 | def test_overflow_predict(): 355 | 356 | no_users, no_items = (1000, 1000) 357 | 358 | train = sp.rand(no_users, no_items, format="csr", random_state=42) 359 | 360 | model = LightFM(loss="warp") 361 | 362 | model.fit(train) 363 | 364 | with pytest.raises((ValueError, OverflowError)): 365 | print( 366 | model.predict( 367 | 1231241241231241414, 368 | np.arange(no_items), 369 | user_features=sp.identity(no_users), 370 | ) 371 | ) 372 | 373 | 374 | def test_warp_few_items(): 375 | 376 | no_users, no_items = (1000, 2) 377 | 378 | train = sp.rand(no_users, no_items, format="csr", random_state=42) 379 | 380 | model = LightFM(loss="warp", max_sampled=10) 381 | 382 | model.fit(train) 383 | -------------------------------------------------------------------------------- /tests/test_cross_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lightfm.cross_validation import random_train_test_split 4 | from lightfm.datasets import fetch_movielens 5 | 6 | 7 | def _assert_disjoint(x, y): 8 | 9 | x = x.tocsr() 10 | y = y.tocoo() 11 | 12 | for (i, j) in zip(y.row, y.col): 13 | assert x[i, j] == 0.0 14 | 15 | 16 | @pytest.mark.parametrize("test_percentage", [0.2, 0.5, 0.7]) 17 | def test_random_train_test_split(test_percentage): 18 | 19 | data = fetch_movielens()["train"] 20 | 21 | train, test = random_train_test_split(data, test_percentage=test_percentage) 22 | 23 | assert test.nnz / float(data.nnz) == test_percentage 24 | _assert_disjoint(train, test) 25 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from lightfm.data import Dataset 5 | 6 | 7 | def test_fitting(): 8 | 9 | users, items = 10, 100 10 | 11 | dataset = Dataset() 12 | dataset.fit(range(users), range(items)) 13 | 14 | assert dataset.interactions_shape() == (users, items) 15 | assert dataset.user_features_shape() == (users, users) 16 | assert dataset.item_features_shape() == (items, items) 17 | 18 | assert dataset.build_interactions([])[0].shape == (users, items) 19 | assert dataset.build_user_features([]).getnnz() == users 20 | assert dataset.build_item_features([]).getnnz() == items 21 | 22 | 23 | def test_fitting_no_identity(): 24 | 25 | users, items = 10, 100 26 | 27 | dataset = Dataset(user_identity_features=False, item_identity_features=False) 28 | dataset.fit(range(users), range(items)) 29 | 30 | assert dataset.interactions_shape() == (users, items) 31 | assert dataset.user_features_shape() == (users, 0) 32 | assert dataset.item_features_shape() == (items, 0) 33 | 34 | assert dataset.build_interactions([])[0].shape == (users, items) 35 | assert dataset.build_user_features([], normalize=False).getnnz() == 0 36 | assert dataset.build_item_features([], normalize=False).getnnz() == 0 37 | 38 | 39 | def test_exceptions(): 40 | 41 | users, items = 10, 100 42 | 43 | dataset = Dataset() 44 | dataset.fit(range(users), range(items)) 45 | 46 | with pytest.raises(ValueError): 47 | dataset.build_interactions([(users + 1, 0)]) 48 | 49 | with pytest.raises(ValueError): 50 | dataset.build_interactions([(0, items + 1)]) 51 | 52 | dataset.fit_partial([users + 1], [items + 1]) 53 | dataset.build_interactions([(users + 1, 0)]) 54 | dataset.build_interactions([(0, items + 1)]) 55 | 56 | 57 | def test_build_features(): 58 | 59 | users, items = 10, 100 60 | 61 | dataset = Dataset(user_identity_features=False, item_identity_features=False) 62 | dataset.fit( 63 | range(users), 64 | range(items), 65 | ["user:{}".format(x) for x in range(users)], 66 | ["item:{}".format(x) for x in range(items)], 67 | ) 68 | 69 | # Build from lists 70 | user_features = dataset.build_user_features( 71 | [ 72 | (user_id, ["user:{}".format(x) for x in range(users)]) 73 | for user_id in range(users) 74 | ] 75 | ) 76 | assert user_features.getnnz() == users**2 77 | 78 | item_features = dataset.build_item_features( 79 | [ 80 | (item_id, ["item:{}".format(x) for x in range(items)]) 81 | for item_id in range(items) 82 | ] 83 | ) 84 | assert item_features.getnnz() == items**2 85 | 86 | # Build from dicts 87 | user_features = dataset.build_user_features( 88 | [ 89 | (user_id, {"user:{}".format(x): float(x) for x in range(users)}) 90 | for user_id in range(users) 91 | ], 92 | normalize=False, 93 | ) 94 | 95 | assert np.all(user_features.todense() == np.array([list(range(users))] * users)) 96 | 97 | item_features = dataset.build_item_features( 98 | [ 99 | (item_id, {"item:{}".format(x): float(x) for x in range(items)}) 100 | for item_id in range(items) 101 | ], 102 | normalize=False, 103 | ) 104 | 105 | assert np.all(item_features.todense() == np.array([list(range(items))] * items)) 106 | 107 | # Test normalization 108 | item_features = dataset.build_item_features( 109 | [ 110 | (item_id, {"item:{}".format(x): float(x) for x in range(items)}) 111 | for item_id in range(items) 112 | ] 113 | ) 114 | 115 | assert np.all(item_features.sum(1) == 1.0) 116 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | 5 | import scipy.sparse as sp 6 | 7 | from lightfm.datasets import fetch_movielens, fetch_stackexchange 8 | 9 | 10 | def test_basic_fetching_movielens(): 11 | 12 | data = fetch_movielens() 13 | 14 | assert isinstance(data["train"], sp.coo_matrix) 15 | assert isinstance(data["test"], sp.coo_matrix) 16 | 17 | assert data["train"].shape == data["test"].shape 18 | assert data["train"].shape == (943, 1682) 19 | assert (data["train"].getnnz() + data["test"].getnnz()) == 100000 20 | 21 | assert data["item_features"].shape == (1682, 1682) 22 | assert len(data["item_feature_labels"]) == 1682 23 | assert data["item_feature_labels"] is data["item_labels"] 24 | 25 | data = fetch_movielens(genre_features=True) 26 | 27 | assert data["item_features"].shape == (1682, len(data["item_feature_labels"])) 28 | assert data["item_feature_labels"] is not data["item_labels"] 29 | 30 | with pytest.raises(ValueError): 31 | data = fetch_movielens(indicator_features=False, genre_features=False) 32 | 33 | 34 | @pytest.mark.skip(reason="Runs out of memory in CI") 35 | def test_basic_fetching_stackexchange(): 36 | 37 | test_fractions = (0.2, 0.5, 0.6) 38 | 39 | for test_fraction in test_fractions: 40 | data = fetch_stackexchange( 41 | "crossvalidated", 42 | min_training_interactions=0, 43 | test_set_fraction=test_fraction, 44 | ) 45 | 46 | train = data["train"] 47 | test = data["test"] 48 | 49 | assert isinstance(train, sp.coo_matrix) 50 | assert isinstance(test, sp.coo_matrix) 51 | 52 | assert train.shape == test.shape 53 | 54 | frac = float(test.getnnz()) / (train.getnnz() + test.getnnz()) 55 | assert abs(frac - test_fraction) < 0.01 56 | 57 | for dataset in ("crossvalidated", "stackoverflow"): 58 | 59 | data = fetch_stackexchange( 60 | dataset, 61 | min_training_interactions=0, 62 | indicator_features=True, 63 | tag_features=False, 64 | ) 65 | assert isinstance(data["item_features"], sp.csr_matrix) 66 | assert ( 67 | data["item_features"].shape[0] 68 | == data["item_features"].shape[1] 69 | == data["train"].shape[1] 70 | ) 71 | 72 | data = fetch_stackexchange( 73 | dataset, 74 | min_training_interactions=0, 75 | indicator_features=False, 76 | tag_features=True, 77 | ) 78 | assert isinstance(data["item_features"], sp.csr_matrix) 79 | assert data["item_features"].shape[0] > data["item_features"].shape[1] 80 | 81 | data = fetch_stackexchange( 82 | dataset, 83 | min_training_interactions=0, 84 | indicator_features=True, 85 | tag_features=True, 86 | ) 87 | assert isinstance(data["item_features"], sp.csr_matrix) 88 | assert data["item_features"].shape[0] < data["item_features"].shape[1] 89 | 90 | if dataset == "crossvalidated": 91 | assert data["train"].shape == (9431, 72360) 92 | else: 93 | assert data["train"].shape == (1349835, 11280896) 94 | 95 | assert np.all(data["train"].data == 1.0) 96 | assert np.all(data["test"].data == 1.0) 97 | -------------------------------------------------------------------------------- /tests/test_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pytest 4 | 5 | import scipy.sparse as sp 6 | 7 | from sklearn.metrics import roc_auc_score 8 | 9 | from lightfm.lightfm import LightFM 10 | from lightfm import evaluation 11 | 12 | 13 | def _generate_data(num_users, num_items, density=0.1, test_fraction=0.2): 14 | # Generate a dataset where every user has interactions 15 | # in both the train and the test set. 16 | 17 | train = sp.lil_matrix((num_users, num_items), dtype=np.float32) 18 | test = sp.lil_matrix((num_users, num_items), dtype=np.float32) 19 | 20 | for user_id in range(num_users): 21 | positives = np.random.choice( 22 | num_items, size=int(density * num_items), replace=False 23 | ) 24 | 25 | for item_id in positives[: int(test_fraction * len(positives))]: 26 | test[user_id, item_id] = 1.0 27 | 28 | for item_id in positives[int(test_fraction * len(positives)) :]: 29 | train[user_id, item_id] = 1.0 30 | 31 | return train.tocoo(), test.tocoo() 32 | 33 | 34 | def _precision_at_k( 35 | model, ground_truth, k, train=None, user_features=None, item_features=None 36 | ): 37 | # Alternative test implementation 38 | 39 | ground_truth = ground_truth.tocsr() 40 | 41 | no_users, no_items = ground_truth.shape 42 | 43 | pid_array = np.arange(no_items, dtype=np.int32) 44 | 45 | precisions = [] 46 | 47 | uid_array = np.empty(no_items, dtype=np.int32) 48 | 49 | if train is not None: 50 | train = train.tocsr() 51 | 52 | for user_id, row in enumerate(ground_truth): 53 | uid_array.fill(user_id) 54 | 55 | predictions = model.predict( 56 | uid_array, 57 | pid_array, 58 | user_features=user_features, 59 | item_features=item_features, 60 | num_threads=4, 61 | ) 62 | if train is not None: 63 | train_items = train[user_id].indices 64 | top_k = set( 65 | [x for x in np.argsort(-predictions) if x not in train_items][:k] 66 | ) 67 | else: 68 | top_k = set(np.argsort(-predictions)[:k]) 69 | 70 | true_pids = set(row.indices[row.data == 1]) 71 | 72 | if true_pids: 73 | precisions.append(len(top_k & true_pids) / float(k)) 74 | 75 | return sum(precisions) / len(precisions) 76 | 77 | 78 | def _recall_at_k( 79 | model, ground_truth, k, train=None, user_features=None, item_features=None 80 | ): 81 | # Alternative test implementation 82 | 83 | ground_truth = ground_truth.tocsr() 84 | 85 | no_users, no_items = ground_truth.shape 86 | 87 | pid_array = np.arange(no_items, dtype=np.int32) 88 | 89 | recalls = [] 90 | 91 | uid_array = np.empty(no_items, dtype=np.int32) 92 | 93 | if train is not None: 94 | train = train.tocsr() 95 | 96 | for user_id, row in enumerate(ground_truth): 97 | uid_array.fill(user_id) 98 | 99 | predictions = model.predict( 100 | uid_array, 101 | pid_array, 102 | user_features=user_features, 103 | item_features=item_features, 104 | num_threads=4, 105 | ) 106 | if train is not None: 107 | train_items = train[user_id].indices 108 | top_k = set( 109 | [x for x in np.argsort(-predictions) if x not in train_items][:k] 110 | ) 111 | else: 112 | top_k = set(np.argsort(-predictions)[:k]) 113 | 114 | true_pids = set(row.indices[row.data == 1]) 115 | 116 | if true_pids: 117 | recalls.append(len(top_k & true_pids) / float(len(true_pids))) 118 | 119 | return sum(recalls) / len(recalls) 120 | 121 | 122 | def _auc(model, ground_truth, train=None, user_features=None, item_features=None): 123 | 124 | ground_truth = ground_truth.tocsr() 125 | 126 | no_users, no_items = ground_truth.shape 127 | 128 | pid_array = np.arange(no_items, dtype=np.int32) 129 | 130 | scores = [] 131 | 132 | if train is not None: 133 | train = train.tocsr() 134 | 135 | for user_id, row in enumerate(ground_truth): 136 | uid_array = np.empty(no_items, dtype=np.int32) 137 | uid_array.fill(user_id) 138 | predictions = model.predict( 139 | uid_array, 140 | pid_array, 141 | user_features=user_features, 142 | item_features=item_features, 143 | num_threads=4, 144 | ) 145 | 146 | true_pids = row.indices[row.data == 1] 147 | 148 | grnd = np.zeros(no_items, dtype=np.int32) 149 | grnd[true_pids] = 1 150 | 151 | if not len(true_pids): 152 | continue 153 | 154 | if train is not None: 155 | train_indices = train[user_id].indices 156 | not_in_train = np.array([x not in train_indices for x in range(no_items)]) 157 | scores.append(roc_auc_score(grnd[not_in_train], predictions[not_in_train])) 158 | else: 159 | scores.append(roc_auc_score(grnd, predictions)) 160 | 161 | return scores 162 | 163 | 164 | def test_precision_at_k(): 165 | 166 | no_users, no_items = (10, 100) 167 | 168 | train, test = _generate_data(no_users, no_items) 169 | 170 | model = LightFM(loss="bpr") 171 | 172 | # We want a high precision to catch the k=1 case 173 | model.fit_partial(test) 174 | 175 | for k in (10, 5, 1): 176 | 177 | # Without omitting train interactions 178 | precision = evaluation.precision_at_k(model, test, k=k) 179 | expected_mean_precision = _precision_at_k(model, test, k) 180 | 181 | assert np.allclose(precision.mean(), expected_mean_precision) 182 | assert len(precision) == (test.getnnz(axis=1) > 0).sum() 183 | assert ( 184 | len(evaluation.precision_at_k(model, train, preserve_rows=True)) 185 | == test.shape[0] 186 | ) 187 | 188 | # With omitting train interactions 189 | precision = evaluation.precision_at_k( 190 | model, test, k=k, train_interactions=train 191 | ) 192 | expected_mean_precision = _precision_at_k(model, test, k, train=train) 193 | 194 | assert np.allclose(precision.mean(), expected_mean_precision) 195 | 196 | 197 | def test_precision_at_k_with_ties(): 198 | 199 | no_users, no_items = (10, 100) 200 | 201 | train, test = _generate_data(no_users, no_items) 202 | 203 | model = LightFM(loss="bpr") 204 | model.fit_partial(train) 205 | 206 | # Make all predictions zero 207 | model.user_embeddings = np.zeros_like(model.user_embeddings) 208 | model.item_embeddings = np.zeros_like(model.item_embeddings) 209 | model.user_biases = np.zeros_like(model.user_biases) 210 | model.item_biases = np.zeros_like(model.item_biases) 211 | 212 | k = 10 213 | 214 | precision = evaluation.precision_at_k(model, test, k=k) 215 | 216 | # Pessimistic precision with all ties 217 | assert precision.mean() == 0.0 218 | 219 | 220 | def test_recall_at_k(): 221 | 222 | no_users, no_items = (10, 100) 223 | 224 | train, test = _generate_data(no_users, no_items) 225 | 226 | model = LightFM(loss="bpr") 227 | model.fit_partial(test) 228 | 229 | for k in (10, 5, 1): 230 | 231 | # Without omitting train interactions 232 | recall = evaluation.recall_at_k(model, test, k=k) 233 | expected_mean_recall = _recall_at_k(model, test, k) 234 | 235 | assert np.allclose(recall.mean(), expected_mean_recall) 236 | assert len(recall) == (test.getnnz(axis=1) > 0).sum() 237 | assert ( 238 | len(evaluation.recall_at_k(model, train, preserve_rows=True)) 239 | == test.shape[0] 240 | ) 241 | 242 | # With omitting train interactions 243 | recall = evaluation.recall_at_k(model, test, k=k, train_interactions=train) 244 | expected_mean_recall = _recall_at_k(model, test, k, train=train) 245 | 246 | assert np.allclose(recall.mean(), expected_mean_recall) 247 | 248 | 249 | def test_auc_score(): 250 | 251 | no_users, no_items = (10, 100) 252 | 253 | train, test = _generate_data(no_users, no_items) 254 | 255 | model = LightFM(loss="bpr") 256 | model.fit_partial(train) 257 | 258 | auc = evaluation.auc_score(model, test, num_threads=2) 259 | expected_auc = np.array(_auc(model, test)) 260 | 261 | assert auc.shape == expected_auc.shape 262 | assert np.abs(auc.mean() - expected_auc.mean()) < 0.01 263 | assert len(auc) == (test.getnnz(axis=1) > 0).sum() 264 | assert len(evaluation.auc_score(model, train, preserve_rows=True)) == test.shape[0] 265 | 266 | # With omitting train interactions 267 | auc = evaluation.auc_score(model, test, train_interactions=train, num_threads=2) 268 | expected_auc = np.array(_auc(model, test, train)) 269 | assert np.abs(auc.mean() - expected_auc.mean()) < 0.01 270 | 271 | 272 | def test_intersections_check(): 273 | 274 | no_users, no_items = (10, 100) 275 | 276 | train, test = _generate_data(no_users, no_items) 277 | 278 | model = LightFM(loss="bpr") 279 | model.fit_partial(train) 280 | 281 | # check error is raised when train and test have interactions in common 282 | with pytest.raises(ValueError): 283 | evaluation.auc_score( 284 | model, train, train_interactions=train, check_intersections=True 285 | ) 286 | 287 | with pytest.raises(ValueError): 288 | evaluation.recall_at_k( 289 | model, train, train_interactions=train, check_intersections=True 290 | ) 291 | 292 | with pytest.raises(ValueError): 293 | evaluation.precision_at_k( 294 | model, train, train_interactions=train, check_intersections=True 295 | ) 296 | 297 | with pytest.raises(ValueError): 298 | evaluation.reciprocal_rank( 299 | model, train, train_interactions=train, check_intersections=True 300 | ) 301 | 302 | # check no errors raised when train and test have no interactions in common 303 | evaluation.auc_score( 304 | model, test, train_interactions=train, check_intersections=True 305 | ) 306 | evaluation.recall_at_k( 307 | model, test, train_interactions=train, check_intersections=True 308 | ) 309 | evaluation.precision_at_k( 310 | model, test, train_interactions=train, check_intersections=True 311 | ) 312 | evaluation.reciprocal_rank( 313 | model, test, train_interactions=train, check_intersections=True 314 | ) 315 | 316 | # check no error is raised when there are intersections but flag is False 317 | evaluation.auc_score( 318 | model, train, train_interactions=train, check_intersections=False 319 | ) 320 | evaluation.recall_at_k( 321 | model, train, train_interactions=train, check_intersections=False 322 | ) 323 | evaluation.precision_at_k( 324 | model, train, train_interactions=train, check_intersections=False 325 | ) 326 | evaluation.reciprocal_rank( 327 | model, train, train_interactions=train, check_intersections=False 328 | ) 329 | -------------------------------------------------------------------------------- /tests/test_fast_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import scipy.sparse as sp 4 | 5 | 6 | from lightfm import _lightfm_fast 7 | 8 | 9 | def test_in_positives(): 10 | 11 | mat = sp.csr_matrix(np.array([[0, 1], [1, 0]])).astype(np.float32) 12 | 13 | assert not _lightfm_fast.__test_in_positives(0, 0, _lightfm_fast.CSRMatrix(mat)) 14 | assert _lightfm_fast.__test_in_positives(0, 1, _lightfm_fast.CSRMatrix(mat)) 15 | 16 | assert _lightfm_fast.__test_in_positives(1, 0, _lightfm_fast.CSRMatrix(mat)) 17 | assert not _lightfm_fast.__test_in_positives(1, 1, _lightfm_fast.CSRMatrix(mat)) 18 | --------------------------------------------------------------------------------