├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   └── test.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .travis.yml
├── CONTRIBUTORS
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── changelog.md
├── doc
    ├── Makefile
    ├── conf.py
    ├── cross_validation.rst
    ├── datasets.rst
    ├── examples.rst
    ├── examples
    │   ├── dataset.rst
    │   ├── hybrid_crossvalidated.rst
    │   ├── learning_schedules.rst
    │   ├── learning_schedules_files
    │   │   ├── learning_schedules_5_0.png
    │   │   └── learning_schedules_8_0.png
    │   ├── movielens_implicit.rst
    │   ├── warp_loss.rst
    │   └── warp_loss_files
    │   │   ├── warp_loss_5_0.png
    │   │   ├── warp_loss_7_0.png
    │   │   ├── warp_loss_9_0.png
    │   │   └── warp_loss_9_1.png
    ├── faq.rst
    ├── home.rst
    ├── index.rst
    ├── lightfm.data.rst
    ├── lightfm.evaluation.rst
    ├── lightfm.rst
    └── quickstart.rst
├── docker-compose.yml
├── docs-requirements.txt
├── examples
    ├── ann
    │   └── annoy_nsmlib_example.ipynb
    ├── dataset
    │   ├── Makefile
    │   ├── dataset.pmd
    │   ├── download.py
    │   └── readme.rst
    ├── movielens
    │   ├── data.py
    │   ├── example.ipynb
    │   ├── learning_schedules.ipynb
    │   ├── readme.md
    │   └── warp_loss.ipynb
    ├── quickstart
    │   ├── quickstart.ipynb
    │   └── short_quickstart.ipynb
    └── stackexchange
    │   └── hybrid_crossvalidated.ipynb
├── lightfm.png
├── lightfm
    ├── __init__.py
    ├── _lightfm_fast.py
    ├── _lightfm_fast.pyx.template
    ├── _lightfm_fast_no_openmp.c
    ├── _lightfm_fast_openmp.c
    ├── cross_validation.py
    ├── data.py
    ├── datasets
    │   ├── __init__.py
    │   ├── _common.py
    │   ├── movielens.py
    │   └── stackexchange.py
    ├── evaluation.py
    ├── lightfm.py
    └── version.py
├── lint-requirements.txt
├── setup.cfg
├── setup.py
├── test-requirements.txt
└── tests
    ├── __init__.py
    ├── test_api.py
    ├── test_cross_validation.py
    ├── test_data.py
    ├── test_datasets.py
    ├── test_evaluation.py
    ├── test_fast_functions.py
    └── test_movielens.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | *.zip
2 | *.7z
3 | *.xml
4 | examples/crossvalidated/*.7z
5 | examples/crossvalidated/*.xml
6 | examples/movielens/*.zip
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Thanks for opening an issue!
2 | 
3 | Please include as much detail as possible: what does your dataset look like, what hyperparameters you are using (and have you tried other ones?).
4 | 
5 | When including code snippets, make sure you use appropriate code formatting using backtics. Have a look at the markdown reference for details: https://guides.github.com/features/mastering-markdown/.
6 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | name: LightFM test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       matrix:
17 |         os: [ubuntu-latest, macos-latest, windows-latest]
18 |         python-version: ["3.7", "3.11"]
19 |         exclude:
20 |           - os: macos-latest
21 |             python-version: "3.7"
22 |           - os: windows-latest
23 |             python-version: "3.7"
24 | 
25 |     steps:
26 |       - uses: actions/checkout@v3
27 |       - name: Set up Python ${{ matrix.python-version }}
28 |         uses: actions/setup-python@v4
29 |         with:
30 |           python-version: ${{ matrix.python-version }}
31 |       - name: Install dependencies
32 |         run: |
33 |           python -m pip install --upgrade pip
34 |           pip install flake8 pytest
35 |       - name: Lint with flake8
36 |         run: |
37 |           # stop the build if there are Python syntax errors or undefined names
38 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
39 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
40 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
41 |       - name: Install
42 |         run: |
43 |           pip install -e .
44 |       - name: Test with pytest
45 |         run: |
46 |           pytest
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | venv/
 2 | *.pyc
 3 | *.egg*
 4 | *~
 5 | *.zip
 6 | *.so
 7 | examples/movielens/.ipynb_checkpoints/
 8 | examples/quickstart/.ipynb_checkpoints/
 9 | examples/stackexchange/.ipynb_checkpoints/
10 | build/
11 | dist/
12 | bench/
13 | *#*
14 | *.7z
15 | *.xml
16 | doc/_build/*
17 | lightfm/_lightfm_fast_openmp.pyx
18 | lightfm/_lightfm_fast_no_openmp.pyx
19 | *.*-checkpoint
20 | 
21 | # Editor specific
22 | .vscode/
23 | .idea/
24 | .devcontainer/


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v3.2.0
 6 |     hooks:
 7 |     -   id: trailing-whitespace
 8 |     -   id: end-of-file-fixer
 9 |     -   id: check-yaml
10 | -   repo: "https://github.com/psf/black"
11 |     rev: "22.1.0"
12 |     hooks:
13 |     -   id: black
14 | -   repo: https://gitlab.com/pycqa/flake8
15 |     rev: "4.0.1"
16 |     hooks:
17 |     -   id: flake8
18 |         types: [file, python]
19 |         exclude: doc/
20 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | os:
2 |   - osx
3 | before_install:
4 |   - python3 -m venv venv
5 |   - venv/bin/pip install -r test-requirements.txt
6 | install:
7 |   - venv/bin/pip install -e .
8 | script: venv/bin/py.test -xv tests/
9 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
1 | Oliver Grisel
2 | Jong Wook Kim
3 | Maciej Kula
4 | Paolo Rais
5 | Kent Shikama
6 | Mice Pápai
7 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | RUN apt-get update
 4 | RUN apt-get install -y libxml2 libxslt-dev wget bzip2 gcc
 5 | 
 6 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
 7 |     wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
 8 |     /bin/bash ~/miniconda.sh -b -p /opt/conda && \
 9 |     rm ~/miniconda.sh
10 | 
11 | ENV PATH /opt/conda/bin:$PATH
12 | 
13 | RUN conda install pytest jupyter scikit-learn
14 | 
15 | ENV PYTHONDONTWRITEBYTECODE 1
16 | 
17 | ADD . /home/lightfm/
18 | WORKDIR /home/
19 | 
20 | RUN cd lightfm && pip install -e .
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2015 Lyst
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: examples
 2 | examples:
 3 | 	jupyter nbconvert --to rst examples/quickstart/quickstart.ipynb
 4 | 	mv examples/quickstart/quickstart.rst doc/
 5 | 	jupyter nbconvert --to rst examples/movielens/example.ipynb
 6 | 	mv examples/movielens/example.rst doc/examples/movielens_implicit.rst
 7 | 	jupyter nbconvert --to rst examples/movielens/learning_schedules.ipynb
 8 | 	mv examples/movielens/learning_schedules.rst doc/examples/
 9 | 	cp -r examples/movielens/learning_schedules_files doc/examples/
10 | 	rm -rf examples/movielens/learning_schedules_files
11 | 	jupyter nbconvert --to rst examples/stackexchange/hybrid_crossvalidated.ipynb
12 | 	mv examples/stackexchange/hybrid_crossvalidated.rst doc/examples/
13 | 	jupyter nbconvert --to rst examples/movielens/warp_loss.ipynb
14 | 	mv examples/movielens/warp_loss.rst doc/examples/
15 | 	cp -r examples/movielens/warp_loss_files doc/examples/
16 | 	rm -rf examples/movielens/warp_loss_files
17 | .PHONY: update-docs
18 | update-docs:
19 | 	pip install -e . \
20 | 	&& cd doc && make html && cd .. \
21 | 	&& git fetch origin gh-pages && git checkout gh-pages \
22 | 	&& rm -rf ./docs/ \
23 | 	&& mkdir ./docs/ \
24 | 	&& cp -r ./doc/_build/html/* ./docs/ \
25 | 	&& git add -A ./docs/* \
26 | 	&& git commit -m 'Update docs.' && git push origin gh-pages
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LightFM
 2 | 
 3 | ![LightFM logo](lightfm.png)
 4 | 
 5 | | Build status | |
 6 | |---|---|
 7 | | Linux |[![Circle CI](https://circleci.com/gh/lyst/lightfm.svg?style=svg)](https://circleci.com/gh/lyst/lightfm)|
 8 | | OSX (OpenMP disabled)|[![Travis CI](https://travis-ci.org/lyst/lightfm.svg?branch=master)](https://travis-ci.org/lyst/lightfm)|
 9 | | Windows (OpenMP disabled) |[![Appveyor](https://ci.appveyor.com/api/projects/status/6cqpqb6969i1h4p7/branch/master?svg=true)](https://ci.appveyor.com/project/maciejkula/lightfm/branch/master)|
10 | 
11 | [![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/lightfm-rec/Lobby) [![PyPI](https://img.shields.io/pypi/v/lightfm.svg)](https://pypi.python.org/pypi/lightfm/)
12 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/lightfm/badges/version.svg)](https://anaconda.org/conda-forge/lightfm)
13 | 
14 | LightFM is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback, including efficient implementation of BPR and WARP ranking losses. It's easy to use, fast (via multithreaded model estimation), and produces high quality results.
15 | 
16 | It also makes it possible to incorporate both item and user metadata into the traditional matrix factorization algorithms. It represents each user and item as the sum of the latent representations of their features, thus allowing recommendations to generalise to new items (via item features) and to new users (via user features).
17 | 
18 | For more details, see the [Documentation](http://lyst.github.io/lightfm/docs/home.html).
19 | 
20 | Need help? Contact me via [email](mailto:lightfm@zoho.com), [Twitter](https://twitter.com/Maciej_Kula), or [Gitter](https://gitter.im/lightfm-rec/Lobby).
21 | 
22 | ## Installation
23 | Install from `pip`:
24 | ```
25 | pip install lightfm
26 | ```
27 | or Conda:
28 | ```
29 | conda install -c conda-forge lightfm
30 | ```
31 | 
32 | ## Quickstart
33 | Fitting an implicit feedback model on the MovieLens 100k dataset is very easy:
34 | ```python
35 | from lightfm import LightFM
36 | from lightfm.datasets import fetch_movielens
37 | from lightfm.evaluation import precision_at_k
38 | 
39 | # Load the MovieLens 100k dataset. Only five
40 | # star ratings are treated as positive.
41 | data = fetch_movielens(min_rating=5.0)
42 | 
43 | # Instantiate and train the model
44 | model = LightFM(loss='warp')
45 | model.fit(data['train'], epochs=30, num_threads=2)
46 | 
47 | # Evaluate the trained model
48 | test_precision = precision_at_k(model, data['test'], k=5).mean()
49 | ```
50 | 
51 | ## Articles and tutorials on using LightFM
52 | 1. [Learning to Rank Sketchfab Models with LightFM](http://blog.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/)
53 | 2. [Metadata Embeddings for User and Item Cold-start Recommendations](http://building-babylon.net/2016/01/26/metadata-embeddings-for-user-and-item-cold-start-recommendations/)
54 | 3. [Recommendation Systems - Learn Python for Data Science](https://www.youtube.com/watch?v=9gBC9R-msAk)
55 | 4. [Using LightFM to Recommend Projects to Consultants](https://medium.com/product-at-catalant-technologies/using-lightfm-to-recommend-projects-to-consultants-44084df7321c#.gu887ky51)
56 | 
57 | ## How to cite
58 | Please cite LightFM if it helps your research. You can use the following BibTeX entry:
59 | ```
60 | @inproceedings{DBLP:conf/recsys/Kula15,
61 |   author    = {Maciej Kula},
62 |   editor    = {Toine Bogers and
63 |                Marijn Koolen},
64 |   title     = {Metadata Embeddings for User and Item Cold-start Recommendations},
65 |   booktitle = {Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender
66 |                Systems co-located with 9th {ACM} Conference on Recommender Systems
67 |                (RecSys 2015), Vienna, Austria, September 16-20, 2015.},
68 |   series    = {{CEUR} Workshop Proceedings},
69 |   volume    = {1448},
70 |   pages     = {14--21},
71 |   publisher = {CEUR-WS.org},
72 |   year      = {2015},
73 |   url       = {http://ceur-ws.org/Vol-1448/paper4.pdf},
74 | }
75 | ```
76 | 
77 | ## Development
78 | Pull requests are welcome. To install for development:
79 | 
80 | 1. Clone the repository: `git clone git@github.com:lyst/lightfm.git`
81 | 2. Setup a virtual environment: `cd lightfm && python3 -m venv venv && source ./venv/bin/activate`
82 | 3. Install it for development using pip: `pip install -e . && pip install -r test-requirements.txt`
83 | 4. You can run tests by running `./venv/bin/py.test tests`.
84 | 5. LightFM uses [black](https://github.com/ambv/black) to enforce code formatting and flake8 for linting, see `lint-requirements.txt`.
85 | 6. [Optional]: You can install pre-commit to locally enfore formatting and linting. Install with:
86 |     ```bash
87 |     pip install pre-commit
88 |     pre-commit install
89 |     ```
90 | 
91 | When making changes to the `.pyx` extension files, you'll need to run `python setup.py cythonize` in order to produce the extension `.c` files before running `pip install -e .`.
92 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## [1.17][2023-03-19]
  4 | 
  5 | ### Fixed
  6 | 
  7 | - Re-Cythonized cython files to fix compilation errors with newer compilers.
  8 | - Fixed `np.object` usage in tests.
  9 | 
 10 | ## [1.16][2020-11-27]
 11 | 
 12 | ### Addded
 13 | - Set the `LIGHTFM_NO_CFLAGS` environment variable when building LightFM to prevent it from setting
 14 |   `-ffast-math` or `-march=native` compiler flags.
 15 | 
 16 | ### Changed
 17 | - `predict` now returns float32 predictions.
 18 | 
 19 | ## [1.15][2018-05-26]
 20 | ### Added
 21 | - Added a check that there is no overlap between test and train in `predict_ranks` (thanks to [@artdgn](https://github.com/artdgn)).
 22 | - Added dataset builder functionality.
 23 | ### Fixed
 24 | - Fixed error message when item features have the wrong dimensions.
 25 | - Predict now checks for overflow in inputs to predict.
 26 | - WARP fitting is now numerically stable when there are very few items to
 27 |   draw negative samples from (< max_sampled).
 28 | 
 29 | ## [1.14][2017-11-18]
 30 | ### Added
 31 | - added additional input checks for non-normal inputs (NaNs, infinites) for features
 32 | - added additional input checks for non-normal inputs (NaNs, infinites) for interactions
 33 | - cross validation module with dataset splitting utilities
 34 | ### Changed
 35 | - LightFM model now raises a ValueError (instead of assertion) when the number of supplied
 36 |   features exceeds the number of estimated feature embeddings.
 37 | - Warn and delete downloaded file when Movielens download is corrputed. This happens in the wild
 38 |   cofuses users terribly.
 39 | 
 40 | ## [1.13][2017-05-20]
 41 | ### Added
 42 | - added get_{user/item}_representations functions to facilitate extracting the latent representations out of the model.
 43 | ### Fixed
 44 | - recall_at_k and precision_at_k now work correctly at k=1 (thanks to Zank Bennett).
 45 | - Moved Movielens data to data release to prevent grouplens server flakiness from affecting users.
 46 | - Fix segfault when trying to predict from a model that has not been fitted.
 47 | 
 48 | ## [1.12][2017-01-26]
 49 | ### Changed
 50 | - Ranks are now computed pessimistically: when two items are tied, the positive item is assumed to have higher rank. This will lead to zero precision scores for models that predict all zeros, for example.
 51 | - The model will raise a ValueError if, during fitting, any of the parameters become non-finite (NaN or +/- infinity).
 52 | - Added mid-epoch regularization when a lot of regularization is used. This reduces the likelihood of numerical instability at high regularization rates.
 53 | 
 54 | 
 55 | ## [1.11][2016-12-26]
 56 | ### Changed
 57 | - negative samples in BPR are now drawn from the empirical distributions of positives. This improves accuracy slightly on the Movielens 100k dataset.
 58 | 
 59 | ### Fixed
 60 | - incorrect calculation of BPR loss (thanks to @TimonVS for reporting this).
 61 | 
 62 | 
 63 | ## [1.10][2016-11-25]
 64 | ### Added
 65 | - added recall@k evaluation function
 66 | ### Fixed
 67 | - added >=0.17.0 scipy depdendency to setup.py
 68 | - fixed segfaults on when duplicate entries are present in input COO matrices (thanks to Florian
 69 |   Wilhelm for the bug report).
 70 | 
 71 | ## [1.9][2016-05-25]
 72 | ### Fixed
 73 | - fixed gradient accumulation in adagrad (the feature value is now correctly used when accumulating gradient).
 74 |   Thanks to Benjamin Wilson for the bug report.
 75 | - all interaction values greater than 0.0 are now treated as positives for ranking losses.
 76 | ### Added
 77 | - max_sampled hyperparameter for WARP losses. This allows trading off accuracy for WARP training time: a smaller value
 78 |   will mean less negative sampling and faster training when the model is near the optimum.
 79 | - Added a sample_weight argument to fit and fit_partial functions. A high value will now increase the size of the SGD step taken for that interaction.
 80 | - Added an evaluation module for more efficient evaluation of learning-to-rank models.
 81 | - Added a random_state keyword argument to LightFM to allow repeatable model runs.
 82 | ### Changed
 83 | - By default, an OpenMP-less version will be built on OSX. This allows much easier installation at the expense of
 84 | performance.
 85 | - The default value of the max_sampled argument is now 10. This represents a decent default value that allows fast training.
 86 | 
 87 | ## [1.8][2016-01-14]
 88 | ### Changed
 89 | - fix scipy missing from requirements in setup.py
 90 | - remove dependency on glibc by including a translation of the musl rand_r implementation
 91 | 
 92 | ## [1.7][2015-10-14]
 93 | ### Changed
 94 | - fixed bug where item momentum would be incorrectly used in adadelta training for user features (thanks to Jong Wook Kim @jongwook for the bug report).
 95 | - user and item features are now floats (instead of ints), allowing fractional feature weights to be used when fitting models.
 96 | 
 97 | ## [1.6][2015-09-29]
 98 | ### Changed
 99 | - when installing into an Anaconda distribution, drop -march=native compiler flag
100 |   due to assembler issues.
101 | - when installing on OSX, search macports and homebrew install location for gcc
102 |   version 5.x
103 | 
104 | ## [1.5][2015-09-24]
105 | ### Changed
106 | - when installing on OSX, search macports install location for gcc
107 | 
108 | ## [1.4][2015-09-18]
109 | ### Changed
110 | - input matrices automatically converted to correct dtype if necessary
111 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | 	$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  epub3      to make an epub3"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 	@echo "  dummy      to check syntax errors of document sources"
 51 | 
 52 | .PHONY: apidoc
 53 | apidoc:
 54 | 	sphinx-apidoc -o . lightfm
 55 | 
 56 | .PHONY: clean
 57 | clean:
 58 | 	rm -rf $(BUILDDIR)/*
 59 | 
 60 | .PHONY: html
 61 | html:
 62 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 65 | 
 66 | .PHONY: dirhtml
 67 | dirhtml:
 68 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 71 | 
 72 | .PHONY: singlehtml
 73 | singlehtml:
 74 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 75 | 	@echo
 76 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 77 | 
 78 | .PHONY: pickle
 79 | pickle:
 80 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the pickle files."
 83 | 
 84 | .PHONY: json
 85 | json:
 86 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 87 | 	@echo
 88 | 	@echo "Build finished; now you can process the JSON files."
 89 | 
 90 | .PHONY: htmlhelp
 91 | htmlhelp:
 92 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 93 | 	@echo
 94 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 95 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 96 | 
 97 | .PHONY: qthelp
 98 | qthelp:
 99 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
100 | 	@echo
101 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
102 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
103 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/LightFM.qhcp"
104 | 	@echo "To view the help file:"
105 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/LightFM.qhc"
106 | 
107 | .PHONY: applehelp
108 | applehelp:
109 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
110 | 	@echo
111 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
112 | 	@echo "N.B. You won't be able to view it unless you put it in" \
113 | 	      "~/Library/Documentation/Help or install it in your application" \
114 | 	      "bundle."
115 | 
116 | .PHONY: devhelp
117 | devhelp:
118 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
119 | 	@echo
120 | 	@echo "Build finished."
121 | 	@echo "To view the help file:"
122 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/LightFM"
123 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/LightFM"
124 | 	@echo "# devhelp"
125 | 
126 | .PHONY: epub
127 | epub:
128 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
129 | 	@echo
130 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
131 | 
132 | .PHONY: epub3
133 | epub3:
134 | 	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
135 | 	@echo
136 | 	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
137 | 
138 | .PHONY: latex
139 | latex:
140 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
141 | 	@echo
142 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
143 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
144 | 	      "(use \`make latexpdf' here to do that automatically)."
145 | 
146 | .PHONY: latexpdf
147 | latexpdf:
148 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
149 | 	@echo "Running LaTeX files through pdflatex..."
150 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
151 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
152 | 
153 | .PHONY: latexpdfja
154 | latexpdfja:
155 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
156 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
157 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
158 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
159 | 
160 | .PHONY: text
161 | text:
162 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
163 | 	@echo
164 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
165 | 
166 | .PHONY: man
167 | man:
168 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
169 | 	@echo
170 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
171 | 
172 | .PHONY: texinfo
173 | texinfo:
174 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
175 | 	@echo
176 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
177 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
178 | 	      "(use \`make info' here to do that automatically)."
179 | 
180 | .PHONY: info
181 | info:
182 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
183 | 	@echo "Running Texinfo files through makeinfo..."
184 | 	make -C $(BUILDDIR)/texinfo info
185 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
186 | 
187 | .PHONY: gettext
188 | gettext:
189 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
190 | 	@echo
191 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
192 | 
193 | .PHONY: changes
194 | changes:
195 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
196 | 	@echo
197 | 	@echo "The overview file is in $(BUILDDIR)/changes."
198 | 
199 | .PHONY: linkcheck
200 | linkcheck:
201 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
202 | 	@echo
203 | 	@echo "Link check complete; look for any errors in the above output " \
204 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
205 | 
206 | .PHONY: doctest
207 | doctest:
208 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
209 | 	@echo "Testing of doctests in the sources finished, look at the " \
210 | 	      "results in $(BUILDDIR)/doctest/output.txt."
211 | 
212 | .PHONY: coverage
213 | coverage:
214 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
215 | 	@echo "Testing of coverage in the sources finished, look at the " \
216 | 	      "results in $(BUILDDIR)/coverage/python.txt."
217 | 
218 | .PHONY: xml
219 | xml:
220 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
221 | 	@echo
222 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
223 | 
224 | .PHONY: pseudoxml
225 | pseudoxml:
226 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
227 | 	@echo
228 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
229 | 
230 | .PHONY: dummy
231 | dummy:
232 | 	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
233 | 	@echo
234 | 	@echo "Build finished. Dummy builder generates no files."
235 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # LightFM documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Apr 21 12:26:52 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | import lightfm
 19 | import sphinx_rtd_theme
 20 | 
 21 | 
 22 | # If extensions (or modules to document with autodoc) are in another directory,
 23 | # add these directories to sys.path here. If the directory is relative to the
 24 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 25 | sys.path.insert(0, os.path.abspath(".."))
 26 | 
 27 | # -- General configuration ------------------------------------------------
 28 | 
 29 | # If your documentation needs a minimal Sphinx version, state it here.
 30 | # needs_sphinx = '1.0'
 31 | 
 32 | # Add any Sphinx extension module names here, as strings. They can be
 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 34 | # ones.
 35 | extensions = [
 36 |     "sphinx.ext.autodoc",
 37 |     "sphinx.ext.doctest",
 38 |     "sphinx.ext.githubpages",
 39 |     "sphinx.ext.napoleon",
 40 |     "sphinx.ext.viewcode",
 41 |     'sphinx_rtd_theme',
 42 | ]
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ["_templates"]
 46 | 
 47 | # The suffix(es) of source filenames.
 48 | # You can specify multiple suffix as a list of string:
 49 | # source_suffix = ['.rst', '.md']
 50 | source_suffix = ".rst"
 51 | 
 52 | # The encoding of source files.
 53 | # source_encoding = 'utf-8-sig'
 54 | 
 55 | # The master toctree document.
 56 | master_doc = "index"
 57 | 
 58 | # General information about the project.
 59 | project = "LightFM"
 60 | copyright = "2016, Lyst (Maciej Kula)"
 61 | author = "Lyst (Maciej Kula)"
 62 | 
 63 | # The version info for the project you're documenting, acts as replacement for
 64 | # |version| and |release|, also used in various other places throughout the
 65 | # built documents.
 66 | #
 67 | # The short X.Y version.
 68 | version = lightfm.__version__
 69 | # The full version, including alpha/beta/rc tags.
 70 | release = lightfm.__version__
 71 | 
 72 | # The language for content autogenerated by Sphinx. Refer to documentation
 73 | # for a list of supported languages.
 74 | #
 75 | # This is also used if you do content translation via gettext catalogs.
 76 | # Usually you set "language" from the command line for these cases.
 77 | language = None
 78 | 
 79 | # There are two options for replacing |today|: either, you set today to some
 80 | # non-false value, then it is used:
 81 | # today = ''
 82 | # Else, today_fmt is used as the format for a strftime call.
 83 | # today_fmt = '%B %d, %Y'
 84 | 
 85 | # List of patterns, relative to source directory, that match files and
 86 | # directories to ignore when looking for source files.
 87 | # This patterns also effect to html_static_path and html_extra_path
 88 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 89 | 
 90 | # The reST default role (used for this markup: `text`) to use for all
 91 | # documents.
 92 | # default_role = None
 93 | 
 94 | # If true, '()' will be appended to :func: etc. cross-reference text.
 95 | # add_function_parentheses = True
 96 | 
 97 | # If true, the current module name will be prepended to all description
 98 | # unit titles (such as .. function::).
 99 | # add_module_names = True
100 | 
101 | # If true, sectionauthor and moduleauthor directives will be shown in the
102 | # output. They are ignored by default.
103 | # show_authors = False
104 | 
105 | # The name of the Pygments (syntax highlighting) style to use.
106 | pygments_style = "sphinx"
107 | 
108 | # A list of ignored prefixes for module index sorting.
109 | # modindex_common_prefix = []
110 | 
111 | # If true, keep warnings as "system message" paragraphs in the built documents.
112 | # keep_warnings = False
113 | 
114 | # If true, `todo` and `todoList` produce output, else they produce nothing.
115 | todo_include_todos = False
116 | 
117 | 
118 | # -- Options for HTML output ----------------------------------------------
119 | 
120 | # The theme to use for HTML and HTML Help pages.  See the documentation for
121 | # a list of builtin themes.
122 | html_theme = "sphinx_rtd_theme"
123 | 
124 | # Theme options are theme-specific and customize the look and feel of a theme
125 | # further.  For a list of options available for each theme, see the
126 | # documentation.
127 | # html_theme_options = {}
128 | 
129 | # Add any paths that contain custom themes here, relative to this directory.
130 | # html_theme_path = []
131 | 
132 | # The name for this set of Sphinx documents.
133 | # "<project> v<release> documentation" by default.
134 | # html_title = u'LightFM v1.8'
135 | 
136 | # A shorter title for the navigation bar.  Default is the same as html_title.
137 | # html_short_title = None
138 | 
139 | # The name of an image file (relative to this directory) to place at the top
140 | # of the sidebar.
141 | # html_logo = None
142 | 
143 | # The name of an image file (relative to this directory) to use as a favicon of
144 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
145 | # pixels large.
146 | # html_favicon = None
147 | 
148 | # Add any paths that contain custom static files (such as style sheets) here,
149 | # relative to this directory. They are copied after the builtin static files,
150 | # so a file named "default.css" will overwrite the builtin "default.css".
151 | html_static_path = []
152 | 
153 | # Add any extra paths that contain custom files (such as robots.txt or
154 | # .htaccess) here, relative to this directory. These files are copied
155 | # directly to the root of the documentation.
156 | # html_extra_path = []
157 | 
158 | # If not None, a 'Last updated on:' timestamp is inserted at every page
159 | # bottom, using the given strftime format.
160 | # The empty string is equivalent to '%b %d, %Y'.
161 | # html_last_updated_fmt = None
162 | 
163 | # If true, SmartyPants will be used to convert quotes and dashes to
164 | # typographically correct entities.
165 | # html_use_smartypants = True
166 | 
167 | # Custom sidebar templates, maps document names to template names.
168 | # html_sidebars = {}
169 | 
170 | # Additional templates that should be rendered to pages, maps page names to
171 | # template names.
172 | # html_additional_pages = {}
173 | 
174 | # If false, no module index is generated.
175 | # html_domain_indices = True
176 | 
177 | # If false, no index is generated.
178 | # html_use_index = True
179 | 
180 | # If true, the index is split into individual pages for each letter.
181 | # html_split_index = False
182 | 
183 | # If true, links to the reST sources are added to the pages.
184 | # html_show_sourcelink = True
185 | 
186 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
187 | # html_show_sphinx = True
188 | 
189 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
190 | # html_show_copyright = True
191 | 
192 | # If true, an OpenSearch description file will be output, and all pages will
193 | # contain a <link> tag referring to it.  The value of this option must be the
194 | # base URL from which the finished HTML is served.
195 | # html_use_opensearch = ''
196 | 
197 | # This is the file name suffix for HTML files (e.g. ".xhtml").
198 | # html_file_suffix = None
199 | 
200 | # Language to be used for generating the HTML full-text search index.
201 | # Sphinx supports the following languages:
202 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
203 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
204 | # html_search_language = 'en'
205 | 
206 | # A dictionary with options for the search language support, empty by default.
207 | # 'ja' uses this config value.
208 | # 'zh' user can custom change `jieba` dictionary path.
209 | # html_search_options = {'type': 'default'}
210 | 
211 | # The name of a javascript file (relative to the configuration directory) that
212 | # implements a search results scorer. If empty, the default will be used.
213 | # html_search_scorer = 'scorer.js'
214 | 
215 | # Output file base name for HTML help builder.
216 | htmlhelp_basename = "LightFMdoc"
217 | 
218 | # -- Options for LaTeX output ---------------------------------------------
219 | 
220 | latex_elements = {
221 |     # The paper size ('letterpaper' or 'a4paper').
222 |     #'papersize': 'letterpaper',
223 |     # The font size ('10pt', '11pt' or '12pt').
224 |     #'pointsize': '10pt',
225 |     # Additional stuff for the LaTeX preamble.
226 |     #'preamble': '',
227 |     # Latex figure (float) alignment
228 |     #'figure_align': 'htbp',
229 | }
230 | 
231 | # Grouping the document tree into LaTeX files. List of tuples
232 | # (source start file, target name, title,
233 | #  author, documentclass [howto, manual, or own class]).
234 | latex_documents = [
235 |     (
236 |         master_doc,
237 |         "LightFM.tex",
238 |         "LightFM Documentation",
239 |         "Lyst (Maciej Kula)",
240 |         "manual",
241 |     ),
242 | ]
243 | 
244 | # The name of an image file (relative to this directory) to place at the top of
245 | # the title page.
246 | # latex_logo = None
247 | 
248 | # For "manual" documents, if this is true, then toplevel headings are parts,
249 | # not chapters.
250 | # latex_use_parts = False
251 | 
252 | # If true, show page references after internal links.
253 | # latex_show_pagerefs = False
254 | 
255 | # If true, show URL addresses after external links.
256 | # latex_show_urls = False
257 | 
258 | # Documents to append as an appendix to all manuals.
259 | # latex_appendices = []
260 | 
261 | # If false, no module index is generated.
262 | # latex_domain_indices = True
263 | 
264 | 
265 | # -- Options for manual page output ---------------------------------------
266 | 
267 | # One entry per manual page. List of tuples
268 | # (source start file, name, description, authors, manual section).
269 | man_pages = [(master_doc, "lightfm", "LightFM Documentation", [author], 1)]
270 | 
271 | # If true, show URL addresses after external links.
272 | # man_show_urls = False
273 | 
274 | 
275 | # -- Options for Texinfo output -------------------------------------------
276 | 
277 | # Grouping the document tree into Texinfo files. List of tuples
278 | # (source start file, target name, title, author,
279 | #  dir menu entry, description, category)
280 | texinfo_documents = [
281 |     (
282 |         master_doc,
283 |         "LightFM",
284 |         "LightFM Documentation",
285 |         author,
286 |         "LightFM",
287 |         "One line description of project.",
288 |         "Miscellaneous",
289 |     ),
290 | ]
291 | 
292 | # Documents to append as an appendix to all manuals.
293 | # texinfo_appendices = []
294 | 
295 | # If false, no module index is generated.
296 | # texinfo_domain_indices = True
297 | 
298 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
299 | # texinfo_show_urls = 'footnote'
300 | 
301 | # If true, do not generate a @detailmenu in the "Top" node's menu.
302 | # texinfo_no_detailmenu = False
303 | 
304 | # Compact attribute lists
305 | napoleon_use_ivar = True
306 | 


--------------------------------------------------------------------------------
/doc/cross_validation.rst:
--------------------------------------------------------------------------------
1 | Cross-validation
2 | ================
3 | 
4 | .. automodule:: lightfm.cross_validation
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/doc/datasets.rst:
--------------------------------------------------------------------------------
1 | Datasets
2 | ===============
3 | 
4 | .. autofunction:: lightfm.datasets.movielens.fetch_movielens
5 | .. autofunction:: lightfm.datasets.stackexchange.fetch_stackexchange
6 | 


--------------------------------------------------------------------------------
/doc/examples.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | =========================
 3 | 
 4 | Many of the examples can be viewed (and run) as Jupyter notebooks in the `examples directory <https://github.com/lyst/lightfm/tree/master/examples>`_ of the LightFM repository.
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 2
 8 | 
 9 |    Movielens implicit feedback recommender <examples/movielens_implicit>
10 |    Learning rate schedules <examples/learning_schedules>
11 |    Cold-start hybrid recommender <examples/hybrid_crossvalidated>
12 |    Learning-to-rank using WARP loss <examples/warp_loss>
13 |    Building datasets <examples/dataset>
14 | 


--------------------------------------------------------------------------------
/doc/examples/dataset.rst:
--------------------------------------------------------------------------------
  1 | Building datasets
  2 | =================
  3 | 
  4 | In this example, we'll use LightFM's built-in ``Dataset`` class to build
  5 | an interaction dataset from raw data. The goal is to demonstrate how to
  6 | go from raw data (lists of interactions and perhaps item and user
  7 | features) to ``scipy.sparse`` matrices that can be used to fit a LightFM
  8 | model.
  9 | 
 10 | Getting the data
 11 | ----------------
 12 | 
 13 | We're going to use a sample from
 14 | `Goodbooks-10k <https://github.com/zygmuntz/goodbooks-10k>`__ as our
 15 | example dataset. Let's download the data first.
 16 | 
 17 | .. code:: python
 18 | 
 19 |     import os
 20 |     import zipfile
 21 |     import csv
 22 | 
 23 |     import requests
 24 | 
 25 | 
 26 |     def _download(url: str, dest_path: str):
 27 | 
 28 |         req = requests.get(url, stream=True)
 29 |         req.raise_for_status()
 30 | 
 31 |         with open(dest_path, "wb") as fd:
 32 |             for chunk in req.iter_content(chunk_size=2 ** 20):
 33 |                 fd.write(chunk)
 34 | 
 35 | 
 36 |     def get_data():
 37 | 
 38 |         ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")
 39 | 
 40 |         if not os.path.exists("data"):
 41 |             os.makedirs("data")
 42 | 
 43 |             _download(ratings_url, "data/data.zip")
 44 | 
 45 |         with zipfile.ZipFile("data/data.zip") as archive:
 46 |             return (
 47 |                 csv.DictReader(
 48 |                     (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
 49 |                     delimiter=";",
 50 |                 ),
 51 |                 csv.DictReader(
 52 |                     (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
 53 |                 ),
 54 |             )
 55 | 
 56 | 
 57 |     def get_ratings():
 58 | 
 59 |         return get_data()[0]
 60 | 
 61 | 
 62 |     def get_book_features():
 63 | 
 64 |         return get_data()[1]
 65 | 
 66 | The data consists of book ratings and book details:
 67 | 
 68 | .. code:: python
 69 | 
 70 |     import json
 71 |     from itertools import islice
 72 | 
 73 |     ratings, book_features = get_data()
 74 | 
 75 | Ratings look like this:
 76 | 
 77 | .. code:: python
 78 | 
 79 |     for line in islice(ratings, 2):
 80 |         print(json.dumps(line, indent=4))
 81 | 
 82 | ::
 83 | 
 84 |     {
 85 |         "User-ID": "276725",
 86 |         "ISBN": "034545104X",
 87 |         "Book-Rating": "0"
 88 |     }
 89 |     {
 90 |         "User-ID": "276726",
 91 |         "ISBN": "0155061224",
 92 |         "Book-Rating": "5"
 93 |     }
 94 | 
 95 | and book features look like this:
 96 | 
 97 | .. code:: python
 98 | 
 99 |     for line in islice(book_features, 1):
100 |         print(json.dumps(line, indent=4))
101 | 
102 | ::
103 | 
104 |     {
105 |         "ISBN": "0195153448",
106 |         "Book-Title": "Classical Mythology",
107 |         "Book-Author": "Mark P. O. Morford",
108 |         "Year-Of-Publication": "2002",
109 |         "Publisher": "Oxford University Press",
110 |         "Image-URL-S":
111 |     "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
112 |         "Image-URL-M":
113 |     "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
114 |         "Image-URL-L":
115 |     "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
116 |     }
117 | 
118 | Building the ID mappings
119 | ------------------------
120 | 
121 | The first thing we need to do is to create a mapping between the user
122 | and item ids from our input data to indices that will be used internally
123 | by our model.
124 | 
125 | We do this because LightFM works with user and item ids that are
126 | consecutive non-negative integers. The ``Dataset`` class allow us to
127 | create a mapping between the IDs we use in our systems and the
128 | consecutive indices preferred by the model.
129 | 
130 | To do this, we create a dataset and call its ``fit`` method. The first
131 | argument is an iterable of all user ids in our data, and the second is
132 | an iterable of all item ids. In this case, we use generator expressions
133 | to lazily iterate over our data and yield user and item ids:
134 | 
135 | .. code:: python
136 | 
137 |     from lightfm.data import Dataset
138 | 
139 |     dataset = Dataset()
140 |     dataset.fit((x['User-ID'] for x in get_ratings()),
141 |                 (x['ISBN'] for x in get_ratings()))
142 | 
143 | This call will assign an internal numerical id to every user and item id
144 | we pass in. These will be contiguous (from 0 to however many users and
145 | items we have), and will also determine the dimensions of the resulting
146 | LightFM model.
147 | 
148 | We can check that the mappings have been created by querying the dataset
149 | on how many users and books it knows about:
150 | 
151 | .. code:: python
152 | 
153 |     num_users, num_items = dataset.interactions_shape()
154 |     print('Num users: {}, num_items {}.'.format(num_users, num_items))
155 | 
156 | ::
157 | 
158 |     Num users: 105283, num_items 340553.
159 | 
160 | Note that if we don't have all user and items ids at once, we can
161 | repeatedly call ``fit_partial`` to supply additional ids. In this case,
162 | we will use this capability to add some item feature mappings:
163 | 
164 | .. code:: python
165 | 
166 |     dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
167 |                         item_features=(x['Book-Author'] for x in get_book_features()))
168 | 
169 | This will create a feature for every unique author name in the dataset.
170 | 
171 | (Note that we fit some more item ids: this is to make sure our mappings
172 | are complete even if there are items in the features dataset that are
173 | not in the interactions set.)
174 | 
175 | Building the interactions matrix
176 | --------------------------------
177 | 
178 | Having created the mapping, we build the interaction matrix:
179 | 
180 | .. code:: python
181 | 
182 |     (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
183 |                                                           for x in get_ratings()))
184 | 
185 |     print(repr(interactions))
186 | 
187 | ::
188 | 
189 |     <105283x341762 sparse matrix of type '<class 'numpy.int32'>'
190 |             with 1149780 stored elements in COOrdinate format>
191 | 
192 | This is main input into a LightFM model: it encodes the interactions
193 | betwee users and items.
194 | 
195 | Since we have item features, we can also create the item features
196 | matrix:
197 | 
198 | .. code:: python
199 | 
200 |     item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
201 |                                                   for x in get_book_features()))
202 |     print(repr(item_features))
203 | 
204 | ::
205 | 
206 |     <341762x443805 sparse matrix of type '<class 'numpy.float32'>'
207 |             with 613141 stored elements in Compressed Sparse Row format>
208 | 
209 | Building a model
210 | ----------------
211 | 
212 | This is all we need to build a LightFM model:
213 | 
214 | .. code:: python
215 | 
216 |     from lightfm import LightFM
217 | 
218 |     model = LightFM(loss='bpr')
219 |     model.fit(interactions, item_features=item_features)
220 | 
221 | ::
222 | 
223 |     <lightfm.lightfm.LightFM at 0x7f5f0e8f7c88>
224 | 


--------------------------------------------------------------------------------
/doc/examples/hybrid_crossvalidated.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Item cold-start: recommending StackExchange questions
  3 | =====================================================
  4 | 
  5 | In this example we'll use the StackExchange dataset to explore
  6 | recommendations under item-cold start. Data dumps from the StackExchange
  7 | network are available at https://archive.org/details/stackexchange, and
  8 | we'll use one of them --- for stats.stackexchange.com --- here.
  9 | 
 10 | The consists of users answering questions: in the user-item interaction
 11 | matrix, each user is a row, and each question is a column. Based on
 12 | which users answered which questions in the training set, we'll try to
 13 | recommend new questions in the training set.
 14 | 
 15 | Let's start by loading the data. We'll use the ``datasets`` module.
 16 | 
 17 | .. code:: python
 18 | 
 19 |     import numpy as np
 20 | 
 21 |     from lightfm.datasets import fetch_stackexchange
 22 | 
 23 |     data = fetch_stackexchange('crossvalidated',
 24 |                                test_set_fraction=0.1,
 25 |                                indicator_features=False,
 26 |                                tag_features=True)
 27 | 
 28 |     train = data['train']
 29 |     test = data['test']
 30 | 
 31 | Let's examine the data:
 32 | 
 33 | .. code:: python
 34 | 
 35 |     print('The dataset has %s users and %s items, '
 36 |           'with %s interactions in the test and %s interactions in the training set.'
 37 |           % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))
 38 | 
 39 | 
 40 | .. parsed-literal::
 41 | 
 42 |     The dataset has 3221 users and 72360 items, with 4307 interactions in the test and 57830 interactions in the training set.
 43 | 
 44 | 
 45 | The training and test set are divided chronologically: the test set
 46 | contains the 10% of interactions that happened after the 90% in the
 47 | training set. This means that many of the questions in the test set have
 48 | no interactions. This is an accurate description of a questions
 49 | answering system: it is most important to recommend questions that have
 50 | not yet been answered to the expert users who can answer them.
 51 | 
 52 | A pure collaborative filtering model
 53 | ------------------------------------
 54 | 
 55 | This is clearly a cold-start scenario, and so we can expect a
 56 | traditional collaborative filtering model to do very poorly. Let's check
 57 | if that's the case:
 58 | 
 59 | .. code:: python
 60 | 
 61 |     # Import the model
 62 |     from lightfm import LightFM
 63 | 
 64 |     # Set the number of threads; you can increase this
 65 |     # if you have more physical cores available.
 66 |     NUM_THREADS = 2
 67 |     NUM_COMPONENTS = 30
 68 |     NUM_EPOCHS = 3
 69 |     ITEM_ALPHA = 1e-6
 70 | 
 71 |     # Let's fit a WARP model: these generally have the best performance.
 72 |     model = LightFM(loss='warp',
 73 |                     item_alpha=ITEM_ALPHA,
 74 |                    no_components=NUM_COMPONENTS)
 75 | 
 76 |     # Run 3 epochs and time it.
 77 |     %time model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)
 78 | 
 79 | 
 80 | .. parsed-literal::
 81 | 
 82 |     CPU times: user 12.9 s, sys: 8 ms, total: 12.9 s
 83 |     Wall time: 6.52 s
 84 | 
 85 | 
 86 | As a means of sanity checking, let's calculate the model's AUC on the
 87 | training set first. If it's reasonably high, we can be sure that the
 88 | model is not doing anything stupid and is fitting the training data
 89 | well.
 90 | 
 91 | .. code:: python
 92 | 
 93 |     # Import the evaluation routines
 94 |     from lightfm.evaluation import auc_score
 95 | 
 96 |     # Compute and print the AUC score
 97 |     train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
 98 |     print('Collaborative filtering train AUC: %s' % train_auc)
 99 | 
100 | 
101 | .. parsed-literal::
102 | 
103 |     Collaborative filtering train AUC: 0.887519
104 | 
105 | 
106 | Fantastic, the model is fitting the training set well. But what about
107 | the test set?
108 | 
109 | .. code:: python
110 | 
111 |     # We pass in the train interactions to exclude them from predictions.
112 |     # This is to simulate a recommender system where we do not
113 |     # re-recommend things the user has already interacted with in the train
114 |     # set.
115 |     test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
116 |     print('Collaborative filtering test AUC: %s' % test_auc)
117 | 
118 | 
119 | .. parsed-literal::
120 | 
121 |     Collaborative filtering test AUC: 0.34728
122 | 
123 | 
124 | This is terrible: we do worse than random! This is not very surprising:
125 | as there is no training data for the majority of the test questions, the
126 | model cannot compute reasonable representations of the test set items.
127 | 
128 | The fact that we score them lower than other items (AUC < 0.5) is due to
129 | estimated per-item biases, which can be confirmed by setting them to
130 | zero and re-evaluating the model.
131 | 
132 | .. code:: python
133 | 
134 |     # Set biases to zero
135 |     model.item_biases *= 0.0
136 | 
137 |     test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS, check_intersections=False).mean()
138 |     print('Collaborative filtering test AUC: %s' % test_auc)
139 | 
140 | 
141 | .. parsed-literal::
142 | 
143 |     Collaborative filtering test AUC: 0.496266
144 | 
145 | 
146 | A hybrid model
147 | --------------
148 | 
149 | We can do much better by employing LightFM's hybrid model capabilities.
150 | The StackExchange data comes with content information in the form of
151 | tags users apply to their questions:
152 | 
153 | .. code:: python
154 | 
155 |     item_features = data['item_features']
156 |     tag_labels = data['item_feature_labels']
157 | 
158 |     print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))
159 | 
160 | 
161 | .. parsed-literal::
162 | 
163 |     There are 1246 distinct tags, with values like [u'bayesian', u'prior', u'elicitation'].
164 | 
165 | 
166 | We can use these features (instead of an identity feature matrix like in
167 | a pure CF model) to estimate a model which will generalize better to
168 | unseen examples: it will simply use its representations of item features
169 | to infer representations of previously unseen questions.
170 | 
171 | Let's go ahead and fit a model of this type.
172 | 
173 | .. code:: python
174 | 
175 |     # Define a new model instance
176 |     model = LightFM(loss='warp',
177 |                     item_alpha=ITEM_ALPHA,
178 |                     no_components=NUM_COMPONENTS)
179 | 
180 |     # Fit the hybrid model. Note that this time, we pass
181 |     # in the item features matrix.
182 |     model = model.fit(train,
183 |                     item_features=item_features,
184 |                     epochs=NUM_EPOCHS,
185 |                     num_threads=NUM_THREADS)
186 | 
187 | As before, let's sanity check the model on the training set.
188 | 
189 | .. code:: python
190 | 
191 |     # Don't forget the pass in the item features again!
192 |     train_auc = auc_score(model,
193 |                           train,
194 |                           item_features=item_features,
195 |                           num_threads=NUM_THREADS).mean()
196 |     print('Hybrid training set AUC: %s' % train_auc)
197 | 
198 | 
199 | .. parsed-literal::
200 | 
201 |     Hybrid training set AUC: 0.86049
202 | 
203 | 
204 | Note that the training set AUC is lower than in a pure CF model. This is
205 | fine: by using a lower-rank item feature matrix, we have effectively
206 | regularized the model, giving it less freedom to fit the training data.
207 | 
208 | Despite this the model does much better on the test set:
209 | 
210 | .. code:: python
211 | 
212 |     test_auc = auc_score(model,
213 |                         test,
214 |                         train_interactions=train,
215 |                         item_features=item_features,
216 |                         num_threads=NUM_THREADS,
217 |                         check_intersections=False).mean()
218 |     print('Hybrid test set AUC: %s' % test_auc)
219 | 
220 | 
221 | .. parsed-literal::
222 | 
223 |     Hybrid test set AUC: 0.703039
224 | 
225 | 
226 | This is as expected: because items in the test set share tags with items
227 | in the training set, we can provide better test set recommendations by
228 | using the tag representations learned from training.
229 | 
230 | Bonus: tag embeddings
231 | ---------------------
232 | 
233 | One of the nice properties of the hybrid model is that the estimated tag
234 | embeddings capture semantic characteristics of the tags. Like the
235 | word2vec model, we can use this property to explore semantic tag
236 | similarity:
237 | 
238 | .. code:: python
239 | 
240 |     def get_similar_tags(model, tag_id):
241 |         # Define similarity as the cosine of the angle
242 |         # between the tag latent vectors
243 | 
244 |         # Normalize the vectors to unit length
245 |         tag_embeddings = (model.item_embeddings.T
246 |                           / np.linalg.norm(model.item_embeddings, axis=1)).T
247 | 
248 |         query_embedding = tag_embeddings[tag_id]
249 |         similarity = np.dot(tag_embeddings, query_embedding)
250 |         most_similar = np.argsort(-similarity)[1:4]
251 | 
252 |         return most_similar
253 | 
254 | 
255 |     for tag in (u'bayesian', u'regression', u'survival'):
256 |         tag_id = tag_labels.tolist().index(tag)
257 |         print('Most similar tags for %s: %s' % (tag_labels[tag_id],
258 |                                                 tag_labels[get_similar_tags(model, tag_id)]))
259 | 
260 | 
261 | .. parsed-literal::
262 | 
263 |     Most similar tags for bayesian: [u'posterior' u'mcmc' u'bayes']
264 |     Most similar tags for regression: [u'multicollinearity' u'stepwise-regression' u'multiple-regression']
265 |     Most similar tags for survival: [u'cox-model' u'kaplan-meier' u'odds-ratio']
266 | 


--------------------------------------------------------------------------------
/doc/examples/learning_schedules.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Using different learning schedules
  3 | ==================================
  4 | 
  5 | ``lightfm`` implements two learning schedules: adagrad and adadelta.
  6 | Neither is clearly superior, and, like other hyperparameter choices, the
  7 | best learning schedule will differ based on the problem at hand.
  8 | 
  9 | This example tries both at the Movielens 100k dataset.
 10 | 
 11 | Preliminaries
 12 | -------------
 13 | 
 14 | Let's first get the data and define the evaluations functions.
 15 | 
 16 | .. code:: python
 17 | 
 18 |     import numpy as np
 19 |     import data
 20 | 
 21 |     %matplotlib inline
 22 | 
 23 |     import matplotlib
 24 |     import numpy as np
 25 |     import matplotlib.pyplot as plt
 26 | 
 27 |     from lightfm import LightFM
 28 |     from lightfm.datasets import fetch_movielens
 29 |     from lightfm.evaluation import auc_score
 30 | 
 31 |     movielens = fetch_movielens()
 32 | 
 33 |     train, test = movielens['train'], movielens['test']
 34 | 
 35 | Experiment
 36 | ----------
 37 | 
 38 | To evaluate the performance of both learning schedules, let's create two
 39 | models and run each for a number of epochs, measuring the ROC AUC on the
 40 | test set at the end of each epoch.
 41 | 
 42 | .. code:: python
 43 | 
 44 |     alpha = 1e-3
 45 |     epochs = 70
 46 | 
 47 |     adagrad_model = LightFM(no_components=30,
 48 |                             loss='warp',
 49 |                             learning_schedule='adagrad',
 50 |                             user_alpha=alpha,
 51 |                             item_alpha=alpha)
 52 |     adadelta_model = LightFM(no_components=30,
 53 |                             loss='warp',
 54 |                             learning_schedule='adadelta',
 55 |                             user_alpha=alpha,
 56 |                             item_alpha=alpha)
 57 | 
 58 |     adagrad_auc = []
 59 | 
 60 |     for epoch in range(epochs):
 61 |         adagrad_model.fit_partial(train, epochs=1)
 62 |         adagrad_auc.append(auc_score(adagrad_model, test).mean())
 63 | 
 64 | 
 65 |     adadelta_auc = []
 66 | 
 67 |     for epoch in range(epochs):
 68 |         adadelta_model.fit_partial(train, epochs=1)
 69 |         adadelta_auc.append(auc_score(adadelta_model, test).mean())
 70 | 
 71 | It looks like the adadelta gets to a better result at the beginning of
 72 | training. However, as we keep running more epochs adagrad wins out,
 73 | converging to a better final solution.
 74 | 
 75 | .. code:: python
 76 | 
 77 |     x = np.arange(len(adagrad_auc))
 78 |     plt.plot(x, np.array(adagrad_auc))
 79 |     plt.plot(x, np.array(adadelta_auc))
 80 |     plt.legend(['adagrad', 'adadelta'], loc='lower right')
 81 |     plt.show()
 82 | 
 83 | 
 84 | 
 85 | .. image:: learning_schedules_files/learning_schedules_5_0.png
 86 | 
 87 | 
 88 | We can try the same for the k-OS loss.
 89 | 
 90 | .. code:: python
 91 | 
 92 |     alpha = 1e-3
 93 |     epochs = 70
 94 | 
 95 |     adagrad_model = LightFM(no_components=30,
 96 |                             loss='warp-kos',
 97 |                             learning_schedule='adagrad',
 98 |                             user_alpha=alpha, item_alpha=alpha)
 99 |     adadelta_model = LightFM(no_components=30,
100 |                             loss='warp-kos',
101 |                             learning_schedule='adadelta',
102 |                             user_alpha=alpha, item_alpha=alpha)
103 | 
104 |     adagrad_auc = []
105 | 
106 |     for epoch in range(epochs):
107 |         adagrad_model.fit_partial(train, epochs=1)
108 |         adagrad_auc.append(auc_score(adagrad_model, test).mean())
109 | 
110 | 
111 |     adadelta_auc = []
112 | 
113 |     for epoch in range(epochs):
114 |         adadelta_model.fit_partial(train, epochs=1)
115 |         adadelta_auc.append(auc_score(adadelta_model, test).mean())
116 | 
117 | .. code:: python
118 | 
119 |     x = np.arange(len(adagrad_auc))
120 |     plt.plot(x, np.array(adagrad_auc))
121 |     plt.plot(x, np.array(adadelta_auc))
122 |     plt.legend(['adagrad', 'adadelta'], loc='lower right')
123 |     plt.show()
124 | 
125 | 
126 | 
127 | .. image:: learning_schedules_files/learning_schedules_8_0.png
128 | 


--------------------------------------------------------------------------------
/doc/examples/learning_schedules_files/learning_schedules_5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/learning_schedules_files/learning_schedules_5_0.png


--------------------------------------------------------------------------------
/doc/examples/learning_schedules_files/learning_schedules_8_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/learning_schedules_files/learning_schedules_8_0.png


--------------------------------------------------------------------------------
/doc/examples/movielens_implicit.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | An implicit feedback recommender for the Movielens dataset
  3 | ==========================================================
  4 | 
  5 | Implicit feedback
  6 | -----------------
  7 | 
  8 | For some time, the recommender system literature focused on explicit
  9 | feedback: the Netflix prize focused on accurately reproducing the
 10 | ratings users have given to movies they watched.
 11 | 
 12 | Focusing on ratings in this way ignored the importance of taking into
 13 | account which movies the users chose to watch in the first place, and
 14 | treating the absence of ratings as absence of information.
 15 | 
 16 | But the things that we don't have ratings for aren't unknowns: we know
 17 | the user didn't pick them. This reflects a user's conscious choice, and
 18 | is a good source of information on what she thinks she might like.
 19 | 
 20 | This sort of phenomenon is described as data which is
 21 | missing-not-at-random in the literature: the ratings that are missing
 22 | are more likely to be negative precisely because the user chooses which
 23 | items to rate. When choosing a restaurant, you only go to places which
 24 | you think you'll enjoy, and never go to places that you think you'll
 25 | hate. What this leads to is that you're only going to be submitting
 26 | ratings for things which, a priori, you expected to like; the things
 27 | that you expect you will not like you will never rate.
 28 | 
 29 | This observation has led to the development of models that are suitable
 30 | for implicit feedback. LightFM implements two that have proven
 31 | particular successful:
 32 | 
 33 | -  BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the
 34 |    prediction difference between a positive example and a randomly
 35 |    chosen negative example. Useful when only positive interactions are
 36 |    present and optimising ROC AUC is desired.
 37 | -  WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank
 38 |    of positive examples by repeatedly sampling negative examples until
 39 |    rank violating one is found. Useful when only positive interactions
 40 |    are present and optimising the top of the recommendation list
 41 |    (precision@k) is desired.
 42 | 
 43 | This example shows how to estimate these models on the Movielens
 44 | dataset.
 45 | 
 46 | [1] Rendle, Steffen, et al. "BPR: Bayesian personalized ranking from
 47 | implicit feedback." Proceedings of the Twenty-Fifth Conference on
 48 | Uncertainty in Artificial Intelligence. AUAI Press, 2009.
 49 | 
 50 | [2] Weston, Jason, Samy Bengio, and Nicolas Usunier. "Wsabie: Scaling up
 51 | to large vocabulary image annotation." IJCAI. Vol. 11. 2011.
 52 | 
 53 | Getting the data
 54 | ----------------
 55 | 
 56 | The first step is to get the `Movielens
 57 | data <http://grouplens.org/datasets/movielens/100k/>`__. This is a
 58 | classic small recommender dataset, consisting of around 950 users, 1700
 59 | movies, and 100,000 ratings. The ratings are on a scale from 1 to 5, but
 60 | we'll all treat them as implicit positive feedback in this example.
 61 | 
 62 | Fortunately, this is one of the functions provided by LightFM itself.
 63 | 
 64 | .. code:: python
 65 | 
 66 |     import numpy as np
 67 | 
 68 |     from lightfm.datasets import fetch_movielens
 69 | 
 70 |     movielens = fetch_movielens()
 71 | 
 72 | This gives us a dictionary with the following fields:
 73 | 
 74 | .. code:: python
 75 | 
 76 |     for key, value in movielens.items():
 77 |         print(key, type(value), value.shape)
 78 | 
 79 | 
 80 | .. parsed-literal::
 81 | 
 82 |     ('test', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))
 83 |     ('item_features', <class 'scipy.sparse.csr.csr_matrix'>, (1682, 1682))
 84 |     ('train', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))
 85 |     ('item_labels', <type 'numpy.ndarray'>, (1682,))
 86 |     ('item_feature_labels', <type 'numpy.ndarray'>, (1682,))
 87 | 
 88 | 
 89 | .. code:: python
 90 | 
 91 |     train = movielens['train']
 92 |     test = movielens['test']
 93 | 
 94 | The ``train`` and ``test`` elements are the most important: they contain
 95 | the raw rating data, split into a train and a test set. Each row
 96 | represents a user, and each column an item. Entries are ratings from 1
 97 | to 5.
 98 | 
 99 | Fitting models
100 | --------------
101 | 
102 | Now let's train a BPR model and look at its accuracy.
103 | 
104 | We'll use two metrics of accuracy: precision@k and ROC AUC. Both are
105 | ranking metrics: to compute them, we'll be constructing recommendation
106 | lists for all of our users, and checking the ranking of known positive
107 | movies. For precision at k we'll be looking at whether they are within
108 | the first k results on the list; for AUC, we'll be calculating the
109 | probability that any known positive is higher on the list than a random
110 | negative example.
111 | 
112 | .. code:: python
113 | 
114 |     from lightfm import LightFM
115 |     from lightfm.evaluation import precision_at_k
116 |     from lightfm.evaluation import auc_score
117 | 
118 |     model = LightFM(learning_rate=0.05, loss='bpr')
119 |     model.fit(train, epochs=10)
120 | 
121 |     train_precision = precision_at_k(model, train, k=10).mean()
122 |     test_precision = precision_at_k(model, test, k=10).mean()
123 | 
124 |     train_auc = auc_score(model, train).mean()
125 |     test_auc = auc_score(model, test).mean()
126 | 
127 |     print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
128 |     print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
129 | 
130 | 
131 | .. parsed-literal::
132 | 
133 |     Precision: train 0.59, test 0.10.
134 |     AUC: train 0.90, test 0.86.
135 | 
136 | 
137 | The WARP model, on the other hand, optimises for precision@k---we should
138 | expect its performance to be better on precision.
139 | 
140 | .. code:: python
141 | 
142 |     model = LightFM(learning_rate=0.05, loss='warp')
143 | 
144 |     model.fit_partial(train, epochs=10)
145 | 
146 |     train_precision = precision_at_k(model, train, k=10).mean()
147 |     test_precision = precision_at_k(model, test, k=10).mean()
148 | 
149 |     train_auc = auc_score(model, train).mean()
150 |     test_auc = auc_score(model, test).mean()
151 | 
152 |     print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
153 |     print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
154 | 
155 | 
156 | .. parsed-literal::
157 | 
158 |     Precision: train 0.61, test 0.11.
159 |     AUC: train 0.93, test 0.90.
160 | 
161 | 
162 | And that is exactly what we see: we get slightly higher precision@10
163 | (but the AUC metric is also improved).
164 | 


--------------------------------------------------------------------------------
/doc/examples/warp_loss.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Learning-to-rank using the WARP loss
  3 | ====================================
  4 | 
  5 | LightFM is probably the only recommender package implementing the WARP
  6 | (Weighted Approximate-Rank Pairwise) loss for implicit feedback
  7 | learning-to-rank. Generally, it perfoms better than the more popular BPR
  8 | (Bayesian Personalised Ranking) loss --- often by a large margin.
  9 | 
 10 | It was originally applied to image annotations in the Weston et al.
 11 | `WSABIE
 12 | paper <http://www.thespermwhale.com/jaseweston/papers/wsabie-ijcai.pdf>`__,
 13 | but has been extended to apply to recommendation settings in the `2013
 14 | k-order statistic loss
 15 | paper <http://www.ee.columbia.edu/~ronw/pubs/recsys2013-kaos.pdf>`__ in
 16 | the form of the k-OS WARP loss, also implemented in LightFM.
 17 | 
 18 | Like the BPR model, WARP deals with (user, positive item, negative item)
 19 | triplets. Unlike BPR, the negative items in the triplet are not chosen
 20 | by random sampling: they are chosen from among those negatie items which
 21 | would violate the desired item ranking given the state of the model.
 22 | This approximates a form of active learning where the model selects
 23 | those triplets that it cannot currently rank correctly.
 24 | 
 25 | This procedure yields roughly the following algorithm:
 26 | 
 27 | 1. For a given (user, positive item pair), sample a negative item at
 28 |    random from all the remaining items. Compute predictions for both
 29 |    items; if the negative item's prediction exceeds that of the positive
 30 |    item plus a margin, perform a gradient update to rank the positive
 31 |    item higher and the negative item lower. If there is no rank
 32 |    violation, continue sampling negative items until a violation is
 33 |    found.
 34 | 2. If you found a violating negative example at the first try, make a
 35 |    large gradient update: this indicates that a lot of negative items
 36 |    are ranked higher than positives items given the current state of the
 37 |    model, and the model must be updated by a large amount. If it took a
 38 |    lot of sampling to find a violating example, perform a small update:
 39 |    the model is likely close to the optimum and should be updated at a
 40 |    low rate.
 41 | 
 42 | While this is fairly hand-wavy, it should give the correct intuition.
 43 | For more details, read the paper itself or a more in-depth blog post
 44 | `here <https://building-babylon.net/2016/03/18/warp-loss-for-implicit-feedback-recommendation/>`__.
 45 | A similar approach for BPR is described in Rendle's 2014 `WSDM 2014
 46 | paper <http://webia.lip6.fr/~gallinar/gallinari/uploads/Teaching/WSDM2014-rendle.pdf>`__.
 47 | 
 48 | Having covered the theory, the rest of this example looks at the
 49 | practical implications of using WARP in LightFM.
 50 | 
 51 | Preliminaries
 52 | -------------
 53 | 
 54 | Let's first get the data. We'll use the MovieLens 100K dataset.
 55 | 
 56 | .. code:: python
 57 | 
 58 |     import time
 59 | 
 60 |     import numpy as np
 61 | 
 62 |     %matplotlib inline
 63 | 
 64 |     import matplotlib
 65 |     import numpy as np
 66 |     import matplotlib.pyplot as plt
 67 | 
 68 |     from lightfm import LightFM
 69 |     from lightfm.datasets import fetch_movielens
 70 |     from lightfm.evaluation import auc_score
 71 | 
 72 |     movielens = fetch_movielens()
 73 | 
 74 |     train, test = movielens['train'], movielens['test']
 75 | 
 76 | Accuracy
 77 | --------
 78 | 
 79 | The first interesting experiment is to compare the accuracy between the
 80 | WARP and BPR losses. Let's fit two models with equivalent
 81 | hyperparameters and compare their accuracy across epochs. Whilst we're
 82 | fitting them, let's also measure how much time each epoch takes.
 83 | 
 84 | .. code:: python
 85 | 
 86 |     alpha = 1e-05
 87 |     epochs = 70
 88 |     num_components = 32
 89 | 
 90 |     warp_model = LightFM(no_components=num_components,
 91 |                         loss='warp',
 92 |                         learning_schedule='adagrad',
 93 |                         max_sampled=100,
 94 |                         user_alpha=alpha,
 95 |                         item_alpha=alpha)
 96 | 
 97 |     bpr_model = LightFM(no_components=num_components,
 98 |                         loss='bpr',
 99 |                         learning_schedule='adagrad',
100 |                         user_alpha=alpha,
101 |                         item_alpha=alpha)
102 | 
103 |     warp_duration = []
104 |     bpr_duration = []
105 |     warp_auc = []
106 |     bpr_auc = []
107 | 
108 |     for epoch in range(epochs):
109 |         start = time.time()
110 |         warp_model.fit_partial(train, epochs=1)
111 |         warp_duration.append(time.time() - start)
112 |         warp_auc.append(auc_score(warp_model, test, train_interactions=train).mean())
113 | 
114 |     for epoch in range(epochs):
115 |         start = time.time()
116 |         bpr_model.fit_partial(train, epochs=1)
117 |         bpr_duration.append(time.time() - start)
118 |         bpr_auc.append(auc_score(bpr_model, test, train_interactions=train).mean())
119 | 
120 | Plotting the results immediately reveals that WARP produces superior
121 | results: a smarter way of selecting negative examples leads to higher
122 | quality rankings. Test accuracy decreases after the first 10 epochs,
123 | suggesting WARP starts overfitting and would benefit from regularization
124 | or early stopping.
125 | 
126 | .. code:: python
127 | 
128 |     x = np.arange(epochs)
129 |     plt.plot(x, np.array(warp_auc))
130 |     plt.plot(x, np.array(bpr_auc))
131 |     plt.legend(['WARP AUC', 'BPR AUC'], loc='upper right')
132 |     plt.show()
133 | 
134 | 
135 | 
136 | .. image:: warp_loss_files/warp_loss_5_0.png
137 | 
138 | 
139 | Fitting speed
140 | -------------
141 | 
142 | What about model fitting speed?
143 | 
144 | .. code:: python
145 | 
146 |     x = np.arange(epochs)
147 |     plt.plot(x, np.array(warp_duration))
148 |     plt.plot(x, np.array(bpr_duration))
149 |     plt.legend(['WARP duration', 'BPR duration'], loc='upper right')
150 |     plt.show()
151 | 
152 | 
153 | 
154 | .. image:: warp_loss_files/warp_loss_7_0.png
155 | 
156 | 
157 | WARP is slower than BPR for all epochs. Interestingly, however, it gets
158 | slower with additional epochs; every subsequent epoch takes more time.
159 | This is because of WARP's adaptive samling of negatives: the closer the
160 | model fits the training data, the more times it needs to sample in order
161 | to find rank-violating examples, leading to longer fitting times.
162 | 
163 | For this reason, LightFM exposes the ``max_sampled`` hyperparameter that
164 | limits the number of attemps WARP will carry out to find a negative.
165 | Setting it to a low value and repeating the run shows that the run time
166 | actually decreases with every epoch: this is because no updates happen
167 | when a violating example cannot be found in the specified number of
168 | attempts.
169 | 
170 | .. code:: python
171 | 
172 |     warp_model = LightFM(no_components=num_components,
173 |                          max_sampled=3,
174 |                         loss='warp',
175 |                         learning_schedule='adagrad',
176 |                         user_alpha=alpha,
177 |                         item_alpha=alpha)
178 | 
179 |     warp_duration = []
180 |     warp_auc = []
181 | 
182 |     for epoch in range(epochs):
183 |         start = time.time()
184 |         warp_model.fit_partial(train, epochs=1)
185 |         warp_duration.append(time.time() - start)
186 |         warp_auc.append(auc_score(warp_model, test, train_interactions=train).mean())
187 | 
188 |     x = np.arange(epochs)
189 |     plt.plot(x, np.array(warp_duration))
190 |     plt.legend(['WARP duration'], loc='upper right')
191 |     plt.title('Duration')
192 |     plt.show()
193 | 
194 |     x = np.arange(epochs)
195 |     plt.plot(x, np.array(warp_auc))
196 |     plt.legend(['WARP AUC'], loc='upper right')
197 |     plt.title('AUC')
198 |     plt.show()
199 | 
200 | 
201 | 
202 | .. image:: warp_loss_files/warp_loss_9_0.png
203 | 
204 | 
205 | 
206 | .. image:: warp_loss_files/warp_loss_9_1.png
207 | 


--------------------------------------------------------------------------------
/doc/examples/warp_loss_files/warp_loss_5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_5_0.png


--------------------------------------------------------------------------------
/doc/examples/warp_loss_files/warp_loss_7_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_7_0.png


--------------------------------------------------------------------------------
/doc/examples/warp_loss_files/warp_loss_9_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_9_0.png


--------------------------------------------------------------------------------
/doc/examples/warp_loss_files/warp_loss_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/doc/examples/warp_loss_files/warp_loss_9_1.png


--------------------------------------------------------------------------------
/doc/faq.rst:
--------------------------------------------------------------------------------
 1 | ===
 2 | FAQ
 3 | ===
 4 | 
 5 | Does LightFM have a GPU-based implementation?
 6 | =============================================
 7 | No, there is no option to run training or inference on the GPU with LightFM. There are
 8 | currently no plans to change this.
 9 | See https://github.com/lyst/lightfm/issues/429
10 | 
11 | What are the "learning to rank" and "hybrid" aspects of LightFM and how do they relate?
12 | =======================================================================================
13 | *Learning to rank* and *hybrid* recommendation models are independent concepts. 
14 | *Learning to rank* just means that you are optimizing a ranking loss such as `WARP` or 
15 | `BPR`. *Hybrid* refers to the fact that you incorporate user or item meta-data as additional features.
16 | See: https://github.com/lyst/lightfm/issues/442
17 | 
18 | Adding user/item features makes my model perform worse than without features, what can I do?
19 | ============================================================================================
20 | That's not unusual and might have various reasons. For one, make sure you 
21 | don't drop per-user/item features, see the notes in :doc:`LightFM<lightfm>`. If that
22 | doesn't help, your features might be simply uninformative and worsen the 
23 | signal to noise ratio. You can experiment with different features and try
24 | discretization strategies for continuous features. More strategies and ideas
25 | can be found here:
26 | 
27 | - https://github.com/lyst/lightfm/issues/551
28 | - https://github.com/lyst/lightfm/issues/486
29 | - https://github.com/lyst/lightfm/issues/176
30 | - https://github.com/lyst/lightfm/issues/430
31 | 
32 | My model is recommending the same popular items to all users, what can I do?
33 | ============================================================================
34 | You can try to set your item bias vectors to all zeros. Another strategy is 
35 | to apply inverse propensity weights to your features.
36 | See these issues for more information:
37 | 
38 | - https://github.com/lyst/lightfm/issues/395
39 | - https://github.com/lyst/lightfm/issues/176
40 | 
41 | How can I re-train my model on partial data and/or new users (user cold-start)?
42 | ===============================================================================
43 | This depends a lot on your specific use case. Here are some helpful discussions:
44 | 
45 | - https://github.com/lyst/lightfm/issues/194
46 | - https://github.com/lyst/lightfm/issues/347
47 | - https://github.com/lyst/lightfm/issues/210
48 | - https://github.com/lyst/lightfm/issues/371
49 | - https://stackoverflow.com/questions/46924119/lightfm-handling-user-and-item-cold-start
50 | 


--------------------------------------------------------------------------------
/doc/home.rst:
--------------------------------------------------------------------------------
  1 | Welcome to LightFM's documentation!
  2 | ===================================
  3 | 
  4 | LightFM is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback.
  5 | 
  6 | It also makes it possible to incorporate both item and user metadata into the traditional matrix factorization algorithms. It represents each user and item as the sum of the latent representations of their features, thus allowing recommendations to generalise to new items (via item features) and to new users (via user features).
  7 | 
  8 | The details of the approach are described in the LightFM paper, available on `arXiv <http://arxiv.org/abs/1507.08439>`_.
  9 | 
 10 | Quickstart
 11 | ----------
 12 | 
 13 | Jump straight to the :doc:`Movielens quickstart <quickstart>` if you're impatient.
 14 | 
 15 | 
 16 | Installation
 17 | ------------
 18 | 
 19 | PyPI
 20 | ~~~~
 21 | 
 22 | Install from pypi using pip: ``pip install lightfm``. Everything should work out-of-the box on Linux, OSX using Homebrew Python, and Windows using Miniconda.
 23 | 
 24 | Note for OSX and Windows users: LightFM will by default not use OpenMP on OSX and Windows, and so all model fitting will be single-threaded. This is due to the fact that Clang (and Miniconda) does not support OpenMP, and installing an OpenMP-enabled version of gcc is complicated and labour-intensive. If you'd like to use the multi-threading capabilities of LightFM on these platforms, you should try using it via Docker as described in the next section.
 25 | 
 26 | Building with the default Python distribution included in OSX is also not supported; please try the version from Homebrew or Anaconda.
 27 | 
 28 | Using with Docker
 29 | ~~~~~~~~~~~~~~~~~
 30 | 
 31 | On many systems it may be more convenient to try LightFM out in a Docker container. This repository provides a small Dockerfile sufficient to run LightFM and its examples. To run it:
 32 | 
 33 | 1. `Install Docker <https://docs.docker.com/compose/install/>`_ and start the docker deamon/virtual machine.
 34 | 2. Clone this repository and navigate to it: ``git clone git@github.com:lyst/lightfm.git && cd lightfm``.
 35 | 3. Run ``docker-compose build lightfm`` to build the container.
 36 | 
 37 | The container should now be ready for use. You can then:
 38 | 
 39 | 1. Run tests by running ``docker-compose run lightfm py.test -x lightfm/tests/``
 40 | 2. Run the movielens example by running ``docker-compose run --service-ports lightfm jupyter notebook lightfm/examples/movielens/example.ipynb --allow-root --ip="0.0.0.0" --port=8888 --no-browser``. The notebook will be accessible at port 8888 of your container's IP address.
 41 | 
 42 | Usage
 43 | -----
 44 | 
 45 | Model fitting is very straightforward using the main :doc:`LightFM class <lightfm>`.
 46 | 
 47 | Create a model instance with the desired latent dimensionality::
 48 | 
 49 |     from lightfm import LightFM
 50 | 
 51 |     model = LightFM(no_components=30)
 52 | 
 53 | Assuming ``train`` is a (no_users, no_items) sparse matrix (with 1s denoting positive, and -1s negative interactions), you can fit a traditional matrix factorization model by calling::
 54 | 
 55 |     model.fit(train, epochs=20)
 56 | 
 57 | This will train a traditional MF model, as no user or item features have been supplied.
 58 | 
 59 | To get predictions, call ``model.predict``::
 60 | 
 61 |     predictions = model.predict(test_user_ids, test_item_ids)
 62 | 
 63 | 
 64 | User and item features can be incorporated into training by passing them into the ``fit`` method. Assuming ``user_features`` is a (no_users, no_user_features) sparse matrix (and similarly for ``item_features``), you can call::
 65 | 
 66 |     model.fit(train,
 67 |               user_features=user_features,
 68 |               item_features=item_features,
 69 |               epochs=20)
 70 |     predictions = model.predict(test_user_ids,
 71 |                                 test_item_ids,
 72 |                                 user_features=user_features,
 73 |                                 item_features=item_features)
 74 | 
 75 | to train the model and obtain predictions.
 76 | 
 77 | Both training and prediction can employ multiple cores for speed::
 78 | 
 79 |     model.fit(train, epochs=20, num_threads=4)
 80 |     predictions = model.predict(test_user_ids, test_item_ids, num_threads=4)
 81 | 
 82 | This implementation uses asynchronous stochastic gradient descent [6] for training. This can lead to lower accuracy when the interaction matrix (or the feature matrices) are very dense and a large number of threads is used. In practice, however, training on a sparse dataset with 20 threads does not lead to a measurable loss of accuracy.
 83 | 
 84 | In an implicit feedback setting, the BPR, WARP, or k-OS WARP loss functions can be used. If ``train`` is a sparse matrix with positive entries representing positive interactions, the model can be trained as follows::
 85 | 
 86 |     model = LightFM(no_components=30, loss='warp')
 87 |     model.fit(train, epochs=20)
 88 | 
 89 | 
 90 | Examples
 91 | --------
 92 | 
 93 | Check the ``examples`` directory for more examples.
 94 | 
 95 | The `Movielens example <https://github.com/lyst/lightfm/blob/master/examples/movielens/example.ipynb>`_ shows how to use LightFM on the Movielens dataset, both with and without using movie metadata. `Another example <https://github.com/lyst/lightfm/blob/master/examples/movielens/learning_schedules.ipynb>`_ compares the performance of the adagrad and adadelta learning schedules.
 96 | 
 97 | The `Kaggle coupon purchase prediction <https://github.com/tdeboissiere/Kaggle/blob/master/Ponpare/ponpare_lightfm.ipynb>`_ example applies LightFM to predicting coupon purchases.
 98 | 
 99 | Articles and tutorials on using LightFM
100 | ---------------------------------------
101 | 
102 | 1. `Learning to Rank Sketchfab Models with LightFM <http://blog.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/>`_
103 | 2. `Metadata Embeddings for User and Item Cold-start Recommendations <http://building-babylon.net/2016/01/26/metadata-embeddings-for-user-and-item-cold-start-recommendations/>`_
104 | 3. `Recommendation Systems - Learn Python for Data Science <https://www.youtube.com/watch?v=9gBC9R-msAk>`_
105 | 
106 | 
107 | How to cite
108 | -----------
109 | 
110 | Please cite LightFM if it helps your research. You can use the following BibTeX entry.::
111 | 
112 |     @inproceedings{DBLP:conf/recsys/Kula15,
113 |       author    = {Maciej Kula},
114 |       editor    = {Toine Bogers and
115 |                    Marijn Koolen},
116 |       title     = {Metadata Embeddings for User and Item Cold-start Recommendations},
117 |       booktitle = {Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender
118 |                    Systems co-located with 9th {ACM} Conference on Recommender Systems
119 |                    (RecSys 2015), Vienna, Austria, September 16-20, 2015.},
120 |       series    = {{CEUR} Workshop Proceedings},
121 |       volume    = {1448},
122 |       pages     = {14--21},
123 |       publisher = {CEUR-WS.org},
124 |       year      = {2015},
125 |       url       = {http://ceur-ws.org/Vol-1448/paper4.pdf},
126 |     }
127 | 
128 | 
129 | Development
130 | -----------
131 | 
132 | Pull requests are welcome. To install for development:
133 | 
134 | 1. Clone the repository: ``git clone git@github.com:lyst/lightfm.git``
135 | 2. Install it for development using pip: ``cd lightfm && pip install -e .``
136 | 3. You can run tests by running ``python setup.py test``.
137 | 
138 | When making changes to the ``.pyx`` extension files, you'll need to run ``python setup.py cythonize`` in order to produce the extension ``.c`` files before running ``pip install -e .``.
139 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: home.rst
 2 | 
 3 | 
 4 | Contents
 5 | ========
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    Home <home>
11 |    Quickstart <quickstart>
12 |    The LightFM model class <lightfm>
13 |    Model evaluation <lightfm.evaluation>
14 |    Cross validation <cross_validation>
15 |    Constructing datasets <lightfm.data>
16 |    Built-in datasets <datasets>
17 |    Examples <examples>
18 |    FAQ <faq>
19 | 
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`modindex`
26 | * :ref:`search`
27 | 


--------------------------------------------------------------------------------
/doc/lightfm.data.rst:
--------------------------------------------------------------------------------
1 | Dataset construction
2 | ====================
3 | 
4 | .. automodule:: lightfm.data
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/lightfm.evaluation.rst:
--------------------------------------------------------------------------------
1 | Model evaluation
2 | =========================
3 | 
4 | .. automodule:: lightfm.evaluation
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/lightfm.rst:
--------------------------------------------------------------------------------
1 | LightFM
2 | ===============
3 | 
4 | .. autoclass:: lightfm.LightFM
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/doc/quickstart.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Quickstart
  3 | ==========
  4 | 
  5 | In this example, we'll build an implicit feedback recommender using the
  6 | Movielens 100k dataset (http://grouplens.org/datasets/movielens/100k/).
  7 | 
  8 | The code behind this example is available as a `Jupyter
  9 | notebook <https://github.com/lyst/lightfm/tree/master/examples/quickstart/quickstart.ipynb>`__
 10 | 
 11 | LightFM includes functions for getting and processing this dataset, so
 12 | obtaining it is quite easy.
 13 | 
 14 | .. code:: python
 15 | 
 16 |     import numpy as np
 17 | 
 18 |     from lightfm.datasets import fetch_movielens
 19 | 
 20 |     data = fetch_movielens(min_rating=5.0)
 21 | 
 22 | This downloads the dataset and automatically pre-processes it into
 23 | sparse matrices suitable for further calculation. In particular, it
 24 | prepares the sparse user-item matrices, containing positive entries
 25 | where a user interacted with a product, and zeros otherwise.
 26 | 
 27 | We have two such matrices, a training and a testing set. Both have
 28 | around 1000 users and 1700 items. We'll train the model on the train
 29 | matrix but test it on the test matrix.
 30 | 
 31 | .. code:: python
 32 | 
 33 |     print(repr(data['train']))
 34 |     print(repr(data['test']))
 35 | 
 36 | 
 37 | .. parsed-literal::
 38 | 
 39 |     <943x1682 sparse matrix of type '<type 'numpy.int32'>'
 40 |     	with 19048 stored elements in COOrdinate format>
 41 |     <943x1682 sparse matrix of type '<type 'numpy.int32'>'
 42 |     	with 2153 stored elements in COOrdinate format>
 43 | 
 44 | 
 45 | We need to import the model class to fit the model:
 46 | 
 47 | .. code:: python
 48 | 
 49 |     from lightfm import LightFM
 50 | 
 51 | We're going to use the WARP (Weighted Approximate-Rank Pairwise) model.
 52 | WARP is an implicit feedback model: all interactions in the training
 53 | matrix are treated as positive signals, and products that users did not
 54 | interact with they implicitly do not like. The goal of the model is to
 55 | score these implicit positives highly while assigining low scores to
 56 | implicit negatives.
 57 | 
 58 | Model training is accomplished via SGD (stochastic gradient descent).
 59 | This means that for every pass through the data --- an epoch --- the
 60 | model learns to fit the data more and more closely. We'll run it for 30
 61 | epochs in this example. We can also run it on multiple cores, so we'll
 62 | set that to 2. (The dataset in this example is too small for that to
 63 | make a difference, but it will matter on bigger datasets.)
 64 | 
 65 | .. code:: python
 66 | 
 67 |     model = LightFM(loss='warp')
 68 |     %time model.fit(data['train'], epochs=30, num_threads=2)
 69 | 
 70 | 
 71 | .. parsed-literal::
 72 | 
 73 |     CPU times: user 1.55 s, sys: 4 ms, total: 1.56 s
 74 |     Wall time: 838 ms
 75 | 
 76 | 
 77 | 
 78 | 
 79 | .. parsed-literal::
 80 | 
 81 |     <lightfm.lightfm.LightFM at 0x7f978c58ea50>
 82 | 
 83 | 
 84 | 
 85 | Done! We should now evaluate the model to see how well it's doing. We're
 86 | most interested in how good the ranking produced by the model is.
 87 | Precision@k is one suitable metric, expressing the percentage of top k
 88 | items in the ranking the user has actually interacted with. ``lightfm``
 89 | implements a number of metrics in the ``evaluation`` module.
 90 | 
 91 | .. code:: python
 92 | 
 93 |     from lightfm.evaluation import precision_at_k
 94 | 
 95 | We'll measure precision in both the train and the test set.
 96 | 
 97 | .. code:: python
 98 | 
 99 |     print("Train precision: %.2f" % precision_at_k(model, data['train'], k=5).mean())
100 |     print("Test precision: %.2f" % precision_at_k(model, data['test'], k=5).mean())
101 | 
102 | 
103 | .. parsed-literal::
104 | 
105 |     Train precision: 0.43
106 |     Test precision: 0.04
107 | 
108 | 
109 | Unsurprisingly, the model fits the train set better than the test set.
110 | 
111 | For an alternative way of judging the model, we can sample a couple of
112 | users and get their recommendations. To make predictions for given user,
113 | we pass the id of that user and the ids of all products we want
114 | predictions for into the ``predict`` method.
115 | 
116 | .. code:: python
117 | 
118 |     def sample_recommendation(model, data, user_ids):
119 | 
120 |         n_users, n_items = data['train'].shape
121 | 
122 |         for user_id in user_ids:
123 |             known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
124 | 
125 |             scores = model.predict(user_id, np.arange(n_items))
126 |             top_items = data['item_labels'][np.argsort(-scores)]
127 | 
128 |             print("User %s" % user_id)
129 |             print("     Known positives:")
130 | 
131 |             for x in known_positives[:3]:
132 |                 print("        %s" % x)
133 | 
134 |             print("     Recommended:")
135 | 
136 |             for x in top_items[:3]:
137 |                 print("        %s" % x)
138 | 
139 |     sample_recommendation(model, data, [3, 25, 450])
140 | 
141 | 
142 | .. parsed-literal::
143 | 
144 |     User 3
145 |          Known positives:
146 |             Contact (1997)
147 |             Air Force One (1997)
148 |             In & Out (1997)
149 |          Recommended:
150 |             Air Force One (1997)
151 |             Assignment, The (1997)
152 |             Kiss the Girls (1997)
153 |     User 25
154 |          Known positives:
155 |             Fargo (1996)
156 |             Godfather, The (1972)
157 |             L.A. Confidential (1997)
158 |          Recommended:
159 |             L.A. Confidential (1997)
160 |             Titanic (1997)
161 |             Fargo (1996)
162 |     User 450
163 |          Known positives:
164 |             Event Horizon (1997)
165 |             Scream (1996)
166 |             Conspiracy Theory (1997)
167 |          Recommended:
168 |             Independence Day (ID4) (1996)
169 |             Scream (1996)
170 |             Ransom (1996)
171 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   lightfm:
 3 |     build: .
 4 |     # Uncomment this to mount your local version
 5 |     # of the LightFM code.
 6 |     # volumes:
 7 |     #   - .:/home/lightfm/
 8 |     ports:
 9 |     - "8888:8888"
10 | 


--------------------------------------------------------------------------------
/docs-requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=4.0
2 | sphinx_rtd_theme>=1.0


--------------------------------------------------------------------------------
/examples/dataset/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: html
2 | html:
3 | 	pweave -f pandoc2html dataset.pmd
4 | 
5 | .PHONY: rst
6 | rst:
7 | 	pweave -f markdown dataset.pmd
8 | 	pandoc -s -t rst dataset.md -o dataset.rst
9 | 


--------------------------------------------------------------------------------
/examples/dataset/dataset.pmd:
--------------------------------------------------------------------------------
 1 | # Building datasets
 2 | 
 3 | In this example, we'll use LightFM's built-in `Dataset` class to build an interaction dataset from raw data. The goal is to demonstrate how to go from raw data (lists of interactions and perhaps item and user features) to `scipy.sparse` matrices that can be used to fit a LightFM model.
 4 | 
 5 | ## Getting the data
 6 | We're going to use a sample from [Goodbooks-10k](https://github.com/zygmuntz/goodbooks-10k) as our example dataset. Let's download the data first.
 7 | 
 8 | ```{python, source="download.py", echo=True}
 9 | ```
10 | 
11 | The data consists of book ratings and book details:
12 | ```python
13 | import json
14 | from itertools import islice
15 | 
16 | ratings, book_features = get_data()
17 | ```
18 | 
19 | Ratings look like this:
20 | ```python
21 | for line in islice(ratings, 2):
22 |     print(json.dumps(line, indent=4))
23 | ```
24 | and book features look like this:
25 | ```python
26 | for line in islice(book_features, 1):
27 |     print(json.dumps(line, indent=4))
28 | ```
29 | 
30 | ## Building the ID mappings
31 | The first thing we need to do is to create a mapping between the user and item ids from our input data to indices that will be used internally by our model.
32 | 
33 | We do this because LightFM works with user and item ids that are consecutive non-negative integers. The `Dataset` class allow us to create a mapping between the IDs we use in our systems and the consecutive indices preferred by the model.
34 | 
35 | To do this, we create a dataset and call its `fit` method. The first argument is an iterable of all user ids in our data, and the second is an iterable of all item ids. In this case, we use generator expressions to lazily iterate over our data and yield user and item ids:
36 | ```python
37 | from lightfm.data import Dataset
38 | 
39 | dataset = Dataset()
40 | dataset.fit((x['User-ID'] for x in get_ratings()),
41 |             (x['ISBN'] for x in get_ratings()))
42 | ```
43 | 
44 | This call will assign an internal numerical id to every user and item id we pass in. These will be contiguous (from 0 to however many users and items we have), and will also determine the dimensions of the resulting LightFM model.
45 | 
46 | We can check that the mappings have been created by querying the dataset on how many users and books it knows about:
47 | ```python
48 | num_users, num_items = dataset.interactions_shape()
49 | print('Num users: {}, num_items {}.'.format(num_users, num_items))
50 | ```
51 | 
52 | Note that if we don't have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings:
53 | ```python
54 | dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
55 |                     item_features=(x['Book-Author'] for x in get_book_features()))
56 | ```
57 | This will create a feature for every unique author name in the dataset.
58 | 
59 | (Note that we fit some more item ids: this is to make sure our mappings are complete even if there are items in the features dataset that are not in the interactions set.)
60 | 
61 | ## Building the interactions matrix
62 | Having created the mapping, we build the interaction matrix:
63 | ```python
64 | (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
65 |                                                       for x in get_ratings()))
66 | 
67 | print(repr(interactions))
68 | ```
69 | 
70 | This is main input into a LightFM model: it encodes the interactions betwee users and items.
71 | 
72 | Since we have item features, we can also create the item features matrix:
73 | ```python
74 | item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
75 |                                               for x in get_book_features()))
76 | print(repr(item_features))
77 | ```
78 | 
79 | ## Building a model
80 | This is all we need to build a LightFM model:
81 | ```python
82 | from lightfm import LightFM
83 | 
84 | model = LightFM(loss='bpr')
85 | model.fit(interactions, item_features=item_features)
86 | ```
87 | 


--------------------------------------------------------------------------------
/examples/dataset/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | import csv
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | def _download(url: str, dest_path: str):
 9 | 
10 |     req = requests.get(url, stream=True)
11 |     req.raise_for_status()
12 | 
13 |     with open(dest_path, "wb") as fd:
14 |         for chunk in req.iter_content(chunk_size=2**20):
15 |             fd.write(chunk)
16 | 
17 | 
18 | def get_data():
19 | 
20 |     ratings_url = (
21 |         "http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip"
22 |     )
23 | 
24 |     if not os.path.exists("data"):
25 |         os.makedirs("data")
26 | 
27 |         _download(ratings_url, "data/data.zip")
28 | 
29 |     with zipfile.ZipFile("data/data.zip") as archive:
30 |         return (
31 |             csv.DictReader(
32 |                 (
33 |                     x.decode("utf-8", "ignore")
34 |                     for x in archive.open("BX-Book-Ratings.csv")
35 |                 ),
36 |                 delimiter=";",
37 |             ),
38 |             csv.DictReader(
39 |                 (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")),
40 |                 delimiter=";",
41 |             ),
42 |         )
43 | 
44 | 
45 | def get_ratings():
46 | 
47 |     return get_data()[0]
48 | 
49 | 
50 | def get_book_features():
51 | 
52 |     return get_data()[1]
53 | 


--------------------------------------------------------------------------------
/examples/dataset/readme.rst:
--------------------------------------------------------------------------------
  1 | Building datasets
  2 | =================
  3 | 
  4 | In this example, we'll use LightFM's built-in ``Dataset`` class to build
  5 | an interaction dataset from raw data. The goal is to demonstrate how to
  6 | go from raw data (lists of interactions and perhaps item and user
  7 | features) to ``scipy.sparse`` matrices that can be used to fit a LightFM
  8 | model.
  9 | 
 10 | Getting the data
 11 | ----------------
 12 | 
 13 | We're going to use a sample from
 14 | `Goodbooks-10k <https://github.com/zygmuntz/goodbooks-10k>`__ as our
 15 | example dataset. Let's download the data first.
 16 | 
 17 | .. code:: python
 18 | 
 19 |     import os
 20 |     import zipfile
 21 |     import csv
 22 | 
 23 |     import requests
 24 | 
 25 | 
 26 |     def _download(url: str, dest_path: str):
 27 | 
 28 |         req = requests.get(url, stream=True)
 29 |         req.raise_for_status()
 30 | 
 31 |         with open(dest_path, "wb") as fd:
 32 |             for chunk in req.iter_content(chunk_size=2 ** 20):
 33 |                 fd.write(chunk)
 34 | 
 35 | 
 36 |     def get_data():
 37 | 
 38 |         ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")
 39 | 
 40 |         if not os.path.exists("data"):
 41 |             os.makedirs("data")
 42 | 
 43 |             _download(ratings_url, "data/data.zip")
 44 | 
 45 |         with zipfile.ZipFile("data/data.zip") as archive:
 46 |             return (
 47 |                 csv.DictReader(
 48 |                     (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
 49 |                     delimiter=";",
 50 |                 ),
 51 |                 csv.DictReader(
 52 |                     (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
 53 |                 ),
 54 |             )
 55 | 
 56 | 
 57 |     def get_ratings():
 58 | 
 59 |         return get_data()[0]
 60 | 
 61 | 
 62 |     def get_book_features():
 63 | 
 64 |         return get_data()[1]
 65 | 
 66 | The data consists of book ratings and book details:
 67 | 
 68 | .. code:: python
 69 | 
 70 |     import json
 71 |     from itertools import islice
 72 | 
 73 |     ratings, book_features = get_data()
 74 | 
 75 | Ratings look like this:
 76 | 
 77 | .. code:: python
 78 | 
 79 |     for line in islice(ratings, 2):
 80 |         print(json.dumps(line, indent=4))
 81 | 
 82 | ::
 83 | 
 84 |     {
 85 |         "User-ID": "276725",
 86 |         "ISBN": "034545104X",
 87 |         "Book-Rating": "0"
 88 |     }
 89 |     {
 90 |         "User-ID": "276726",
 91 |         "ISBN": "0155061224",
 92 |         "Book-Rating": "5"
 93 |     }
 94 | 
 95 | and book features look like this:
 96 | 
 97 | .. code:: python
 98 | 
 99 |     for line in islice(book_features, 1):
100 |         print(json.dumps(line, indent=4))
101 | 
102 | ::
103 | 
104 |     {
105 |         "ISBN": "0195153448",
106 |         "Book-Title": "Classical Mythology",
107 |         "Book-Author": "Mark P. O. Morford",
108 |         "Year-Of-Publication": "2002",
109 |         "Publisher": "Oxford University Press",
110 |         "Image-URL-S":
111 |     "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
112 |         "Image-URL-M":
113 |     "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
114 |         "Image-URL-L":
115 |     "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
116 |     }
117 | 
118 | Building the ID mappings
119 | ------------------------
120 | 
121 | The first thing we need to do is to create a mapping between the user
122 | and item ids from our input data to indices that will be used internally
123 | by our model.
124 | 
125 | We do this because LightFM works with user and item ids that are
126 | consecutive non-negative integers. The ``Dataset`` class allow us to
127 | create a mapping between the IDs we use in our systems and the
128 | consecutive indices preferred by the model.
129 | 
130 | To do this, we create a dataset and call its ``fit`` method. The first
131 | argument is an iterable of all user ids in our data, and the second is
132 | an iterable of all item ids. In this case, we use generator expressions
133 | to lazily iterate over our data and yield user and item ids:
134 | 
135 | .. code:: python
136 | 
137 |     from lightfm.data import Dataset
138 | 
139 |     dataset = Dataset()
140 |     dataset.fit((x['User-ID'] for x in get_ratings()),
141 |                 (x['ISBN'] for x in get_ratings()))
142 | 
143 | This call will assign an internal numerical id to every user and item id
144 | we pass in. These will be contiguous (from 0 to however many users and
145 | items we have), and will also determine the dimensions of the resulting
146 | LightFM model.
147 | 
148 | We can check that the mappings have been created by querying the dataset
149 | on how many users and books it knows about:
150 | 
151 | .. code:: python
152 | 
153 |     num_users, num_items = dataset.interactions_shape()
154 |     print('Num users: {}, num_items {}.'.format(num_users, num_items))
155 | 
156 | ::
157 | 
158 |     Num users: 105283, num_items 340553.
159 | 
160 | Note that if we don't have all user and items ids at once, we can
161 | repeatedly call ``fit_partial`` to supply additional ids. In this case,
162 | we will use this capability to add some item feature mappings:
163 | 
164 | .. code:: python
165 | 
166 |     dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
167 |                         item_features=(x['Book-Author'] for x in get_book_features()))
168 | 
169 | This will create a feature for every unique author name in the dataset.
170 | 
171 | (Note that we fit some more item ids: this is to make sure our mappings
172 | are complete even if there are items in the features dataset that are
173 | not in the interactions set.)
174 | 
175 | Building the interactions matrix
176 | --------------------------------
177 | 
178 | Having created the mapping, we build the interaction matrix:
179 | 
180 | .. code:: python
181 | 
182 |     (interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
183 |                                                           for x in get_ratings()))
184 | 
185 |     print(repr(interactions))
186 | 
187 | ::
188 | 
189 |     <105283x341762 sparse matrix of type '<class 'numpy.int32'>'
190 |             with 1149780 stored elements in COOrdinate format>
191 | 
192 | This is main input into a LightFM model: it encodes the interactions
193 | between users and items.
194 | 
195 | Since we have item features, we can also create the item features
196 | matrix:
197 | 
198 | .. code:: python
199 | 
200 |     item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
201 |                                                   for x in get_book_features()))
202 |     print(repr(item_features))
203 | 
204 | ::
205 | 
206 |     <341762x443805 sparse matrix of type '<class 'numpy.float32'>'
207 |             with 613141 stored elements in Compressed Sparse Row format>
208 | 
209 | Building a model
210 | ----------------
211 | 
212 | This is all we need to build a LightFM model:
213 | 
214 | .. code:: python
215 | 
216 |     from lightfm import LightFM
217 | 
218 |     model = LightFM(loss='bpr')
219 |     model.fit(interactions, item_features=item_features)
220 | 
221 | ::
222 | 
223 |     <lightfm.lightfm.LightFM at 0x7f5f0e8f7c88>
224 | 


--------------------------------------------------------------------------------
/examples/movielens/data.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | import zipfile
  4 | 
  5 | import numpy as np
  6 | 
  7 | import requests
  8 | 
  9 | import scipy.sparse as sp
 10 | 
 11 | 
 12 | def _get_movielens_path():
 13 |     """
 14 |     Get path to the movielens dataset file.
 15 |     """
 16 | 
 17 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "movielens.zip")
 18 | 
 19 | 
 20 | def _download_movielens(dest_path):
 21 |     """
 22 |     Download the dataset.
 23 |     """
 24 | 
 25 |     url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
 26 |     req = requests.get(url, stream=True)
 27 | 
 28 |     with open(dest_path, "wb") as fd:
 29 |         for chunk in req.iter_content():
 30 |             fd.write(chunk)
 31 | 
 32 | 
 33 | def _get_raw_movielens_data():
 34 |     """
 35 |     Return the raw lines of the train and test files.
 36 |     """
 37 | 
 38 |     path = _get_movielens_path()
 39 | 
 40 |     if not os.path.isfile(path):
 41 |         _download_movielens(path)
 42 | 
 43 |     with zipfile.ZipFile(path) as datafile:
 44 |         return (
 45 |             datafile.read("ml-100k/ua.base").decode().split("\n"),
 46 |             datafile.read("ml-100k/ua.test").decode().split("\n"),
 47 |         )
 48 | 
 49 | 
 50 | def _parse(data):
 51 |     """
 52 |     Parse movielens dataset lines.
 53 |     """
 54 | 
 55 |     for line in data:
 56 | 
 57 |         if not line:
 58 |             continue
 59 | 
 60 |         uid, iid, rating, timestamp = [int(x) for x in line.split("\t")]
 61 | 
 62 |         yield uid, iid, rating, timestamp
 63 | 
 64 | 
 65 | def _build_interaction_matrix(rows, cols, data):
 66 |     """
 67 |     Build the training matrix (no_users, no_items),
 68 |     with ratings >= 4.0 being marked as positive and
 69 |     the rest as negative.
 70 |     """
 71 | 
 72 |     mat = sp.lil_matrix((rows, cols), dtype=np.int32)
 73 | 
 74 |     for uid, iid, rating, timestamp in data:
 75 |         if rating >= 4.0:
 76 |             mat[uid, iid] = 1.0
 77 |         else:
 78 |             mat[uid, iid] = -1.0
 79 | 
 80 |     return mat.tocoo()
 81 | 
 82 | 
 83 | def _get_movie_raw_metadata():
 84 |     """
 85 |     Get raw lines of the genre file.
 86 |     """
 87 | 
 88 |     path = _get_movielens_path()
 89 | 
 90 |     if not os.path.isfile(path):
 91 |         _download_movielens(path)
 92 | 
 93 |     with zipfile.ZipFile(path) as datafile:
 94 |         return datafile.read("ml-100k/u.item").decode(errors="ignore").split("\n")
 95 | 
 96 | 
 97 | def get_movielens_item_metadata(use_item_ids):
 98 |     """
 99 |     Build a matrix of genre features (no_items, no_features).
100 | 
101 |     If use_item_ids is True, per-item features will also be used.
102 |     """
103 | 
104 |     features = {}
105 |     genre_set = set()
106 | 
107 |     for line in _get_movie_raw_metadata():
108 | 
109 |         if not line:
110 |             continue
111 | 
112 |         splt = line.split("|")
113 |         item_id = int(splt[0])
114 | 
115 |         genres = [
116 |             idx for idx, val in zip(range(len(splt[5:])), splt[5:]) if int(val) > 0
117 |         ]
118 | 
119 |         if use_item_ids:
120 |             # Add item-specific features too
121 |             genres.append(item_id)
122 | 
123 |         for genre_id in genres:
124 |             genre_set.add(genre_id)
125 | 
126 |         features[item_id] = genres
127 | 
128 |     mat = sp.lil_matrix((len(features) + 1, len(genre_set)), dtype=np.int32)
129 | 
130 |     for item_id, genre_ids in features.items():
131 |         for genre_id in genre_ids:
132 |             mat[item_id, genre_id] = 1
133 | 
134 |     return mat
135 | 
136 | 
137 | def get_movielens_data():
138 |     """
139 |     Return (train_interactions, test_interactions).
140 |     """
141 | 
142 |     train_data, test_data = _get_raw_movielens_data()
143 | 
144 |     uids = set()
145 |     iids = set()
146 | 
147 |     for uid, iid, rating, timestamp in itertools.chain(
148 |         _parse(train_data), _parse(test_data)
149 |     ):
150 |         uids.add(uid)
151 |         iids.add(iid)
152 | 
153 |     rows = max(uids) + 1
154 |     cols = max(iids) + 1
155 | 
156 |     return (
157 |         _build_interaction_matrix(rows, cols, _parse(train_data)),
158 |         _build_interaction_matrix(rows, cols, _parse(test_data)),
159 |     )
160 | 


--------------------------------------------------------------------------------
/examples/movielens/example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# An implicit feedback recommender for the Movielens dataset"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "collapsed": true
 14 |    },
 15 |    "source": [
 16 |     "## Implicit feedback\n",
 17 |     "For some time, the recommender system literature focused on explicit feedback: the Netflix prize focused on accurately reproducing the ratings users have given to movies they watched.\n",
 18 |     "\n",
 19 |     "Focusing on ratings in this way ignored the importance of taking into account which movies the users chose to watch in the first place, and treating the absence of ratings as absence of information.\n",
 20 |     "\n",
 21 |     "But the things that we don't have ratings for aren't unknowns: we know the user didn't pick them. This reflects a user's conscious choice, and is a good source of information on what she thinks she might like. \n",
 22 |     "\n",
 23 |     "This sort of phenomenon is described as data which is missing-not-at-random in the literature: the ratings that are missing are more likely to be negative precisely because the user chooses which items to rate. When choosing a restaurant, you only go to places which you think you'll enjoy, and never go to places that you think you'll hate. What this leads to is that you're only going to be submitting ratings for things which, a priori, you expected to like; the things that you expect you will not like you will never rate.\n",
 24 |     "\n",
 25 |     "This observation has led to the development of models that are suitable for implicit feedback. LightFM implements two that have proven particular successful:\n",
 26 |     "\n",
 27 |     "- BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.\n",
 28 |     "- WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.\n",
 29 |     "\n",
 30 |     "This example shows how to estimate these models on the Movielens dataset.\n",
 31 |     "\n",
 32 |     "[1] Rendle, Steffen, et al. \"BPR: Bayesian personalized ranking from implicit feedback.\" Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. AUAI Press, 2009.\n",
 33 |     "\n",
 34 |     "[2] Weston, Jason, Samy Bengio, and Nicolas Usunier. \"Wsabie: Scaling up to large vocabulary image annotation.\" IJCAI. Vol. 11. 2011.\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "## Getting the data\n",
 38 |     "The first step is to get the [Movielens data](http://grouplens.org/datasets/movielens/100k/). This is a classic small recommender dataset, consisting of around 950 users, 1700 movies, and 100,000 ratings. The ratings are on a scale from 1 to 5, but we'll all treat them as implicit positive feedback in this example.\n",
 39 |     "\n",
 40 |     "\n",
 41 |     "Fortunately, this is one of the functions provided by LightFM itself."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 1,
 47 |    "metadata": {
 48 |     "collapsed": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "import numpy as np\n",
 53 |     "\n",
 54 |     "from lightfm.datasets import fetch_movielens\n",
 55 |     "\n",
 56 |     "movielens = fetch_movielens()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "This gives us a dictionary with the following fields:"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 2,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "('test', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))\n",
 78 |       "('item_features', <class 'scipy.sparse.csr.csr_matrix'>, (1682, 1682))\n",
 79 |       "('train', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))\n",
 80 |       "('item_labels', <type 'numpy.ndarray'>, (1682,))\n",
 81 |       "('item_feature_labels', <type 'numpy.ndarray'>, (1682,))\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "for key, value in movielens.items():\n",
 87 |     "    print(key, type(value), value.shape)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 3,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "train = movielens['train']\n",
 99 |     "test = movielens['test']"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "The `train` and `test` elements are the most important: they contain the raw rating data, split into a train and a test set. Each row represents a user, and each column an item. Entries are ratings from 1 to 5."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "## Fitting models\n",
114 |     "\n",
115 |     "Now let's train a BPR model and look at its accuracy.\n",
116 |     "\n",
117 |     "We'll use two metrics of accuracy: precision@k and ROC AUC. Both are ranking metrics: to compute them, we'll be constructing recommendation lists for all of our users, and checking the ranking of known positive movies. For precision at k we'll be looking at whether they are within the first k results on the list; for AUC, we'll be calculating the probability that any known positive is higher on the list than a random negative example."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "Precision: train 0.59, test 0.10.\n",
132 |       "AUC: train 0.90, test 0.86.\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "from lightfm import LightFM\n",
138 |     "from lightfm.evaluation import precision_at_k\n",
139 |     "from lightfm.evaluation import auc_score\n",
140 |     "\n",
141 |     "model = LightFM(learning_rate=0.05, loss='bpr')\n",
142 |     "model.fit(train, epochs=10)\n",
143 |     "\n",
144 |     "train_precision = precision_at_k(model, train, k=10).mean()\n",
145 |     "test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()\n",
146 |     "\n",
147 |     "train_auc = auc_score(model, train).mean()\n",
148 |     "test_auc = auc_score(model, test, train_interactions=train).mean()\n",
149 |     "\n",
150 |     "print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))\n",
151 |     "print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "The WARP model, on the other hand, optimises for precision@k---we should expect its performance to be better on precision."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 5,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "Precision: train 0.61, test 0.11.\n",
173 |       "AUC: train 0.93, test 0.90.\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "model = LightFM(learning_rate=0.05, loss='warp')\n",
179 |     "\n",
180 |     "model.fit_partial(train, epochs=10)\n",
181 |     "\n",
182 |     "train_precision = precision_at_k(model, train, k=10).mean()\n",
183 |     "test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()\n",
184 |     "\n",
185 |     "train_auc = auc_score(model, train).mean()\n",
186 |     "test_auc = auc_score(model, test, train_interactions=train).mean()\n",
187 |     "\n",
188 |     "print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))\n",
189 |     "print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "And that is exactly what we see: we get slightly higher precision@10 (but the AUC metric is also improved)."
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": []
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 2",
212 |    "language": "python",
213 |    "name": "python2"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 2
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython2",
225 |    "version": "2.7.12"
226 |   },
227 |   "widgets": {
228 |    "state": {},
229 |    "version": "1.1.2"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 0
234 | }
235 | 


--------------------------------------------------------------------------------
/examples/movielens/readme.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # An implicit feedback recommender for the Movielens dataset
  3 | 
  4 | ## Implicit feedback
  5 | For some time, the recommender system literature focused on explicit feedback: the Netflix prize focused on accurately reproducing the ratings users have given to movies they watched.
  6 | 
  7 | Focusing on ratings in this way ignored the importance of taking into account which movies the users chose to watch in the first place, and treating the absence of ratings as absence of information.
  8 | 
  9 | But the things that we don't have ratings for aren't unknowns: we know the user didn't pick them. This reflects a user's conscious choice, and is a good source of information on what she thinks she might like.
 10 | 
 11 | This sort of phenomenon is described as data which is missing-not-at-random in the literature: the ratings that are missing are more likely to be negative precisely because the user chooses which items to rate. When choosing a restaurant, you only go to places which you think you'll enjoy, and never go to places that you think you'll hate. What this leads to is that you're only going to be submitting ratings for things which, a priori, you expected to like; the things that you expect you will not like you will never rate.
 12 | 
 13 | This observation has led to the development of models that are suitable for implicit feedback. LightFM implements two that have proven particular successful:
 14 | 
 15 | - BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.
 16 | - WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.
 17 | 
 18 | This example shows how to estimate these models on the Movielens dataset.
 19 | 
 20 | [1] Rendle, Steffen, et al. "BPR: Bayesian personalized ranking from implicit feedback." Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. AUAI Press, 2009.
 21 | 
 22 | [2] Weston, Jason, Samy Bengio, and Nicolas Usunier. "Wsabie: Scaling up to large vocabulary image annotation." IJCAI. Vol. 11. 2011.
 23 | 
 24 | 
 25 | ## Getting the data
 26 | The first step is to get the [Movielens data](http://grouplens.org/datasets/movielens/100k/). This is a classic small recommender dataset, consisting of around 950 users, 1700 movies, and 100,000 ratings. The ratings are on a scale from 1 to 5, but we'll all treat them as implicit positive feedback in this example.
 27 | 
 28 | 
 29 | Fortunately, this is one of the functions provided by LightFM itself.
 30 | 
 31 | 
 32 | ```python
 33 | import numpy as np
 34 | 
 35 | from lightfm.datasets import fetch_movielens
 36 | 
 37 | movielens = fetch_movielens()
 38 | ```
 39 | 
 40 | This gives us a dictionary with the following fields:
 41 | 
 42 | 
 43 | ```python
 44 | for key, value in movielens.items():
 45 |     print(key, type(value), value.shape)
 46 | ```
 47 | 
 48 |     ('test', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))
 49 |     ('item_features', <class 'scipy.sparse.csr.csr_matrix'>, (1682, 1682))
 50 |     ('train', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))
 51 |     ('item_labels', <type 'numpy.ndarray'>, (1682,))
 52 |     ('item_feature_labels', <type 'numpy.ndarray'>, (1682,))
 53 | 
 54 | 
 55 | 
 56 | ```python
 57 | train = movielens['train']
 58 | test = movielens['test']
 59 | ```
 60 | 
 61 | The `train` and `test` elements are the most important: they contain the raw rating data, split into a train and a test set. Each row represents a user, and each column an item. Entries are ratings from 1 to 5.
 62 | 
 63 | ## Fitting models
 64 | 
 65 | Now let's train a BPR model and look at its accuracy.
 66 | 
 67 | We'll use two metrics of accuracy: precision@k and ROC AUC. Both are ranking metrics: to compute them, we'll be constructing recommendation lists for all of our users, and checking the ranking of known positive movies. For precision at k we'll be looking at whether they are within the first k results on the list; for AUC, we'll be calculating the probability that any known positive is higher on the list than a random negative example.
 68 | 
 69 | 
 70 | ```python
 71 | from lightfm import LightFM
 72 | from lightfm.evaluation import precision_at_k
 73 | from lightfm.evaluation import auc_score
 74 | 
 75 | model = LightFM(learning_rate=0.05, loss='bpr')
 76 | model.fit(train, epochs=10)
 77 | 
 78 | train_precision = precision_at_k(model, train, k=10).mean()
 79 | test_precision = precision_at_k(model, test, k=10).mean()
 80 | 
 81 | train_auc = auc_score(model, train).mean()
 82 | test_auc = auc_score(model, test).mean()
 83 | 
 84 | print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
 85 | print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
 86 | ```
 87 | 
 88 |     Precision: train 0.59, test 0.10.
 89 |     AUC: train 0.90, test 0.86.
 90 | 
 91 | 
 92 | The WARP model, on the other hand, optimises for precision@k---we should expect its performance to be better on precision.
 93 | 
 94 | 
 95 | ```python
 96 | model = LightFM(learning_rate=0.05, loss='warp')
 97 | 
 98 | model.fit_partial(train, epochs=10)
 99 | 
100 | train_precision = precision_at_k(model, train, k=10).mean()
101 | test_precision = precision_at_k(model, test, k=10).mean()
102 | 
103 | train_auc = auc_score(model, train).mean()
104 | test_auc = auc_score(model, test).mean()
105 | 
106 | print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
107 | print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
108 | ```
109 | 
110 |     Precision: train 0.61, test 0.11.
111 |     AUC: train 0.93, test 0.90.
112 | 
113 | 
114 | And that is exactly what we see: we get slightly higher precision@10 (but the AUC metric is also improved).
115 | 


--------------------------------------------------------------------------------
/examples/quickstart/quickstart.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Quickstart\n",
  8 |     "In this example, we'll build an implicit feedback recommender using the Movielens 100k dataset (http://grouplens.org/datasets/movielens/100k/).\n",
  9 |     "\n",
 10 |     "The code behind this example is available as a [Jupyter notebook](https://github.com/lyst/lightfm/tree/master/examples/quickstart/quickstart.ipynb)\n",
 11 |     "\n",
 12 |     "LightFM includes functions for getting and processing this dataset, so obtaining it is quite easy."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "\n",
 25 |     "from lightfm.datasets import fetch_movielens\n",
 26 |     "\n",
 27 |     "data = fetch_movielens(min_rating=5.0)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "This downloads the dataset and automatically pre-processes it into sparse matrices suitable for further calculation. In particular, it prepares the sparse user-item matrices, containing positive entries where a user interacted with a product, and zeros otherwise.\n",
 35 |     "\n",
 36 |     "We have two such matrices, a training and a testing set. Both have around 1000 users and 1700 items. We'll train the model on the train matrix but test it on the test matrix."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "<943x1682 sparse matrix of type '<type 'numpy.int32'>'\n",
 51 |       "\twith 19048 stored elements in COOrdinate format>\n",
 52 |       "<943x1682 sparse matrix of type '<type 'numpy.int32'>'\n",
 53 |       "\twith 2153 stored elements in COOrdinate format>\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "print(repr(data['train']))\n",
 59 |     "print(repr(data['test']))"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "We need to import the model class to fit the model:"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 3,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from lightfm import LightFM"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "We're going to use the WARP (Weighted Approximate-Rank Pairwise) model. WARP is an implicit feedback model: all interactions in the training matrix are treated as positive signals, and products that users did not interact with they implicitly do not like. The goal of the model is to score these implicit positives highly while assigining low scores to implicit negatives.\n",
 85 |     "\n",
 86 |     "Model training is accomplished via SGD (stochastic gradient descent). This means that for every pass through the data --- an epoch --- the model learns to fit the data more and more closely. We'll run it for 30 epochs in this example. We can also run it on multiple cores, so we'll set that to 2. (The dataset in this example is too small for that to make a difference, but it will matter on bigger datasets.)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 57,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "CPU times: user 1.55 s, sys: 4 ms, total: 1.56 s\n",
101 |       "Wall time: 838 ms\n"
102 |      ]
103 |     },
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "<lightfm.lightfm.LightFM at 0x7f978c58ea50>"
108 |       ]
109 |      },
110 |      "execution_count": 57,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "model = LightFM(loss='warp')\n",
117 |     "%time model.fit(data['train'], epochs=30, num_threads=2)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "Done! We should now evaluate the model to see how well it's doing. We're most interested in how good the ranking produced by the model is. Precision@k is one suitable metric, expressing the percentage of top k items in the ranking the user has actually interacted with. `lightfm` implements a number of metrics in the `evaluation` module. "
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 55,
130 |    "metadata": {
131 |     "collapsed": true
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "from lightfm.evaluation import precision_at_k"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "We'll measure precision in both the train and the test set."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 58,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "Train precision: 0.43\n",
157 |       "Test precision: 0.04\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "print(\"Train precision: %.2f\" % precision_at_k(model, data['train'], k=5).mean())\n",
163 |     "print(\"Test precision: %.2f\" % precision_at_k(model, data['test'], k=5).mean())"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Unsurprisingly, the model fits the train set better than the test set.\n",
171 |     "\n",
172 |     "For an alternative way of judging the model, we can sample a couple of users and get their recommendations. To make predictions for given user, we pass the id of that user and the ids of all products we want predictions for into the `predict` method."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 60,
178 |    "metadata": {
179 |     "collapsed": false
180 |    },
181 |    "outputs": [
182 |     {
183 |      "name": "stdout",
184 |      "output_type": "stream",
185 |      "text": [
186 |       "User 3\n",
187 |       "     Known positives:\n",
188 |       "        Contact (1997)\n",
189 |       "        Air Force One (1997)\n",
190 |       "        In & Out (1997)\n",
191 |       "     Recommended:\n",
192 |       "        Air Force One (1997)\n",
193 |       "        Assignment, The (1997)\n",
194 |       "        Kiss the Girls (1997)\n",
195 |       "User 25\n",
196 |       "     Known positives:\n",
197 |       "        Fargo (1996)\n",
198 |       "        Godfather, The (1972)\n",
199 |       "        L.A. Confidential (1997)\n",
200 |       "     Recommended:\n",
201 |       "        L.A. Confidential (1997)\n",
202 |       "        Titanic (1997)\n",
203 |       "        Fargo (1996)\n",
204 |       "User 450\n",
205 |       "     Known positives:\n",
206 |       "        Event Horizon (1997)\n",
207 |       "        Scream (1996)\n",
208 |       "        Conspiracy Theory (1997)\n",
209 |       "     Recommended:\n",
210 |       "        Independence Day (ID4) (1996)\n",
211 |       "        Scream (1996)\n",
212 |       "        Ransom (1996)\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "def sample_recommendation(model, data, user_ids):\n",
218 |     "    \n",
219 |     "\n",
220 |     "    n_users, n_items = data['train'].shape\n",
221 |     "\n",
222 |     "    for user_id in user_ids:\n",
223 |     "        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]\n",
224 |     "        \n",
225 |     "        scores = model.predict(user_id, np.arange(n_items))\n",
226 |     "        top_items = data['item_labels'][np.argsort(-scores)]\n",
227 |     "        \n",
228 |     "        print(\"User %s\" % user_id)\n",
229 |     "        print(\"     Known positives:\")\n",
230 |     "        \n",
231 |     "        for x in known_positives[:3]:\n",
232 |     "            print(\"        %s\" % x)\n",
233 |     "\n",
234 |     "        print(\"     Recommended:\")\n",
235 |     "        \n",
236 |     "        for x in top_items[:3]:\n",
237 |     "            print(\"        %s\" % x)\n",
238 |     "        \n",
239 |     "sample_recommendation(model, data, [3, 25, 450]) "
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {
246 |     "collapsed": true
247 |    },
248 |    "outputs": [],
249 |    "source": []
250 |   }
251 |  ],
252 |  "metadata": {
253 |   "kernelspec": {
254 |    "display_name": "Python 2",
255 |    "language": "python",
256 |    "name": "python2"
257 |   },
258 |   "language_info": {
259 |    "codemirror_mode": {
260 |     "name": "ipython",
261 |     "version": 2
262 |    },
263 |    "file_extension": ".py",
264 |    "mimetype": "text/x-python",
265 |    "name": "python",
266 |    "nbconvert_exporter": "python",
267 |    "pygments_lexer": "ipython2",
268 |    "version": "2.7.10"
269 |   }
270 |  },
271 |  "nbformat": 4,
272 |  "nbformat_minor": 0
273 | }
274 | 


--------------------------------------------------------------------------------
/examples/quickstart/short_quickstart.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Short quickstart"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 1,
13 |    "metadata": {
14 |     "collapsed": false
15 |    },
16 |    "outputs": [],
17 |    "source": [
18 |     "from lightfm import LightFM\n",
19 |     "from lightfm.datasets import fetch_movielens\n",
20 |     "from lightfm.evaluation import precision_at_k\n",
21 |     "\n",
22 |     "# Load the MovieLens 100k dataset. Only five\n",
23 |     "# star ratings are treated as positive.\n",
24 |     "data = fetch_movielens(min_rating=5.0)\n",
25 |     "\n",
26 |     "# Instantiate and train the model\n",
27 |     "model = LightFM(loss='warp')\n",
28 |     "model.fit(data['train'], epochs=30, num_threads=2)\n",
29 |     "\n",
30 |     "# Evaluate the trained model\n",
31 |     "test_precision = precision_at_k(model, data['test'], k=5).mean()"
32 |    ]
33 |   }
34 |  ],
35 |  "metadata": {
36 |   "kernelspec": {
37 |    "display_name": "Python 2",
38 |    "language": "python",
39 |    "name": "python2"
40 |   },
41 |   "language_info": {
42 |    "codemirror_mode": {
43 |     "name": "ipython",
44 |     "version": 2
45 |    },
46 |    "file_extension": ".py",
47 |    "mimetype": "text/x-python",
48 |    "name": "python",
49 |    "nbconvert_exporter": "python",
50 |    "pygments_lexer": "ipython2",
51 |    "version": "2.7.8"
52 |   }
53 |  },
54 |  "nbformat": 4,
55 |  "nbformat_minor": 0
56 | }
57 | 


--------------------------------------------------------------------------------
/lightfm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/lightfm.png


--------------------------------------------------------------------------------
/lightfm/__init__.py:
--------------------------------------------------------------------------------
1 | from .lightfm import LightFM
2 | from .version import __version__
3 | 
4 | __all__ = ["LightFM", "datasets", "evaluation", "__version__"]
5 | 


--------------------------------------------------------------------------------
/lightfm/_lightfm_fast.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     # Import OpenMP-enabled extension
 3 |     from ._lightfm_fast_openmp import *  # NOQA
 4 |     from ._lightfm_fast_openmp import __test_in_positives  # NOQA
 5 | except ImportError:
 6 |     # Fall back on OpenMP-less extension
 7 |     import warnings
 8 | 
 9 |     warnings.warn(
10 |         "LightFM was compiled without OpenMP support. "
11 |         "Only a single thread will be used."
12 |     )
13 | 
14 |     from ._lightfm_fast_no_openmp import *  # NOQA
15 |     from ._lightfm_fast_no_openmp import __test_in_positives  # NOQA
16 | 


--------------------------------------------------------------------------------
/lightfm/cross_validation.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """
 3 | Dataset splitting functions.
 4 | """
 5 | 
 6 | import numpy as np
 7 | import scipy.sparse as sp
 8 | 
 9 | 
10 | def _shuffle(uids, iids, data, random_state):
11 | 
12 |     shuffle_indices = np.arange(len(uids))
13 |     random_state.shuffle(shuffle_indices)
14 | 
15 |     return (uids[shuffle_indices], iids[shuffle_indices], data[shuffle_indices])
16 | 
17 | 
18 | def random_train_test_split(interactions, test_percentage=0.2, random_state=None):
19 |     """
20 |     Randomly split interactions between training and testing.
21 | 
22 |     This function takes an interaction set and splits it into
23 |     two disjoint sets, a training set and a test set. Note that
24 |     no effort is made to make sure that all items and users with
25 |     interactions in the test set also have interactions in the
26 |     training set; this may lead to a partial cold-start problem
27 |     in the test set.
28 |     To split a sample_weight matrix along the same lines, pass it
29 |     into this function with the same random_state seed as was used
30 |     for splitting the interactions.
31 | 
32 |     Parameters
33 |     ----------
34 | 
35 |     interactions: a scipy sparse matrix containing interactions
36 |         The interactions to split.
37 |     test_percentage: float, optional
38 |         The fraction of interactions to place in the test set.
39 |     random_state: int or numpy.random.RandomState, optional
40 |         Random seed used to initialize the numpy.random.RandomState number generator.
41 |         Accepts an instance of numpy.random.RandomState for backwards compatibility.
42 | 
43 |     Returns
44 |     -------
45 | 
46 |     (train, test): (scipy.sparse.COOMatrix,
47 |                     scipy.sparse.COOMatrix)
48 |          A tuple of (train data, test data)
49 |     """
50 | 
51 |     if not sp.issparse(interactions):
52 |         raise ValueError("Interactions must be a scipy.sparse matrix.")
53 | 
54 |     if not isinstance(random_state, np.random.RandomState):
55 |         random_state = np.random.RandomState(seed=random_state)
56 | 
57 |     interactions = interactions.tocoo()
58 | 
59 |     shape = interactions.shape
60 |     uids, iids, data = (interactions.row, interactions.col, interactions.data)
61 | 
62 |     uids, iids, data = _shuffle(uids, iids, data, random_state)
63 | 
64 |     cutoff = int((1.0 - test_percentage) * len(uids))
65 | 
66 |     train_idx = slice(None, cutoff)
67 |     test_idx = slice(cutoff, None)
68 | 
69 |     train = sp.coo_matrix(
70 |         (data[train_idx], (uids[train_idx], iids[train_idx])),
71 |         shape=shape,
72 |         dtype=interactions.dtype,
73 |     )
74 |     test = sp.coo_matrix(
75 |         (data[test_idx], (uids[test_idx], iids[test_idx])),
76 |         shape=shape,
77 |         dtype=interactions.dtype,
78 |     )
79 | 
80 |     return train, test
81 | 


--------------------------------------------------------------------------------
/lightfm/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from lightfm.datasets.movielens import fetch_movielens  # NOQA
2 | from lightfm.datasets.stackexchange import fetch_stackexchange  # NOQA
3 | 


--------------------------------------------------------------------------------
/lightfm/datasets/_common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get_data_dir():
 7 | 
 8 |     return os.path.join(os.path.expanduser("~"), "lightfm_data")
 9 | 
10 | 
11 | def create_data_dir(path):
12 | 
13 |     if not os.path.isdir(path):
14 |         os.makedirs(path)
15 | 
16 | 
17 | def download(url, dest_path):
18 | 
19 |     req = requests.get(url, stream=True)
20 |     req.raise_for_status()
21 | 
22 |     with open(dest_path, "wb") as fd:
23 |         for chunk in req.iter_content(chunk_size=2**20):
24 |             fd.write(chunk)
25 | 
26 | 
27 | def get_data(data_home, url, dest_subdir, dest_filename, download_if_missing):
28 | 
29 |     if data_home is None:
30 |         data_dir = os.path.join(get_data_dir(), dest_subdir)
31 |     else:
32 |         data_dir = os.path.join(os.path.abspath(data_home), dest_subdir)
33 | 
34 |     create_data_dir(data_dir)
35 | 
36 |     dest_path = os.path.join(data_dir, dest_filename)
37 | 
38 |     if not os.path.isfile(dest_path):
39 |         if download_if_missing:
40 |             download(url, dest_path)
41 |         else:
42 |             raise IOError("Dataset missing.")
43 | 
44 |     return dest_path
45 | 


--------------------------------------------------------------------------------
/lightfm/datasets/movielens.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | import zipfile
  4 | 
  5 | import numpy as np
  6 | 
  7 | import scipy.sparse as sp
  8 | 
  9 | from lightfm.datasets import _common
 10 | 
 11 | 
 12 | def _read_raw_data(path):
 13 |     """
 14 |     Return the raw lines of the train and test files.
 15 |     """
 16 | 
 17 |     with zipfile.ZipFile(path) as datafile:
 18 |         return (
 19 |             datafile.read("ml-100k/ua.base").decode().split("\n"),
 20 |             datafile.read("ml-100k/ua.test").decode().split("\n"),
 21 |             datafile.read("ml-100k/u.item").decode(errors="ignore").split("\n"),
 22 |             datafile.read("ml-100k/u.genre").decode(errors="ignore").split("\n"),
 23 |         )
 24 | 
 25 | 
 26 | def _parse(data):
 27 | 
 28 |     for line in data:
 29 | 
 30 |         if not line:
 31 |             continue
 32 | 
 33 |         uid, iid, rating, timestamp = [int(x) for x in line.split("\t")]
 34 | 
 35 |         # Subtract one from ids to shift
 36 |         # to zero-based indexing
 37 |         yield uid - 1, iid - 1, rating, timestamp
 38 | 
 39 | 
 40 | def _get_dimensions(train_data, test_data):
 41 | 
 42 |     uids = set()
 43 |     iids = set()
 44 | 
 45 |     for uid, iid, _, _ in itertools.chain(train_data, test_data):
 46 |         uids.add(uid)
 47 |         iids.add(iid)
 48 | 
 49 |     rows = max(uids) + 1
 50 |     cols = max(iids) + 1
 51 | 
 52 |     return rows, cols
 53 | 
 54 | 
 55 | def _build_interaction_matrix(rows, cols, data, min_rating):
 56 | 
 57 |     mat = sp.lil_matrix((rows, cols), dtype=np.int32)
 58 | 
 59 |     for uid, iid, rating, _ in data:
 60 |         if rating >= min_rating:
 61 |             mat[uid, iid] = rating
 62 | 
 63 |     return mat.tocoo()
 64 | 
 65 | 
 66 | def _parse_item_metadata(num_items, item_metadata_raw, genres_raw):
 67 | 
 68 |     genres = []
 69 | 
 70 |     for line in genres_raw:
 71 |         if line:
 72 |             genre, gid = line.split("|")
 73 |             genres.append("genre:{}".format(genre))
 74 | 
 75 |     id_feature_labels = np.empty(num_items, dtype=str)
 76 |     genre_feature_labels = np.array(genres)
 77 | 
 78 |     id_features = sp.identity(num_items, format="csr", dtype=np.float32)
 79 |     genre_features = sp.lil_matrix((num_items, len(genres)), dtype=np.float32)
 80 | 
 81 |     for line in item_metadata_raw:
 82 | 
 83 |         if not line:
 84 |             continue
 85 | 
 86 |         splt = line.split("|")
 87 | 
 88 |         # Zero-based indexing
 89 |         iid = int(splt[0]) - 1
 90 |         title = splt[1]
 91 | 
 92 |         id_feature_labels[iid] = title
 93 | 
 94 |         item_genres = [idx for idx, val in enumerate(splt[5:]) if int(val) > 0]
 95 | 
 96 |         for gid in item_genres:
 97 |             genre_features[iid, gid] = 1.0
 98 | 
 99 |     return (
100 |         id_features,
101 |         id_feature_labels,
102 |         genre_features.tocsr(),
103 |         genre_feature_labels,
104 |     )
105 | 
106 | 
107 | def fetch_movielens(
108 |     data_home=None,
109 |     indicator_features=True,
110 |     genre_features=False,
111 |     min_rating=0.0,
112 |     download_if_missing=True,
113 | ):
114 |     """
115 |     Fetch the `Movielens 100k dataset <http://grouplens.org/datasets/movielens/100k/>`_.
116 | 
117 |     The dataset contains 100,000 interactions from 1000 users on 1700 movies,
118 |     and is exhaustively described in its
119 |     `README <http://files.grouplens.org/datasets/movielens/ml-100k-README.txt>`_.
120 | 
121 |     Parameters
122 |     ----------
123 | 
124 |     data_home: path, optional
125 |         Path to the directory in which the downloaded data should be placed.
126 |         Defaults to ``~/lightfm_data/``.
127 |     indicator_features: bool, optional
128 |         Use an [n_items, n_items] identity matrix for item features. When True with genre_features,
129 |         indicator and genre features are concatenated into a single feature matrix of shape
130 |         [n_items, n_items + n_genres].
131 |     genre_features: bool, optional
132 |         Use a [n_items, n_genres] matrix for item features. When True with item_indicator_features,
133 |         indicator and genre features are concatenated into a single feature matrix of shape
134 |         [n_items, n_items + n_genres].
135 |     min_rating: float, optional
136 |         Minimum rating to include in the interaction matrix.
137 |     download_if_missing: bool, optional
138 |         Download the data if not present. Raises an IOError if False and data is missing.
139 | 
140 |     Notes
141 |     -----
142 | 
143 |     The return value is a dictionary containing the following keys:
144 | 
145 |     Returns
146 |     -------
147 | 
148 |     train: sp.coo_matrix of shape [n_users, n_items]
149 |          Contains training set interactions.
150 |     test: sp.coo_matrix of shape [n_users, n_items]
151 |          Contains testing set interactions.
152 |     item_features: sp.csr_matrix of shape [n_items, n_item_features]
153 |          Contains item features.
154 |     item_feature_labels: np.array of strings of shape [n_item_features,]
155 |          Labels of item features.
156 |     item_labels: np.array of strings of shape [n_items,]
157 |          Items' titles.
158 |     """
159 | 
160 |     if not (indicator_features or genre_features):
161 |         raise ValueError(
162 |             "At least one of item_indicator_features " "or genre_features must be True"
163 |         )
164 | 
165 |     zip_path = _common.get_data(
166 |         data_home,
167 |         (
168 |             "https://github.com/maciejkula/"
169 |             "lightfm_datasets/releases/"
170 |             "download/v0.1.0/movielens.zip"
171 |         ),
172 |         "movielens100k",
173 |         "movielens.zip",
174 |         download_if_missing,
175 |     )
176 | 
177 |     # Load raw data
178 |     try:
179 |         (train_raw, test_raw, item_metadata_raw, genres_raw) = _read_raw_data(zip_path)
180 |     except zipfile.BadZipFile:
181 |         # Download was corrupted, get rid of the partially
182 |         # downloaded file so that we re-download on the
183 |         # next try.
184 |         os.unlink(zip_path)
185 |         raise ValueError(
186 |             "Corrupted Movielens download. Check your "
187 |             "internet connection and try again."
188 |         )
189 | 
190 |     # Figure out the dimensions
191 |     num_users, num_items = _get_dimensions(_parse(train_raw), _parse(test_raw))
192 | 
193 |     # Load train interactions
194 |     train = _build_interaction_matrix(
195 |         num_users, num_items, _parse(train_raw), min_rating
196 |     )
197 |     # Load test interactions
198 |     test = _build_interaction_matrix(num_users, num_items, _parse(test_raw), min_rating)
199 | 
200 |     assert train.shape == test.shape
201 | 
202 |     # Load metadata features
203 |     (
204 |         id_features,
205 |         id_feature_labels,
206 |         genre_features_matrix,
207 |         genre_feature_labels,
208 |     ) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw)
209 | 
210 |     assert id_features.shape == (num_items, len(id_feature_labels))
211 |     assert genre_features_matrix.shape == (num_items, len(genre_feature_labels))
212 | 
213 |     if indicator_features and not genre_features:
214 |         features = id_features
215 |         feature_labels = id_feature_labels
216 |     elif genre_features and not indicator_features:
217 |         features = genre_features_matrix
218 |         feature_labels = genre_feature_labels
219 |     else:
220 |         features = sp.hstack([id_features, genre_features_matrix]).tocsr()
221 |         feature_labels = np.concatenate((id_feature_labels, genre_feature_labels))
222 | 
223 |     data = {
224 |         "train": train,
225 |         "test": test,
226 |         "item_features": features,
227 |         "item_feature_labels": feature_labels,
228 |         "item_labels": id_feature_labels,
229 |     }
230 | 
231 |     return data
232 | 


--------------------------------------------------------------------------------
/lightfm/datasets/stackexchange.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | 
  5 | import scipy.sparse as sp
  6 | 
  7 | from lightfm.datasets import _common
  8 | 
  9 | 
 10 | def fetch_stackexchange(
 11 |     dataset,
 12 |     test_set_fraction=0.2,
 13 |     min_training_interactions=1,
 14 |     data_home=None,
 15 |     indicator_features=True,
 16 |     tag_features=False,
 17 |     download_if_missing=True,
 18 | ):
 19 |     """
 20 |     Fetch a dataset from the `StackExchange network <http://stackexchange.com/>`_.
 21 | 
 22 |     The datasets contain users answering questions: an interaction is defined as a user
 23 |     answering a given question.
 24 | 
 25 |     The following datasets from the StackExchange network are available:
 26 | 
 27 |     - CrossValidated: From stats.stackexchange.com. Approximately 9000 users, 72000 questions,
 28 |       and 70000 answers.
 29 |     - StackOverflow: From stackoverflow.stackexchange.com. Approximately 1.3M users, 11M questions,
 30 |       and 18M answers.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 | 
 35 |     dataset: string, one of ('crossvalidated', 'stackoverflow')
 36 |         The part of the StackExchange network for which to fetch the dataset.
 37 |     test_set_fraction: float, optional
 38 |         The fraction of the dataset used for testing. Splitting into the train and test set is done
 39 |         in a time-based fashion: all interactions before a certain time are in the train set and
 40 |         all interactions after that time are in the test set.
 41 |     min_training_interactions: int, optional
 42 |         Only include users with this amount of interactions in the training set.
 43 |     data_home: path, optional
 44 |         Path to the directory in which the downloaded data should be placed.
 45 |         Defaults to ``~/lightfm_data/``.
 46 |     indicator_features: bool, optional
 47 |         Use an [n_users, n_users] identity matrix for item features. When True with genre_features,
 48 |         indicator and genre features are concatenated into a single feature matrix of shape
 49 |         [n_users, n_users + n_genres].
 50 |     download_if_missing: bool, optional
 51 |         Download the data if not present. Raises an IOError if False and data is missing.
 52 | 
 53 |     Notes
 54 |     -----
 55 | 
 56 |     The return value is a dictionary containing the following keys:
 57 | 
 58 |     Returns
 59 |     -------
 60 | 
 61 |     train: sp.coo_matrix of shape [n_users, n_items]
 62 |          Contains training set interactions.
 63 |     test: sp.coo_matrix of shape [n_users, n_items]
 64 |          Contains testing set interactions.
 65 |     item_features: sp.csr_matrix of shape [n_items, n_item_features]
 66 |          Contains item features.
 67 |     item_feature_labels: np.array of strings of shape [n_item_features,]
 68 |          Labels of item features.
 69 |     """
 70 | 
 71 |     if not (indicator_features or tag_features):
 72 |         raise ValueError(
 73 |             "At least one of item_indicator_features " "or tag_features must be True"
 74 |         )
 75 | 
 76 |     if dataset not in ("crossvalidated", "stackoverflow"):
 77 |         raise ValueError("Unknown dataset")
 78 | 
 79 |     if not (0.0 < test_set_fraction < 1.0):
 80 |         raise ValueError("Test set fraction must be between 0 and 1")
 81 | 
 82 |     urls = {
 83 |         "crossvalidated": (
 84 |             "https://github.com/maciejkula/lightfm_datasets/releases/"
 85 |             "download/v0.1.0/stackexchange_crossvalidated.npz"
 86 |         ),
 87 |         "stackoverflow": (
 88 |             "https://github.com/maciejkula/lightfm_datasets/releases/"
 89 |             "download/v0.1.0/stackexchange_stackoverflow.npz"
 90 |         ),
 91 |     }
 92 | 
 93 |     path = _common.get_data(
 94 |         data_home,
 95 |         urls[dataset],
 96 |         os.path.join("stackexchange", dataset),
 97 |         "data.npz",
 98 |         download_if_missing,
 99 |     )
100 | 
101 |     data = np.load(path)
102 | 
103 |     interactions = sp.coo_matrix(
104 |         (
105 |             data["interactions_data"],
106 |             (data["interactions_row"], data["interactions_col"]),
107 |         ),
108 |         shape=data["interactions_shape"].flatten(),
109 |     )
110 |     interactions.sum_duplicates()
111 | 
112 |     tag_features_mat = sp.coo_matrix(
113 |         (data["features_data"], (data["features_row"], data["features_col"])),
114 |         shape=data["features_shape"].flatten(),
115 |     )
116 |     tag_labels = data["labels"]
117 | 
118 |     test_cutoff_index = int(len(interactions.data) * (1.0 - test_set_fraction))
119 |     test_cutoff_timestamp = np.sort(interactions.data)[test_cutoff_index]
120 |     in_train = interactions.data < test_cutoff_timestamp
121 |     in_test = np.logical_not(in_train)
122 | 
123 |     train = sp.coo_matrix(
124 |         (
125 |             np.ones(in_train.sum(), dtype=np.float32),
126 |             (interactions.row[in_train], interactions.col[in_train]),
127 |         ),
128 |         shape=interactions.shape,
129 |     )
130 |     test = sp.coo_matrix(
131 |         (
132 |             np.ones(in_test.sum(), dtype=np.float32),
133 |             (interactions.row[in_test], interactions.col[in_test]),
134 |         ),
135 |         shape=interactions.shape,
136 |     )
137 | 
138 |     if min_training_interactions > 0:
139 |         include = np.squeeze(np.array(train.getnnz(axis=1))) > min_training_interactions
140 | 
141 |         train = train.tocsr()[include].tocoo()
142 |         test = test.tocsr()[include].tocoo()
143 | 
144 |     if indicator_features and not tag_features:
145 |         features = sp.identity(train.shape[1], format="csr", dtype=np.float32)
146 |         labels = np.array(["question_id:{}".format(x) for x in range(train.shape[1])])
147 |     elif not indicator_features and tag_features:
148 |         features = tag_features_mat.tocsr()
149 |         labels = tag_labels
150 |     else:
151 |         id_features = sp.identity(train.shape[1], format="csr", dtype=np.float32)
152 |         features = sp.hstack([id_features, tag_features_mat]).tocsr()
153 |         labels = np.concatenate(
154 |             [
155 |                 np.array(["question_id:{}".format(x) for x in range(train.shape[1])]),
156 |                 tag_labels,
157 |             ]
158 |         )
159 | 
160 |     return {
161 |         "train": train,
162 |         "test": test,
163 |         "item_features": features,
164 |         "item_feature_labels": labels,
165 |     }
166 | 


--------------------------------------------------------------------------------
/lightfm/evaluation.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 | Module containing evaluation functions suitable for judging the performance of
  4 | a fitted LightFM model.
  5 | """
  6 | 
  7 | import numpy as np
  8 | 
  9 | from ._lightfm_fast import CSRMatrix, calculate_auc_from_rank
 10 | 
 11 | __all__ = ["precision_at_k", "recall_at_k", "auc_score", "reciprocal_rank"]
 12 | 
 13 | 
 14 | def precision_at_k(
 15 |     model,
 16 |     test_interactions,
 17 |     train_interactions=None,
 18 |     k=10,
 19 |     user_features=None,
 20 |     item_features=None,
 21 |     preserve_rows=False,
 22 |     num_threads=1,
 23 |     check_intersections=True,
 24 | ):
 25 |     """
 26 |     Measure the precision at k metric for a model: the fraction of known
 27 |     positives in the first k positions of the ranked list of results.
 28 |     A perfect score is 1.0.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 | 
 33 |     model: LightFM instance
 34 |          the fitted model to be evaluated
 35 |     test_interactions: np.float32 csr_matrix of shape [n_users, n_items]
 36 |          Non-zero entries representing known positives in the evaluation set.
 37 |     train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional
 38 |          Non-zero entries representing known positives in the train set. These
 39 |          will be omitted from the score calculations to avoid re-recommending
 40 |          known positives.
 41 |     k: integer, optional
 42 |          The k parameter.
 43 |     user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional
 44 |          Each row contains that user's weights over features.
 45 |     item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional
 46 |          Each row contains that item's weights over features.
 47 |     preserve_rows: boolean, optional
 48 |          When False (default), the number of rows in the output will be equal
 49 |          to the number of users with interactions in the evaluation set.
 50 |          When True, the number of rows in the output will be equal to the
 51 |          number of users.
 52 |     num_threads: int, optional
 53 |          Number of parallel computation threads to use. Should
 54 |          not be higher than the number of physical cores.
 55 |     check_intersections: bool, optional, True by default,
 56 |         Only relevant when train_interactions are supplied.
 57 |         A flag that signals whether the test and train matrices should be checked
 58 |         for intersections to prevent optimistic ranks / wrong evaluation / bad data split.
 59 | 
 60 |     Returns
 61 |     -------
 62 | 
 63 |     np.array of shape [n_users with interactions or n_users,]
 64 |          Numpy array containing precision@k scores for each user. If there are
 65 |          no interactions for a given user the returned precision will be 0.
 66 |     """
 67 | 
 68 |     if num_threads < 1:
 69 |         raise ValueError("Number of threads must be 1 or larger.")
 70 | 
 71 |     ranks = model.predict_rank(
 72 |         test_interactions,
 73 |         train_interactions=train_interactions,
 74 |         user_features=user_features,
 75 |         item_features=item_features,
 76 |         num_threads=num_threads,
 77 |         check_intersections=check_intersections,
 78 |     )
 79 | 
 80 |     ranks.data = np.less(ranks.data, k, ranks.data)
 81 | 
 82 |     precision = np.squeeze(np.array(ranks.sum(axis=1))) / k
 83 | 
 84 |     if not preserve_rows:
 85 |         precision = precision[test_interactions.getnnz(axis=1) > 0]
 86 | 
 87 |     return precision
 88 | 
 89 | 
 90 | def recall_at_k(
 91 |     model,
 92 |     test_interactions,
 93 |     train_interactions=None,
 94 |     k=10,
 95 |     user_features=None,
 96 |     item_features=None,
 97 |     preserve_rows=False,
 98 |     num_threads=1,
 99 |     check_intersections=True,
100 | ):
101 |     """
102 |     Measure the recall at k metric for a model: the number of positive items in
103 |     the first k positions of the ranked list of results divided by the number
104 |     of positive items in the test period. A perfect score is 1.0.
105 | 
106 |     Parameters
107 |     ----------
108 | 
109 |     model: LightFM instance
110 |          the fitted model to be evaluated
111 |     test_interactions: np.float32 csr_matrix of shape [n_users, n_items]
112 |          Non-zero entries representing known positives in the evaluation set.
113 |     train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional
114 |          Non-zero entries representing known positives in the train set. These
115 |          will be omitted from the score calculations to avoid re-recommending
116 |          known positives.
117 |     k: integer, optional
118 |          The k parameter.
119 |     user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional
120 |          Each row contains that user's weights over features.
121 |     item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional
122 |          Each row contains that item's weights over features.
123 |     preserve_rows: boolean, optional
124 |          When False (default), the number of rows in the output will be equal
125 |          to the number of users with interactions in the evaluation set.
126 |          When True, the number of rows in the output will be equal to the
127 |          number of users.
128 |     num_threads: int, optional
129 |          Number of parallel computation threads to use. Should
130 |          not be higher than the number of physical cores.
131 |     check_intersections: bool, optional, True by default,
132 |         Only relevant when train_interactions are supplied.
133 |         A flag that signals whether the test and train matrices should be checked
134 |         for intersections to prevent optimistic ranks / wrong evaluation / bad data split.
135 | 
136 |     Returns
137 |     -------
138 | 
139 |     np.array of shape [n_users with interactions or n_users,]
140 |          Numpy array containing recall@k scores for each user. If there are no
141 |          interactions for a given user having items in the test period, the
142 |          returned recall will be 0.
143 |     """
144 | 
145 |     if num_threads < 1:
146 |         raise ValueError("Number of threads must be 1 or larger.")
147 | 
148 |     ranks = model.predict_rank(
149 |         test_interactions,
150 |         train_interactions=train_interactions,
151 |         user_features=user_features,
152 |         item_features=item_features,
153 |         num_threads=num_threads,
154 |         check_intersections=check_intersections,
155 |     )
156 | 
157 |     ranks.data = np.less(ranks.data, k, ranks.data)
158 | 
159 |     retrieved = np.squeeze(test_interactions.getnnz(axis=1))
160 |     hit = np.squeeze(np.array(ranks.sum(axis=1)))
161 | 
162 |     if not preserve_rows:
163 |         hit = hit[test_interactions.getnnz(axis=1) > 0]
164 |         retrieved = retrieved[test_interactions.getnnz(axis=1) > 0]
165 | 
166 |     return hit / retrieved
167 | 
168 | 
169 | def auc_score(
170 |     model,
171 |     test_interactions,
172 |     train_interactions=None,
173 |     user_features=None,
174 |     item_features=None,
175 |     preserve_rows=False,
176 |     num_threads=1,
177 |     check_intersections=True,
178 | ):
179 |     """
180 |     Measure the ROC AUC metric for a model: the probability that a randomly
181 |     chosen positive example has a higher score than a randomly chosen negative
182 |     example.
183 |     A perfect score is 1.0.
184 | 
185 |     Parameters
186 |     ----------
187 | 
188 |     model: LightFM instance
189 |          the fitted model to be evaluated
190 |     test_interactions: np.float32 csr_matrix of shape [n_users, n_items]
191 |          Non-zero entries representing known positives in the evaluation set.
192 |     train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional
193 |          Non-zero entries representing known positives in the train set. These
194 |          will be omitted from the score calculations to avoid re-recommending
195 |          known positives.
196 |     user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional
197 |          Each row contains that user's weights over features.
198 |     item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional
199 |          Each row contains that item's weights over features.
200 |     preserve_rows: boolean, optional
201 |          When False (default), the number of rows in the output will be equal
202 |          to the number of users with interactions in the evaluation set.
203 |          When True, the number of rows in the output will be equal to the
204 |          number of users.
205 |     num_threads: int, optional
206 |          Number of parallel computation threads to use. Should
207 |          not be higher than the number of physical cores.
208 |     check_intersections: bool, optional, True by default,
209 |         Only relevant when train_interactions are supplied.
210 |         A flag that signals whether the test and train matrices should be checked
211 |         for intersections to prevent optimistic ranks / wrong evaluation / bad data split.
212 | 
213 |     Returns
214 |     -------
215 | 
216 |     np.array of shape [n_users with interactions or n_users,]
217 |          Numpy array containing AUC scores for each user. If there are no
218 |          interactions for a given user the returned AUC will be 0.5.
219 |     """
220 | 
221 |     if num_threads < 1:
222 |         raise ValueError("Number of threads must be 1 or larger.")
223 | 
224 |     ranks = model.predict_rank(
225 |         test_interactions,
226 |         train_interactions=train_interactions,
227 |         user_features=user_features,
228 |         item_features=item_features,
229 |         num_threads=num_threads,
230 |         check_intersections=check_intersections,
231 |     )
232 | 
233 |     assert np.all(ranks.data >= 0)
234 | 
235 |     auc = np.zeros(ranks.shape[0], dtype=np.float32)
236 | 
237 |     if train_interactions is not None:
238 |         num_train_positives = np.squeeze(
239 |             np.array(train_interactions.getnnz(axis=1)).astype(np.int32)
240 |         )
241 |     else:
242 |         num_train_positives = np.zeros(test_interactions.shape[0], dtype=np.int32)
243 | 
244 |     # The second argument is modified in-place, but
245 |     # here we don't care about the inconsistency
246 |     # introduced into the ranks matrix.
247 |     calculate_auc_from_rank(
248 |         CSRMatrix(ranks), num_train_positives, ranks.data, auc, num_threads
249 |     )
250 | 
251 |     if not preserve_rows:
252 |         auc = auc[test_interactions.getnnz(axis=1) > 0]
253 | 
254 |     return auc
255 | 
256 | 
257 | def reciprocal_rank(
258 |     model,
259 |     test_interactions,
260 |     train_interactions=None,
261 |     user_features=None,
262 |     item_features=None,
263 |     preserve_rows=False,
264 |     num_threads=1,
265 |     check_intersections=True,
266 | ):
267 |     """
268 |     Measure the reciprocal rank metric for a model: 1 / the rank of the highest
269 |     ranked positive example. A perfect score is 1.0.
270 | 
271 |     Parameters
272 |     ----------
273 | 
274 |     model: LightFM instance
275 |          the fitted model to be evaluated
276 |     test_interactions: np.float32 csr_matrix of shape [n_users, n_items]
277 |          Non-zero entries representing known positives in the evaluation set.
278 |     train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional
279 |          Non-zero entries representing known positives in the train set. These
280 |          will be omitted from the score calculations to avoid re-recommending
281 |          known positives.
282 |     user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional
283 |          Each row contains that user's weights over features.
284 |     item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional
285 |          Each row contains that item's weights over features.
286 |     preserve_rows: boolean, optional
287 |          When False (default), the number of rows in the output will be equal
288 |          to the number of users with interactions in the evaluation set.
289 |          When True, the number of rows in the output will be equal to the
290 |          number of users.
291 |     num_threads: int, optional
292 |          Number of parallel computation threads to use. Should
293 |          not be higher than the number of physical cores.
294 |     check_intersections: bool, optional, True by default,
295 |         Only relevant when train_interactions are supplied.
296 |         A flag that signals whether the test and train matrices should be checked
297 |         for intersections to prevent optimistic ranks / wrong evaluation / bad data split.
298 | 
299 |     Returns
300 |     -------
301 | 
302 |     np.array of shape [n_users with interactions or n_users,]
303 |          Numpy array containing reciprocal rank scores for each user.
304 |          If there are no interactions for a given user the returned value will
305 |          be 0.0.
306 |     """
307 | 
308 |     if num_threads < 1:
309 |         raise ValueError("Number of threads must be 1 or larger.")
310 | 
311 |     ranks = model.predict_rank(
312 |         test_interactions,
313 |         train_interactions=train_interactions,
314 |         user_features=user_features,
315 |         item_features=item_features,
316 |         num_threads=num_threads,
317 |         check_intersections=check_intersections,
318 |     )
319 | 
320 |     ranks.data = 1.0 / (ranks.data + 1.0)
321 | 
322 |     ranks = np.squeeze(np.array(ranks.max(axis=1).todense()))
323 | 
324 |     if not preserve_rows:
325 |         ranks = ranks[test_interactions.getnnz(axis=1) > 0]
326 | 
327 |     return ranks
328 | 


--------------------------------------------------------------------------------
/lightfm/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.17"
2 | 


--------------------------------------------------------------------------------
/lint-requirements.txt:
--------------------------------------------------------------------------------
1 | pre-commit==2.17.0
2 | black==22.1.0
3 | flake8==4.0.1
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 
4 | [flake8]
5 | ignore = I100, W503, E203
6 | max-line-length = 100
7 | exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,docs,doc
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import os
  3 | import pathlib
  4 | import subprocess
  5 | import sys
  6 | import textwrap
  7 | 
  8 | from setuptools import Command, Extension, setup
  9 | 
 10 | 
 11 | def define_extensions(use_openmp):
 12 |     compile_args = []
 13 |     if not os.environ.get("LIGHTFM_NO_CFLAGS"):
 14 |         compile_args += ["-ffast-math"]
 15 | 
 16 |         if sys.platform.startswith("darwin"):
 17 |             compile_args += []
 18 |         else:
 19 |             compile_args += ["-march=native"]
 20 | 
 21 |     if not use_openmp:
 22 |         print("Compiling without OpenMP support.")
 23 |         return [
 24 |             Extension(
 25 |                 "lightfm._lightfm_fast_no_openmp",
 26 |                 ["lightfm/_lightfm_fast_no_openmp.c"],
 27 |                 extra_compile_args=compile_args,
 28 |             )
 29 |         ]
 30 |     else:
 31 |         return [
 32 |             Extension(
 33 |                 "lightfm._lightfm_fast_openmp",
 34 |                 ["lightfm/_lightfm_fast_openmp.c"],
 35 |                 extra_link_args=["-fopenmp"],
 36 |                 extra_compile_args=compile_args + ["-fopenmp"],
 37 |             )
 38 |         ]
 39 | 
 40 | 
 41 | class Cythonize(Command):
 42 |     """
 43 |     Compile the extension .pyx files.
 44 |     """
 45 | 
 46 |     user_options = []
 47 | 
 48 |     def initialize_options(self):
 49 |         pass
 50 | 
 51 |     def finalize_options(self):
 52 |         pass
 53 | 
 54 |     def generate_pyx(self):
 55 |         openmp_import = textwrap.dedent(
 56 |             """
 57 |              from cython.parallel import parallel, prange
 58 |              cimport openmp
 59 |         """
 60 |         )
 61 | 
 62 |         lock_init = textwrap.dedent(
 63 |             """
 64 |              cdef openmp.omp_lock_t THREAD_LOCK
 65 |              openmp.omp_init_lock(&THREAD_LOCK)
 66 |         """
 67 |         )
 68 | 
 69 |         params = (
 70 |             (
 71 |                 "no_openmp",
 72 |                 dict(
 73 |                     openmp_import="",
 74 |                     nogil_block="with nogil:",
 75 |                     range_block="range",
 76 |                     thread_num="0",
 77 |                     lock_init="",
 78 |                     lock_acquire="",
 79 |                     lock_release="",
 80 |                 ),
 81 |             ),
 82 |             (
 83 |                 "openmp",
 84 |                 dict(
 85 |                     openmp_import=openmp_import,
 86 |                     nogil_block="with nogil, parallel(num_threads=num_threads):",
 87 |                     range_block="prange",
 88 |                     thread_num="openmp.omp_get_thread_num()",
 89 |                     lock_init=lock_init,
 90 |                     lock_acquire="openmp.omp_set_lock(&THREAD_LOCK)",
 91 |                     lock_release="openmp.omp_unset_lock(&THREAD_LOCK)",
 92 |                 ),
 93 |             ),
 94 |         )
 95 | 
 96 |         file_dir = os.path.join(os.path.dirname(__file__), "lightfm")
 97 | 
 98 |         with open(os.path.join(file_dir, "_lightfm_fast.pyx.template"), "r") as fl:
 99 |             template = fl.read()
100 | 
101 |         for variant, template_params in params:
102 |             with open(
103 |                 os.path.join(file_dir, "_lightfm_fast_{}.pyx".format(variant)), "w"
104 |             ) as fl:
105 |                 fl.write(template.format(**template_params))
106 | 
107 |     def run(self):
108 |         from Cython.Build import cythonize
109 | 
110 |         self.generate_pyx()
111 | 
112 |         cythonize(
113 |             [
114 |                 Extension(
115 |                     "lightfm._lightfm_fast_no_openmp",
116 |                     ["lightfm/_lightfm_fast_no_openmp.pyx"],
117 |                 ),
118 |                 Extension(
119 |                     "lightfm._lightfm_fast_openmp",
120 |                     ["lightfm/_lightfm_fast_openmp.pyx"],
121 |                     extra_link_args=["-fopenmp"],
122 |                 ),
123 |             ],
124 |             compiler_directives={'language_level' : "3"}
125 |         )
126 | 
127 | 
128 | class Clean(Command):
129 |     """
130 |     Clean build files.
131 |     """
132 | 
133 |     user_options = [("all", None, "(Compatibility with original clean command)")]
134 | 
135 |     def initialize_options(self):
136 |         self.all = False
137 | 
138 |     def finalize_options(self):
139 |         pass
140 | 
141 |     def run(self):
142 |         pth = os.path.dirname(os.path.abspath(__file__))
143 | 
144 |         subprocess.call(["rm", "-rf", os.path.join(pth, "build")])
145 |         subprocess.call(["rm", "-rf", os.path.join(pth, "lightfm.egg-info")])
146 |         subprocess.call(["find", pth, "-name", "lightfm*.pyc", "-type", "f", "-delete"])
147 |         subprocess.call(["rm", os.path.join(pth, "lightfm", "_lightfm_fast.so")])
148 | 
149 | 
150 | def read_version():
151 |     mod = {}
152 |     path = os.path.join(
153 |         os.path.dirname(__file__),
154 |         "lightfm",
155 |         "version.py",
156 |     )
157 |     with open(path) as fd:
158 |         exec(fd.read(), mod)
159 |     return mod["__version__"]
160 | 
161 | 
162 | use_openmp = not sys.platform.startswith("darwin") and not sys.platform.startswith(
163 |     "win"
164 | )
165 | 
166 | long_description = pathlib.Path(__file__).parent.joinpath("README.md").read_text()
167 | 
168 | setup(
169 |     name="lightfm",
170 |     version=read_version(),
171 |     description="LightFM recommendation model",
172 |     long_description=long_description,
173 |     long_description_content_type="text/markdown",
174 |     url="https://github.com/lyst/lightfm",
175 |     download_url="https://github.com/lyst/lightfm/tarball/{}".format(read_version()),
176 |     packages=["lightfm", "lightfm.datasets"],
177 |     package_data={"": ["*.c"]},
178 |     install_requires=["numpy", "scipy>=0.17.0", "requests", "scikit-learn"],
179 |     tests_require=["pytest", "requests", "scikit-learn"],
180 |     cmdclass={"cythonize": Cythonize, "clean": Clean},
181 |     author="Lyst Ltd (Maciej Kula)",
182 |     author_email="data@ly.st",
183 |     license="MIT",
184 |     classifiers=[
185 |         "Development Status :: 5 - Production/Stable",
186 |         "License :: OSI Approved :: MIT License",
187 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
188 |     ],
189 |     ext_modules=define_extensions(use_openmp),
190 | )
191 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyst/lightfm/0c9c31e027b976beab2385e268b58010fff46096/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import pytest
  4 | 
  5 | import scipy.sparse as sp
  6 | 
  7 | from lightfm.lightfm import LightFM
  8 | 
  9 | 
 10 | def test_empty_matrix():
 11 | 
 12 |     no_users, no_items = (10, 100)
 13 | 
 14 |     train = sp.coo_matrix((no_users, no_items), dtype=np.int32)
 15 | 
 16 |     model = LightFM()
 17 |     model.fit_partial(train)
 18 | 
 19 | 
 20 | def test_matrix_types():
 21 | 
 22 |     mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix)
 23 | 
 24 |     dtypes = (np.int32, np.int64, np.float32, np.float64)
 25 | 
 26 |     no_users, no_items = (10, 100)
 27 |     no_features = 20
 28 | 
 29 |     for mattype in mattypes:
 30 |         for dtype in dtypes:
 31 |             train = mattype((no_users, no_items), dtype=dtype)
 32 |             weights = train.tocoo()
 33 | 
 34 |             user_features = mattype((no_users, no_features), dtype=dtype)
 35 |             item_features = mattype((no_items, no_features), dtype=dtype)
 36 | 
 37 |             model = LightFM()
 38 |             model.fit_partial(
 39 |                 train,
 40 |                 sample_weight=weights,
 41 |                 user_features=user_features,
 42 |                 item_features=item_features,
 43 |             )
 44 | 
 45 |             model.predict(
 46 |                 np.random.randint(0, no_users, 10).astype(np.int32),
 47 |                 np.random.randint(0, no_items, 10).astype(np.int32),
 48 |                 user_features=user_features,
 49 |                 item_features=item_features,
 50 |             )
 51 | 
 52 |             model.predict_rank(
 53 |                 train, user_features=user_features, item_features=item_features
 54 |             )
 55 | 
 56 | 
 57 | def test_coo_with_duplicate_entries():
 58 |     # Calling .tocsr on a COO matrix with duplicate entries
 59 |     # changes its data arrays in-place, leading to out-of-bounds
 60 |     # array accesses in the WARP code.
 61 |     # Reported in https://github.com/lyst/lightfm/issues/117.
 62 | 
 63 |     rows, cols = (1000, 100)
 64 |     mat = sp.random(rows, cols)
 65 |     mat.data[:] = 1
 66 | 
 67 |     # Duplicate entries in the COO matrix
 68 |     mat.data = np.concatenate((mat.data, mat.data[:1000]))
 69 |     mat.row = np.concatenate((mat.row, mat.row[:1000]))
 70 |     mat.col = np.concatenate((mat.col, mat.col[:1000]))
 71 | 
 72 |     for loss in ("warp", "bpr", "warp-kos"):
 73 |         model = LightFM(loss=loss)
 74 |         model.fit(mat)
 75 | 
 76 | 
 77 | def test_predict():
 78 | 
 79 |     no_users, no_items = (10, 100)
 80 | 
 81 |     train = sp.coo_matrix((no_users, no_items), dtype=np.int32)
 82 | 
 83 |     model = LightFM()
 84 |     model.fit_partial(train)
 85 | 
 86 |     for uid in range(no_users):
 87 |         scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items))
 88 |         scores_int = model.predict(uid, np.arange(no_items))
 89 |         assert np.allclose(scores_arr, scores_int)
 90 | 
 91 |     with pytest.raises(ValueError):
 92 |         model.predict("foo", np.arange(no_items))
 93 | 
 94 | 
 95 | def test_input_dtypes():
 96 | 
 97 |     dtypes = (np.int32, np.int64, np.float32, np.float64)
 98 | 
 99 |     no_users, no_items = (10, 100)
100 |     no_features = 20
101 | 
102 |     for dtype in dtypes:
103 |         train = sp.coo_matrix((no_users, no_items), dtype=dtype)
104 | 
105 |         user_features = sp.coo_matrix((no_users, no_features), dtype=dtype)
106 |         item_features = sp.coo_matrix((no_items, no_features), dtype=dtype)
107 | 
108 |         model = LightFM()
109 |         model.fit_partial(
110 |             train, user_features=user_features, item_features=item_features
111 |         )
112 | 
113 |         model.predict(
114 |             np.random.randint(0, no_users, 10).astype(np.int32),
115 |             np.random.randint(0, no_items, 10).astype(np.int32),
116 |             user_features=user_features,
117 |             item_features=item_features,
118 |         )
119 | 
120 | 
121 | def test_not_enough_features_fails():
122 | 
123 |     no_users, no_items = (10, 100)
124 |     no_features = 20
125 | 
126 |     train = sp.coo_matrix((no_users, no_items), dtype=np.int32)
127 | 
128 |     user_features = sp.csr_matrix((no_users - 1, no_features), dtype=np.int32)
129 |     item_features = sp.csr_matrix((no_items - 1, no_features), dtype=np.int32)
130 |     model = LightFM()
131 |     with pytest.raises(Exception):
132 |         model.fit_partial(
133 |             train, user_features=user_features, item_features=item_features
134 |         )
135 | 
136 | 
137 | def test_feature_inference_fails():
138 | 
139 |     # On predict if we try to use feature inference and supply
140 |     # higher ids than the number of features that were supplied to fit
141 |     # we should complain
142 | 
143 |     no_users, no_items = (10, 100)
144 |     no_features = 20
145 | 
146 |     train = sp.coo_matrix((no_users, no_items), dtype=np.int32)
147 | 
148 |     user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32)
149 |     item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32)
150 |     model = LightFM()
151 |     model.fit_partial(train, user_features=user_features, item_features=item_features)
152 | 
153 |     with pytest.raises(ValueError):
154 |         model.predict(
155 |             np.array([no_features], dtype=np.int32),
156 |             np.array([no_features], dtype=np.int32),
157 |         )
158 | 
159 | 
160 | def test_return_self():
161 | 
162 |     no_users, no_items = (10, 100)
163 | 
164 |     train = sp.coo_matrix((no_users, no_items), dtype=np.int32)
165 | 
166 |     model = LightFM()
167 |     assert model.fit_partial(train) is model
168 |     assert model.fit(train) is model
169 | 
170 | 
171 | def test_param_sanity():
172 | 
173 |     with pytest.raises(AssertionError):
174 |         LightFM(no_components=-1)
175 | 
176 |     with pytest.raises(AssertionError):
177 |         LightFM(user_alpha=-1.0)
178 | 
179 |     with pytest.raises(AssertionError):
180 |         LightFM(item_alpha=-1.0)
181 | 
182 |     with pytest.raises(ValueError):
183 |         LightFM(max_sampled=-1.0)
184 | 
185 | 
186 | def test_sample_weight():
187 | 
188 |     model = LightFM()
189 | 
190 |     train = sp.coo_matrix(np.array([[0, 1], [0, 1]]))
191 | 
192 |     with pytest.raises(ValueError):
193 |         # Wrong number of weights
194 |         sample_weight = sp.coo_matrix(np.zeros((2, 2)))
195 | 
196 |         model.fit(train, sample_weight=sample_weight)
197 | 
198 |     with pytest.raises(ValueError):
199 |         # Wrong shape
200 |         sample_weight = sp.coo_matrix(np.zeros(2))
201 |         model.fit(train, sample_weight=np.zeros(3))
202 | 
203 |     with pytest.raises(ValueError):
204 |         # Wrong order of entries
205 |         sample_weight = sp.coo_matrix((train.data, (train.row[::-1], train.col[::-1])))
206 |         model.fit(train, sample_weight=np.zeros(3))
207 | 
208 |     sample_weight = sp.coo_matrix((train.data, (train.row, train.col)))
209 |     model.fit(train, sample_weight=sample_weight)
210 | 
211 |     model = LightFM(loss="warp-kos")
212 | 
213 |     with pytest.raises(NotImplementedError):
214 |         model.fit(train, sample_weight=np.ones(1))
215 | 
216 | 
217 | def test_predict_ranks():
218 | 
219 |     no_users, no_items = (10, 100)
220 | 
221 |     train = sp.coo_matrix((no_users, no_items), dtype=np.float32)
222 |     train = sp.rand(no_users, no_items, format="csr", random_state=42)
223 | 
224 |     model = LightFM()
225 |     model.fit_partial(train)
226 | 
227 |     # Compute ranks for all items
228 |     rank_input = sp.csr_matrix(np.ones((no_users, no_items)))
229 |     ranks = model.predict_rank(rank_input, num_threads=2).todense()
230 | 
231 |     assert np.all(ranks.min(axis=1) == 0)
232 |     assert np.all(ranks.max(axis=1) == no_items - 1)
233 | 
234 |     for row in range(no_users):
235 |         assert np.all(np.sort(ranks[row]) == np.arange(no_items))
236 | 
237 |     # Train set exclusions. All ranks should be zero
238 |     # if train interactions is dense.
239 |     ranks = model.predict_rank(
240 |         rank_input, train_interactions=rank_input, check_intersections=False
241 |     ).todense()
242 |     assert np.all(ranks == 0)
243 | 
244 |     # Max rank should be num_items - 1 - number of positives
245 |     # in train in that row
246 |     ranks = model.predict_rank(
247 |         rank_input, train_interactions=train, check_intersections=False
248 |     ).todense()
249 |     assert np.all(
250 |         np.squeeze(np.array(ranks.max(axis=1)))
251 |         == no_items - 1 - np.squeeze(np.array(train.getnnz(axis=1)))
252 |     )
253 | 
254 |     # check error is raised when train and test have interactions in common
255 |     with pytest.raises(ValueError):
256 |         model.predict_rank(train, train_interactions=train, check_intersections=True)
257 | 
258 |     # check error not raised when flag is False
259 |     model.predict_rank(train, train_interactions=train, check_intersections=False)
260 | 
261 |     # check no errors raised when train and test have no interactions in common
262 |     not_train = sp.rand(no_users, no_items, format="csr", random_state=43) - train
263 |     not_train.data[not_train.data < 0] = 0
264 |     not_train.eliminate_zeros()
265 |     model.predict_rank(not_train, train_interactions=train, check_intersections=True)
266 | 
267 |     # Make sure ranks are computed pessimistically when
268 |     # there are ties (that is, equal predictions for every
269 |     # item will assign maximum rank to each).
270 |     model.user_embeddings = np.zeros_like(model.user_embeddings)
271 |     model.item_embeddings = np.zeros_like(model.item_embeddings)
272 |     model.user_biases = np.zeros_like(model.user_biases)
273 |     model.item_biases = np.zeros_like(model.item_biases)
274 | 
275 |     ranks = model.predict_rank(rank_input, num_threads=2).todense()
276 | 
277 |     assert np.all(ranks.min(axis=1) == 99)
278 |     assert np.all(ranks.max(axis=1) == 99)
279 | 
280 |     # Wrong input dimensions
281 |     with pytest.raises(ValueError):
282 |         model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
283 | 
284 | 
285 | def test_exception_on_divergence():
286 | 
287 |     no_users, no_items = (1000, 1000)
288 | 
289 |     train = sp.rand(no_users, no_items, format="csr", random_state=42)
290 | 
291 |     model = LightFM(learning_rate=10000000.0, loss="warp")
292 | 
293 |     with pytest.raises(ValueError):
294 |         model.fit(train, epochs=10)
295 | 
296 | 
297 | def test_sklearn_api():
298 |     model = LightFM()
299 |     params = model.get_params()
300 |     model2 = LightFM(**params)
301 |     params2 = model2.get_params()
302 |     assert params == params2
303 |     model.set_params(**params)
304 |     params["invalid_param"] = 666
305 |     with pytest.raises(ValueError):
306 |         model.set_params(**params)
307 | 
308 | 
309 | def test_predict_not_fitted():
310 | 
311 |     model = LightFM()
312 | 
313 |     with pytest.raises(ValueError):
314 |         model.predict(np.arange(10), np.arange(10))
315 | 
316 |     with pytest.raises(ValueError):
317 |         model.predict_rank(1)
318 | 
319 |     with pytest.raises(ValueError):
320 |         model.get_user_representations()
321 | 
322 |     with pytest.raises(ValueError):
323 |         model.get_item_representations()
324 | 
325 | 
326 | def test_nan_features():
327 | 
328 |     no_users, no_items = (1000, 1000)
329 | 
330 |     train = sp.rand(no_users, no_items, format="csr", random_state=42)
331 | 
332 |     features = sp.identity(no_items)
333 |     features.data *= np.nan
334 | 
335 |     model = LightFM(loss="warp")
336 | 
337 |     with pytest.raises(ValueError):
338 |         model.fit(train, epochs=10, user_features=features, item_features=features)
339 | 
340 | 
341 | def test_nan_interactions():
342 | 
343 |     no_users, no_items = (1000, 1000)
344 | 
345 |     train = sp.rand(no_users, no_items, format="csr", random_state=42)
346 |     train.data *= np.nan
347 | 
348 |     model = LightFM(loss="warp")
349 | 
350 |     with pytest.raises(ValueError):
351 |         model.fit(train)
352 | 
353 | 
354 | def test_overflow_predict():
355 | 
356 |     no_users, no_items = (1000, 1000)
357 | 
358 |     train = sp.rand(no_users, no_items, format="csr", random_state=42)
359 | 
360 |     model = LightFM(loss="warp")
361 | 
362 |     model.fit(train)
363 | 
364 |     with pytest.raises((ValueError, OverflowError)):
365 |         print(
366 |             model.predict(
367 |                 1231241241231241414,
368 |                 np.arange(no_items),
369 |                 user_features=sp.identity(no_users),
370 |             )
371 |         )
372 | 
373 | 
374 | def test_warp_few_items():
375 | 
376 |     no_users, no_items = (1000, 2)
377 | 
378 |     train = sp.rand(no_users, no_items, format="csr", random_state=42)
379 | 
380 |     model = LightFM(loss="warp", max_sampled=10)
381 | 
382 |     model.fit(train)
383 | 


--------------------------------------------------------------------------------
/tests/test_cross_validation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from lightfm.cross_validation import random_train_test_split
 4 | from lightfm.datasets import fetch_movielens
 5 | 
 6 | 
 7 | def _assert_disjoint(x, y):
 8 | 
 9 |     x = x.tocsr()
10 |     y = y.tocoo()
11 | 
12 |     for (i, j) in zip(y.row, y.col):
13 |         assert x[i, j] == 0.0
14 | 
15 | 
16 | @pytest.mark.parametrize("test_percentage", [0.2, 0.5, 0.7])
17 | def test_random_train_test_split(test_percentage):
18 | 
19 |     data = fetch_movielens()["train"]
20 | 
21 |     train, test = random_train_test_split(data, test_percentage=test_percentage)
22 | 
23 |     assert test.nnz / float(data.nnz) == test_percentage
24 |     _assert_disjoint(train, test)
25 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | 
  4 | from lightfm.data import Dataset
  5 | 
  6 | 
  7 | def test_fitting():
  8 | 
  9 |     users, items = 10, 100
 10 | 
 11 |     dataset = Dataset()
 12 |     dataset.fit(range(users), range(items))
 13 | 
 14 |     assert dataset.interactions_shape() == (users, items)
 15 |     assert dataset.user_features_shape() == (users, users)
 16 |     assert dataset.item_features_shape() == (items, items)
 17 | 
 18 |     assert dataset.build_interactions([])[0].shape == (users, items)
 19 |     assert dataset.build_user_features([]).getnnz() == users
 20 |     assert dataset.build_item_features([]).getnnz() == items
 21 | 
 22 | 
 23 | def test_fitting_no_identity():
 24 | 
 25 |     users, items = 10, 100
 26 | 
 27 |     dataset = Dataset(user_identity_features=False, item_identity_features=False)
 28 |     dataset.fit(range(users), range(items))
 29 | 
 30 |     assert dataset.interactions_shape() == (users, items)
 31 |     assert dataset.user_features_shape() == (users, 0)
 32 |     assert dataset.item_features_shape() == (items, 0)
 33 | 
 34 |     assert dataset.build_interactions([])[0].shape == (users, items)
 35 |     assert dataset.build_user_features([], normalize=False).getnnz() == 0
 36 |     assert dataset.build_item_features([], normalize=False).getnnz() == 0
 37 | 
 38 | 
 39 | def test_exceptions():
 40 | 
 41 |     users, items = 10, 100
 42 | 
 43 |     dataset = Dataset()
 44 |     dataset.fit(range(users), range(items))
 45 | 
 46 |     with pytest.raises(ValueError):
 47 |         dataset.build_interactions([(users + 1, 0)])
 48 | 
 49 |     with pytest.raises(ValueError):
 50 |         dataset.build_interactions([(0, items + 1)])
 51 | 
 52 |     dataset.fit_partial([users + 1], [items + 1])
 53 |     dataset.build_interactions([(users + 1, 0)])
 54 |     dataset.build_interactions([(0, items + 1)])
 55 | 
 56 | 
 57 | def test_build_features():
 58 | 
 59 |     users, items = 10, 100
 60 | 
 61 |     dataset = Dataset(user_identity_features=False, item_identity_features=False)
 62 |     dataset.fit(
 63 |         range(users),
 64 |         range(items),
 65 |         ["user:{}".format(x) for x in range(users)],
 66 |         ["item:{}".format(x) for x in range(items)],
 67 |     )
 68 | 
 69 |     # Build from lists
 70 |     user_features = dataset.build_user_features(
 71 |         [
 72 |             (user_id, ["user:{}".format(x) for x in range(users)])
 73 |             for user_id in range(users)
 74 |         ]
 75 |     )
 76 |     assert user_features.getnnz() == users**2
 77 | 
 78 |     item_features = dataset.build_item_features(
 79 |         [
 80 |             (item_id, ["item:{}".format(x) for x in range(items)])
 81 |             for item_id in range(items)
 82 |         ]
 83 |     )
 84 |     assert item_features.getnnz() == items**2
 85 | 
 86 |     # Build from dicts
 87 |     user_features = dataset.build_user_features(
 88 |         [
 89 |             (user_id, {"user:{}".format(x): float(x) for x in range(users)})
 90 |             for user_id in range(users)
 91 |         ],
 92 |         normalize=False,
 93 |     )
 94 | 
 95 |     assert np.all(user_features.todense() == np.array([list(range(users))] * users))
 96 | 
 97 |     item_features = dataset.build_item_features(
 98 |         [
 99 |             (item_id, {"item:{}".format(x): float(x) for x in range(items)})
100 |             for item_id in range(items)
101 |         ],
102 |         normalize=False,
103 |     )
104 | 
105 |     assert np.all(item_features.todense() == np.array([list(range(items))] * items))
106 | 
107 |     # Test normalization
108 |     item_features = dataset.build_item_features(
109 |         [
110 |             (item_id, {"item:{}".format(x): float(x) for x in range(items)})
111 |             for item_id in range(items)
112 |         ]
113 |     )
114 | 
115 |     assert np.all(item_features.sum(1) == 1.0)
116 | 


--------------------------------------------------------------------------------
/tests/test_datasets.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import numpy as np
 4 | 
 5 | import scipy.sparse as sp
 6 | 
 7 | from lightfm.datasets import fetch_movielens, fetch_stackexchange
 8 | 
 9 | 
10 | def test_basic_fetching_movielens():
11 | 
12 |     data = fetch_movielens()
13 | 
14 |     assert isinstance(data["train"], sp.coo_matrix)
15 |     assert isinstance(data["test"], sp.coo_matrix)
16 | 
17 |     assert data["train"].shape == data["test"].shape
18 |     assert data["train"].shape == (943, 1682)
19 |     assert (data["train"].getnnz() + data["test"].getnnz()) == 100000
20 | 
21 |     assert data["item_features"].shape == (1682, 1682)
22 |     assert len(data["item_feature_labels"]) == 1682
23 |     assert data["item_feature_labels"] is data["item_labels"]
24 | 
25 |     data = fetch_movielens(genre_features=True)
26 | 
27 |     assert data["item_features"].shape == (1682, len(data["item_feature_labels"]))
28 |     assert data["item_feature_labels"] is not data["item_labels"]
29 | 
30 |     with pytest.raises(ValueError):
31 |         data = fetch_movielens(indicator_features=False, genre_features=False)
32 | 
33 | 
34 | @pytest.mark.skip(reason="Runs out of memory in CI")
35 | def test_basic_fetching_stackexchange():
36 | 
37 |     test_fractions = (0.2, 0.5, 0.6)
38 | 
39 |     for test_fraction in test_fractions:
40 |         data = fetch_stackexchange(
41 |             "crossvalidated",
42 |             min_training_interactions=0,
43 |             test_set_fraction=test_fraction,
44 |         )
45 | 
46 |         train = data["train"]
47 |         test = data["test"]
48 | 
49 |         assert isinstance(train, sp.coo_matrix)
50 |         assert isinstance(test, sp.coo_matrix)
51 | 
52 |         assert train.shape == test.shape
53 | 
54 |         frac = float(test.getnnz()) / (train.getnnz() + test.getnnz())
55 |         assert abs(frac - test_fraction) < 0.01
56 | 
57 |     for dataset in ("crossvalidated", "stackoverflow"):
58 | 
59 |         data = fetch_stackexchange(
60 |             dataset,
61 |             min_training_interactions=0,
62 |             indicator_features=True,
63 |             tag_features=False,
64 |         )
65 |         assert isinstance(data["item_features"], sp.csr_matrix)
66 |         assert (
67 |             data["item_features"].shape[0]
68 |             == data["item_features"].shape[1]
69 |             == data["train"].shape[1]
70 |         )
71 | 
72 |         data = fetch_stackexchange(
73 |             dataset,
74 |             min_training_interactions=0,
75 |             indicator_features=False,
76 |             tag_features=True,
77 |         )
78 |         assert isinstance(data["item_features"], sp.csr_matrix)
79 |         assert data["item_features"].shape[0] > data["item_features"].shape[1]
80 | 
81 |         data = fetch_stackexchange(
82 |             dataset,
83 |             min_training_interactions=0,
84 |             indicator_features=True,
85 |             tag_features=True,
86 |         )
87 |         assert isinstance(data["item_features"], sp.csr_matrix)
88 |         assert data["item_features"].shape[0] < data["item_features"].shape[1]
89 | 
90 |         if dataset == "crossvalidated":
91 |             assert data["train"].shape == (9431, 72360)
92 |         else:
93 |             assert data["train"].shape == (1349835, 11280896)
94 | 
95 |         assert np.all(data["train"].data == 1.0)
96 |         assert np.all(data["test"].data == 1.0)
97 | 


--------------------------------------------------------------------------------
/tests/test_evaluation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import pytest
  4 | 
  5 | import scipy.sparse as sp
  6 | 
  7 | from sklearn.metrics import roc_auc_score
  8 | 
  9 | from lightfm.lightfm import LightFM
 10 | from lightfm import evaluation
 11 | 
 12 | 
 13 | def _generate_data(num_users, num_items, density=0.1, test_fraction=0.2):
 14 |     # Generate a dataset where every user has interactions
 15 |     # in both the train and the test set.
 16 | 
 17 |     train = sp.lil_matrix((num_users, num_items), dtype=np.float32)
 18 |     test = sp.lil_matrix((num_users, num_items), dtype=np.float32)
 19 | 
 20 |     for user_id in range(num_users):
 21 |         positives = np.random.choice(
 22 |             num_items, size=int(density * num_items), replace=False
 23 |         )
 24 | 
 25 |         for item_id in positives[: int(test_fraction * len(positives))]:
 26 |             test[user_id, item_id] = 1.0
 27 | 
 28 |         for item_id in positives[int(test_fraction * len(positives)) :]:
 29 |             train[user_id, item_id] = 1.0
 30 | 
 31 |     return train.tocoo(), test.tocoo()
 32 | 
 33 | 
 34 | def _precision_at_k(
 35 |     model, ground_truth, k, train=None, user_features=None, item_features=None
 36 | ):
 37 |     # Alternative test implementation
 38 | 
 39 |     ground_truth = ground_truth.tocsr()
 40 | 
 41 |     no_users, no_items = ground_truth.shape
 42 | 
 43 |     pid_array = np.arange(no_items, dtype=np.int32)
 44 | 
 45 |     precisions = []
 46 | 
 47 |     uid_array = np.empty(no_items, dtype=np.int32)
 48 | 
 49 |     if train is not None:
 50 |         train = train.tocsr()
 51 | 
 52 |     for user_id, row in enumerate(ground_truth):
 53 |         uid_array.fill(user_id)
 54 | 
 55 |         predictions = model.predict(
 56 |             uid_array,
 57 |             pid_array,
 58 |             user_features=user_features,
 59 |             item_features=item_features,
 60 |             num_threads=4,
 61 |         )
 62 |         if train is not None:
 63 |             train_items = train[user_id].indices
 64 |             top_k = set(
 65 |                 [x for x in np.argsort(-predictions) if x not in train_items][:k]
 66 |             )
 67 |         else:
 68 |             top_k = set(np.argsort(-predictions)[:k])
 69 | 
 70 |         true_pids = set(row.indices[row.data == 1])
 71 | 
 72 |         if true_pids:
 73 |             precisions.append(len(top_k & true_pids) / float(k))
 74 | 
 75 |     return sum(precisions) / len(precisions)
 76 | 
 77 | 
 78 | def _recall_at_k(
 79 |     model, ground_truth, k, train=None, user_features=None, item_features=None
 80 | ):
 81 |     # Alternative test implementation
 82 | 
 83 |     ground_truth = ground_truth.tocsr()
 84 | 
 85 |     no_users, no_items = ground_truth.shape
 86 | 
 87 |     pid_array = np.arange(no_items, dtype=np.int32)
 88 | 
 89 |     recalls = []
 90 | 
 91 |     uid_array = np.empty(no_items, dtype=np.int32)
 92 | 
 93 |     if train is not None:
 94 |         train = train.tocsr()
 95 | 
 96 |     for user_id, row in enumerate(ground_truth):
 97 |         uid_array.fill(user_id)
 98 | 
 99 |         predictions = model.predict(
100 |             uid_array,
101 |             pid_array,
102 |             user_features=user_features,
103 |             item_features=item_features,
104 |             num_threads=4,
105 |         )
106 |         if train is not None:
107 |             train_items = train[user_id].indices
108 |             top_k = set(
109 |                 [x for x in np.argsort(-predictions) if x not in train_items][:k]
110 |             )
111 |         else:
112 |             top_k = set(np.argsort(-predictions)[:k])
113 | 
114 |         true_pids = set(row.indices[row.data == 1])
115 | 
116 |         if true_pids:
117 |             recalls.append(len(top_k & true_pids) / float(len(true_pids)))
118 | 
119 |     return sum(recalls) / len(recalls)
120 | 
121 | 
122 | def _auc(model, ground_truth, train=None, user_features=None, item_features=None):
123 | 
124 |     ground_truth = ground_truth.tocsr()
125 | 
126 |     no_users, no_items = ground_truth.shape
127 | 
128 |     pid_array = np.arange(no_items, dtype=np.int32)
129 | 
130 |     scores = []
131 | 
132 |     if train is not None:
133 |         train = train.tocsr()
134 | 
135 |     for user_id, row in enumerate(ground_truth):
136 |         uid_array = np.empty(no_items, dtype=np.int32)
137 |         uid_array.fill(user_id)
138 |         predictions = model.predict(
139 |             uid_array,
140 |             pid_array,
141 |             user_features=user_features,
142 |             item_features=item_features,
143 |             num_threads=4,
144 |         )
145 | 
146 |         true_pids = row.indices[row.data == 1]
147 | 
148 |         grnd = np.zeros(no_items, dtype=np.int32)
149 |         grnd[true_pids] = 1
150 | 
151 |         if not len(true_pids):
152 |             continue
153 | 
154 |         if train is not None:
155 |             train_indices = train[user_id].indices
156 |             not_in_train = np.array([x not in train_indices for x in range(no_items)])
157 |             scores.append(roc_auc_score(grnd[not_in_train], predictions[not_in_train]))
158 |         else:
159 |             scores.append(roc_auc_score(grnd, predictions))
160 | 
161 |     return scores
162 | 
163 | 
164 | def test_precision_at_k():
165 | 
166 |     no_users, no_items = (10, 100)
167 | 
168 |     train, test = _generate_data(no_users, no_items)
169 | 
170 |     model = LightFM(loss="bpr")
171 | 
172 |     # We want a high precision to catch the k=1 case
173 |     model.fit_partial(test)
174 | 
175 |     for k in (10, 5, 1):
176 | 
177 |         # Without omitting train interactions
178 |         precision = evaluation.precision_at_k(model, test, k=k)
179 |         expected_mean_precision = _precision_at_k(model, test, k)
180 | 
181 |         assert np.allclose(precision.mean(), expected_mean_precision)
182 |         assert len(precision) == (test.getnnz(axis=1) > 0).sum()
183 |         assert (
184 |             len(evaluation.precision_at_k(model, train, preserve_rows=True))
185 |             == test.shape[0]
186 |         )
187 | 
188 |         # With omitting train interactions
189 |         precision = evaluation.precision_at_k(
190 |             model, test, k=k, train_interactions=train
191 |         )
192 |         expected_mean_precision = _precision_at_k(model, test, k, train=train)
193 | 
194 |         assert np.allclose(precision.mean(), expected_mean_precision)
195 | 
196 | 
197 | def test_precision_at_k_with_ties():
198 | 
199 |     no_users, no_items = (10, 100)
200 | 
201 |     train, test = _generate_data(no_users, no_items)
202 | 
203 |     model = LightFM(loss="bpr")
204 |     model.fit_partial(train)
205 | 
206 |     # Make all predictions zero
207 |     model.user_embeddings = np.zeros_like(model.user_embeddings)
208 |     model.item_embeddings = np.zeros_like(model.item_embeddings)
209 |     model.user_biases = np.zeros_like(model.user_biases)
210 |     model.item_biases = np.zeros_like(model.item_biases)
211 | 
212 |     k = 10
213 | 
214 |     precision = evaluation.precision_at_k(model, test, k=k)
215 | 
216 |     # Pessimistic precision with all ties
217 |     assert precision.mean() == 0.0
218 | 
219 | 
220 | def test_recall_at_k():
221 | 
222 |     no_users, no_items = (10, 100)
223 | 
224 |     train, test = _generate_data(no_users, no_items)
225 | 
226 |     model = LightFM(loss="bpr")
227 |     model.fit_partial(test)
228 | 
229 |     for k in (10, 5, 1):
230 | 
231 |         # Without omitting train interactions
232 |         recall = evaluation.recall_at_k(model, test, k=k)
233 |         expected_mean_recall = _recall_at_k(model, test, k)
234 | 
235 |         assert np.allclose(recall.mean(), expected_mean_recall)
236 |         assert len(recall) == (test.getnnz(axis=1) > 0).sum()
237 |         assert (
238 |             len(evaluation.recall_at_k(model, train, preserve_rows=True))
239 |             == test.shape[0]
240 |         )
241 | 
242 |         # With omitting train interactions
243 |         recall = evaluation.recall_at_k(model, test, k=k, train_interactions=train)
244 |         expected_mean_recall = _recall_at_k(model, test, k, train=train)
245 | 
246 |         assert np.allclose(recall.mean(), expected_mean_recall)
247 | 
248 | 
249 | def test_auc_score():
250 | 
251 |     no_users, no_items = (10, 100)
252 | 
253 |     train, test = _generate_data(no_users, no_items)
254 | 
255 |     model = LightFM(loss="bpr")
256 |     model.fit_partial(train)
257 | 
258 |     auc = evaluation.auc_score(model, test, num_threads=2)
259 |     expected_auc = np.array(_auc(model, test))
260 | 
261 |     assert auc.shape == expected_auc.shape
262 |     assert np.abs(auc.mean() - expected_auc.mean()) < 0.01
263 |     assert len(auc) == (test.getnnz(axis=1) > 0).sum()
264 |     assert len(evaluation.auc_score(model, train, preserve_rows=True)) == test.shape[0]
265 | 
266 |     # With omitting train interactions
267 |     auc = evaluation.auc_score(model, test, train_interactions=train, num_threads=2)
268 |     expected_auc = np.array(_auc(model, test, train))
269 |     assert np.abs(auc.mean() - expected_auc.mean()) < 0.01
270 | 
271 | 
272 | def test_intersections_check():
273 | 
274 |     no_users, no_items = (10, 100)
275 | 
276 |     train, test = _generate_data(no_users, no_items)
277 | 
278 |     model = LightFM(loss="bpr")
279 |     model.fit_partial(train)
280 | 
281 |     # check error is raised when train and test have interactions in common
282 |     with pytest.raises(ValueError):
283 |         evaluation.auc_score(
284 |             model, train, train_interactions=train, check_intersections=True
285 |         )
286 | 
287 |     with pytest.raises(ValueError):
288 |         evaluation.recall_at_k(
289 |             model, train, train_interactions=train, check_intersections=True
290 |         )
291 | 
292 |     with pytest.raises(ValueError):
293 |         evaluation.precision_at_k(
294 |             model, train, train_interactions=train, check_intersections=True
295 |         )
296 | 
297 |     with pytest.raises(ValueError):
298 |         evaluation.reciprocal_rank(
299 |             model, train, train_interactions=train, check_intersections=True
300 |         )
301 | 
302 |     # check no errors raised when train and test have no interactions in common
303 |     evaluation.auc_score(
304 |         model, test, train_interactions=train, check_intersections=True
305 |     )
306 |     evaluation.recall_at_k(
307 |         model, test, train_interactions=train, check_intersections=True
308 |     )
309 |     evaluation.precision_at_k(
310 |         model, test, train_interactions=train, check_intersections=True
311 |     )
312 |     evaluation.reciprocal_rank(
313 |         model, test, train_interactions=train, check_intersections=True
314 |     )
315 | 
316 |     # check no error is raised when there are intersections but flag is False
317 |     evaluation.auc_score(
318 |         model, train, train_interactions=train, check_intersections=False
319 |     )
320 |     evaluation.recall_at_k(
321 |         model, train, train_interactions=train, check_intersections=False
322 |     )
323 |     evaluation.precision_at_k(
324 |         model, train, train_interactions=train, check_intersections=False
325 |     )
326 |     evaluation.reciprocal_rank(
327 |         model, train, train_interactions=train, check_intersections=False
328 |     )
329 | 


--------------------------------------------------------------------------------
/tests/test_fast_functions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import scipy.sparse as sp
 4 | 
 5 | 
 6 | from lightfm import _lightfm_fast
 7 | 
 8 | 
 9 | def test_in_positives():
10 | 
11 |     mat = sp.csr_matrix(np.array([[0, 1], [1, 0]])).astype(np.float32)
12 | 
13 |     assert not _lightfm_fast.__test_in_positives(0, 0, _lightfm_fast.CSRMatrix(mat))
14 |     assert _lightfm_fast.__test_in_positives(0, 1, _lightfm_fast.CSRMatrix(mat))
15 | 
16 |     assert _lightfm_fast.__test_in_positives(1, 0, _lightfm_fast.CSRMatrix(mat))
17 |     assert not _lightfm_fast.__test_in_positives(1, 1, _lightfm_fast.CSRMatrix(mat))
18 | 


--------------------------------------------------------------------------------