├── rankfm
    ├── __init__.py
    ├── mt19937ar.pxd
    ├── utils.py
    ├── mt19937ar
    │   ├── mt19937ar.h
    │   ├── readme-mt.txt
    │   └── mt19937ar.c
    ├── evaluation.py
    ├── _rankfm.pyx
    └── rankfm.py
├── .gitattributes
├── dist
    ├── rankfm-0.1.0.tar.gz
    ├── rankfm-0.1.1.tar.gz
    ├── rankfm-0.1.2.tar.gz
    ├── rankfm-0.1.3.tar.gz
    ├── rankfm-0.2.3.tar.gz
    ├── rankfm-0.2.5.tar.gz
    ├── rankfm-0.1.0-py3-none-any.whl
    ├── rankfm-0.1.1-py3-none-any.whl
    ├── rankfm-0.1.2-py3-none-any.whl
    ├── rankfm-0.1.3-py3-none-any.whl
    ├── rankfm-0.2.0-cp37-cp37m-macosx_10_7_x86_64.whl
    ├── rankfm-0.2.2-cp37-cp37m-macosx_10_9_x86_64.whl
    ├── rankfm-0.2.3-cp37-cp37m-macosx_10_9_x86_64.whl
    └── rankfm-0.2.5-cp37-cp37m-macosx_10_7_x86_64.whl
├── images
    └── UnderConstruction.png
├── MANIFEST.in
├── Makefile
├── docs
    ├── source
    │   ├── evaluation.rst
    │   ├── rankfm.rst
    │   ├── index.rst
    │   ├── home.rst
    │   ├── quickstart.rst
    │   └── conf.py
    └── Makefile
├── requirements.txt
├── .gitignore
├── .circleci
    └── config.yml
├── setup.py
├── README.md
├── tests
    └── test_rankfm.py
└── LICENSE


/rankfm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-detectable=false
2 | *.py linguist-detectable=true
3 | 


--------------------------------------------------------------------------------
/dist/rankfm-0.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.0.tar.gz


--------------------------------------------------------------------------------
/dist/rankfm-0.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.1.tar.gz


--------------------------------------------------------------------------------
/dist/rankfm-0.1.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.2.tar.gz


--------------------------------------------------------------------------------
/dist/rankfm-0.1.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.3.tar.gz


--------------------------------------------------------------------------------
/dist/rankfm-0.2.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.2.3.tar.gz


--------------------------------------------------------------------------------
/dist/rankfm-0.2.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.2.5.tar.gz


--------------------------------------------------------------------------------
/images/UnderConstruction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/images/UnderConstruction.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include rankfm *.h *.c *.pyx *.py
2 | recursive-exclude tests *
3 | recursive-exclude examples *


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | lint:
2 | 	python -m flake8 --ignore W3,E3,E5,E74 rankfm/
3 | 
4 | test:
5 | 	python -m pytest -r Efp tests/


--------------------------------------------------------------------------------
/dist/rankfm-0.1.0-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.0-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/rankfm-0.1.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.1-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/rankfm-0.1.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.2-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/rankfm-0.1.3-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.1.3-py3-none-any.whl


--------------------------------------------------------------------------------
/docs/source/evaluation.rst:
--------------------------------------------------------------------------------
1 | Model Evaluation
2 | ================
3 | 
4 | .. automodule:: rankfm.evaluation
5 |     :members:
6 |     :undoc-members:
7 | 


--------------------------------------------------------------------------------
/dist/rankfm-0.2.0-cp37-cp37m-macosx_10_7_x86_64.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.2.0-cp37-cp37m-macosx_10_7_x86_64.whl


--------------------------------------------------------------------------------
/dist/rankfm-0.2.2-cp37-cp37m-macosx_10_9_x86_64.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.2.2-cp37-cp37m-macosx_10_9_x86_64.whl


--------------------------------------------------------------------------------
/dist/rankfm-0.2.3-cp37-cp37m-macosx_10_9_x86_64.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.2.3-cp37-cp37m-macosx_10_9_x86_64.whl


--------------------------------------------------------------------------------
/dist/rankfm-0.2.5-cp37-cp37m-macosx_10_7_x86_64.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etlundquist/rankfm/HEAD/dist/rankfm-0.2.5-cp37-cp37m-macosx_10_7_x86_64.whl


--------------------------------------------------------------------------------
/docs/source/rankfm.rst:
--------------------------------------------------------------------------------
1 | RankFM
2 | ======
3 | 
4 | .. autoclass:: rankfm.rankfm.RankFM
5 |     :members:
6 |     :undoc-members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # core package requirements
 2 | numpy>=1.15
 3 | pandas>=0.24
 4 | 
 5 | # additional development/testing requirements
 6 | flake8>=3.5
 7 | pytest>=5.3
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: home.rst
 2 | 
 3 | 
 4 | Contents 
 5 | ========
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    Home <home>
11 |    Quickstart <quickstart>
12 |    RankFM Model <rankfm>
13 |    Model Evaluation <evaluation>
14 | 
15 | 


--------------------------------------------------------------------------------
/rankfm/mt19937ar.pxd:
--------------------------------------------------------------------------------
 1 | """
 2 | cython declarations for the Mersenne Twister RNG
 3 | http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
 4 | """
 5 | 
 6 | cdef extern from "mt19937ar/mt19937ar.h":
 7 | 
 8 |     # initialize mt[N] by setting a seed
 9 |     void init_genrand(unsigned long s) nogil
10 | 
11 |     # generate a random uint32 number
12 |     unsigned long genrand_int32() nogil
13 | 
14 |     # generate a random [0.0, 1.0) real number
15 |     double genrand_real2() nogil
16 | 
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # exclude data and private notebooks
 2 | data/
 3 | output/
 4 | old/
 5 | cython/
 6 | examples/ignore/
 7 | 
 8 | # cython generated files
 9 | *.so
10 | *.html
11 | 
12 | # system files
13 | *.DS_Store
14 | *.pyc
15 | *.ipynb_checkpoints
16 | __pycache__/
17 | 
18 | # distribution / packaging
19 | .Python
20 | build/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | pip-wheel-metadata/
28 | share/python-wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 | 
34 | # spark stuff
35 | */derby.log
36 | */metastore_db/
37 | 


--------------------------------------------------------------------------------
/rankfm/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | rankfm general utility functions
 3 | """
 4 | 
 5 | def get_data(obj):
 6 |     """get the numeric data from either a pd.dataframe or np.ndarray
 7 | 
 8 |     :param obj: pd.dataframe or np.ndarray
 9 |     :return: the object's underlying np.ndarray data
10 |     """
11 | 
12 |     if obj.__class__.__name__ in ('DataFrame', 'Series'):
13 |         data = obj.values
14 |     elif obj.__class__.__name__ == 'ndarray':
15 |         data = obj
16 |     else:
17 |         raise TypeError("input data must be in either pd.dataframe/pd.series or np.ndarray format")
18 |     return data
19 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | jobs:
 4 |   build:
 5 |     docker:
 6 |       - image: circleci/python:3.6.4
 7 |     steps:
 8 |       - checkout
 9 |       - run:
10 |           command: |
11 |             sudo chown -R circleci:circleci /usr/local/bin
12 |             sudo chown -R circleci:circleci /usr/local/lib/python3.6/site-packages
13 |       - restore_cache:
14 |           key: deps-{{ .Branch }}-{{ checksum "requirements.txt" }}
15 |       - run:
16 |           command: |
17 |             pip install --upgrade pip
18 |             pip install -r requirements.txt
19 |             python setup.py build_ext --inplace
20 |       - save_cache:
21 |           key: deps-{{ .Branch }}-{{ checksum "requirements.txt" }}
22 |           paths:
23 |             - /usr/local/bin
24 |             - /usr/local/lib/python3.6/site-packages
25 |       - run: python -m flake8 --ignore W3,E3,E5,E74 rankfm/
26 |       - run: python -m pytest -r Efp tests/
27 | 
28 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import glob
 4 | from setuptools import Extension, setup
 5 | 
 6 | NAME = 'rankfm'
 7 | VERSION = '0.2.5'
 8 | 
 9 | # define the extension packages to include
10 | # ----------------------------------------
11 | 
12 | # prefer the generated C extensions when building
13 | if glob.glob('rankfm/_rankfm.c'):
14 |     print("building extensions with pre-generated C source...")
15 |     use_cython = False
16 |     ext = 'c'
17 | else:
18 |     print("re-generating C source with cythonize...")
19 |     from Cython.Build import cythonize
20 |     use_cython = True
21 |     ext = 'pyx'
22 | 
23 | # add compiler arguments to optimize machine code and ignore warnings
24 | disabled_warnings = ['-Wno-unused-function', '-Wno-uninitialized']
25 | compile_args = ['-O2', '-ffast-math'] + disabled_warnings
26 | 
27 | # define the _rankfm extension including the wrapped MT module
28 | extensions = [
29 |     Extension(
30 |         name='rankfm._rankfm',
31 |         sources=['rankfm/_rankfm.{ext}'.format(ext=ext), 'rankfm/mt19937ar/mt19937ar.c'],
32 |         extra_compile_args=compile_args
33 |     )
34 | ]
35 | 
36 | # re-generate the C code if needed
37 | if use_cython:
38 |     extensions = cythonize(extensions)
39 | 
40 | # define the main package setup function
41 | # --------------------------------------
42 | 
43 | setup(
44 |     name=NAME,
45 |     version=VERSION,
46 |     description='a python implementation of the generic factorization machines model class '
47 |                 'adapted for collaborative filtering recommendation problems '
48 |                 'with implicit feedback user-item interaction data '
49 |                 'and (optionally) additional user/item side features',
50 |     author='Eric Lundquist',
51 |     author_email='e.t.lundquist@gmail.com',
52 |     url='https://github.com/etlundquist/rankfm',
53 |     keywords=['machine', 'learning', 'recommendation', 'factorization', 'machines', 'implicit'],
54 |     license='GNU General Public License v3.0',
55 |     packages=['rankfm'],
56 |     ext_modules=extensions,
57 |     zip_safe=False,
58 |     python_requires='>=3.6',
59 |     install_requires=['numpy>=1.15', 'pandas>=0.24']
60 | )
61 | 
62 | 


--------------------------------------------------------------------------------
/docs/source/home.rst:
--------------------------------------------------------------------------------
 1 | Welcome to RankFM's Documentation!
 2 | ==================================
 3 | 
 4 | RankFM is a python implementation of the general Factorization Machines model class described in `Rendle 2010 <https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf>`_ adapted for collaborative filtering recommendation/ranking problems with implicit feedback user-item interaction data. It uses `Bayesian Personalized Ranking (BPR) <https://arxiv.org/pdf/1205.2618.pdf>`_ and a variant of `Weighted Approximate-Rank Pairwise (WARP) <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.587.3946&rep=rep1&type=pdf>`_ loss to learn model weights via Stochastic Gradient Descent (SGD). It can (optionally) incorporate individual training sample weights and/or user/item auxiliary features to augment the main interaction data for model training.
 5 | 
 6 | The core training/prediction/recommendation methods are written in `Cython <https://cython.org/>`_. This makes it possible to scale to millions of users, items, and interactions. Designed for ease-of-use, RankFM accepts both `pd.DataFrame` and `np.ndarray` inputs. You do not have to convert your data to `scipy.sparse` matrices or re-map user/item identifiers to matrix indexes prior to use - RankFM internally maps all user/item identifiers to zero-based integer indexes, but always converts its outputs back to the original user/item identifiers from your data, which can be arbitrary (non-zero-based, non-consecutive) integers or even strings.
 7 | 
 8 | In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities `similiar_users()` and `similar_items()` to find the most similar users/items to a given user/item based on latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions have been included in the separate `evaluation` module to streamline model tuning and validation.
 9 | 
10 | Dependencies
11 | ------------
12 | 
13 | * Python 3.6+
14 | * numpy >= 1.15
15 | * pandas >= 0.24
16 | 
17 | Installation
18 | ------------
19 | 
20 | Prerequisites
21 | ^^^^^^^^^^^^^
22 | 
23 | To install RankFM's C extensions you will need the `GNU Compiler Collection (GCC) <https://gcc.gnu.org/>`_. Check to see whether you already have it installed:
24 | 
25 | .. code:: bash
26 | 
27 |   gcc --version
28 | 
29 | If you don't have it already you can easily install it using `Homebrew <https://brew.sh/>`_ on OSX or your default linux package manager:
30 | 
31 | .. code:: bash
32 | 
33 |   # OSX
34 |   brew install gcc
35 | 
36 |   # linux
37 |   sudo yum install gcc
38 | 
39 |   # ensure [gcc] has been installed correctly and is on the system PATH
40 |   gcc --version
41 | 
42 | Package Installation
43 | ^^^^^^^^^^^^^^^^^^^^
44 | 
45 | You can install the latest published version from PyPI using `pip`:
46 | 
47 | .. code:: bash
48 | 
49 |   pip install rankfm
50 | 
51 | Or alternatively install the current development build directly from GitHub:
52 | 
53 | .. code:: bash
54 |   
55 |   pip install git+https://github.com/etlundquist/rankfm.git#egg=rankfm
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/rankfm/mt19937ar/mt19937ar.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |    A C-program for MT19937, with initialization improved 2002/1/26.
 3 |    Coded by Takuji Nishimura and Makoto Matsumoto.
 4 | 
 5 |    Before using, initialize the state by using init_genrand(seed)  
 6 |    or init_by_array(init_key, key_length).
 7 | 
 8 |    Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
 9 |    All rights reserved.
10 |    Copyright (C) 2005, Mutsuo Saito
11 |    All rights reserved.
12 | 
13 |    Redistribution and use in source and binary forms, with or without
14 |    modification, are permitted provided that the following conditions
15 |    are met:
16 | 
17 |      1. Redistributions of source code must retain the above copyright
18 |         notice, this list of conditions and the following disclaimer.
19 | 
20 |      2. Redistributions in binary form must reproduce the above copyright
21 |         notice, this list of conditions and the following disclaimer in the
22 |         documentation and/or other materials provided with the distribution.
23 | 
24 |      3. The names of its contributors may not be used to endorse or promote 
25 |         products derived from this software without specific prior written 
26 |         permission.
27 | 
28 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
31 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
32 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
35 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
36 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
37 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 | 
40 | 
41 |    Any feedback is very welcome.
42 |    http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
43 |    email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
44 | */
45 | 
46 | /* initializes mt[N] with a seed */
47 | void init_genrand(unsigned long s);
48 | 
49 | /* initialize by an array with array-length */
50 | /* init_key is the array for initializing keys */
51 | /* key_length is its length */
52 | /* slight change for C++, 2004/2/26 */
53 | void init_by_array(unsigned long init_key[], int key_length);
54 | 
55 | /* generates a random number on [0,0xffffffff]-interval */
56 | unsigned long genrand_int32(void);
57 | 
58 | /* generates a random number on [0,0x7fffffff]-interval */
59 | long genrand_int31(void);
60 | 
61 | /* These real versions are due to Isaku Wada, 2002/01/09 added */
62 | /* generates a random number on [0,1]-real-interval */
63 | double genrand_real1(void);
64 | 
65 | /* generates a random number on [0,1)-real-interval */
66 | double genrand_real2(void);
67 | 
68 | /* generates a random number on (0,1)-real-interval */
69 | double genrand_real3(void);
70 | 
71 | /* generates a random number on [0,1) with 53-bit resolution*/
72 | double genrand_res53(void);
73 | 


--------------------------------------------------------------------------------
/rankfm/mt19937ar/readme-mt.txt:
--------------------------------------------------------------------------------
 1 | This is a Mersenne Twister pseudorandom number generator
 2 | with period 2^19937-1 with improved initialization scheme,
 3 | modified on 2002/1/26 by Takuji Nishimura and Makoto Matsumoto. 
 4 | modified on 2005/4/26 by Mutsuo Saito
 5 | 
 6 | Contents of this tar ball:
 7 | readme-mt.txt	 this file
 8 | mt19937ar.c	 the C source (ar: initialize by ARray)
 9 | mt19937ar.h      the C header file for mt19937ar
10 | mtTest.c         the C test main program of mt19937ar.c
11 | mt19937ar.out	 Test outputs of six types generators. 1000 for each
12 | 
13 | 1. Initialization
14 |   The initialization scheme for the previous versions of MT
15 | (e.g. 1999/10/28 version or earlier) has a tiny problem, that
16 | the most significant bits of the seed is not well reflected 
17 | to the state vector of MT.
18 | 
19 | This version (2002/1/26) has two initialization schemes:
20 | init_genrand(seed) and init_by_array(init_key, key_length).
21 | 
22 | init_genrand(seed) initializes the state vector by using
23 | one unsigned 32-bit integer "seed", which may be zero.
24 | 
25 | init_by_array(init_key, key_length) initializes the state vector 
26 | by using an array init_key[] of unsigned 32-bit integers
27 | of length key_kength. If key_length is smaller than 624,
28 | then each array of 32-bit integers gives distinct initial
29 | state vector. This is useful if you want a larger seed space
30 | than 32-bit word.
31 | 
32 | 2. Generation
33 | After initialization, the following type of pseudorandom numbers
34 | are available. 
35 | 
36 | genrand_int32() generates unsigned 32-bit integers.
37 | genrand_int31() generates unsigned 31-bit integers.
38 | genrand_real1() generates uniform real in [0,1] (32-bit resolution). 
39 | genrand_real2() generates uniform real in [0,1) (32-bit resolution). 
40 | genrand_real3() generates uniform real in (0,1) (32-bit resolution).
41 | genrand_res53() generates uniform real in [0,1) with 53-bit resolution.
42 | 
43 | Note: the last five functions call the first one. 
44 | if you need more speed for these five functions, you may
45 | suppress the function call by copying genrand_int32() and
46 | replacing the last return(), following to these five functions.
47 | 
48 | 3. main()
49 | main() is an example to initialize with an array of length 4,
50 | then 1000 outputs of unsigned 32-bit integers, 
51 | then 1000 outputs of real [0,1) numbers. 
52 | 
53 | 4. The outputs
54 | The output of the mt19937ar.c is in the file mt19937ar.out.
55 | If you revise or translate the code, check the output
56 | by using this file. 
57 | 
58 | 5. Cryptography
59 | This generator is not cryptoraphically secure. 
60 | You need to use a one-way (or hash) function to obtain 
61 | a secure random sequence.
62 | 
63 | 6. Correspondence
64 | See:
65 | URL http://www.math.keio.ac.jp/matumoto/emt.html
66 | email matumoto@math.keio.ac.jp, nisimura@sci.kj.yamagata-u.ac.jp
67 | 
68 | 7. Reference
69 | M. Matsumoto and T. Nishimura,
70 | "Mersenne Twister: A 623-Dimensionally Equidistributed Uniform  
71 | Pseudo-Random Number Generator",
72 | ACM Transactions on Modeling and Computer Simulation,
73 | Vol. 8, No. 1, January 1998, pp 3--30.
74 | 
75 | -------
76 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
77 | All rights reserved.
78 | Copyright (C) 2005, Mutsuo Saito
79 | All rights reserved.
80 | 


--------------------------------------------------------------------------------
/docs/source/quickstart.rst:
--------------------------------------------------------------------------------
  1 | Quickstart
  2 | ==========
  3 | 
  4 | Let's work through a simple example of fitting a model, generating recommendations, evaluating performance, and assessing some item-item similarities. The data we'll be using here may already be somewhat familiar: you know it, you love it, it's the `MovieLens 1M <https://grouplens.org/datasets/movielens/1m/>`_!
  5 | 
  6 | Let's first look at the required shape of the interaction data:
  7 | 
  8 | ======= =======
  9 | user_id item_id    
 10 | ======= =======
 11 | 3       233
 12 | 5       377
 13 | 8       610
 14 | ======= =======
 15 | 
 16 | It has just two columns: a `user_id` and an `item_id` (you can name these fields whatever you want or use a numpy array instead). Notice that there is no `rating` column - this library is for **implicit feedback** data (e.g. watches, page views, purchases, clicks) as opposed to **explicit feedback** data (e.g. 1-5 ratings, thumbs up/down). Implicit feedback is far more common in real-world recommendation contexts and doesn't suffer from the `missing-not-at-random problem <https://resources.bibblio.org/hubfs/share/2018-01-24-RecSysLDN-Ravelin.pdf>`_ of pure explicit feedback approaches.
 17 | 
 18 | Now let's import the library, initialize our model, and fit on the training data:
 19 | 
 20 | .. code:: python
 21 | 
 22 |   from rankfm.rankfm import RankFM
 23 |   model = RankFM(factors=20, loss='warp', max_samples=20, learning_rate=0.1, learning_schedule='invscaling')
 24 |   model.fit(interactions_train, epochs=20, verbose=True)
 25 | 
 26 | If you set `verbose=True` the model will print the current epoch number as well as the epoch's log-likelihood during training. This can be useful to gauge both computational speed and training gains by epoch. If the log likelihood is not increasing then try upping the `learning_rate` or lowering the (`alpha`, `beta`) regularization strength terms. If the log likelihood is starting to bounce up and down try lowering the `learning_rate` or using `learning_schedule='invscaling'` to decrease the learning rate over time. If you run into overflow errors then decrease the feature and/or sample-weight magnitudes and try upping `beta`, especially if you have a small number of dense user-features and/or item-features. Selecting `BPR` loss will lead to faster training times, but `WARP` loss typically yields superior model performance.
 27 | 
 28 | Now let's generate some user-item model scores from the validation data:
 29 | 
 30 | .. code:: python
 31 | 
 32 |   valid_scores = model.predict(interactions_valid, cold_start='nan')
 33 | 
 34 | this will produce an array of real-valued model scores generated using the Factorization Machines model equation. You can interpret it as a measure of the predicted utility of item (i) for user (u). The `cold_start='nan'` option can be used to set scores to `np.nan` for user/item pairs not found in the training data, or `cold_start='drop'` can be specified to drop those pairs so the results contain no missing values.
 35 | 
 36 | Now let's generate our topN recommended movies for each user:
 37 | 
 38 | .. code:: python
 39 | 
 40 |   valid_recs = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='drop')
 41 | 
 42 | The input should be a `pd.Series`, `np.ndarray` or `list` of `user_id` values. You can use `filter_previous=True` to prevent generating recommendations that include any items observed by the user in the training data, which could be useful depending on your application context. The result will be a `pd.DataFrame` where `user_id` values will be the index and the rows will be each user's top recommended items in descending order (best item is in column 0):
 43 | 
 44 | =======  ====  ====  ====  ====  ====  ====  ====  ==== ====  ====
 45 | user_id     0     1     2     3     4     5     6     7    8     9
 46 | =======  ====  ====  ====  ====  ====  ====  ====  ==== ====  ====
 47 | 3        2396  1265   357    34  2858  3175     1  2028   17   356
 48 | 5         608  1617  1610  3418   590   474   858   377  924  1036
 49 | 8         589  1036  2571  2028  2000  1220  1197   110  780  1954
 50 | =======  ====  ====  ====  ====  ====  ====  ====  ==== ====  ====
 51 | 
 52 | Now let's see how the model is performing wrt the included validation metrics evaluated on the hold-out data:
 53 | 
 54 | .. code:: python
 55 | 
 56 |   from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall
 57 | 
 58 |   valid_hit_rate = hit_rate(model, interactions_valid, k=10)
 59 |   valid_reciprocal_rank = reciprocal_rank(model, interactions_valid, k=10)
 60 |   valid_dcg = discounted_cumulative_gain(model, interactions_valid, k=10)
 61 |   valid_precision = precision(model, interactions_valid, k=10)
 62 |   valid_recall = recall(model, interactions_valid, k=10)
 63 | 
 64 | .. parsed-literal::
 65 | 
 66 |   hit_rate: 0.796
 67 |   reciprocal_rank: 0.339
 68 |   dcg: 0.734
 69 |   precision: 0.159
 70 |   recall: 0.077
 71 | 
 72 | `That's a Bingo! <https://www.youtube.com/watch?v=q5pESPQpXxE>`_
 73 | 
 74 | Now let's find the most similar other movies for a few movies based on their embedding representations in latent factor space:
 75 | 
 76 | .. code:: python
 77 | 
 78 |   # Terminator 2: Judgment Day (1991)
 79 |   model.similar_items(589, n_items=10)
 80 | 
 81 | .. parsed-literal::
 82 | 
 83 |   2571                       Matrix, The (1999)
 84 |   1527                Fifth Element, The (1997)
 85 |   2916                      Total Recall (1990)
 86 |   3527                          Predator (1987)
 87 |   780             Independence Day (ID4) (1996)
 88 |   1909    X-Files: Fight the Future, The (1998)
 89 |   733                          Rock, The (1996)
 90 |   1376     Star Trek IV: The Voyage Home (1986)
 91 |   480                      Jurassic Park (1993)
 92 |   1200                            Aliens (1986)
 93 | 
 94 | `I hope you like explosions... <https://www.youtube.com/watch?v=uENYMZNzg9w>`_
 95 | 
 96 | .. code:: python
 97 | 
 98 |   # Being John Malkovich (1999)
 99 |   model.similar_items(2997, n_items=10)
100 | 
101 | .. parsed-literal::
102 | 
103 |   2599           Election (1999)
104 |   3174    Man on the Moon (1999)
105 |   2858    American Beauty (1999)
106 |   3317        Wonder Boys (2000)
107 |   223              Clerks (1994)
108 |   3897      Almost Famous (2000)
109 |   2395           Rushmore (1998)
110 |   2502       Office Space (1999)
111 |   2908     Boys Don't Cry (1999)
112 |   3481      High Fidelity (2000)
113 | 
114 | `Let's get weird... <https://www.youtube.com/watch?v=lIpev8JXJHQ&t=5s>`_
115 | 
116 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | 
 15 | import os
 16 | import sys
 17 | import subprocess
 18 | 
 19 | repo_root = os.path.abspath('../..')
 20 | sys.path.insert(0, repo_root)
 21 | 
 22 | cmd = "cd {} && python setup.py build_ext --inplace".format(repo_root)
 23 | subprocess.run(cmd, shell=True, check=True)
 24 | 
 25 | # import dependencies
 26 | # -------------------
 27 | 
 28 | import rankfm
 29 | import sphinx_rtd_theme
 30 | 
 31 | # -- Project information -----------------------------------------------------
 32 | 
 33 | project = 'rankfm'
 34 | copyright = '2020, Eric Lundquist'
 35 | author = 'Eric Lundquist'
 36 | 
 37 | # The short X.Y version
 38 | version = ''
 39 | # The full version, including alpha/beta/rc tags
 40 | release = '0.2.5'
 41 | 
 42 | 
 43 | # -- General configuration ---------------------------------------------------
 44 | 
 45 | # If your documentation needs a minimal Sphinx version, state it here.
 46 | #
 47 | # needs_sphinx = '1.0'
 48 | 
 49 | # Add any Sphinx extension module names here, as strings. They can be
 50 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 51 | # ones.
 52 | extensions = [
 53 |     'sphinx.ext.autodoc',
 54 |     'sphinx.ext.mathjax',
 55 |     'sphinx.ext.viewcode',
 56 |     'sphinx_rtd_theme'
 57 | ]
 58 | 
 59 | # add autodoc default options
 60 | autodoc_default_options = {
 61 |     'member-order': 'bysource',
 62 |     'special-members': '__init__',
 63 |     'exclude-members': '__weakref__'
 64 | }
 65 | 
 66 | # Add any paths that contain templates here, relative to this directory.
 67 | templates_path = ['_templates']
 68 | 
 69 | # The suffix(es) of source filenames.
 70 | # You can specify multiple suffix as a list of string:
 71 | #
 72 | # source_suffix = ['.rst', '.md']
 73 | source_suffix = '.rst'
 74 | 
 75 | # The master toctree document.
 76 | master_doc = 'index'
 77 | 
 78 | # The language for content autogenerated by Sphinx. Refer to documentation
 79 | # for a list of supported languages.
 80 | #
 81 | # This is also used if you do content translation via gettext catalogs.
 82 | # Usually you set "language" from the command line for these cases.
 83 | language = None
 84 | 
 85 | # List of patterns, relative to source directory, that match files and
 86 | # directories to ignore when looking for source files.
 87 | # This pattern also affects html_static_path and html_extra_path.
 88 | exclude_patterns = []
 89 | 
 90 | # The name of the Pygments (syntax highlighting) style to use.
 91 | pygments_style = None
 92 | 
 93 | 
 94 | # -- Options for HTML output -------------------------------------------------
 95 | 
 96 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 97 | # a list of builtin themes.
 98 | #
 99 | html_theme = 'sphinx_rtd_theme'
100 | 
101 | # Theme options are theme-specific and customize the look and feel of a theme
102 | # further.  For a list of options available for each theme, see the
103 | # documentation.
104 | #
105 | # html_theme_options = {}
106 | 
107 | # Add any paths that contain custom static files (such as style sheets) here,
108 | # relative to this directory. They are copied after the builtin static files,
109 | # so a file named "default.css" will overwrite the builtin "default.css".
110 | html_static_path = ['_static']
111 | 
112 | # Custom sidebar templates, must be a dictionary that maps document names
113 | # to template names.
114 | #
115 | # The default sidebars (for documents that don't match any pattern) are
116 | # defined by theme itself.  Builtin themes are using these templates by
117 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
118 | # 'searchbox.html']``.
119 | #
120 | # html_sidebars = {}
121 | 
122 | 
123 | # -- Options for HTMLHelp output ---------------------------------------------
124 | 
125 | # Output file base name for HTML help builder.
126 | htmlhelp_basename = 'rankfmdoc'
127 | 
128 | 
129 | # -- Options for LaTeX output ------------------------------------------------
130 | 
131 | latex_elements = {
132 |     # The paper size ('letterpaper' or 'a4paper').
133 |     #
134 |     # 'papersize': 'letterpaper',
135 | 
136 |     # The font size ('10pt', '11pt' or '12pt').
137 |     #
138 |     # 'pointsize': '10pt',
139 | 
140 |     # Additional stuff for the LaTeX preamble.
141 |     #
142 |     # 'preamble': '',
143 | 
144 |     # Latex figure (float) alignment
145 |     #
146 |     # 'figure_align': 'htbp',
147 | }
148 | 
149 | # Grouping the document tree into LaTeX files. List of tuples
150 | # (source start file, target name, title,
151 | #  author, documentclass [howto, manual, or own class]).
152 | latex_documents = [
153 |     (master_doc, 'rankfm.tex', 'rankfm Documentation',
154 |      'Eric Lundquist', 'manual'),
155 | ]
156 | 
157 | 
158 | # -- Options for manual page output ------------------------------------------
159 | 
160 | # One entry per manual page. List of tuples
161 | # (source start file, name, description, authors, manual section).
162 | man_pages = [
163 |     (master_doc, 'rankfm', 'rankfm Documentation',
164 |      [author], 1)
165 | ]
166 | 
167 | 
168 | # -- Options for Texinfo output ----------------------------------------------
169 | 
170 | # Grouping the document tree into Texinfo files. List of tuples
171 | # (source start file, target name, title, author,
172 | #  dir menu entry, description, category)
173 | texinfo_documents = [
174 |     (master_doc, 'rankfm', 'rankfm Documentation',
175 |      author, 'rankfm', 'One line description of project.',
176 |      'Miscellaneous'),
177 | ]
178 | 
179 | 
180 | # -- Options for Epub output -------------------------------------------------
181 | 
182 | # Bibliographic Dublin Core info.
183 | epub_title = project
184 | 
185 | # The unique identifier of the text. This can be a ISBN number
186 | # or the project homepage.
187 | #
188 | # epub_identifier = ''
189 | 
190 | # A unique identification for the text.
191 | #
192 | # epub_uid = ''
193 | 
194 | # A list of files that should not be packed into the epub file.
195 | epub_exclude_files = ['search.html']
196 | 
197 | 
198 | # -- Extension configuration -------------------------------------------------
199 | 


--------------------------------------------------------------------------------
/rankfm/mt19937ar/mt19937ar.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |    A C-program for MT19937, with initialization improved 2002/1/26.
  3 |    Coded by Takuji Nishimura and Makoto Matsumoto.
  4 | 
  5 |    Before using, initialize the state by using init_genrand(seed)
  6 |    or init_by_array(init_key, key_length).
  7 | 
  8 |    Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
  9 |    All rights reserved.
 10 |    Copyright (C) 2005, Mutsuo Saito,
 11 |    All rights reserved.
 12 | 
 13 |    Redistribution and use in source and binary forms, with or without
 14 |    modification, are permitted provided that the following conditions
 15 |    are met:
 16 | 
 17 |      1. Redistributions of source code must retain the above copyright
 18 |         notice, this list of conditions and the following disclaimer.
 19 | 
 20 |      2. Redistributions in binary form must reproduce the above copyright
 21 |         notice, this list of conditions and the following disclaimer in the
 22 |         documentation and/or other materials provided with the distribution.
 23 | 
 24 |      3. The names of its contributors may not be used to endorse or promote
 25 |         products derived from this software without specific prior written
 26 |         permission.
 27 | 
 28 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 29 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 30 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 31 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 32 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 33 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 34 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 35 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 36 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 37 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 38 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 39 | 
 40 | 
 41 |    Any feedback is very welcome.
 42 |    http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
 43 |    email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
 44 | */
 45 | 
 46 | #include <stdio.h>
 47 | #include "mt19937ar.h"
 48 | 
 49 | /* Period parameters */
 50 | #define N 624
 51 | #define M 397
 52 | #define MATRIX_A 0x9908b0dfUL   /* constant vector a */
 53 | #define UPPER_MASK 0x80000000UL /* most significant w-r bits */
 54 | #define LOWER_MASK 0x7fffffffUL /* least significant r bits */
 55 | 
 56 | static unsigned long mt[N]; /* the array for the state vector  */
 57 | static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
 58 | 
 59 | /* initializes mt[N] with a seed */
 60 | void init_genrand(unsigned long s)
 61 | {
 62 |     mt[0]= s & 0xffffffffUL;
 63 |     for (mti=1; mti<N; mti++) {
 64 |         mt[mti] =
 65 | 	    (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
 66 |         /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
 67 |         /* In the previous versions, MSBs of the seed affect   */
 68 |         /* only MSBs of the array mt[].                        */
 69 |         /* 2002/01/09 modified by Makoto Matsumoto             */
 70 |         mt[mti] &= 0xffffffffUL;
 71 |         /* for >32 bit machines */
 72 |     }
 73 | }
 74 | 
 75 | /* initialize by an array with array-length */
 76 | /* init_key is the array for initializing keys */
 77 | /* key_length is its length */
 78 | /* slight change for C++, 2004/2/26 */
 79 | void init_by_array(unsigned long init_key[], int key_length)
 80 | {
 81 |     int i, j, k;
 82 |     init_genrand(19650218UL);
 83 |     i=1; j=0;
 84 |     k = (N>key_length ? N : key_length);
 85 |     for (; k; k--) {
 86 |         mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
 87 |           + init_key[j] + j; /* non linear */
 88 |         mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
 89 |         i++; j++;
 90 |         if (i>=N) { mt[0] = mt[N-1]; i=1; }
 91 |         if (j>=key_length) j=0;
 92 |     }
 93 |     for (k=N-1; k; k--) {
 94 |         mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
 95 |           - i; /* non linear */
 96 |         mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
 97 |         i++;
 98 |         if (i>=N) { mt[0] = mt[N-1]; i=1; }
 99 |     }
100 | 
101 |     mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
102 | }
103 | 
104 | /* generates a random number on [0,0xffffffff]-interval */
105 | unsigned long genrand_int32(void)
106 | {
107 |     unsigned long y;
108 |     static unsigned long mag01[2]={0x0UL, MATRIX_A};
109 |     /* mag01[x] = x * MATRIX_A  for x=0,1 */
110 | 
111 |     if (mti >= N) { /* generate N words at one time */
112 |         int kk;
113 | 
114 |         if (mti == N+1)   /* if init_genrand() has not been called, */
115 |             init_genrand(5489UL); /* a default initial seed is used */
116 | 
117 |         for (kk=0;kk<N-M;kk++) {
118 |             y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
119 |             mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
120 |         }
121 |         for (;kk<N-1;kk++) {
122 |             y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
123 |             mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
124 |         }
125 |         y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
126 |         mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
127 | 
128 |         mti = 0;
129 |     }
130 | 
131 |     y = mt[mti++];
132 | 
133 |     /* Tempering */
134 |     y ^= (y >> 11);
135 |     y ^= (y << 7) & 0x9d2c5680UL;
136 |     y ^= (y << 15) & 0xefc60000UL;
137 |     y ^= (y >> 18);
138 | 
139 |     return y;
140 | }
141 | 
142 | /* generates a random number on [0,0x7fffffff]-interval */
143 | long genrand_int31(void)
144 | {
145 |     return (long)(genrand_int32()>>1);
146 | }
147 | 
148 | /* generates a random number on [0,1]-real-interval */
149 | double genrand_real1(void)
150 | {
151 |     return genrand_int32()*(1.0/4294967295.0);
152 |     /* divided by 2^32-1 */
153 | }
154 | 
155 | /* generates a random number on [0,1)-real-interval */
156 | double genrand_real2(void)
157 | {
158 |     return genrand_int32()*(1.0/4294967296.0);
159 |     /* divided by 2^32 */
160 | }
161 | 
162 | /* generates a random number on (0,1)-real-interval */
163 | double genrand_real3(void)
164 | {
165 |     return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
166 |     /* divided by 2^32 */
167 | }
168 | 
169 | /* generates a random number on [0,1) with 53-bit resolution*/
170 | double genrand_res53(void)
171 | {
172 |     unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
173 |     return(a*67108864.0+b)*(1.0/9007199254740992.0);
174 | }
175 | /* These real versions are due to Isaku Wada, 2002/01/09 added */
176 | 


--------------------------------------------------------------------------------
/rankfm/evaluation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | rankfm model tuning and evaluation functions
  3 | """
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from rankfm.utils import get_data
  8 | 
  9 | def hit_rate(model, test_interactions, k=10, filter_previous=False):
 10 |     """evaluate hit-rate (any match) wrt out-of-sample observed interactions
 11 | 
 12 |     :param model: trained RankFM model instance
 13 |     :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
 14 |     :param k: number of recommendations to generate for each user
 15 |     :param filter_previous: remove observed training items from generated recommendations
 16 |     :return: the hit rate or proportion of test users with any matching items
 17 |     """
 18 | 
 19 |     # ensure that the model has been fit before attempting to generate predictions
 20 |     assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 21 | 
 22 |     # transform interactions into a user -> items dictionary
 23 |     test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
 24 |     test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
 25 |     test_users = list(test_user_items.keys())
 26 | 
 27 |     # generate topK recommendations for all test users also present in the training data
 28 |     test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
 29 |     comm_user = test_recs.index.values
 30 | 
 31 |     # calculate the hit rate (percentage of users with any relevant recommendation) wrt common users
 32 |     hit_rate = np.mean([int(len(set(test_recs.loc[u]) & test_user_items[u]) > 0) for u in comm_user])
 33 |     return hit_rate
 34 | 
 35 | 
 36 | def reciprocal_rank(model, test_interactions, k=10, filter_previous=False):
 37 |     """evaluate reciprocal rank wrt out-of-sample observed interactions
 38 | 
 39 |     :param model: trained RankFM model instance
 40 |     :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
 41 |     :param k: number of recommendations to generate for each user
 42 |     :param filter_previous: remove observed training items from generated recommendations
 43 |     :return: mean reciprocal rank wrt the test users
 44 |     """
 45 | 
 46 |     # ensure that the model has been fit before attempting to generate predictions
 47 |     assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 48 | 
 49 |     # transform interactions into a user -> items dictionary
 50 |     test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
 51 |     test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
 52 |     test_users = list(test_user_items.keys())
 53 | 
 54 |     # generate topK recommendations for all test users also present in the training data
 55 |     test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
 56 |     comm_user = test_recs.index.values
 57 | 
 58 |     # calculate the reciprocal rank (inverse rank of the first relevant recommended item) wrt common users
 59 |     match_indexes = [np.where(test_recs.loc[u].isin(set(test_recs.loc[u]) & test_user_items[u]))[0] for u in comm_user]
 60 |     reciprocal_rank = np.mean([1 / (np.min(index) + 1) if len(index) > 0 else 0 for index in match_indexes])
 61 |     return reciprocal_rank
 62 | 
 63 | 
 64 | def discounted_cumulative_gain(model, test_interactions, k=10, filter_previous=False):
 65 |     """evaluate discounted cumulative gain wrt out-of-sample observed interactions
 66 | 
 67 |     :param model: trained RankFM model instance
 68 |     :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
 69 |     :param k: number of recommendations to generate for each user
 70 |     :param filter_previous: remove observed training items from generated recommendations
 71 |     :return: mean discounted cumulative gain wrt the test users
 72 |     """
 73 | 
 74 |     # ensure that the model has been fit before attempting to generate predictions
 75 |     assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 76 | 
 77 |     # transform interactions into a user -> items dictionary
 78 |     test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
 79 |     test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
 80 |     test_users = list(test_user_items.keys())
 81 | 
 82 |     # generate topK recommendations for all test users also present in the training data
 83 |     test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
 84 |     comm_user = test_recs.index.values
 85 | 
 86 |     # calculate the discounted cumulative gain (sum of inverse log scaled ranks of relevant items) wrt common users
 87 |     match_indexes = [np.where(test_recs.loc[u].isin(set(test_recs.loc[u]) & test_user_items[u]))[0] for u in comm_user]
 88 |     discounted_cumulative_gain = np.mean([np.sum(1 / np.log2(index + 2)) if len(index) > 0 else 0 for index in match_indexes])
 89 |     return discounted_cumulative_gain
 90 | 
 91 | 
 92 | def precision(model, test_interactions, k=10, filter_previous=False):
 93 |     """evaluate precision wrt out-of-sample observed interactions
 94 | 
 95 |     :param model: trained RankFM model instance
 96 |     :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
 97 |     :param k: number of recommendations to generate for each user
 98 |     :param filter_previous: remove observed training items from generated recommendations
 99 |     :return: mean precision wrt the test users
100 |     """
101 | 
102 |     # ensure that the model has been fit before attempting to generate predictions
103 |     assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
104 | 
105 |     # transform interactions into a user -> items dictionary
106 |     test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
107 |     test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
108 |     test_users = list(test_user_items.keys())
109 | 
110 |     # generate topK recommendations for all test users also present in the training data
111 |     test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
112 |     comm_user = test_recs.index.values
113 | 
114 |     # calculate average precision wrt common users
115 |     precision = np.mean([len(set(test_recs.loc[u]) & test_user_items[u]) / len(test_recs.loc[u]) for u in comm_user])
116 |     return precision
117 | 
118 | 
119 | def recall(model, test_interactions, k=10, filter_previous=False):
120 |     """evaluate recall wrt out-of-sample observed interactions
121 | 
122 |     :param model: trained RankFM model instance
123 |     :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
124 |     :param k: number of recommendations to generate for each user
125 |     :param filter_previous: remove observed training items from generated recommendations
126 |     :return: mean recall wrt the test users
127 |     """
128 | 
129 |     # ensure that the model has been fit before attempting to generate predictions
130 |     assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
131 | 
132 |     # transform interactions into a user -> items dictionary
133 |     test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
134 |     test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
135 |     test_users = list(test_user_items.keys())
136 | 
137 |     # generate topK recommendations for all test users also present in the training data
138 |     test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
139 |     comm_user = test_recs.index.values
140 | 
141 |     # calculate average recall across wrt common users
142 |     recall = np.mean([len(set(test_recs.loc[u]) & test_user_items[u]) / len(test_user_items[u]) for u in comm_user])
143 |     return recall
144 | 
145 | 
146 | def diversity(model, test_interactions, k=10, filter_previous=False):
147 |     """evaluate the diversity of the model recommendations
148 | 
149 |     :param model: trained RankFM model instance
150 |     :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
151 |     :param k: number of recommendations to generate for each user
152 |     :param filter_previous: remove observed training items from generated recommendations
153 |     :return: dataframe of cnt/pct of users recommended for each item
154 |     """
155 | 
156 |     # ensure that the model has been fit before attempting to generate predictions
157 |     assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
158 | 
159 |     # get the unique set of test users
160 |     test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
161 |     test_users = test_user_items['user_id'].unique()
162 | 
163 |     # generate topK recommendations for all test users also present in the training data
164 |     test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
165 |     comm_user = test_recs.index.values
166 | 
167 |     # stack the recommendations long-format for aggregation
168 |     test_recs = test_recs.stack().reset_index().drop('level_1', axis=1)
169 |     test_recs.columns = ['user_id', 'item_id']
170 | 
171 |     # calculate the number and percentage of users getting recommended each unique item
172 |     user_counts = test_recs.groupby('item_id')['user_id'].count().to_frame('cnt_users')
173 |     user_counts = user_counts.reindex(model.item_id.values, fill_value=0).sort_values('cnt_users', ascending=False).reset_index()
174 |     user_counts['pct_users'] = user_counts['cnt_users'] / len(comm_user)
175 |     return user_counts
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RankFM
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/rankfm.svg)](https://badge.fury.io/py/rankfm)
  4 | [![CircleCI](https://circleci.com/gh/etlundquist/rankfm.svg?style=shield)](https://circleci.com/gh/etlundquist/rankfm)
  5 | [![Documentation Status](https://readthedocs.org/projects/rankfm/badge/?version=latest)](https://rankfm.readthedocs.io/en/latest/?badge=latest)
  6 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
  7 | 
  8 | RankFM is a python implementation of the general [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) model class adapted for collaborative filtering recommendation/ranking problems with implicit feedback user/item interaction data. It uses [Bayesian Personalized Ranking (BPR)](https://arxiv.org/pdf/1205.2618.pdf) and a variant of [Weighted Approximate-Rank Pairwise (WARP)](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.587.3946&rep=rep1&type=pdf) loss to learn model weights via Stochastic Gradient Descent (SGD). It can (optionally) incorporate sample weights and user/item auxiliary features to augment the main interaction data.
  9 | 
 10 | The core (training, prediction, recommendation) methods are written in [Cython](https://cython.org/), making it possible to scale to millions of user/item interactions. Designed for ease-of-use, RankFM accepts both `pd.DataFrame` and `np.ndarray` inputs - you do not have to convert your data to `scipy.sparse` matrices or re-map user/item identifiers prior to use. RankFM internally maps all user/item identifiers to zero-based integer indexes, but always converts its output back to the original user/item identifiers from your data, which can be arbitrary (non-zero-based, non-consecutive) integers or even strings.
 11 | 
 12 | In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities `similiar_users()` and `similar_items()` to find the most similar users/items to a given user/item based on latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions have been included in the separate `evaluation` module to streamline model tuning and validation.
 13 | 
 14 | * see the **Quickstart** section below to get started with the basic functionality
 15 | * see the `/examples` folder for more in-depth jupyter notebook walkthroughs with several popular open-source data sets
 16 | * see the [Online Documentation](https://rankfm.readthedocs.io/en/latest/) for more comprehensive documentation on the main model class and separate evaluation module
 17 | * see the [Medium Article](https://towardsdatascience.com/factorization-machines-for-item-recommendation-with-implicit-feedback-data-5655a7c749db) for contextual motivation and a detailed mathematical description of the algorithm
 18 | 
 19 | ---
 20 | ### Dependencies
 21 | * Python 3.6+
 22 | * numpy >= 1.15
 23 | * pandas >= 0.24
 24 | 
 25 | ### Installation
 26 | 
 27 | #### Prerequisites
 28 | 
 29 | To install RankFM's C extensions you will need the [GNU Compiler Collection (GCC)](https://gcc.gnu.org/). Check to see whether you already have it installed:
 30 | ```
 31 | gcc --version
 32 | ```
 33 | 
 34 | If you don't have it already you can easily install it using [Homebrew](https://brew.sh/) on OSX or your default linux package manager:
 35 | ```
 36 | # OSX
 37 | brew install gcc
 38 | 
 39 | # linux
 40 | sudo yum install gcc
 41 | 
 42 | # ensure [gcc] has been installed correctly and is on the system PATH
 43 | gcc --version
 44 | ```
 45 | 
 46 | #### Package Installation
 47 | 
 48 | You can install the latest published version from PyPI using `pip`:
 49 | ```
 50 | pip install rankfm
 51 | ```
 52 | Or alternatively install the current development build directly from GitHub:
 53 | ```
 54 | pip install git+https://github.com/etlundquist/rankfm.git#egg=rankfm
 55 | ```
 56 | 
 57 | It's highly recommended that you use an [Anaconda](https://www.anaconda.com/) base environment to ensure that all core numpy C extensions and linear algebra libraries have been installed and configured correctly. Anaconda: it just works.
 58 | 
 59 | ### Quickstart
 60 | Let's work through a simple example of fitting a model, generating recommendations, evaluating performance, and assessing some item-item similarities. The data we'll be using here may already be somewhat familiar: you know it, you love it, it's the [MovieLens 1M](https://grouplens.org/datasets/movielens/1m/)!
 61 | 
 62 | Let's first look at the required shape of the interaction data:
 63 | 
 64 | | user_id | item_id |
 65 | |---------|---------|
 66 | | 3       | 233     |
 67 | | 5       | 377     |
 68 | | 8       | 610     |
 69 | 
 70 | It has just two columns: a `user_id` and an `item_id` (you can name these fields whatever you want or use a numpy array instead). Notice that there is no `rating` column - this library is for **implicit feedback** data (e.g. watches, page views, purchases, clicks) as opposed to **explicit feedback** data (e.g. 1-5 ratings, thumbs up/down). Implicit feedback is far more common in real-world recommendation contexts and doesn't suffer from the [missing-not-at-random problem](https://resources.bibblio.org/hubfs/share/2018-01-24-RecSysLDN-Ravelin.pdf) of pure explicit feedback approaches.
 71 | 
 72 | Now let's import the library, initialize our model, and fit on the training data:
 73 | ```python
 74 | from rankfm.rankfm import RankFM
 75 | model = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.1, learning_schedule='invscaling')
 76 | model.fit(interactions_train, epochs=20, verbose=True)
 77 | # NOTE: this takes about 30 seconds for 750,000 interactions on my 2.3 GHz i5 8GB RAM MacBook
 78 | ```
 79 | If you set `verbose=True` the model will print the current epoch number as well as the epoch's log-likelihood during training. This can be useful to gauge both computational speed and training gains by epoch. If the log likelihood is not increasing then try upping the `learning_rate` or lowering the (`alpha`, `beta`) regularization strength terms. If the log likelihood is starting to bounce up and down try lowering the `learning_rate` or using `learning_schedule='invscaling'` to decrease the learning rate over time. If you run into overflow errors then decrease the feature and/or sample-weight magnitudes and try upping `beta`, especially if you have a small number of dense user-features and/or item-features. Selecting `BPR` loss will lead to faster training times, but `WARP` loss typically yields superior model performance.
 80 | 
 81 | Now let's generate some user-item model scores from the validation data:
 82 | ```python
 83 | valid_scores = model.predict(interactions_valid, cold_start='nan')
 84 | ```
 85 | this will produce an array of real-valued model scores generated using the Factorization Machines model equation. You can interpret it as a measure of the predicted utility of item (i) for user (u). The `cold_start='nan'` option can be used to set scores to `np.nan` for user/item pairs not found in the training data, or `cold_start='drop'` can be specified to drop those pairs so the results contain no missing values.
 86 | 
 87 | Now let's generate our topN recommended movies for each user:
 88 | ```python
 89 | valid_recs = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='drop')
 90 | ```
 91 | The input should be a `pd.Series`, `np.ndarray` or `list` of `user_id` values. You can use `filter_previous=True` to prevent generating recommendations that include any items observed by the user in the training data, which could be useful depending on your application context. The result will be a `pd.DataFrame` where `user_id` values will be the index and the rows will be each user's top recommended items in descending order (best item is in column 0):
 92 | 
 93 | |   |    0|    1|    2|    3|    4|    5|    6|    7|   8|    9|
 94 | |---|-----|-----|-----|-----|-----|-----|-----|-----|----|-----|
 95 | |3  | 2396| 1265|  357|   34| 2858| 3175|    1| 2028|  17|  356|
 96 | |5  |  608| 1617| 1610| 3418|  590|  474|  858|  377| 924| 1036|
 97 | |8  |  589| 1036| 2571| 2028| 2000| 1220| 1197|  110| 780| 1954|
 98 | 
 99 | Now let's see how the model is performing wrt the included validation metrics evaluated on the hold-out data:
100 | ```python
101 | from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall
102 | 
103 | valid_hit_rate = hit_rate(model, interactions_valid, k=10)
104 | valid_reciprocal_rank = reciprocal_rank(model, interactions_valid, k=10)
105 | valid_dcg = discounted_cumulative_gain(model, interactions_valid, k=10)
106 | valid_precision = precision(model, interactions_valid, k=10)
107 | valid_recall = recall(model, interactions_valid, k=10)
108 | ```
109 | ```
110 | hit_rate: 0.796
111 | reciprocal_rank: 0.339
112 | dcg: 0.734
113 | precision: 0.159
114 | recall: 0.077
115 | ```
116 | [That's a Bingo!](https://www.youtube.com/watch?v=q5pESPQpXxE)
117 | 
118 | Now let's find the most similar other movies for a few movies based on their embedding representations in latent factor space:
119 | ```python
120 | # Terminator 2: Judgment Day (1991)
121 | model.similar_items(589, n_items=10)
122 | ```
123 | ```
124 | 2571                       Matrix, The (1999)
125 | 1527                Fifth Element, The (1997)
126 | 2916                      Total Recall (1990)
127 | 3527                          Predator (1987)
128 | 780             Independence Day (ID4) (1996)
129 | 1909    X-Files: Fight the Future, The (1998)
130 | 733                          Rock, The (1996)
131 | 1376     Star Trek IV: The Voyage Home (1986)
132 | 480                      Jurassic Park (1993)
133 | 1200                            Aliens (1986)
134 | ```
135 | [I hope you like explosions...](https://www.youtube.com/watch?v=uENYMZNzg9w)
136 | 
137 | ```python
138 | # Being John Malkovich (1999)
139 | model.similar_items(2997, n_items=10)
140 | ```
141 | ```
142 | 2599           Election (1999)
143 | 3174    Man on the Moon (1999)
144 | 2858    American Beauty (1999)
145 | 3317        Wonder Boys (2000)
146 | 223              Clerks (1994)
147 | 3897      Almost Famous (2000)
148 | 2395           Rushmore (1998)
149 | 2502       Office Space (1999)
150 | 2908     Boys Don't Cry (1999)
151 | 3481      High Fidelity (2000)
152 | ```
153 | [Let's get weird...](https://www.youtube.com/watch?v=lIpev8JXJHQ&t=5s)
154 | 


--------------------------------------------------------------------------------
/tests/test_rankfm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pytest
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from rankfm.rankfm import RankFM
  7 | from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall
  8 | 
  9 | # ------------------------------
 10 | # create sample data for testing
 11 | # ------------------------------
 12 | 
 13 | # training interactions
 14 | # ---------------------
 15 | 
 16 | # train interactions as pd.dataframe of [int]
 17 | intx_train_pd_int = pd.DataFrame([
 18 |     (1, 1), (1, 3), (1, 5),
 19 |     (2, 1), (2, 2), (2, 6),
 20 |     (3, 3), (3, 6), (3, 4)
 21 | ], columns=['user_id', 'item_id'], dtype=np.int32)
 22 | 
 23 | # train interactions as pd.dataframe of [str]
 24 | intx_train_pd_str = pd.DataFrame([
 25 |     ('X', 'A'), ('X', 'C'), ('X', 'E'),
 26 |     ('Y', 'A'), ('Y', 'B'), ('Y', 'F'),
 27 |     ('Z', 'C'), ('Z', 'F'), ('Z', 'D')
 28 | ], columns=['user_id', 'item_id'])
 29 | 
 30 | # train interactions as a np.ndarray
 31 | intx_train_np = np.array([
 32 |     (1, 1), (1, 3), (1, 5),
 33 |     (2, 1), (2, 2), (2, 6),
 34 |     (3, 3), (3, 6), (3, 4)
 35 | ])
 36 | 
 37 | # train interactions with an erroneous extra rating column
 38 | intx_train_pd_rating = pd.DataFrame([
 39 |     (1, 1, 5), (1, 3, 2), (1, 5, 3),
 40 |     (2, 1, 2), (2, 2, 1), (2, 6, 4),
 41 |     (3, 3, 3), (3, 6, 4), (3, 4, 5)
 42 | ], columns=['user_id', 'item_id', 'rating'], dtype=np.int32)
 43 | 
 44 | # valid interactions with disjoint user/items
 45 | intx_valid_disjoint = pd.DataFrame([
 46 |     (1, 1), (1, 3), (1, 5),
 47 |     (2, 1), (2, 2), (2, 7),
 48 |     (4, 3), (4, 7), (4, 4)
 49 | ], columns=['user_id', 'item_id'], dtype=np.int32)
 50 | 
 51 | # user features
 52 | # -------------
 53 | 
 54 | # user features data as a pd.dataframe in the correct format
 55 | uf_pd_good = pd.DataFrame([
 56 |     (1, 0, 1, 5, 3.14),
 57 |     (2, 1, 0, 6, 2.72),
 58 |     (3, 0, 0, 4, 1.62)
 59 | ], columns=['user_id', 'bin_1', 'bin_2', 'int', 'cnt'])
 60 | 
 61 | # user features as a np.ndarray in the correct format
 62 | uf_np_good = np.array([
 63 |     (1, 0, 1, 5, 3.14),
 64 |     (2, 1, 0, 6, 2.72),
 65 |     (3, 0, 0, 4, 1.62)
 66 | ])
 67 | 
 68 | # user features with a missing ID column
 69 | uf_no_id = pd.DataFrame([
 70 |     (0, 1, 5, 3.14),
 71 |     (1, 0, 6, 2.72),
 72 |     (0, 0, 4, 1.62)
 73 | ], columns=['bin_1', 'bin_2', 'int', 'cnt'])
 74 | 
 75 | # user features with a string column
 76 | uf_str_cols = pd.DataFrame([
 77 |     (1, 0, 1, "A", 3.14),
 78 |     (2, 1, 0, "B", 2.72),
 79 |     (3, 0, 0, "C", 1.62)
 80 | ], columns=['user_id', 'bin_1', 'bin_2', 'str', 'cnt'])
 81 | 
 82 | # item features
 83 | # -------------
 84 | 
 85 | # item features data as a pd.dataframe in the correct format
 86 | if_pd_good = pd.DataFrame([
 87 |     (1, 0, 1, 5, 3.14),
 88 |     (2, 1, 0, 6, 2.72),
 89 |     (3, 0, 0, 4, 1.62),
 90 |     (4, 1, 1, 3, 1.05),
 91 |     (5, 1, 0, 6, 0.33),
 92 |     (6, 0, 0, 0, 0.00)
 93 | ], columns=['item_id', 'bin_1', 'bin_2', 'int', 'cnt'])
 94 | 
 95 | # item features as a np.ndarray in the correct format
 96 | if_np_good = np.array([
 97 |     (1, 0, 1, 5, 3.14),
 98 |     (2, 1, 0, 6, 2.72),
 99 |     (3, 0, 0, 4, 1.62),
100 |     (4, 1, 1, 3, 1.05),
101 |     (5, 1, 0, 6, 0.33),
102 |     (6, 0, 0, 0, 0.00)
103 | ])
104 | 
105 | # item features with a missing ID column
106 | if_no_id = pd.DataFrame([
107 |     (0, 1, 5, 3.14),
108 |     (1, 0, 6, 2.72),
109 |     (0, 0, 4, 1.62),
110 |     (1, 1, 3, 1.05),
111 |     (1, 0, 6, 0.33),
112 |     (0, 0, 0, 0.00)
113 | ], columns=['bin_1', 'bin_2', 'int', 'cnt'])
114 | 
115 | # item features with a string column
116 | if_str_cols = pd.DataFrame([
117 |     (1, 0, 1, "A", 3.14),
118 |     (2, 1, 0, "B", 2.72),
119 |     (3, 0, 0, "C", 1.62),
120 |     (4, 1, 1, "A", 1.05),
121 |     (5, 1, 0, "F", 0.33),
122 |     (6, 0, 0, "G", 0.00)
123 | ], columns=['item_id', 'bin_1', 'bin_2', 'str', 'cnt'])
124 | 
125 | # user iterables
126 | # --------------
127 | 
128 | train_users = np.array([1, 2, 3])
129 | valid_users = np.array([1, 2, 4, 5])
130 | 
131 | # ------------------------------
132 | # test basic model functionality
133 | # ------------------------------
134 | 
135 | # model fitting
136 | # -------------
137 | 
138 | params_good = [
139 |     (intx_train_pd_int,       None,       None),
140 |     (intx_train_pd_str,       None,       None),
141 |     (intx_train_np,           None,       None),
142 |     (intx_train_pd_int, uf_pd_good,       None),
143 |     (intx_train_pd_int,       None, if_pd_good),
144 |     (intx_train_pd_int, uf_pd_good, if_pd_good),
145 |     (intx_train_pd_int, uf_np_good, if_np_good),
146 | ]
147 | 
148 | @pytest.mark.parametrize("interactions, user_features, item_features", params_good)
149 | def test__fit__good(interactions, user_features, item_features):
150 |     """assert that the model can be successfully fit on the input data"""
151 | 
152 |     model = RankFM(factors=2)
153 |     model.fit(interactions, user_features, item_features, epochs=2, verbose=True)
154 |     assert model.is_fit
155 | 
156 | 
157 | def test__fit__bad__rating_col():
158 |     """ensure that having more than 2 columns in the interaction data causes an assertion failure"""
159 | 
160 |     with pytest.raises(AssertionError):
161 |         model = RankFM(factors=2)
162 |         model.fit(intx_train_pd_rating)
163 | 
164 | 
165 | def test__fit__bad__uf_no_id():
166 |     """ensure that the [user_features] contains a [user_id] column"""
167 | 
168 |     with pytest.raises(KeyError):
169 |         model = RankFM(factors=2)
170 |         model.fit(intx_train_pd_int, user_features=uf_no_id)
171 | 
172 | def test__fit__bad__uf_str_cols():
173 |     """ensure that the [user_features] does not contain string columns"""
174 | 
175 |     with pytest.raises(ValueError):
176 |         model = RankFM(factors=2)
177 |         model.fit(intx_train_pd_int, user_features=uf_str_cols)
178 | 
179 | 
180 | def test__fit__bad__if_no_id():
181 |     """ensure that the [item_features] contains a [item_id] column"""
182 | 
183 |     with pytest.raises(KeyError):
184 |         model = RankFM(factors=2)
185 |         model.fit(intx_train_pd_int, item_features=if_no_id)
186 | 
187 | def test__fit__bad__if_str_cols():
188 |     """ensure that the [item_features] does not contain string columns"""
189 | 
190 |     with pytest.raises(ValueError):
191 |         model = RankFM(factors=2)
192 |         model.fit(intx_train_pd_int, item_features=if_str_cols)
193 | 
194 | # score prediction
195 | # ----------------
196 | 
197 | def test__predict__good__train():
198 |     """test the predict() method on the training inputs"""
199 | 
200 |     model = RankFM(factors=2)
201 |     model.fit(intx_train_pd_int)
202 |     scores = model.predict(intx_train_pd_int)
203 | 
204 |     shape = scores.shape == (9,)
205 |     dtype = scores.dtype == np.float32
206 |     nmiss = np.sum(np.isnan(scores).astype(np.int32)) == 0
207 |     assert shape and dtype and nmiss
208 | 
209 | def test__predict__good__disjoint_nan():
210 |     """test the predict() method on disjoint validation pairs with the cold_start='nan' option"""
211 | 
212 |     model = RankFM(factors=2)
213 |     model.fit(intx_train_pd_int)
214 |     scores = model.predict(intx_valid_disjoint, cold_start='nan')
215 | 
216 |     shape = scores.shape == (9,)
217 |     dtype = scores.dtype == np.float32
218 |     nmiss = np.sum(np.isnan(scores).astype(np.int32)) == 4
219 |     assert shape and dtype and nmiss
220 | 
221 | def test__predict__good__disjoint_drop():
222 |     """test the predict() method on disjoint validation pairs with the cold_start='drop' option"""
223 | 
224 |     model = RankFM(factors=2)
225 |     model.fit(intx_train_pd_int)
226 |     scores = model.predict(intx_valid_disjoint, cold_start='drop')
227 | 
228 |     shape = scores.shape == (5,)
229 |     dtype = scores.dtype == np.float32
230 |     nmiss = np.sum(np.isnan(scores).astype(np.int32)) == 0
231 |     assert shape and dtype and nmiss
232 | 
233 | # user recommendation
234 | # -------------------
235 | 
236 | def test__recommend__good__train():
237 |     """test the recommend() method on the training users"""
238 | 
239 |     model = RankFM(factors=2)
240 |     model.fit(intx_train_pd_int)
241 |     recs = model.recommend(train_users, n_items=3)
242 | 
243 |     klass = isinstance(recs, pd.DataFrame)
244 |     shape = recs.shape == (3, 3)
245 |     index = np.array_equal(recs.index.values, train_users)
246 |     items = recs.isin(intx_train_pd_int['item_id'].values).all().all()
247 |     assert klass and shape and index and items
248 | 
249 | def test__recommend__good__train__filter():
250 |     """test the recommend() method on the training users but filter previous items"""
251 | 
252 |     model = RankFM(factors=2)
253 |     model.fit(intx_train_pd_int)
254 |     recs = model.recommend(train_users, n_items=3, filter_previous=True)
255 | 
256 |     klass = isinstance(recs, pd.DataFrame)
257 |     shape = recs.shape == (3, 3)
258 |     index = np.array_equal(recs.index.values, train_users)
259 |     items = recs.isin(intx_train_pd_int['item_id'].values).all().all()
260 | 
261 |     recs_long = recs.stack().reset_index().drop('level_1', axis=1)
262 |     recs_long.columns = ['user_id', 'item_id']
263 |     intersect = pd.merge(intx_train_pd_int, recs_long, on=['user_id', 'item_id'], how='inner').empty
264 |     assert klass and shape and index and items and intersect
265 | 
266 | def test__recommend__good__valid__nan():
267 |     """test the recommend() method on a disjoint set of validation users"""
268 | 
269 |     model = RankFM(factors=2)
270 |     model.fit(intx_train_pd_int)
271 |     recs = model.recommend(valid_users, n_items=3, cold_start='nan')
272 | 
273 |     klass = isinstance(recs, pd.DataFrame)
274 |     shape = recs.shape == (4, 3)
275 |     index = np.array_equal(sorted(recs.index.values), sorted(valid_users))
276 |     items = recs.dropna().isin(intx_train_pd_int['item_id'].values).all().all()
277 |     new_users = list(set(valid_users) - set(train_users))
278 |     nmiss = recs.loc[new_users].isnull().all().all()
279 |     assert klass and shape and index and items and nmiss
280 | 
281 | def test__recommend__good__valid__drop():
282 |     """test the recommend() method on a disjoint set of validation users"""
283 | 
284 |     model = RankFM(factors=2)
285 |     model.fit(intx_train_pd_int)
286 |     recs = model.recommend(valid_users, n_items=3, cold_start='drop')
287 | 
288 |     klass = isinstance(recs, pd.DataFrame)
289 |     shape = recs.shape == (2, 3)
290 |     index = np.isin(recs.index.values, valid_users).all()
291 |     items = recs.dropna().isin(intx_train_pd_int['item_id'].values).all().all()
292 | 
293 |     same_users = list(set(valid_users) & set(train_users))
294 |     match_users = np.array_equal(sorted(same_users), sorted(recs.index.values))
295 |     assert klass and shape and index and items and match_users
296 | 
297 | # similar items/users
298 | # -------------------
299 | 
300 | def test__similar_items__good():
301 |     """test the similar_items() method for a valid [item_id]"""
302 | 
303 |     model = RankFM(factors=2)
304 |     model.fit(intx_train_pd_int)
305 |     similar = model.similar_items(1, n_items=3)
306 | 
307 |     shape = similar.shape == (3,)
308 |     items = np.isin(similar, intx_train_pd_int['item_id'].unique()).all()
309 |     assert shape and items
310 | 
311 | def test__similar_items__bad():
312 |     """ensure the similar_items() method raises an exception for an item not in training data"""
313 | 
314 |     with pytest.raises(AssertionError):
315 |         model = RankFM(factors=2)
316 |         model.fit(intx_train_pd_int)
317 |         similar = model.similar_items(99, n_items=3)
318 | 
319 | 
320 | def test__similar_users__good():
321 |     """test the similar_users() method for a valid [user_id]"""
322 | 
323 |     model = RankFM(factors=2)
324 |     model.fit(intx_train_pd_int)
325 |     similar = model.similar_users(1, n_users=2)
326 | 
327 |     shape = similar.shape == (2,)
328 |     users = np.isin(similar, intx_train_pd_int['user_id'].unique()).all()
329 |     assert shape and users
330 | 
331 | def test__similar_users__bad():
332 |     """ensure the similar_users() method raises an exception for an user not in training data"""
333 | 
334 |     with pytest.raises(AssertionError):
335 |         model = RankFM(factors=2)
336 |         model.fit(intx_train_pd_int)
337 |         similar = model.similar_users(9, n_users=1)
338 | 
339 | 


--------------------------------------------------------------------------------
/rankfm/_rankfm.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True
  3 | 
  4 | # -----------------------
  5 | # [C/Python] dependencies
  6 | # -----------------------
  7 | 
  8 | from libc.stdlib cimport malloc, free
  9 | from libc.math cimport log, exp, pow
 10 | 
 11 | cimport cython
 12 | cimport rankfm.mt19937ar as mt
 13 | 
 14 | import numpy as np
 15 | 
 16 | # --------------------
 17 | # [C] helper functions
 18 | # --------------------
 19 | 
 20 | cdef int lsearch(int item, int *items, int n) nogil:
 21 |     """linear search for a given item in a sorted array of items"""
 22 | 
 23 |     cdef int i
 24 |     for i in range(n):
 25 |         if item == items[i]:
 26 |             return 1
 27 |     return 0
 28 | 
 29 | 
 30 | cdef int bsearch(int item, int *items, int n) nogil:
 31 |     """binary search for a given item in a sorted array of items"""
 32 | 
 33 |     cdef int lo = 0
 34 |     cdef int hi = n - 1
 35 |     cdef int md
 36 | 
 37 |     while lo <= hi:
 38 |         md = int(lo + (hi - lo) / 2)
 39 |         if items[md] == item:
 40 |             return 1
 41 |         elif (items[md] < item):
 42 |             lo = md + 1
 43 |         else:
 44 |             hi = md - 1
 45 |     return 0
 46 | 
 47 | 
 48 | cdef float compute_ui_utility(
 49 |     int F,
 50 |     int P,
 51 |     int Q,
 52 |     float[::1] x_uf,
 53 |     float[::1] x_if,
 54 |     float w_i,
 55 |     float[::1] w_if,
 56 |     float[::1] v_u,
 57 |     float[::1] v_i,
 58 |     float[:, ::1] v_uf,
 59 |     float[:, ::1] v_if,
 60 |     int x_uf_any,
 61 |     int x_if_any
 62 | ) nogil:
 63 | 
 64 |     cdef int f, p, q
 65 |     cdef float res = w_i
 66 | 
 67 |     for f in range(F):
 68 |         # user * item: np.dot(v_u[u], v_i[i])
 69 |         res += v_u[f] * v_i[f]
 70 | 
 71 |     if x_uf_any:
 72 |         for p in range(P):
 73 |             if x_uf[p] == 0.0:
 74 |                 continue
 75 |             for f in range(F):
 76 |                 # user-features * item: np.dot(x_uf[u], np.dot(v_uf, v_i[i]))
 77 |                 res += x_uf[p] * (v_uf[p, f] * v_i[f])
 78 | 
 79 |     if x_if_any:
 80 |         for q in range(Q):
 81 |             if x_if[q] == 0.0:
 82 |                 continue
 83 |             # item-features: np.dot(x_if[i], w_if)
 84 |             res += x_if[q] * w_if[q]
 85 |             for f in range(F):
 86 |                 # item-features * user: np.dot(x_if[i], np.dot(v_if, v_u[u]))
 87 |                 res += x_if[q] * (v_if[q, f] * v_u[f])
 88 | 
 89 |     return res
 90 | 
 91 | # -------------------------
 92 | # [Python] helper functions
 93 | # -------------------------
 94 | 
 95 | def assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if):
 96 |     """assert all model weights are finite"""
 97 | 
 98 |     assert np.isfinite(np.sum(w_i)), "item weights [w_i] are not finite - try decreasing feature/sample_weight magnitudes"
 99 |     assert np.isfinite(np.sum(w_if)), "item feature weights [w_if] are not finite - try decreasing feature/sample_weight magnitudes"
100 |     assert np.isfinite(np.sum(v_u)), "user factors [v_u] are not finite - try decreasing feature/sample_weight magnitudes"
101 |     assert np.isfinite(np.sum(v_i)), "item factors [v_i] are not finite - try decreasing feature/sample_weight magnitudes"
102 |     assert np.isfinite(np.sum(v_uf)), "user-feature factors [v_uf] are not finite - try decreasing feature/sample_weight magnitudes"
103 |     assert np.isfinite(np.sum(v_if)), "item-feature factors [v_if] are not finite - try decreasing feature/sample_weight magnitudes"
104 | 
105 | 
106 | def reg_penalty(alpha, beta, w_i, w_if, v_u, v_i, v_uf, v_if):
107 |     """calculate the total regularization penalty for all model weights"""
108 | 
109 |     penalty = 0.0
110 |     penalty += np.sum(alpha * np.square(w_i))
111 |     penalty += np.sum(alpha * np.square(v_u))
112 |     penalty += np.sum(alpha * np.square(v_i))
113 |     penalty += np.sum(beta * np.square(w_if))
114 |     penalty += np.sum(beta * np.square(v_uf))
115 |     penalty += np.sum(beta * np.square(v_if))
116 |     return penalty
117 | 
118 | # --------------------------------
119 | # [RankFM] core modeling functions
120 | # --------------------------------
121 | 
122 | def _fit(
123 |     int[:, ::1] interactions,
124 |     float[::1] sample_weight,
125 |     dict user_items,
126 |     float[:, ::1] x_uf,
127 |     float[:, ::1] x_if,
128 |     float[::1] w_i,
129 |     float[::1] w_if,
130 |     float[:, ::1] v_u,
131 |     float[:, ::1] v_i,
132 |     float[:, ::1] v_uf,
133 |     float[:, ::1] v_if,
134 |     float alpha,
135 |     float beta,
136 |     float learning_rate,
137 |     str learning_schedule,
138 |     float learning_exponent,
139 |     int max_samples,
140 |     int epochs,
141 |     bint verbose
142 | ):
143 | 
144 |     #############################
145 |     ### VARIABLE DECLARATIONS ###
146 |     #############################
147 | 
148 |     # constants
149 |     cdef float MARGIN = 1.0
150 | 
151 |     # matrix shapes/indicators
152 |     cdef int N, U, I, F, P, Q
153 |     cdef int x_uf_any, x_if_any
154 | 
155 |     # loop iterators/indices
156 |     cdef int r, u, i, j, f, p, q
157 |     cdef int epoch, row, sampled
158 | 
159 |     # epoch-specific learning rate and log-likelihood
160 |     cdef float eta, log_likelihood
161 | 
162 |     # sample weights and (ui, uj) utility scores
163 |     cdef float sw, ut_ui, ut_uj
164 | 
165 |     # WARP sampling variables
166 |     cdef int min_index
167 |     cdef float pairwise_utility, min_pairwise_utility, multiplier
168 | 
169 |     # loss function derivatives wrt model weights
170 |     cdef float d_outer
171 |     cdef float d_reg_a = 2.0 * alpha
172 |     cdef float d_reg_b = 2.0 * beta
173 |     cdef float d_w_i = 1.0
174 |     cdef float d_w_j = -1.0
175 |     cdef float d_w_if, d_v_i, d_v_j, d_v_u, d_v_uf, d_v_if
176 | 
177 |     #######################################
178 |     ### PYTHON SET-UP PRIOR TO TRAINING ###
179 |     #######################################
180 | 
181 |     # initialize MT random state
182 |     mt.init_genrand(1492)
183 | 
184 |     # calculate matrix shapes
185 |     N = interactions.shape[0]
186 |     U = v_u.shape[0]
187 |     I = v_i.shape[0]
188 |     P = v_uf.shape[0]
189 |     Q = v_if.shape[0]
190 |     F = v_u.shape[1]
191 | 
192 |     # determine whether any user-features/item-features were supplied
193 |     x_uf_any = int(np.asarray(x_uf).any())
194 |     x_if_any = int(np.asarray(x_if).any())
195 | 
196 |     # create a shuffle index to diversify each training epoch and register as a memoryview to use in NOGIL
197 |     shuffle_index = np.arange(N, dtype=np.int32)
198 |     cdef int[:] shuffle_index_mv = shuffle_index
199 | 
200 |     # count the total number of items for each user
201 |     items_user = {user: len(items) for user, items in user_items.items()}
202 | 
203 |     # create c-arrays: number of items and sorted array of items for each user
204 |     cdef int *c_items_user = <int*>malloc(U * sizeof(int))
205 |     cdef int **c_user_items = <int**>malloc(U * sizeof(int*))
206 | 
207 |     # fill the c-arrays from the P-arrays to use later in NOGIL blocks
208 |     for u in range(U):
209 |         c_items_user[u] = items_user[u]
210 |         c_user_items[u] = <int*>malloc(c_items_user[u] * sizeof(int))
211 |         for i in range(c_items_user[u]):
212 |             c_user_items[u][i] = user_items[u][i]
213 | 
214 |     ################################
215 |     ### MAIN TRAINING EPOCH LOOP ###
216 |     ################################
217 | 
218 |     for epoch in range(epochs):
219 | 
220 |         if learning_schedule == 'constant':
221 |             eta = learning_rate
222 |         elif learning_schedule == 'invscaling':
223 |             eta = learning_rate / pow(epoch + 1, learning_exponent)
224 |         else:
225 |             raise ValueError('unknown [learning_schedule]')
226 | 
227 |         np.random.shuffle(shuffle_index)
228 |         log_likelihood = 0.0
229 | 
230 |         for r in range(N):
231 | 
232 |             # locate the observed (user, item, sample-weight)
233 |             row = shuffle_index_mv[r]
234 |             u = interactions[row, 0]
235 |             i = interactions[row, 1]
236 |             sw = sample_weight[row]
237 | 
238 |             # compute the utility score of the observed (u, i) pair
239 |             ut_ui = compute_ui_utility(F, P, Q, x_uf[u], x_if[i], w_i[i], w_if, v_u[u], v_i[i], v_uf, v_if, x_uf_any, x_if_any)
240 | 
241 |             # WARP sampling loop for the (u, i) pair
242 |             # --------------------------------------
243 | 
244 |             min_index = -1
245 |             min_pairwise_utility = 1e6
246 | 
247 |             for sampled in range(1, max_samples + 1):
248 | 
249 |                 # randomly sample an unobserved item (j) for the user
250 |                 while True:
251 |                     j = mt.genrand_int32() % I
252 |                     if not lsearch(j, c_user_items[u], c_items_user[u]):
253 |                         break
254 | 
255 |                 # compute the utility score of the unobserved (u, j) pair and the subsequent pairwise utility
256 |                 ut_uj = compute_ui_utility(F, P, Q, x_uf[u], x_if[j], w_i[j], w_if, v_u[u], v_i[j], v_uf, v_if, x_uf_any, x_if_any)
257 |                 pairwise_utility = ut_ui - ut_uj
258 | 
259 |                 if pairwise_utility < min_pairwise_utility:
260 |                     min_index = j
261 |                     min_pairwise_utility = pairwise_utility
262 | 
263 |                 if pairwise_utility < MARGIN:
264 |                     break
265 | 
266 |             # set the final sampled negative item index and calculate the WARP multiplier
267 |             j = min_index
268 |             pairwise_utility = min_pairwise_utility
269 |             multiplier = log((I - 1) / sampled) / log(I)
270 |             log_likelihood += log(1 / (1 + exp(-pairwise_utility)))
271 | 
272 |             # gradient step model weight updates
273 |             # ----------------------------------
274 | 
275 |             # calculate the outer derivative [d_LL / d_g(pu)]
276 |             d_outer = 1.0 / (exp(pairwise_utility) + 1.0)
277 | 
278 |             # update the [item] weights
279 |             w_i[i] += eta * (sw * multiplier * (d_outer * d_w_i) - (d_reg_a * w_i[i]))
280 |             w_i[j] += eta * (sw * multiplier * (d_outer * d_w_j) - (d_reg_a * w_i[j]))
281 | 
282 |             # update the [item-feature] weights
283 |             if x_if_any:
284 |                 for q in range(Q):
285 |                     d_w_if = x_if[i, q] - x_if[j, q]
286 |                     w_if[q] += eta * (sw * multiplier * (d_outer * d_w_if) - (d_reg_b * w_if[q]))
287 | 
288 |             # update all [factor] weights
289 |             for f in range(F):
290 | 
291 |                 # [user-factor] and [item-factor] derivatives wrt [user-factors] and [item-factors]
292 |                 d_v_u = v_i[i, f] - v_i[j, f]
293 |                 d_v_i = v_u[u, f]
294 |                 d_v_j = -v_u[u, f]
295 | 
296 |                 # add [user-features] to [item-factor] derivatives if supplied
297 |                 if x_uf_any:
298 |                     for p in range(P):
299 |                         d_v_i += v_uf[p, f] * x_uf[u, p]
300 |                         d_v_j -= v_uf[p, f] * x_uf[u, p]
301 | 
302 |                 # add [item-features] in [user-factor] derivatives if supplied
303 |                 if x_if_any:
304 |                     for q in range(Q):
305 |                         d_v_u += v_if[q, f] * (x_if[i, q] - x_if[j, q])
306 | 
307 |                 # update the [user-factor] and [item-factor] weights with the final gradient values
308 |                 v_u[u, f] += eta * (sw * multiplier * (d_outer * d_v_u) - (d_reg_a * v_u[u, f]))
309 |                 v_i[i, f] += eta * (sw * multiplier * (d_outer * d_v_i) - (d_reg_a * v_i[i, f]))
310 |                 v_i[j, f] += eta * (sw * multiplier * (d_outer * d_v_j) - (d_reg_a * v_i[j, f]))
311 | 
312 |                 # update the [user-feature-factor] weights if user features were supplied
313 |                 if x_uf_any:
314 |                     for p in range(P):
315 |                         if x_uf[u, p] == 0.0:
316 |                             continue
317 |                         d_v_uf = x_uf[u, p] * (v_i[i, f] - v_i[j, f])
318 |                         v_uf[p, f] += eta * (sw * multiplier * (d_outer * d_v_uf) - (d_reg_b * v_uf[p, f]))
319 | 
320 |                 # update the [item-feature-factor] weights if item features were supplied
321 |                 if x_if_any:
322 |                     for q in range(Q):
323 |                         if x_if[i, q] - x_if[j, q] == 0.0:
324 |                             continue
325 |                         d_v_if = (x_if[i, q] - x_if[j, q]) * v_u[u, f]
326 |                         v_if[q, f] += eta * (sw * multiplier * (d_outer * d_v_if) - (d_reg_b * v_if[q, f]))
327 | 
328 |         # [end epoch]: assert all model weights are finite
329 |         assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if)
330 | 
331 |         # report the penalized log-likelihood for this training epoch
332 |         if verbose:
333 |             penalty = reg_penalty(alpha, beta, w_i, w_if, v_u, v_i, v_uf, v_if)
334 |             log_likelihood = round(log_likelihood - penalty, 2)
335 |             print("\ntraining epoch:", epoch)
336 |             print("log likelihood:", log_likelihood)
337 | 
338 |     # [end training]: free memory of c-arrays before exiting function
339 |     for u in range(U):
340 |         free(c_user_items[u])
341 |     free(c_items_user)
342 |     free(c_user_items)
343 | 
344 | 
345 | def _predict(
346 |     float[:, ::1] pairs,
347 |     float[:, ::1] x_uf,
348 |     float[:, ::1] x_if,
349 |     float[::1] w_i,
350 |     float[::1] w_if,
351 |     float[:, ::1] v_u,
352 |     float[:, ::1] v_i,
353 |     float[:, ::1] v_uf,
354 |     float[:, ::1] v_if,
355 | ):
356 | 
357 |     # declare variables
358 |     cdef int N, P, Q, F
359 |     cdef int x_uf_any, x_if_any
360 |     cdef int row, u, i
361 |     cdef float u_flt, i_flt
362 | 
363 |     # calculate matrix shapes
364 |     N = pairs.shape[0]
365 |     P = v_uf.shape[0]
366 |     Q = v_if.shape[0]
367 |     F = v_u.shape[1]
368 | 
369 |     # determine whether any user-features/item-features were supplied
370 |     x_uf_any = int(np.asarray(x_uf).any())
371 |     x_if_any = int(np.asarray(x_if).any())
372 | 
373 |     # initialize the output scores vector
374 |     scores = np.empty(N, dtype=np.float32)
375 | 
376 |     for row in range(N):
377 | 
378 |         # locate the user/item to score
379 |         u_flt = pairs[row, 0]
380 |         i_flt = pairs[row, 1]
381 | 
382 |         # set the score to nan if the user or item not found
383 |         if np.isnan(u_flt) or np.isnan(i_flt):
384 |             scores[row] = np.nan
385 |         else:
386 |             # calculate the pointwise utility score for the (u, i) pair
387 |             u, i = int(u_flt), int(i_flt)
388 |             scores[row] = compute_ui_utility(F, P, Q, x_uf[u], x_if[i], w_i[i], w_if, v_u[u], v_i[i], v_uf, v_if, x_uf_any, x_if_any)
389 | 
390 |     return scores
391 | 
392 | 
393 | def _recommend(
394 |     float[::1] users,
395 |     dict user_items,
396 |     int n_items,
397 |     bint filter_previous,
398 |     float[:, ::1] x_uf,
399 |     float[:, ::1] x_if,
400 |     float[::1] w_i,
401 |     float[::1] w_if,
402 |     float[:, ::1] v_u,
403 |     float[:, ::1] v_i,
404 |     float[:, ::1] v_uf,
405 |     float[:, ::1] v_if
406 | ):
407 | 
408 |     # declare variables
409 |     cdef int U, I, P, Q, F
410 |     cdef int x_uf_any, x_if_any
411 |     cdef int row, u, i, s
412 |     cdef float u_flt
413 | 
414 |     # calculate matrix shapes
415 |     U = users.shape[0]
416 |     I = w_i.shape[0]
417 |     P = v_uf.shape[0]
418 |     Q = v_if.shape[0]
419 |     F = v_u.shape[1]
420 | 
421 |     # determine whether any user-features/item-features were supplied
422 |     x_uf_any = int(np.asarray(x_uf).any())
423 |     x_if_any = int(np.asarray(x_if).any())
424 | 
425 |     # initialize the [UxR] recommendations matrix
426 |     rec_items = np.empty((U, n_items), dtype=np.float32)
427 | 
428 |     # initialize a temporary buffer to store all item scores for a given user
429 |     item_scores = np.empty(I, dtype=np.float32)
430 |     cdef float[::1] item_scores_mv = item_scores
431 | 
432 |     for row in range(U):
433 |         u_flt = users[row]
434 |         if np.isnan(u_flt):
435 |             # set the rec item vector to nan if the user not found
436 |             rec_items[row] = np.full(n_items, np.nan, dtype=np.float32)
437 |         else:
438 |             # calculate the scores for all items wrt the current user
439 |             u = int(u_flt)
440 |             for i in range(I):
441 |                 item_scores_mv[i] = compute_ui_utility(F, P, Q, x_uf[u], x_if[i], w_i[i], w_if, v_u[u], v_i[i], v_uf, v_if, x_uf_any, x_if_any)
442 | 
443 |             # get a ranked list of item index positions for the user
444 |             ranked_items = np.argsort(item_scores)[::-1]
445 |             selected_items = np.empty(n_items, dtype=np.float32)
446 | 
447 |             # select the topN items for each user, optionally skipping previously observed items
448 |             s = 0
449 |             for i in range(I):
450 |                 if filter_previous and ranked_items[i] in user_items[u]:
451 |                     continue
452 |                 else:
453 |                     selected_items[s] = ranked_items[i]
454 |                     s += 1
455 |                 if s == n_items:
456 |                     break
457 | 
458 |             rec_items[row] = selected_items
459 | 
460 |     return rec_items
461 | 
462 | 


--------------------------------------------------------------------------------
/rankfm/rankfm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | rankfm main modeling class
  3 | """
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from rankfm._rankfm import _fit, _predict, _recommend
  9 | from rankfm.utils import get_data
 10 | 
 11 | class RankFM():
 12 |     """Factorization Machines for Ranking Problems with Implicit Feedback Data"""
 13 | 
 14 |     def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1, sigma=0.1, learning_rate=0.1, learning_schedule='constant', learning_exponent=0.25):
 15 |         """store hyperparameters and initialize internal model state
 16 | 
 17 |         :param factors: latent factor rank
 18 |         :param loss: optimization/loss function to use for training: ['bpr', 'warp']
 19 |         :param max_samples: maximum number of negative samples to draw for WARP loss
 20 |         :param alpha: L2 regularization penalty on [user, item] model weights
 21 |         :param beta: L2 regularization penalty on [user-feature, item-feature] model weights
 22 |         :param sigma: standard deviation to use for random initialization of factor weights
 23 |         :param learning_rate: initial learning rate for gradient step updates
 24 |         :param learning_schedule: schedule for adjusting learning rates by training epoch: ['constant', 'invscaling']
 25 |         :param learning_exponent: exponent applied to epoch number to adjust learning rate: scaling = 1 / pow(epoch + 1, learning_exponent)
 26 |         :return: None
 27 |         """
 28 | 
 29 |         # validate user input
 30 |         assert isinstance(factors, int) and factors >= 1, "[factors] must be a positive integer"
 31 |         assert isinstance(loss, str) and loss in ('bpr', 'warp'), "[loss] must be in ('bpr', 'warp')"
 32 |         assert isinstance(max_samples, int) and max_samples > 0, "[max_samples] must be a positive integer"
 33 |         assert isinstance(alpha, float) and alpha > 0.0, "[alpha] must be a positive float"
 34 |         assert isinstance(beta, float) and beta > 0.0, "[beta] must be a positive float"
 35 |         assert isinstance(sigma, float) and sigma > 0.0, "[sigma] must be a positive float"
 36 |         assert isinstance(learning_rate, float) and learning_rate > 0.0, "[learning_rate] must be a positive float"
 37 |         assert isinstance(learning_schedule, str) and learning_schedule in ('constant', 'invscaling'), "[learning_schedule] must be in ('constant', 'invscaling')"
 38 |         assert isinstance(learning_exponent, float) and learning_exponent > 0.0, "[learning_exponent] must be a positive float"
 39 | 
 40 |         # store model hyperparameters
 41 |         self.factors = factors
 42 |         self.loss = loss
 43 |         self.max_samples = max_samples
 44 |         self.alpha = alpha
 45 |         self.beta = beta
 46 |         self.sigma = sigma
 47 |         self.learning_rate = learning_rate
 48 |         self.learning_schedule = learning_schedule
 49 |         self.learning_exponent = learning_exponent
 50 | 
 51 |         # set/clear initial model state
 52 |         self._reset_state()
 53 | 
 54 | 
 55 |     # --------------------------------
 56 |     # begin private method definitions
 57 |     # --------------------------------
 58 | 
 59 | 
 60 |     def _reset_state(self):
 61 |         """initialize or reset internal model state"""
 62 | 
 63 |         # [ID, IDX] arrays
 64 |         self.user_id = None
 65 |         self.item_id = None
 66 |         self.user_idx = None
 67 |         self.item_idx = None
 68 | 
 69 |         # [ID <-> IDX] mappings
 70 |         self.index_to_user = None
 71 |         self.index_to_item = None
 72 |         self.user_to_index = None
 73 |         self.item_to_index = None
 74 | 
 75 |         # user/item interactions and importance weights
 76 |         self.interactions = None
 77 |         self.sample_weight = None
 78 | 
 79 |         # set of observed items for each user
 80 |         self.user_items = None
 81 | 
 82 |         # [user, item] features
 83 |         self.x_uf = None
 84 |         self.x_if = None
 85 | 
 86 |         # [item, item-feature] scalar weights
 87 |         self.w_i = None
 88 |         self.w_if = None
 89 | 
 90 |         # [user, item, user-feature, item-feature] latent factors
 91 |         self.v_u = None
 92 |         self.v_i = None
 93 |         self.v_uf = None
 94 |         self.v_if = None
 95 | 
 96 |         # internal model state indicator
 97 |         self.is_fit = False
 98 | 
 99 | 
100 |     def _init_all(self, interactions, user_features=None, item_features=None, sample_weight=None):
101 |         """index the interaction data and user/item features and initialize model weights
102 | 
103 |         :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
104 |         :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
105 |         :param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
106 |         :param sample_weight: vector of importance weights for each observed interaction
107 |         :return: None
108 |         """
109 | 
110 |         assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
111 |         assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]"
112 | 
113 |         # save unique arrays of users/items in terms of original identifiers
114 |         interactions_df = pd.DataFrame(get_data(interactions), columns=['user_id', 'item_id'])
115 |         self.user_id = pd.Series(np.sort(np.unique(interactions_df['user_id'])))
116 |         self.item_id = pd.Series(np.sort(np.unique(interactions_df['item_id'])))
117 | 
118 |         # create zero-based index to identifier mappings
119 |         self.index_to_user = self.user_id
120 |         self.index_to_item = self.item_id
121 | 
122 |         # create reverse mappings from identifiers to zero-based index positions
123 |         self.user_to_index = pd.Series(data=self.index_to_user.index, index=self.index_to_user.values)
124 |         self.item_to_index = pd.Series(data=self.index_to_item.index, index=self.index_to_item.values)
125 | 
126 |         # store unique values of user/item indexes and observed interactions for each user
127 |         self.user_idx = np.arange(len(self.user_id), dtype=np.int32)
128 |         self.item_idx = np.arange(len(self.item_id), dtype=np.int32)
129 | 
130 |         # map the interactions to internal index positions
131 |         self._init_interactions(interactions, sample_weight)
132 | 
133 |         # map the user/item features to internal index positions
134 |         self._init_features(user_features, item_features)
135 | 
136 |         # initialize the model weights after the user/item/feature dimensions have been established
137 |         self._init_weights(user_features, item_features)
138 | 
139 | 
140 |     def _init_interactions(self, interactions, sample_weight):
141 |         """map new interaction data to existing internal user/item indexes
142 | 
143 |         :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
144 |         :param sample_weight: vector of importance weights for each observed interaction
145 |         :return: None
146 |         """
147 | 
148 |         assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
149 |         assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]"
150 | 
151 |         # map the raw user/item identifiers to internal zero-based index positions
152 |         # NOTE: any user/item pairs not found in the existing indexes will be dropped
153 |         self.interactions = pd.DataFrame(get_data(interactions).copy(), columns=['user_id', 'item_id'])
154 |         self.interactions['user_id'] = self.interactions['user_id'].map(self.user_to_index).astype(np.int32)
155 |         self.interactions['item_id'] = self.interactions['item_id'].map(self.item_to_index).astype(np.int32)
156 |         self.interactions = self.interactions.rename({'user_id': 'user_idx', 'item_id': 'item_idx'}, axis=1).dropna()
157 | 
158 |         # store the sample weights internally or generate a vector of ones if not given
159 |         if sample_weight is not None:
160 |             assert isinstance(sample_weight, (np.ndarray, pd.Series)), "[sample_weight] must be np.ndarray or pd.series"
161 |             assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)"
162 |             assert len(sample_weight) == len(interactions), "[sample_weight] must have the same length as [interactions]"
163 |             self.sample_weight = np.ascontiguousarray(get_data(sample_weight), dtype=np.float32)
164 |         else:
165 |             self.sample_weight = np.ones(len(self.interactions), dtype=np.float32)
166 | 
167 |         # create a dictionary containing the set of observed items for each user
168 |         # NOTE: if the model has been previously fit extend rather than replace the itemset for each user
169 | 
170 |         if self.is_fit:
171 |             new_user_items = self.interactions.groupby('user_idx')['item_idx'].apply(set).to_dict()
172 |             self.user_items = {user: np.sort(np.array(list(set(self.user_items[user]) | set(new_user_items[user])), dtype=np.int32)) for user in self.user_items.keys()}
173 |         else:
174 |             self.user_items = self.interactions.sort_values(['user_idx', 'item_idx']).groupby('user_idx')['item_idx'].apply(np.array, dtype=np.int32).to_dict()
175 | 
176 |         # format the interactions data as a c-contiguous integer array for cython use
177 |         self.interactions = np.ascontiguousarray(self.interactions, dtype=np.int32)
178 | 
179 | 
180 | 
181 |     def _init_features(self, user_features=None, item_features=None):
182 |         """initialize the user/item features given existing internal user/item indexes
183 | 
184 |         :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
185 |         :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
186 |         :return: None
187 |         """
188 | 
189 |         # store the user features as a ndarray [UxP] row-ordered by user index position
190 |         if user_features is not None:
191 |             x_uf = pd.DataFrame(user_features.copy())
192 |             x_uf = x_uf.set_index(x_uf.columns[0])
193 |             x_uf.index = x_uf.index.map(self.user_to_index)
194 |             if np.array_equal(sorted(x_uf.index.values), self.user_idx):
195 |                 self.x_uf = np.ascontiguousarray(x_uf.sort_index(), dtype=np.float32)
196 |             else:
197 |                 raise KeyError('the users in [user_features] do not match the users in [interactions]')
198 |         else:
199 |             self.x_uf = np.zeros([len(self.user_idx), 1], dtype=np.float32)
200 | 
201 |         # store the item features as a ndarray [IxQ] row-ordered by item index position
202 |         if item_features is not None:
203 |             x_if = pd.DataFrame(item_features.copy())
204 |             x_if = x_if.set_index(x_if.columns[0])
205 |             x_if.index = x_if.index.map(self.item_to_index)
206 |             if np.array_equal(sorted(x_if.index.values), self.item_idx):
207 |                 self.x_if = np.ascontiguousarray(x_if.sort_index(), dtype=np.float32)
208 |             else:
209 |                 raise KeyError('the items in [item_features] do not match the items in [interactions]')
210 |         else:
211 |             self.x_if = np.zeros([len(self.item_idx), 1], dtype=np.float32)
212 | 
213 | 
214 |     def _init_weights(self, user_features=None, item_features=None):
215 |         """initialize model weights given user/item and user-feature/item-feature indexes/shapes
216 | 
217 |         :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
218 |         :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
219 |         :return: None
220 |         """
221 | 
222 |         # initialize scalar weights as ndarrays of zeros
223 |         self.w_i = np.zeros(len(self.item_idx)).astype(np.float32)
224 |         self.w_if = np.zeros(self.x_if.shape[1]).astype(np.float32)
225 | 
226 |         # initialize latent factors by drawing random samples from a normal distribution
227 |         self.v_u = np.random.normal(loc=0, scale=self.sigma, size=(len(self.user_idx), self.factors)).astype(np.float32)
228 |         self.v_i = np.random.normal(loc=0, scale=self.sigma, size=(len(self.item_idx), self.factors)).astype(np.float32)
229 | 
230 |         # randomly initialize user feature factors if user features were supplied
231 |         # NOTE: set all user feature factor weights to zero to prevent random scoring influence otherwise
232 |         if user_features is not None:
233 |             scale = (self.alpha / self.beta) * self.sigma
234 |             self.v_uf = np.random.normal(loc=0, scale=scale, size=[self.x_uf.shape[1], self.factors]).astype(np.float32)
235 |         else:
236 |             self.v_uf = np.zeros([self.x_uf.shape[1], self.factors], dtype=np.float32)
237 | 
238 |         # randomly initialize item feature factors if item features were supplied
239 |         # NOTE: set all item feature factor weights to zero to prevent random scoring influence otherwise
240 |         if item_features is not None:
241 |             scale = (self.alpha / self.beta) * self.sigma
242 |             self.v_if = np.random.normal(loc=0, scale=scale, size=[self.x_if.shape[1], self.factors]).astype(np.float32)
243 |         else:
244 |             self.v_if = np.zeros([self.x_if.shape[1], self.factors], dtype=np.float32)
245 | 
246 | 
247 |     # -------------------------------
248 |     # begin public method definitions
249 |     # -------------------------------
250 | 
251 | 
252 |     def fit(self, interactions, user_features=None, item_features=None, sample_weight=None, epochs=1, verbose=False):
253 |         """clear previous model state and learn new model weights using the input data
254 | 
255 |         :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
256 |         :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
257 |         :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
258 |         :param sample_weight: vector of importance weights for each observed interaction
259 |         :param epochs: number of training epochs (full passes through observed interactions)
260 |         :param verbose: whether to print epoch number and log-likelihood during training
261 |         :return: self
262 |         """
263 | 
264 |         self._reset_state()
265 |         self.fit_partial(interactions, user_features, item_features, sample_weight, epochs, verbose)
266 |         return self
267 | 
268 | 
269 |     def fit_partial(self, interactions, user_features=None, item_features=None, sample_weight=None, epochs=1, verbose=False):
270 |         """learn or update model weights using the input data and resuming from the current model state
271 | 
272 |         :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
273 |         :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
274 |         :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
275 |         :param sample_weight: vector of importance weights for each observed interaction
276 |         :param epochs: number of training epochs (full passes through observed interactions)
277 |         :param verbose: whether to print epoch number and log-likelihood during training
278 |         :return: self
279 |         """
280 | 
281 |         assert isinstance(epochs, int) and epochs >= 1, "[epochs] must be a positive integer"
282 |         assert isinstance(verbose, bool), "[verbose] must be a boolean value"
283 | 
284 |         if self.is_fit:
285 |             self._init_interactions(interactions, sample_weight)
286 |             self._init_features(user_features, item_features)
287 |         else:
288 |             self._init_all(interactions, user_features, item_features, sample_weight)
289 | 
290 |         # determine the number of negative samples to draw depending on the loss function
291 |         # NOTE: if [loss == 'bpr'] -> [max_samples == 1] and [multiplier ~= 1] for all updates
292 |         # NOTE: the [multiplier] is scaled by total number of items so it's always [0, 1]
293 | 
294 |         if self.loss == 'bpr':
295 |             max_samples = 1
296 |         elif self.loss == 'warp':
297 |             max_samples = self.max_samples
298 |         else:
299 |             raise ValueError('[loss] function not recognized')
300 | 
301 |         # NOTE: the cython private _fit() method updates the model weights in-place via typed memoryviews
302 |         # NOTE: therefore there's nothing returned explicitly by either method
303 | 
304 |         _fit(
305 |             self.interactions,
306 |             self.sample_weight,
307 |             self.user_items,
308 |             self.x_uf,
309 |             self.x_if,
310 |             self.w_i,
311 |             self.w_if,
312 |             self.v_u,
313 |             self.v_i,
314 |             self.v_uf,
315 |             self.v_if,
316 |             self.alpha,
317 |             self.beta,
318 |             self.learning_rate,
319 |             self.learning_schedule,
320 |             self.learning_exponent,
321 |             max_samples,
322 |             epochs,
323 |             verbose
324 |         )
325 | 
326 |         self.is_fit = True
327 |         return self
328 | 
329 | 
330 |     def predict(self, pairs, cold_start='nan'):
331 |         """calculate the predicted pointwise utilities for all (user, item) pairs
332 | 
333 |         :param pairs: dataframe of [user, item] pairs to score
334 |         :param cold_start: whether to generate missing values ('nan') or drop ('drop') user/item pairs not found in training data
335 |         :return: np.array of real-valued model scores
336 |         """
337 | 
338 |         assert isinstance(pairs, (np.ndarray, pd.DataFrame)), "[pairs] must be np.ndarray or pd.dataframe"
339 |         assert pairs.shape[1] == 2, "[pairs] should be: [user_id, item_id]"
340 |         assert self.is_fit, "you must fit the model prior to generating predictions"
341 | 
342 |         pred_pairs = pd.DataFrame(get_data(pairs).copy(), columns=['user_id', 'item_id'])
343 |         pred_pairs['user_id'] = pred_pairs['user_id'].map(self.user_to_index)
344 |         pred_pairs['item_id'] = pred_pairs['item_id'].map(self.item_to_index)
345 |         pred_pairs = np.ascontiguousarray(pred_pairs, dtype=np.float32)
346 | 
347 |         scores = _predict(
348 |             pred_pairs,
349 |             self.x_uf,
350 |             self.x_if,
351 |             self.w_i,
352 |             self.w_if,
353 |             self.v_u,
354 |             self.v_i,
355 |             self.v_uf,
356 |             self.v_if
357 |         )
358 | 
359 |         if cold_start == 'nan':
360 |             return scores
361 |         elif cold_start == 'drop':
362 |             return scores[~np.isnan(scores)]
363 |         else:
364 |             raise ValueError("param [cold_start] must be set to either 'nan' or 'drop'")
365 | 
366 | 
367 |     def recommend(self, users, n_items=10, filter_previous=False, cold_start='nan'):
368 |         """calculate the topN items for each user
369 | 
370 |         :param users: iterable of user identifiers for which to generate recommendations
371 |         :param n_items: number of recommended items to generate for each user
372 |         :param filter_previous: remove observed training items from generated recommendations
373 |         :param cold_start: whether to generate missing values ('nan') or drop ('drop') users not found in training data
374 |         :return: pandas dataframe where the index values are user identifiers and the columns are recommended items
375 |         """
376 | 
377 |         assert getattr(users, '__iter__', False), "[users] must be an iterable (e.g. list, array, series)"
378 |         assert self.is_fit, "you must fit the model prior to generating recommendations"
379 | 
380 |         user_idx = np.ascontiguousarray(pd.Series(users).map(self.user_to_index), dtype=np.float32)
381 |         rec_items = _recommend(
382 |             user_idx,
383 |             self.user_items,
384 |             n_items,
385 |             filter_previous,
386 |             self.x_uf,
387 |             self.x_if,
388 |             self.w_i,
389 |             self.w_if,
390 |             self.v_u,
391 |             self.v_i,
392 |             self.v_uf,
393 |             self.v_if
394 |         )
395 |         rec_items = pd.DataFrame(rec_items, index=users).apply(lambda c: c.map(self.index_to_item))
396 | 
397 |         if cold_start == 'nan':
398 |             return rec_items
399 |         elif cold_start == 'drop':
400 |             return rec_items.dropna(how='any')
401 |         else:
402 |             raise ValueError("param [cold_start] must be set to either 'nan' or 'drop'")
403 | 
404 | 
405 |     def similar_items(self, item_id, n_items=10):
406 |         """find the most similar items wrt latent factor space representation
407 | 
408 |         :param item_id: item to search
409 |         :param n_items: number of similar items to return
410 |         :return: np.array of topN most similar items wrt latent factor representations
411 |         """
412 | 
413 |         assert item_id in self.item_id.values, "you must select an [item_id] present in the training data"
414 |         assert self.is_fit, "you must fit the model prior to generating similarities"
415 | 
416 |         try:
417 |             item_idx = self.item_to_index.loc[item_id]
418 |         except (KeyError, TypeError):
419 |             print("item_id={} not found in training data".format(item_id))
420 | 
421 |         # calculate item latent representations in F dimensional factor space
422 |         lr_item = self.v_i[item_idx] + np.dot(self.v_if.T, self.x_if[item_idx])
423 |         lr_all_items = self.v_i + np.dot(self.x_if, self.v_if)
424 | 
425 |         # calculate the most similar N items excluding the search item
426 |         similarities = pd.Series(np.dot(lr_all_items, lr_item)).drop(item_idx).sort_values(ascending=False)[:n_items]
427 |         most_similar = pd.Series(similarities.index).map(self.index_to_item).values
428 |         return most_similar
429 | 
430 | 
431 |     def similar_users(self, user_id, n_users=10):
432 |         """find the most similar users wrt latent factor space representation
433 | 
434 |         :param user_id: user to search
435 |         :param n_users: number of similar users to return
436 |         :return: np.array of topN most similar users wrt latent factor representations
437 |         """
438 | 
439 |         assert user_id in self.user_id.values, "you must select an [user_id] present in the training data"
440 |         assert self.is_fit, "you must fit the model prior to generating similarities"
441 | 
442 |         try:
443 |             user_idx = self.user_to_index.loc[user_id]
444 |         except (KeyError, TypeError):
445 |             print("user_id={} not found in training data".format(user_id))
446 | 
447 |         # calculate user latent representations in F dimensional factor space
448 |         lr_user = self.v_u[user_idx] + np.dot(self.v_uf.T, self.x_uf[user_idx])
449 |         lr_all_users = self.v_u + np.dot(self.x_uf, self.v_uf)
450 | 
451 |         # calculate the most similar N users excluding the search user
452 |         similarities = pd.Series(np.dot(lr_all_users, lr_user)).drop(user_idx).sort_values(ascending=False)[:n_users]
453 |         most_similar = pd.Series(similarities.index).map(self.index_to_user).values
454 |         return most_similar
455 | 
456 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------