├── .gitignore ├── .gitmodules ├── .travis.yml ├── .travis └── build-wheels.sh ├── COPYING ├── MANIFEST.in ├── Makefile ├── README.md ├── doc ├── Makefile ├── api.rst ├── conf.py ├── dev_notes.txt ├── guide.rst ├── index.rst ├── make.bat └── tutorial.rst ├── examples ├── warm_start_als.py └── warm_start_mcmc.py ├── fastFM ├── __init__.py ├── als.py ├── base.py ├── bpr.py ├── cffm.pxd ├── datasets.py ├── ffm.pyx ├── mcmc.py ├── sgd.py ├── tests │ ├── test_als.py │ ├── test_base.py │ ├── test_datasets.py │ ├── test_ffm.py │ ├── test_mcmc.py │ ├── test_ranking.py │ ├── test_sgd.py │ └── test_utils.py ├── utils.py └── validation.py ├── requirements.txt ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # C 2 | *.swp 3 | *.o 4 | *.a 5 | *.so 6 | *.zip 7 | # latex 8 | *.aux 9 | *.bbl 10 | *.blg 11 | *.dvi 12 | *.log 13 | *.toc 14 | # python 15 | *.pyc 16 | fastFM/ffm.c 17 | fastFM.egg-info/ 18 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "fastFM-core"] 2 | path = fastFM-core 3 | url = https://github.com/ibayer/fastFM-core.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | 3 | matrix: 4 | include: 5 | - os: osx 6 | env: 7 | - TRAVIS_PYTHON_VERSION="2.7" 8 | - DEPLOYABLE="true" 9 | - os: osx 10 | env: 11 | - TRAVIS_PYTHON_VERSION="3.5" 12 | - DEPLOYABLE="true" 13 | - os: osx 14 | env: 15 | - TRAVIS_PYTHON_VERSION="3.6" 16 | - DEPLOYABLE="true" 17 | - os: linux 18 | env: 19 | - TRAVIS_PYTHON_VERSION="2.7" 20 | - os: linux 21 | env: 22 | - TRAVIS_PYTHON_VERSION="3.5" 23 | - os: linux 24 | env: 25 | - TRAVIS_PYTHON_VERSION="3.6" 26 | #- services: docker 27 | # sudo: required 28 | # env: 29 | # - DEPLOY_TARGET="manylinux1" 30 | # - DEPLOYABLE="true" 31 | 32 | dist: trusty 33 | 34 | before_install: 35 | - | 36 | # Skip if manylinux1 37 | if [ "$DEPLOY_TARGET" = "manylinux1" ]; then 38 | echo "Skip before_install step..." 39 | else 40 | # fastFM-core depends on cblas 41 | if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get update -qq; sudo apt-get install -y libopenblas-dev; fi 42 | if [[ "$TRAVIS_PYTHON_VERSION" =~ "^2" ]]; then 43 | if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then 44 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 45 | else 46 | wget https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh; 47 | fi 48 | else 49 | if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then 50 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 51 | else 52 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; 53 | fi 54 | fi 55 | bash miniconda.sh -b -p $HOME/miniconda 56 | export PATH="$HOME/miniconda/bin:$PATH" 57 | hash -r 58 | conda config --set always_yes yes --set changeps1 no 59 | conda update -q conda 60 | # Useful for debugging any issues with conda 61 | conda info -a 62 | conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION cython numpy pandas scipy scikit-learn nose 63 | source activate test-environment 64 | fi 65 | 66 | install: 67 | - | 68 | git submodule update --init --recursive 69 | if [ "$DEPLOY_TARGET" = "manylinux1" ]; then 70 | : 71 | else 72 | make 73 | python setup.py bdist_wheel 74 | pip install dist/*.whl 75 | fi 76 | 77 | script: 78 | - | 79 | if [ "$DEPLOY_TARGET" = "manylinux1" ]; then 80 | #build for 64-bit 81 | docker run --rm -v `pwd`:/io quay.io/pypa/manylinux1_x86_64 /io/.travis/build-wheels.sh 82 | else 83 | nosetests 84 | fi 85 | 86 | deploy: 87 | provider: releases 88 | api_key: 89 | secure: AJcZoe2+OiMJ4VlSkASAeMc/ii0ZRnj2PFaaL7zlSbx1THMpY/49U5BSyqX1PQioPSlTV3ZsIXI3u7KyqoXIQSXWzAuaBzpLTLS85fGSuTvUuexmaJtKU92OC143tuVVLCPnjC992+1uyctjrxMSqgoaUolfYkEftt5RGrMIKl2duGfDXrPXIueHSl8FQGXkmlY6NqkRx2v5kxsAjFcurvwTNU8ptJ84jVKjrE6t1IB61vp2eUcqVR/z6Lwau6mdvIybglnbH4lCMXP98zEIibLA8vbn3XxrC+0uU7Kjz37K6/CsJEPNL5tujJDMRKAupnrkgPsAGTpsAn6O6uLUz0ISgcen8R6KJ7cBli+cq08OZ3JLLoJpqkni62YVSQV+uYkQk9b5Pu09vUTOozJMnOqLSj9hVIswyxGiFPcTFskMgqMdx15M59gd0YpXH633YqwBgRmWNsctp4BKnTaE3iGW6aZc8lrXxpL7qcVAosjmpjLp3jiPXVSRdYf0yHl6pDUj5ZVyu27kAn1/I9JL0nH19zjXF2tUlEjuT9ydHwnhmsgBN/V+JhZxi7ZeEbOZfY1MfekKM/NwSRehVEp/J0XWqWg+kIXRU/rqY1/w0vLVNFeQirpEjUp39eCBydXeS3Bik8uANW2UTxojJo3LBfLLoAT8ZWFb3YrIBAYkzjc= 90 | file_glob: true 91 | file: dist/fastFM-*.whl 92 | skip_cleanup: true 93 | on: 94 | tags: true 95 | condision: $DEPLOYEABLE = "true" 96 | -------------------------------------------------------------------------------- /.travis/build-wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #Author: Likhith Chitneni 4 | #License: BSD 3 Clause license - https://opensource.org/licenses/BSD-3-Clause 5 | # 6 | 7 | set -e -x 8 | 9 | # Install any system packages required here 10 | #yum install -y $PACKAGE_TO_BE_INSTALLED 11 | 12 | #Remove Python 2.6 and 3.3 since numpy requires >=2.7 or >=3.4 13 | rm -rf /opt/python/cpython-2.6.9-* 14 | rm -rf /opt/python/cp33-cp33m 15 | 16 | #Make fastFM-core 17 | cd /io/fastFM-core 18 | make clean && make 19 | cd / 20 | 21 | #Compile wheels 22 | for PYBIN in /opt/python/*/bin; do 23 | "${PYBIN}/pip" install -r /io/requirements.txt 24 | "${PYBIN}/pip" wheel /io/ -w wheelhouse/ 25 | done 26 | 27 | # Bundle external shared libraries into the wheels 28 | for whl in wheelhouse/*.whl; do 29 | auditwheel repair "$whl" -w /io/wheelhouse/ 30 | done 31 | 32 | # Install packages and test 33 | for PYBIN in /opt/python/*/bin; do 34 | "${PYBIN}/pip" install fastFM --no-index -f /io/wheelhouse 35 | "${PYBIN}/pip" install nose 36 | (cd "$HOME"; "${PYBIN}/nosetests" /io/fastFM/tests) 37 | done 38 | 39 | mv /io/wheelhouse /io/dist 40 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2014–2015 Immanuel Bayer 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of the developers nor the names of 16 | its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written 18 | permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 31 | DAMAGE. 32 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include fastFM/ffm.c 3 | include fastFM/ffm.pxy 4 | include fastFM/cffm.pxd 5 | recursive-include fastFM-core/include * 6 | include fastFM-core/bin/libfastfm.a 7 | include fastFM-core/externals/CXSparse/Lib/libcxsparse.a 8 | recursive-include fastFM-core/externals/CXSparse/Include * 9 | recursive-include fastFM-core/externals/CXSparse/SuiteSparse_config * 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON ?= python 2 | 3 | all: 4 | ( cd fastFM-core ; $(MAKE) lib ) 5 | $(PYTHON) setup.py build_ext --inplace 6 | 7 | .PHONY : clean 8 | clean: 9 | ( cd fastFM-core ; $(MAKE) clean ) 10 | cd fastFM/ 11 | rm -f *.so 12 | rm -rf build/ 13 | rm -f fastFM/ffm.c 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Citing fastFM 2 | ============= 3 | 4 | The library fastFM is an academic project. The time and resources spent 5 | developing fastFM are therefore justified by the number of citations of 6 | the software. If you publish scientific articles using fastFM, please 7 | cite the following article (bibtex entry 8 | [citation.bib](http://jmlr.org/papers/v17/15-355.bib)). 9 | 10 | > Bayer, I. \"fastFM: A Library for Factorization Machines\" Journal of 11 | > Machine Learning Research 17, pp. 1-5 (2016) 12 | 13 | fastFM: A Library for Factorization Machines 14 | ============================================ 15 | 16 | [![image](https://travis-ci.org/ibayer/fastFM.svg?branch=master)](https://travis-ci.org/ibayer/fastFM) 17 | [![image](https://img.shields.io/badge/platform-OSX%7CLinux-lightgrey.svg)](https://travis-ci.org/ibayer/fastFM) 18 | [![image](https://img.shields.io/pypi/l/Django.svg)](https://travis-ci.org/ibayer/fastFM) 19 | 20 | This repository allows you to use Factorization Machines in **Python** 21 | (2.7 & 3.x) with the well known **scikit-learn API**. All performance 22 | critical code has been written in C and wrapped with Cython. fastFM 23 | provides stochastic gradient descent (SGD) and coordinate descent (CD) 24 | optimization routines as well as Markov Chain Monte Carlo (MCMC) for 25 | Bayesian inference. The solvers can be used for regression, 26 | classification and ranking problems. Detailed usage instructions can be 27 | found in the [online documentation](http://ibayer.github.io/fastFM) and 28 | on [arXiv](http://arxiv.org/abs/1505.00641). 29 | 30 | Supported Operating Systems 31 | --------------------------- 32 | 33 | fastFM has a continuous integration / testing servers (Travis) for 34 | **Linux (Ubuntu 14.04 LTS)** and **OS X Mavericks**. Other OSs are not 35 | actively supported. 36 | 37 | Usage 38 | ----- 39 | 40 | ``` {.python} 41 | from fastFM import als 42 | fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5) 43 | fm.fit(X_train, y_train) 44 | y_pred = fm.predict(X_test) 45 | ``` 46 | 47 | Tutorials and other information are available 48 | [here](http://arxiv.org/abs/1505.00641). The C code is available as 49 | [subrepository](https://github.com/ibayer/fastFM-core) and provides a 50 | stand alone command line interface. If you still have **questions** 51 | after reading the documentation please open an issue at GitHub. 52 | 53 | | Task | Solver | Loss | 54 | | :------------- | :----------: | -----------: | 55 | | Regression | als, mcmc, sgd | Square Loss | 56 | | Classification | als, mcmc, sgd | Probit(Map), Probit, Sigmoid| 57 | | Ranking | sgd | BPR | 58 | 59 | *Supported solvers and tasks* 60 | 61 | Installation 62 | ------------ 63 | 64 | **binary install (64bit only)** 65 | 66 | `pip install fastFM` 67 | 68 | **source install** 69 | 70 | *Please make sure, that Python and OS bit version agree, e.g. 32bit 71 | Python on 64bit OS won\'t work.* 72 | 73 | ``` {.bash} 74 | # Install cblas and python-dev header (Linux only). 75 | # - cblas can be installed with libatlas-base-dev or libopenblas-dev (Ubuntu) 76 | $ sudo apt-get install python-dev libopenblas-dev 77 | 78 | # Clone the repo including submodules (or clone + `git submodule update --init --recursive`) 79 | $ git clone --recursive https://github.com/ibayer/fastFM.git 80 | 81 | # Enter the root directory 82 | $ cd fastFM 83 | 84 | # Install Python dependencies (Cython>=0.22, numpy, pandas, scipy, scikit-learn) 85 | $ pip install -r ./requirements.txt 86 | 87 | # Compile the C extension. 88 | $ make # build with default python version (python) 89 | $ PYTHON=python3 make # build with custom python version (python3) 90 | 91 | # Install fastFM 92 | $ pip install . 93 | ``` 94 | 95 | Tests 96 | ----- 97 | 98 | The Python tests (`pip install nose`) can be run with: 99 | `nosetests fastFM/fastFM/tests` 100 | 101 | Please refer to the fastFM-core README for instruction on how to run the 102 | C tests at `fastFM/fastFM-core/src/tests`. 103 | 104 | Contribution 105 | ------------ 106 | 107 | - Star this repository: keeps contributors motivated 108 | - Open an issue: report bugs or suggest improvements 109 | - Fix errors in the documentation: small changes matter 110 | - Contribute code 111 | 112 | **Contributions are very welcome!** Since this project lives on GitHub 113 | we recommend to open a pull request (PR) for code contributions as early 114 | as possible. This is the fastest way to get feedback and allows [Travis 115 | CI](https://travis-ci.org/ibayer/fastFM) to run checks on your changes. 116 | 117 | Most information you need to setup your **development environment** can 118 | be learned by adapting the great instructions on 119 | 120 | . Please ensure that your contribution conforms to the 121 | [PEP8](http://www.python.org/dev/peps/pep-0008/) Coding Style and 122 | includes unit tests where appropriate. More valuable guidelines that 123 | apply to fastFM can be found at 124 | 125 | . 126 | 127 | **Contributors** 128 | 129 | - [aaossa](https://github.com/aaossa/) 130 | - [altimin](https://github.com/altimin) 131 | - [bdaskalov](https://github.com/bdaskalov) 132 | - [chezou](https://github.com/chezou) 133 | - [macks22](https://github.com/macks22) 134 | - [takuti](https://github.com/takuti) 135 | - [ibayer](https://github.com/ibayer) 136 | 137 | License: BSD 138 | ------------ 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fastFM.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fastFM.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/fastFM" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fastFM" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /doc/api.rst: -------------------------------------------------------------------------------- 1 | The fastFM API reference 2 | ======================== 3 | 4 | 5 | The MCMC module 6 | --------------- 7 | 8 | .. automodule:: fastFM.mcmc 9 | :members: 10 | :inherited-members: predict 11 | 12 | The ALS module 13 | --------------- 14 | 15 | .. automodule:: fastFM.als 16 | :members: 17 | :inherited-members: predict 18 | 19 | The SGD module 20 | -------------- 21 | 22 | .. automodule:: fastFM.sgd 23 | :members: 24 | :inherited-members: predict 25 | 26 | The Ranking module 27 | ------------------ 28 | 29 | .. automodule:: fastFM.bpr 30 | :members: 31 | :inherited-members: predict 32 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # fastFM documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jun 15 01:42:19 2015. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | import shlex 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | #sys.path.insert(0, os.path.abspath('.')) 23 | #sys.path.insert(0, os.path.abspath('../fastFM')) 24 | sys.path.append(os.path.abspath('../fastFM')) 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | #needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'matplotlib.sphinxext.plot_directive', 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.doctest', 38 | 'sphinx.ext.mathjax', 39 | 'sphinx.ext.viewcode', 40 | 'sphinx.ext.napoleon', 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # source_suffix = ['.rst', '.md'] 49 | source_suffix = '.rst' 50 | 51 | # The encoding of source files. 52 | #source_encoding = 'utf-8-sig' 53 | 54 | # The master toctree document. 55 | master_doc = 'index' 56 | 57 | # General information about the project. 58 | project = u'fastFM' 59 | copyright = u'2016, Immanuel Bayer' 60 | author = u'Immanuel Bayer' 61 | 62 | # The version info for the project you're documenting, acts as replacement for 63 | # |version| and |release|, also used in various other places throughout the 64 | # built documents. 65 | # 66 | # The short X.Y version. 67 | version = '0.2' 68 | # The full version, including alpha/beta/rc tags. 69 | release = '0.2.11' 70 | 71 | # The language for content autogenerated by Sphinx. Refer to documentation 72 | # for a list of supported languages. 73 | # 74 | # This is also used if you do content translation via gettext catalogs. 75 | # Usually you set "language" from the command line for these cases. 76 | language = None 77 | 78 | # There are two options for replacing |today|: either, you set today to some 79 | # non-false value, then it is used: 80 | #today = '' 81 | # Else, today_fmt is used as the format for a strftime call. 82 | #today_fmt = '%B %d, %Y' 83 | 84 | # List of patterns, relative to source directory, that match files and 85 | # directories to ignore when looking for source files. 86 | exclude_patterns = ['_build'] 87 | 88 | # The reST default role (used for this markup: `text`) to use for all 89 | # documents. 90 | #default_role = None 91 | 92 | # If true, '()' will be appended to :func: etc. cross-reference text. 93 | #add_function_parentheses = True 94 | 95 | # If true, the current module name will be prepended to all description 96 | # unit titles (such as .. function::). 97 | #add_module_names = True 98 | 99 | # If true, sectionauthor and moduleauthor directives will be shown in the 100 | # output. They are ignored by default. 101 | #show_authors = False 102 | 103 | # The name of the Pygments (syntax highlighting) style to use. 104 | #pygments_style = 'sphinx' 105 | pygments_style = 'colorful' 106 | 107 | # A list of ignored prefixes for module index sorting. 108 | #modindex_common_prefix = [] 109 | 110 | # If true, keep warnings as "system message" paragraphs in the built documents. 111 | #keep_warnings = False 112 | 113 | # If true, `todo` and `todoList` produce output, else they produce nothing. 114 | todo_include_todos = False 115 | 116 | 117 | # -- Options for HTML output ---------------------------------------------- 118 | 119 | # The theme to use for HTML and HTML Help pages. See the documentation for 120 | # a list of builtin themes. 121 | #html_theme = 'alabaster' 122 | html_theme = 'haiku' 123 | 124 | # Theme options are theme-specific and customize the look and feel of a theme 125 | # further. For a list of options available for each theme, see the 126 | # documentation. 127 | #html_theme_options = {} 128 | 129 | # Add any paths that contain custom themes here, relative to this directory. 130 | #html_theme_path = [] 131 | 132 | # The name for this set of Sphinx documents. If None, it defaults to 133 | # " v documentation". 134 | #html_title = None 135 | 136 | # A shorter title for the navigation bar. Default is the same as html_title. 137 | #html_short_title = None 138 | 139 | # The name of an image file (relative to this directory) to place at the top 140 | # of the sidebar. 141 | #html_logo = None 142 | 143 | # The name of an image file (within the static path) to use as favicon of the 144 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 145 | # pixels large. 146 | #html_favicon = None 147 | 148 | # Add any paths that contain custom static files (such as style sheets) here, 149 | # relative to this directory. They are copied after the builtin static files, 150 | # so a file named "default.css" will overwrite the builtin "default.css". 151 | html_static_path = ['_static'] 152 | 153 | # Add any extra paths that contain custom files (such as robots.txt or 154 | # .htaccess) here, relative to this directory. These files are copied 155 | # directly to the root of the documentation. 156 | #html_extra_path = [] 157 | 158 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 159 | # using the given strftime format. 160 | #html_last_updated_fmt = '%b %d, %Y' 161 | 162 | # If true, SmartyPants will be used to convert quotes and dashes to 163 | # typographically correct entities. 164 | #html_use_smartypants = True 165 | 166 | # Custom sidebar templates, maps document names to template names. 167 | #html_sidebars = {} 168 | 169 | # Additional templates that should be rendered to pages, maps page names to 170 | # template names. 171 | #html_additional_pages = {} 172 | 173 | # If false, no module index is generated. 174 | #html_domain_indices = True 175 | 176 | # If false, no index is generated. 177 | #html_use_index = True 178 | 179 | # If true, the index is split into individual pages for each letter. 180 | #html_split_index = False 181 | 182 | # If true, links to the reST sources are added to the pages. 183 | #html_show_sourcelink = True 184 | 185 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 186 | #html_show_sphinx = True 187 | 188 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 189 | #html_show_copyright = True 190 | 191 | # If true, an OpenSearch description file will be output, and all pages will 192 | # contain a tag referring to it. The value of this option must be the 193 | # base URL from which the finished HTML is served. 194 | #html_use_opensearch = '' 195 | 196 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 197 | #html_file_suffix = None 198 | 199 | # Language to be used for generating the HTML full-text search index. 200 | # Sphinx supports the following languages: 201 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 202 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 203 | #html_search_language = 'en' 204 | 205 | # A dictionary with options for the search language support, empty by default. 206 | # Now only 'ja' uses this config value 207 | #html_search_options = {'type': 'default'} 208 | 209 | # The name of a javascript file (relative to the configuration directory) that 210 | # implements a search results scorer. If empty, the default will be used. 211 | #html_search_scorer = 'scorer.js' 212 | 213 | # Output file base name for HTML help builder. 214 | htmlhelp_basename = 'fastFMdoc' 215 | 216 | # -- Options for LaTeX output --------------------------------------------- 217 | 218 | latex_elements = { 219 | # The paper size ('letterpaper' or 'a4paper'). 220 | #'papersize': 'letterpaper', 221 | 222 | # The font size ('10pt', '11pt' or '12pt'). 223 | #'pointsize': '10pt', 224 | 225 | # Additional stuff for the LaTeX preamble. 226 | #'preamble': '', 227 | 228 | # Latex figure (float) alignment 229 | #'figure_align': 'htbp', 230 | } 231 | 232 | # Grouping the document tree into LaTeX files. List of tuples 233 | # (source start file, target name, title, 234 | # author, documentclass [howto, manual, or own class]). 235 | latex_documents = [ 236 | (master_doc, 'fastFM.tex', u'fastFM Documentation', 237 | u'Immanuel Bayer', 'manual'), 238 | ] 239 | 240 | # The name of an image file (relative to this directory) to place at the top of 241 | # the title page. 242 | #latex_logo = None 243 | 244 | # For "manual" documents, if this is true, then toplevel headings are parts, 245 | # not chapters. 246 | #latex_use_parts = False 247 | 248 | # If true, show page references after internal links. 249 | #latex_show_pagerefs = False 250 | 251 | # If true, show URL addresses after external links. 252 | #latex_show_urls = False 253 | 254 | # Documents to append as an appendix to all manuals. 255 | #latex_appendices = [] 256 | 257 | # If false, no module index is generated. 258 | #latex_domain_indices = True 259 | 260 | 261 | # -- Options for manual page output --------------------------------------- 262 | 263 | # One entry per manual page. List of tuples 264 | # (source start file, name, description, authors, manual section). 265 | man_pages = [ 266 | (master_doc, 'fastfm', u'fastFM Documentation', 267 | [author], 1) 268 | ] 269 | 270 | # If true, show URL addresses after external links. 271 | #man_show_urls = False 272 | 273 | 274 | # -- Options for Texinfo output ------------------------------------------- 275 | 276 | # Grouping the document tree into Texinfo files. List of tuples 277 | # (source start file, target name, title, author, 278 | # dir menu entry, description, category) 279 | texinfo_documents = [ 280 | (master_doc, 'fastFM', u'fastFM Documentation', 281 | author, 'fastFM', 'One line description of project.', 282 | 'Miscellaneous'), 283 | ] 284 | 285 | # Documents to append as an appendix to all manuals. 286 | #texinfo_appendices = [] 287 | 288 | # If false, no module index is generated. 289 | #texinfo_domain_indices = True 290 | 291 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 292 | #texinfo_show_urls = 'footnote' 293 | 294 | # If true, do not generate a @detailmenu in the "Top" node's menu. 295 | #texinfo_no_detailmenu = False 296 | -------------------------------------------------------------------------------- /doc/dev_notes.txt: -------------------------------------------------------------------------------- 1 | update doc: 2 | ghp-import doc/_build/html/ -p -n 3 | 4 | -n Include a .nojekyll file in the branch. 5 | 6 | for new releases: 7 | 8 | - update version & release in `conf.py` 9 | - make sure doc is still correct `make doctest` 10 | 11 | run test server 12 | cd fastFM/doc/ 13 | python -m SimpleHTTPServer 14 | -------------------------------------------------------------------------------- /doc/guide.rst: -------------------------------------------------------------------------------- 1 | Guide 2 | ===== 3 | 4 | How to choose the right Solver. 5 | ------------------------------- 6 | 7 | This section explains the trade off between the three solvers available in fastFM. 8 | The following applies for both **classification** and **regression** tasks. 9 | 10 | .. testcode:: 11 | 12 | import fastFM.mcmc 13 | 14 | - (+) smallest number of hyper parameter 15 | - (+) automatic regularization 16 | - (-) predictions need to be calculated at training time 17 | 18 | `Note: The predict method of the mcmc model returns predictions based on only 19 | the last draw of the model parameters. This evaluation is fast 20 | but usually of low quality. Don't use mcmc if you need fast predictions!` 21 | 22 | .. testcode:: 23 | 24 | import fastFM.als 25 | 26 | - (+) fast predictions 27 | - (+) less hyper parameter then SGD 28 | - (-) regularization must be specified 29 | 30 | .. testcode:: 31 | 32 | import fastFM.sgd 33 | 34 | - (+) fast predictions 35 | - (+) can iterate over large datasets (split and iterate over junks using warm start) 36 | - (-) regularization must be specified 37 | - (-) highest number of hyper parameter (requires, `step_size`) 38 | 39 | 40 | Learning Curves 41 | --------------- 42 | 43 | Learning curves are an important tool to understand the model behavior and 44 | enable us to use techniques such as early stopping to avoid over fitting. We can 45 | `warm_start` every fastFM model which allows us to calculate custom statistics during 46 | the model fitting process efficiently. The following example uses `RMSE` and 47 | `R^2` to demonstrate how we can monitor model performance on train and test set 48 | efficiently. Please note that we can replace them with any metric we want. 49 | 50 | .. plot:: 51 | :include-source: 52 | 53 | from fastFM import als 54 | from fastFM.datasets import make_user_item_regression 55 | from sklearn.metrics import mean_squared_error, r2_score 56 | import numpy as np 57 | 58 | X, y, coef = make_user_item_regression(label_stdev=.4) 59 | from sklearn.model_selection import train_test_split 60 | X_train, X_test, y_train, y_test = train_test_split( 61 | X, y, test_size=0.33, random_state=42) 62 | 63 | n_iter = 20 64 | step_size = 1 65 | l2_reg_w = 0 66 | l2_reg_V = 0 67 | 68 | fm = als.FMRegression(n_iter=0, l2_reg_w=0.1, l2_reg_V=0.1, rank=4) 69 | # Allocates and initalizes the model parameter. 70 | fm.fit(X_train, y_train) 71 | 72 | rmse_train = [] 73 | rmse_test = [] 74 | r2_score_train = [] 75 | r2_score_test = [] 76 | 77 | for i in range(1, n_iter): 78 | fm.fit(X_train, y_train, n_more_iter=step_size) 79 | y_pred = fm.predict(X_test) 80 | 81 | rmse_train.append(np.sqrt(mean_squared_error(fm.predict(X_train), y_train))) 82 | rmse_test.append(np.sqrt(mean_squared_error(fm.predict(X_test), y_test))) 83 | 84 | r2_score_train.append(r2_score(fm.predict(X_train), y_train)) 85 | r2_score_test.append(r2_score(fm.predict(X_test), y_test)) 86 | 87 | 88 | from matplotlib import pyplot as plt 89 | fig, axes = plt.subplots(ncols=2, figsize=(15, 4)) 90 | 91 | x = np.arange(1, n_iter) * step_size 92 | with plt.style.context('fivethirtyeight'): 93 | axes[0].plot(x, rmse_train, label='RMSE-train', color='r', ls="--") 94 | axes[0].plot(x, rmse_test, label='RMSE-test', color='r') 95 | axes[1].plot(x, r2_score_train, label='R^2-train', color='b', ls="--") 96 | axes[1].plot(x, r2_score_test, label='R^2-test', color='b') 97 | axes[0].set_ylabel('RMSE', color='r') 98 | axes[1].set_ylabel('R^2', color='b') 99 | axes[0].legend() 100 | axes[1].legend() 101 | 102 | Visualizing MCMC Traces 103 | ----------------------- 104 | 105 | Our MCMC implementation samples model and hyper parameter at every iteration 106 | and calculates a running mean of the predictions. MCMC traces are an important tool 107 | for evaluating convergence and mixing behavior MCMC chains. The following example 108 | demonstrates how to calculate statistics for predictions, hyper parameter and 109 | model parameter efficiently using the `warm_start` option. 110 | 111 | 112 | .. plot:: 113 | :include-source: 114 | 115 | import numpy as np 116 | from sklearn.metrics import mean_squared_error 117 | from sklearn.model_selection import train_test_split 118 | 119 | from fastFM.datasets import make_user_item_regression 120 | from fastFM import mcmc 121 | 122 | n_iter = 100 123 | step_size = 10 124 | seed = 123 125 | rank = 3 126 | 127 | X, y, coef = make_user_item_regression(label_stdev=.4) 128 | X_train, X_test, y_train, y_test = train_test_split( 129 | X, y, test_size=0.33) 130 | 131 | fm = mcmc.FMRegression(n_iter=0, rank=rank, random_state=seed) 132 | # Allocates and initalizes the model and hyper parameter. 133 | fm.fit_predict(X_train, y_train, X_test) 134 | 135 | rmse_test = [] 136 | rmse_new = [] 137 | hyper_param = np.zeros((n_iter -1, 3 + 2 * rank), dtype=np.float64) 138 | for nr, i in enumerate(range(1, n_iter)): 139 | fm.random_state = i * seed 140 | y_pred = fm.fit_predict(X_train, y_train, X_test, n_more_iter=step_size) 141 | rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_test))) 142 | hyper_param[nr, :] = fm.hyper_param_ 143 | 144 | values = np.arange(1, n_iter) 145 | x = values * step_size 146 | burn_in = 5 147 | x = x[burn_in:] 148 | 149 | from matplotlib import pyplot as plt 150 | fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(15, 8)) 151 | 152 | axes[0, 0].plot(x, rmse_test[burn_in:], label='test rmse', color="r") 153 | axes[0, 0].legend() 154 | axes[0, 1].plot(x, hyper_param[burn_in:,0], label='alpha', color="b") 155 | axes[0, 1].legend() 156 | axes[1, 0].plot(x, hyper_param[burn_in:,1], label='lambda_w', color="g") 157 | axes[1, 0].legend() 158 | axes[1, 1].plot(x, hyper_param[burn_in:,3], label='mu_w', color="g") 159 | axes[1, 1].legend() 160 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. fastFM documentation master file, created by 2 | sphinx-quickstart on Mon Jun 15 01:42:19 2015. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to fastFM's documentation! 7 | ================================== 8 | 9 | This is the documentation for fastFM's python interface. 10 | **Source code** and **install instructions** can be found on https://github.com/ibayer/fastFM. 11 | A short paper describing the library is available on arXiv http://arxiv.org/abs/1505.00641 12 | 13 | 14 | Supported Operating Systems 15 | --------------------------- 16 | fastFM has a continous integration / testing servers (Travis) for **Linux (Ubuntu 14.04 LTS)** 17 | and **OS X Mavericks**. Other OS are not actively supported. 18 | 19 | .. toctree:: 20 | :maxdepth: 3 21 | 22 | tutorial 23 | guide 24 | api 25 | 26 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fastFM.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fastFM.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /doc/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | The following sections show how to use different features of the fastFM 5 | library. This is mostly a demonstration on of the library and no background 6 | on the Factorization Machine (FM) model is given. 7 | I recommend to read [TIST2012]. This paper contains many examples on how FM's 8 | can emulate and extend matrix factorization models through feature engineering. 9 | 10 | 11 | Regression with ALS Solver 12 | -------------------------- 13 | 14 | We first set up a small toy dataset for a regression problem. Please 15 | refere to [SIGIR2011] for background information on the implemented ALS solver. 16 | 17 | .. testcode:: 18 | 19 | from fastFM.datasets import make_user_item_regression 20 | from sklearn.model_selection import train_test_split 21 | 22 | # This sets up a small test dataset. 23 | X, y, _ = make_user_item_regression(label_stdev=.4) 24 | X_train, X_test, y_train, y_test = train_test_split(X, y) 25 | 26 | The number of iterations `n_iter`, the standard deviation `init_stdev` used to 27 | initialize the model parameter and the number of hidden variables `rank` per feature. 28 | This are the parameters that have to be specified for every solver and task. The ALS 29 | solver requires in addition the regularization values for the first `l2_reg_w` 30 | and second order `l2_reg_V` interactions. 31 | 32 | .. testcode:: 33 | 34 | from fastFM import als 35 | fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5) 36 | fm.fit(X_train, y_train) 37 | y_pred = fm.predict(X_test) 38 | 39 | We can easily evaluate our model using the scikit-learn library. 40 | 41 | .. testcode:: 42 | 43 | from sklearn.metrics import mean_squared_error 44 | 'mse:', mean_squared_error(y_test, y_pred) 45 | 46 | 47 | Logit Classification with SGD Solver 48 | ------------------------------------ 49 | 50 | We first have to convert the target of our toy dataset to -1/1 values 51 | in order to work with the classification implementation. Currently only 52 | binary classification is supported. 53 | 54 | .. testcode:: 55 | 56 | import numpy as np 57 | # Convert dataset to binary classification task. 58 | y_labels = np.ones_like(y) 59 | y_labels[y < np.mean(y)] = -1 60 | X_train, X_test, y_train, y_test = train_test_split(X, y_labels) 61 | 62 | 63 | We could have used the ALS solver module for this problem as well but 64 | we will use the SGD module instead. In addition to the 65 | hyper parameter needed for the ALS module we need to specify 66 | the SGD specific `step_size` parameter. 67 | 68 | .. testcode:: 69 | 70 | from fastFM import sgd 71 | fm = sgd.FMClassification(n_iter=1000, init_stdev=0.1, l2_reg_w=0, 72 | l2_reg_V=0, rank=2, step_size=0.1) 73 | fm.fit(X_train, y_train) 74 | y_pred = fm.predict(X_test) 75 | 76 | 77 | All classifier implementations can not only return the most likely labels 78 | but also class probabilities via the `predict_proba`. 79 | 80 | .. testcode:: 81 | 82 | y_pred_proba = fm.predict_proba(X_test) 83 | 84 | This is important for classification metrics such as the AUC score that require the class probabilities 85 | as input. 86 | 87 | .. testcode:: 88 | 89 | from sklearn.metrics import accuracy_score, roc_auc_score 90 | 'acc:', accuracy_score(y_test, y_pred) 91 | 'auc:', roc_auc_score(y_test, y_pred_proba) 92 | 93 | 94 | Bayesian Probit Classification with MCMC Solver 95 | ----------------------------------------------- 96 | 97 | The MCMC module needs fewer hyper parameter that any other solver. 98 | This solver is able to integrate out the regularization parameter and frees us 99 | from selecting them manually. Please see [Freuden2011] for the detail on the implemented 100 | Gibbs sampler. 101 | The major drawback of the MCMC solver is that it forces us to calculate predictions 102 | during fitting time using the `fit_predict` function. 103 | It's however possible to select a subset of parameter draws to speed up prediction [RecSys2013]. 104 | It's also possible to just call `predict` on a trained MCMC model but this returns predictions 105 | that are solely based on the last parameters draw. 106 | These predictions can be used for diagnostic purposes but 107 | are usually not as good as averaged predictions returned by `fit_predict`. 108 | 109 | 110 | .. testcode:: 111 | 112 | from fastFM import mcmc 113 | fm = mcmc.FMClassification(n_iter=1000, rank=2, init_stdev=0.1) 114 | 115 | Our last example shows how to use the MCMC module for binary classification. 116 | Probit regression uses the Cumulative Distribution Function (CDF) of the standard normal Distribution 117 | as link function. Mainly because the CDF leads to an easier Gibbs solver then the 118 | sigmoid function used in the SGD classifier implementation. The results 119 | are in practice usually very similar. 120 | 121 | .. testcode:: 122 | 123 | y_pred = fm.fit_predict(X_train, y_train, X_test) 124 | y_pred_proba = fm.fit_predict_proba(X_train, y_train, X_test) 125 | 126 | 127 | .. testcode:: 128 | 129 | from sklearn.metrics import accuracy_score, roc_auc_score 130 | 'acc:', accuracy_score(y_test, y_pred) 131 | 'auc:', roc_auc_score(y_test, y_pred_proba) 132 | 133 | 134 | 135 | .. [TIST2012] Rendle, Steffen. "Factorization machines with libfm." ACM Transactions on Intelligent Systems and Technology (TIST) 3.3 (2012): 57. 136 | .. [SIGIR2011] Rendle, Steffen, et al. "Fast context-aware recommendations with factorization machines." Proceedings of the 34th international ACM SIGIR conference on Research and development in Information Retrieval. ACM, 2011. 137 | .. [Freuden2011] C Freudenthaler, L Schmidt-Thieme, S Rendle "Bayesian factorization machines" - 2011 - Citeseer 138 | .. [RecSys2013] Silbermann, Bayer, and Rendle "Sample selection for MCMC-based recommender systems" Proceedings of the 7th ACM conference on Recommender systems 2013 139 | -------------------------------------------------------------------------------- /examples/warm_start_als.py: -------------------------------------------------------------------------------- 1 | from fastFM.datasets import make_user_item_regression 2 | from fastFM import als 3 | from sklearn.metrics import mean_squared_error 4 | import scipy.sparse as sp 5 | import numpy as np 6 | 7 | if __name__ == "__main__": 8 | 9 | X, y, coef = make_user_item_regression(label_stdev=.4) 10 | from sklearn.cross_validation import train_test_split 11 | X_train, X_test, y_train, y_test = train_test_split( 12 | X, y, test_size=0.33, random_state=42) 13 | X_train = sp.csc_matrix(X_train) 14 | X_test = sp.csc_matrix(X_test) 15 | n_iter = 50 16 | 17 | """ 18 | offset = '../../fastFM-notes/benchmarks/' 19 | train_path = offset + "data/ml-100k/u1.base.libfm" 20 | test_path = offset + "data/ml-100k/u1.test.libfm" 21 | 22 | from sklearn.datasets import load_svmlight_file 23 | X_train, y_train = load_svmlight_file(train_path) 24 | X_test, y_test= load_svmlight_file(test_path) 25 | X_train = sp.csc_matrix(X_train) 26 | X_test = sp.csc_matrix(X_test) 27 | # add padding for features not in test 28 | X_test = sp.hstack([X_test, sp.csc_matrix((X_test.shape[0], X_train.shape[1] - X_test.shape[1]))]) 29 | """ 30 | 31 | n_iter = 50 32 | rank = 4 33 | seed = 333 34 | step_size = 1 35 | l2_reg_w = 0 36 | l2_reg_V = 0 37 | 38 | fm = als.FMRegression(n_iter=0, l2_reg_w=l2_reg_w, 39 | l2_reg_V=l2_reg_V, rank=rank, random_state=seed) 40 | # initalize coefs 41 | fm.fit(X_train, y_train) 42 | 43 | rmse_train = [] 44 | rmse_test = [] 45 | for i in range(1, n_iter): 46 | fm.fit(X_train, y_train, n_more_iter=step_size) 47 | y_pred = fm.predict(X_test) 48 | rmse_train.append(np.sqrt(mean_squared_error(fm.predict(X_train), y_train))) 49 | rmse_test.append(np.sqrt(mean_squared_error(fm.predict(X_test), y_test))) 50 | 51 | print '------- restart ----------' 52 | values = np.arange(1, n_iter) 53 | rmse_test_re = [] 54 | rmse_train_re = [] 55 | for i in values: 56 | fm = als.FMRegression(n_iter=i, l2_reg_w=l2_reg_w, 57 | l2_reg_V=l2_reg_V, rank=rank, random_state=seed) 58 | fm.fit(X_train, y_train) 59 | rmse_test_re.append(np.sqrt(mean_squared_error(fm.predict(X_test), y_test))) 60 | rmse_train_re.append(np.sqrt(mean_squared_error(fm.predict(X_train), y_train))) 61 | 62 | from matplotlib import pyplot as plt 63 | 64 | x = np.arange(1, n_iter) * step_size 65 | 66 | with plt.style.context('fivethirtyeight'): 67 | plt.plot(x, rmse_train, label='train') 68 | plt.plot(x, rmse_test, label='test') 69 | plt.plot(values, rmse_train_re, label='train re', linestyle='--') 70 | plt.plot(values, rmse_test_re, label='test re', ls='--') 71 | plt.legend() 72 | plt.show() 73 | -------------------------------------------------------------------------------- /examples/warm_start_mcmc.py: -------------------------------------------------------------------------------- 1 | from fastFM.datasets import make_user_item_regression 2 | from fastFM import mcmc 3 | from sklearn.metrics import mean_squared_error 4 | import scipy.sparse as sp 5 | import numpy as np 6 | 7 | 8 | if __name__ == "__main__": 9 | 10 | 11 | offset = '../../fastFM-notes/benchmarks/' 12 | train_path = offset + "data/ml-100k/u1.base.libfm" 13 | test_path = offset + "data/ml-100k/u1.test.libfm" 14 | 15 | from sklearn.datasets import load_svmlight_file 16 | X_train, y_train = load_svmlight_file(train_path) 17 | X_test, y_test= load_svmlight_file(test_path) 18 | X_train = sp.csc_matrix(X_train) 19 | X_test = sp.csc_matrix(X_test) 20 | # add padding for features not in test 21 | X_test = sp.hstack([X_test, sp.csc_matrix((X_test.shape[0], X_train.shape[1] - X_test.shape[1]))]) 22 | 23 | """ 24 | X_train = sp.csc_matrix(np.array([[6, 1], 25 | [2, 3], 26 | [3, 0], 27 | [6, 1], 28 | [4, 5]]), dtype=np.float64) 29 | y_train = np.array([298, 266, 29, 298, 848], dtype=np.float64) 30 | X_test = X_train 31 | y_test = y_train 32 | """ 33 | 34 | n_iter = 50 35 | rank = 4 36 | seed = 333 37 | step_size = 1 38 | 39 | """ 40 | X, y, coef = make_user_item_regression(label_stdev=.4, random_state=seed) 41 | from sklearn.cross_validation import train_test_split 42 | X_train, X_test, y_train, y_test = train_test_split( 43 | X, y, test_size=0.33, random_state=seed) 44 | X_train = sp.csc_matrix(X_train) 45 | X_test = sp.csc_matrix(X_test) 46 | X_test = X_train 47 | y_test = y_train 48 | """ 49 | 50 | fm = mcmc.FMRegression(n_iter=0, rank=rank, random_state=seed) 51 | # initalize coefs 52 | fm.fit_predict(X_train, y_train, X_test) 53 | 54 | rmse_test = [] 55 | rmse_new = [] 56 | hyper_param = np.zeros((n_iter -1, 3 + 2 * rank), dtype=np.float64) 57 | for nr, i in enumerate(range(1, n_iter)): 58 | fm.random_state = i * seed 59 | y_pred = fm.fit_predict(X_train, y_train, X_test, n_more_iter=step_size) 60 | rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_test))) 61 | hyper_param[nr, :] = fm.hyper_param_ 62 | 63 | print '------- restart ----------' 64 | values = np.arange(1, n_iter) 65 | rmse_test_re = [] 66 | hyper_param_re = np.zeros((len(values), 3 + 2 * rank), dtype=np.float64) 67 | for nr, i in enumerate(values): 68 | fm = mcmc.FMRegression(n_iter=i, rank=rank, random_state=seed) 69 | y_pred = fm.fit_predict(X_train, y_train, X_test) 70 | rmse_test_re.append(np.sqrt(mean_squared_error(y_pred, y_test))) 71 | hyper_param_re[nr, :] = fm.hyper_param_ 72 | 73 | from matplotlib import pyplot as plt 74 | fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(15, 8)) 75 | 76 | x = values * step_size 77 | burn_in = 5 78 | x = x[burn_in:] 79 | 80 | #with plt.style.context('ggplot'): 81 | axes[0, 0].plot(x, rmse_test[burn_in:], label='test rmse', color="r") 82 | axes[0, 0].plot(values[burn_in:], rmse_test_re[burn_in:], ls="--", color="r") 83 | axes[0, 0].legend() 84 | 85 | axes[0, 1].plot(x, hyper_param[burn_in:,0], label='alpha', color="b") 86 | axes[0, 1].plot(values[burn_in:], hyper_param_re[burn_in:,0], ls="--", color="b") 87 | axes[0, 1].legend() 88 | 89 | axes[1, 0].plot(x, hyper_param[burn_in:,1], label='lambda_w', color="g") 90 | #axes[2].plot(x, hyper_param[:,2], label='lambda_V', color="r") 91 | axes[1, 0].plot(values[burn_in:], hyper_param_re[burn_in:,1], ls="--", color="g") 92 | #axes[2].plot(values, hyper_param_re[:,2], label='lambda_V', ls="--", color="r") 93 | axes[1, 0].legend() 94 | 95 | axes[1, 1].plot(x, hyper_param[burn_in:,3], label='mu_w', color="g") 96 | #axes[3].plot(x, hyper_param[:,4], label='mu_V', color="r") 97 | axes[1, 1].plot(values[burn_in:], hyper_param_re[burn_in:,3], ls="--", color="g") 98 | #axes[3].plot(values, hyper_param_re[:,4], label='mu_V', ls="--", color="r") 99 | axes[1, 1].legend() 100 | 101 | plt.show() 102 | #plt.savefig("../../fastFM-notes/jmlr/figs/mcmc_trace.pdf", bbox_inches='tight') 103 | -------------------------------------------------------------------------------- /fastFM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibayer/fastFM/9f30c5564a8d365105876f4e5d751c46e57dc983/fastFM/__init__.py -------------------------------------------------------------------------------- /fastFM/als.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import ffm 5 | import numpy as np 6 | from sklearn.base import RegressorMixin 7 | from .validation import check_consistent_length, check_array 8 | from .base import (FactorizationMachine, BaseFMClassifier, 9 | _validate_class_labels, _check_warm_start) 10 | 11 | 12 | class FMRegression(FactorizationMachine, RegressorMixin): 13 | 14 | """ Factorization Machine Regression trained with a als (coordinate descent) 15 | solver. 16 | 17 | Parameters 18 | ---------- 19 | n_iter : int, optional 20 | The number of samples for the MCMC sampler, number or iterations over 21 | the training set for ALS and number of steps for SGD. 22 | 23 | init_stdev: float, optional 24 | Sets the stdev for the initialization of the parameter 25 | 26 | random_state: int, optional 27 | The seed of the pseudo random number generator that 28 | initializes the parameters and mcmc chain. 29 | 30 | rank: int 31 | The rank of the factorization used for the second order interactions. 32 | 33 | l2_reg_w : float 34 | L2 penalty weight for linear coefficients. 35 | 36 | l2_reg_V : float 37 | L2 penalty weight for pairwise coefficients. 38 | 39 | l2_reg : float 40 | L2 penalty weight for all coefficients (default=0). 41 | 42 | Attributes 43 | --------- 44 | 45 | w0_ : float 46 | bias term 47 | 48 | w_ : float | array, shape = (n_features) 49 | Coefficients for linear combination. 50 | 51 | V_ : float | array, shape = (rank_pair, n_features) 52 | Coefficients of second order factor matrix. 53 | """ 54 | def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, 55 | l2_reg_w=0.1, l2_reg_V=0.1, l2_reg=0): 56 | super(FMRegression, self).__init__(n_iter=n_iter, 57 | init_stdev=init_stdev, rank=rank, 58 | random_state=random_state) 59 | if (l2_reg != 0): 60 | self.l2_reg_V = l2_reg 61 | self.l2_reg_w = l2_reg 62 | else: 63 | self.l2_reg_w = l2_reg_w 64 | self.l2_reg_V = l2_reg_V 65 | self.l2_reg = l2_reg 66 | self.task = "regression" 67 | 68 | def fit(self, X_train, y_train, n_more_iter=0): 69 | """ Fit model with specified loss. 70 | 71 | Parameters 72 | ---------- 73 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 74 | 75 | y : float | ndarray, shape = (n_samples, ) 76 | 77 | n_more_iter : int 78 | Number of iterations to continue from the current Coefficients. 79 | 80 | """ 81 | 82 | check_consistent_length(X_train, y_train) 83 | y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) 84 | 85 | X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, 86 | order="F") 87 | self.n_iter = self.n_iter + n_more_iter 88 | 89 | if n_more_iter > 0: 90 | _check_warm_start(self, X_train) 91 | self.warm_start = True 92 | 93 | self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train) 94 | 95 | if self.iter_count != 0: 96 | self.iter_count = self.iter_count + n_more_iter 97 | else: 98 | self.iter_count = self.n_iter 99 | 100 | # reset to default setting 101 | self.warm_start = False 102 | return self 103 | 104 | 105 | class FMClassification(BaseFMClassifier): 106 | 107 | """ Factorization Machine Classification trained with a ALS 108 | (coordinate descent) 109 | solver. 110 | 111 | Parameters 112 | ---------- 113 | n_iter : int, optional 114 | The number of samples for the MCMC sampler, number or iterations over 115 | the training set for ALS and number of steps for SGD. 116 | 117 | init_stdev: float, optional 118 | Sets the stdev for the initialization of the parameter 119 | 120 | random_state: int, optional 121 | The seed of the pseudo random number generator that 122 | initializes the parameters and mcmc chain. 123 | 124 | rank: int 125 | The rank of the factorization used for the second order interactions. 126 | 127 | l2_reg_w : float 128 | L2 penalty weight for linear coefficients. 129 | 130 | l2_reg_V : float 131 | L2 penalty weight for pairwise coefficients. 132 | 133 | l2_reg : float 134 | L2 penalty weight for all coefficients (default=0). 135 | 136 | Attributes 137 | --------- 138 | 139 | w0_ : float 140 | bias term 141 | 142 | w_ : float | array, shape = (n_features) 143 | Coefficients for linear combination. 144 | 145 | V_ : float | array, shape = (rank_pair, n_features) 146 | Coefficients of second order factor matrix. 147 | """ 148 | def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, 149 | l2_reg_w=0.1, l2_reg_V=0.1, l2_reg=None): 150 | super(FMClassification, self).__init__(n_iter=n_iter, 151 | init_stdev=init_stdev, 152 | rank=rank, 153 | random_state=random_state) 154 | if (l2_reg is not None): 155 | self.l2_reg_V = l2_reg 156 | self.l2_reg_w = l2_reg 157 | else: 158 | self.l2_reg_w = l2_reg_w 159 | self.l2_reg_V = l2_reg_V 160 | self.l2_reg = l2_reg 161 | self.task = "classification" 162 | 163 | def fit(self, X_train, y_train, n_more_iter=0): 164 | """ Fit model with specified loss. 165 | 166 | Parameters 167 | ---------- 168 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 169 | 170 | y : float | ndarray, shape = (n_samples, ) 171 | the targets have to be encodes as {-1, 1}. 172 | 173 | n_more_iter : int 174 | Number of iterations to continue from the current Coefficients. 175 | """ 176 | check_consistent_length(X_train, y_train) 177 | X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, 178 | order="F") 179 | y_train = _validate_class_labels(y_train) 180 | 181 | self.classes_ = np.unique(y_train) 182 | if len(self.classes_) != 2: 183 | raise ValueError("This solver only supports binary classification" 184 | " but the data contains" 185 | " class: %r" % self.classes_) 186 | 187 | # fastFM-core expects labels to be in {-1,1} 188 | y_train = y_train.copy() 189 | i_class1 = (y_train == self.classes_[0]) 190 | y_train[i_class1] = -1 191 | y_train[~i_class1] = 1 192 | 193 | self.n_iter = self.n_iter + n_more_iter 194 | 195 | if n_more_iter > 0: 196 | _check_warm_start(self, X_train) 197 | self.warm_start = True 198 | 199 | self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train) 200 | 201 | if self.iter_count != 0: 202 | self.iter_count = self.iter_count + n_more_iter 203 | else: 204 | self.iter_count = self.n_iter 205 | 206 | # reset to default setting 207 | self.warm_start = False 208 | return self 209 | -------------------------------------------------------------------------------- /fastFM/base.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from scipy.stats import norm 7 | from sklearn.base import BaseEstimator, ClassifierMixin 8 | 9 | from .validation import check_array 10 | import ffm 11 | 12 | 13 | def _validate_class_labels(y): 14 | assert len(set(y)) == 2 15 | assert y.min() == -1 16 | assert y.max() == 1 17 | return check_array(y, ensure_2d=False, dtype=np.float64) 18 | 19 | 20 | def _check_warm_start(fm, X): 21 | n_features = X.shape[1] 22 | if not fm.ignore_w_0: 23 | assert fm.w0_ is not None 24 | if not fm.ignore_w: 25 | assert fm.w_ is not None 26 | assert fm.w_.shape[0] == n_features 27 | if not fm.rank == 0: 28 | assert fm.V_.shape[1] == n_features 29 | 30 | 31 | class FactorizationMachine(BaseEstimator): 32 | 33 | """ Factorization Machine trained MCMC (Gibbs) sampling. 34 | The predictions need to be calculated at training time since the individual 35 | parameter samples are to expensive to store. 36 | 37 | Parameters 38 | ---------- 39 | n_iter : int, optional 40 | The number of samples for the MCMC sampler, number or iterations over 41 | the training set for ALS and number of steps for SGD. 42 | 43 | init_stdev: float, optional 44 | Sets the stdev for the initialization of the parameter 45 | 46 | random_state: int, optional 47 | The seed of the pseudo random number generator that 48 | initializes the parameters and mcmc chain. 49 | 50 | rank: int 51 | The rank of the factorization used for the second order interactions. 52 | 53 | copy_X : boolean, optional, default True 54 | If ``True``, X will be copied; else, it may be overwritten. 55 | 56 | Attributes 57 | --------- 58 | Attention these Coefficients are the last sample from the MCMC chain 59 | and can't be used to calculate predictions. 60 | 61 | w0_ : float 62 | bias term 63 | 64 | w_ : float | array, shape = (n_features) 65 | Coefficients for linear combination. 66 | 67 | V_ : float | array, shape = (rank_pair, n_features) 68 | Coefficients of second order factor matrix. 69 | """ 70 | def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, 71 | copy_X=True): 72 | self.n_iter = n_iter 73 | self.random_state = random_state 74 | self.init_stdev = init_stdev 75 | self.rank = rank 76 | self.iter_count = 0 77 | self.warm_start = False 78 | self.ignore_w_0 = False 79 | self.ignore_w = False 80 | self.l2_reg_w = 0.1 81 | self.l2_reg_V = 0.1 82 | self.step_size = 0 83 | self.copy_X = copy_X 84 | 85 | def predict(self, X_test): 86 | """ Return predictions 87 | 88 | Parameters 89 | ---------- 90 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 91 | 92 | Returns 93 | ------ 94 | 95 | T : array, shape (n_samples) 96 | The labels are returned for classification. 97 | """ 98 | X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64, 99 | order="F") 100 | assert sp.isspmatrix_csc(X_test) 101 | assert X_test.shape[1] == len(self.w_) 102 | return ffm.ffm_predict(self.w0_, self.w_, self.V_, X_test) 103 | 104 | 105 | class BaseFMClassifier(FactorizationMachine, ClassifierMixin): 106 | 107 | def predict(self, X_test): 108 | """ Return predictions 109 | 110 | Parameters 111 | ---------- 112 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 113 | 114 | Returns 115 | ------ 116 | 117 | y : array, shape (n_samples) 118 | Class labels 119 | """ 120 | y_proba = norm.cdf(super(BaseFMClassifier, self).predict(X_test)) 121 | # convert probs to labels 122 | y_pred = np.zeros_like(y_proba, dtype=np.float64) + self.classes_[0] 123 | y_pred[y_proba > .5] = self.classes_[1] 124 | return y_pred 125 | 126 | def predict_proba(self, X_test): 127 | """ Return probabilities 128 | 129 | Parameters 130 | ---------- 131 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 132 | 133 | Returns 134 | ------ 135 | 136 | y : array, shape (n_samples) 137 | Class Probability for the class with smaller label. 138 | """ 139 | pred = super(BaseFMClassifier, self).predict(X_test) 140 | return norm.cdf(pred) 141 | -------------------------------------------------------------------------------- /fastFM/bpr.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import ffm 5 | import numpy as np 6 | from .base import FactorizationMachine 7 | from numpy.testing import assert_array_equal 8 | from .validation import check_array, assert_all_finite 9 | 10 | 11 | class FMRecommender(FactorizationMachine): 12 | 13 | """ Factorization Machine Recommender with pairwise (BPR) loss solver. 14 | 15 | Parameters 16 | ---------- 17 | n_iter : int, optional 18 | The number of interations of individual samples . 19 | 20 | init_stdev: float, optional 21 | Sets the stdev for the initialization of the parameter 22 | 23 | random_state: int, optional 24 | The seed of the pseudo random number generator that 25 | initializes the parameters and mcmc chain. 26 | 27 | rank: int 28 | The rank of the factorization used for the second order interactions. 29 | 30 | l2_reg_w : float 31 | L2 penalty weight for linear coefficients. 32 | 33 | l2_reg_V : float 34 | L2 penalty weight for pairwise coefficients. 35 | 36 | l2_reg : float 37 | L2 penalty weight for all coefficients (default=0). 38 | 39 | step_size : float 40 | Stepsize for the SGD solver, the solver uses a fixed step size and 41 | might require a tunning of the number of iterations `n_iter`. 42 | 43 | Attributes 44 | --------- 45 | 46 | w0_ : float 47 | bias term 48 | 49 | w_ : float | array, shape = (n_features) 50 | Coefficients for linear combination. 51 | 52 | V_ : float | array, shape = (rank_pair, n_features) 53 | Coefficients of second order factor matrix. 54 | """ 55 | 56 | def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, 57 | l2_reg_w=0.1, l2_reg_V=0.1, l2_reg=0, step_size=0.1): 58 | super(FMRecommender, self).\ 59 | __init__(n_iter=n_iter, init_stdev=init_stdev, rank=rank, 60 | random_state=random_state) 61 | if (l2_reg != 0): 62 | self.l2_reg_V = l2_reg 63 | self.l2_reg_w = l2_reg 64 | else: 65 | self.l2_reg_w = l2_reg_w 66 | self.l2_reg_V = l2_reg_V 67 | self.step_size = step_size 68 | self.task = "ranking" 69 | 70 | def fit(self, X, pairs): 71 | """ Fit model with specified loss. 72 | 73 | Parameters 74 | ---------- 75 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 76 | 77 | y : float | ndarray, shape = (n_compares, 2) 78 | Each row `i` defines a pair of samples such that 79 | the first returns a high value then the second 80 | FM(X[i,0]) > FM(X[i, 1]). 81 | """ 82 | # The sgd solver expects a transposed design matrix in column major 83 | # order (csc_matrix). 84 | X = X.T # creates a copy 85 | X = check_array(X, accept_sparse="csc", dtype=np.float64) 86 | assert_all_finite(pairs) 87 | 88 | pairs = pairs.astype(np.float64) 89 | 90 | # check that pairs contain no real values 91 | assert_array_equal(pairs, pairs.astype(np.int32)) 92 | assert pairs.max() <= X.shape[1] 93 | assert pairs.min() >= 0 94 | self.w0_, self.w_, self.V_ = ffm.ffm_fit_sgd_bpr(self, X, pairs) 95 | return self 96 | -------------------------------------------------------------------------------- /fastFM/cffm.pxd: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | cdef extern from "../fastFM-core/externals/CXSparse/Include/cs.h": 5 | ctypedef struct cs_di: # matrix in compressed-column or triplet form */ 6 | int nzmax # maximum number of entries */ 7 | int m # number of rows */ 8 | int n # number of columns */ 9 | int *p # column pointers (size n+1) or col indices (size nzmax) */ 10 | int *i # row indices, size nzmax */ 11 | double *x # numerical values, size nzmax */ 12 | int nz # # of entries in triplet matrix, -1 for compressed-col */ 13 | 14 | cdef extern from "../fastFM-core/include/ffm.h": 15 | 16 | ctypedef struct ffm_param: 17 | int n_iter 18 | int k 19 | double init_sigma 20 | double init_lambda_w 21 | double init_lambda_V 22 | int TASK 23 | double stepsize 24 | int rng_seed 25 | 26 | int iter_count 27 | int ignore_w_0 28 | int ignore_w 29 | int warm_start 30 | 31 | int n_hyper_param 32 | double *hyper_param 33 | 34 | void ffm_predict(double *w_0, double * w, double * V, cs_di *X, double *y_pred, int k) 35 | 36 | void ffm_als_fit(double *w_0, double *w, double *V, 37 | cs_di *X, double *y, ffm_param *param) 38 | 39 | void ffm_mcmc_fit_predict(double *w_0, double *w, double *V, 40 | cs_di *X_train, cs_di *X_test, double *y_train, double *y_pred, 41 | ffm_param *param) 42 | 43 | void ffm_sgd_fit(double *w_0, double *w, double *V, 44 | cs_di *X, double *y, ffm_param *param) 45 | 46 | void ffm_sgd_bpr_fit(double *w_0, double *w, double *V, 47 | cs_di *X, double *pairs, int n_pairs, ffm_param *param) 48 | -------------------------------------------------------------------------------- /fastFM/datasets.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from sklearn.metrics import mean_squared_error, r2_score 7 | 8 | from .validation import check_random_state 9 | from ffm import ffm_predict 10 | 11 | 12 | def make_user_item_regression(random_state=123, n_user=20, n_item=20, 13 | label_stdev=0.4, rank=2, bias=True, 14 | first_order=True, stdev_w0=.2, stdev_w=0.3, 15 | stdev_V=0.4, mean_w0=2, mean_w=5, mean_V=10): 16 | 17 | n_features = n_user + n_item 18 | n_samples = n_user * n_item 19 | # create design matrix 20 | user_cols = np.repeat(range(n_user), n_item) 21 | item_cols = np.array(list(range(n_item)) * n_user) + n_user 22 | cols = np.hstack((user_cols, item_cols)) 23 | rows = np.hstack((np.arange(n_item*n_user), np.arange(n_item*n_user))) 24 | 25 | X = sp.coo_matrix((np.ones_like(cols, dtype=np.float64), (rows, cols))) 26 | X = sp.csc_matrix(X) 27 | assert X.shape[0] == n_samples 28 | assert X.shape[1] == n_features 29 | 30 | # sample the model parameter 31 | random_state = check_random_state(random_state) 32 | w0 = random_state.normal(mean_w0, stdev_w0) 33 | w = random_state.normal(mean_w, stdev_w, n_features) 34 | V = random_state.normal(mean_V, stdev_V, (rank, n_features)) 35 | 36 | y = ffm_predict(w0, w, V, X) 37 | if label_stdev > 0: 38 | y = random_state.normal(y, label_stdev) 39 | 40 | return X, y, (w0, w, V) 41 | 42 | 43 | if __name__ == '__main__': 44 | X, y, coef = make_user_item_regression(n_user=5, n_item=5, rank=2, 45 | label_stdev=2) 46 | from sklearn.model_selection import train_test_split 47 | X_train, X_test, y_train, y_test = train_test_split( 48 | X, y, test_size=0.33, random_state=42) 49 | 50 | from mcmc import FMRegression 51 | fm = FMRegression(rank=2) 52 | y_pred = fm.fit_predict(sp.csc_matrix(X_train), y_train, 53 | sp.csc_matrix(X_test)) 54 | 55 | print('rmse', mean_squared_error(y_pred, y_test)) 56 | print('r2_score', r2_score(y_pred, y_test)) 57 | np.random.shuffle(y_pred) 58 | print('---- shuffled pred ---------') 59 | print('rmse', mean_squared_error(y_pred, y_test)) 60 | print('r2_score', r2_score(y_pred, y_test)) 61 | -------------------------------------------------------------------------------- /fastFM/ffm.pyx: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | cimport cffm 5 | from cffm cimport cs_di, ffm_param 6 | # Import some functionality from Python and the C stdlib 7 | from cpython.pycapsule cimport * 8 | 9 | from libc.stdlib cimport malloc, free 10 | from scipy.sparse import csc_matrix 11 | cimport numpy as np 12 | import numpy as np 13 | 14 | 15 | # Destructor for cleaning up CsMatrix objects 16 | cdef del_CsMatrix(object obj): 17 | pt = PyCapsule_GetPointer(obj, "CsMatrix") 18 | free( pt) 19 | 20 | 21 | # Create a CsMatrix object and return as a capsule 22 | def CsMatrix(X not None): 23 | cdef cffm.cs_di *p 24 | p = malloc(sizeof(cffm.cs_di)) 25 | if p == NULL: 26 | raise MemoryError("No memory to make a Point") 27 | 28 | cdef int i 29 | cdef np.ndarray[int, ndim=1, mode = 'c'] indptr = X.indptr 30 | cdef np.ndarray[int, ndim=1, mode = 'c'] indices = X.indices 31 | cdef np.ndarray[double, ndim=1, mode = 'c'] data = X.data 32 | 33 | # Put the scipy data into the CSparse struct. This is just copying some 34 | # pointers. 35 | p.nzmax = X.data.shape[0] 36 | p.m = X.shape[0] 37 | p.n = X.shape[1] 38 | p.p = &indptr[0] 39 | p.i = &indices[0] 40 | p.x = &data[0] 41 | p.nz = -1 # to indicate CSC format 42 | return PyCapsule_New(p, "CsMatrix", 43 | del_CsMatrix) 44 | 45 | 46 | # Destructor for cleaning up FFMParam objects 47 | cdef del_FFMParam(object obj): 48 | pt = PyCapsule_GetPointer(obj, "FFMParam") 49 | free( pt) 50 | 51 | 52 | # Create a FFMParam object and return as a capsule 53 | def FFMParam(fm): 54 | map_flags = {'classification': 10, 55 | 'regression': 20, 56 | 'ranking': 30} 57 | cdef cffm.ffm_param *p 58 | p = malloc(sizeof(cffm.ffm_param)) 59 | if p == NULL: 60 | raise MemoryError("No memory to make a FFMParam") 61 | p.n_iter = fm.n_iter 62 | p.k = fm.rank 63 | p.stepsize = fm.step_size 64 | p.init_sigma = fm.init_stdev 65 | p.TASK = map_flags[fm.task] 66 | p.rng_seed = fm.random_state 67 | p.init_lambda_w = fm.l2_reg_w 68 | p.init_lambda_V = fm.l2_reg_V 69 | p.iter_count = fm.iter_count 70 | 71 | p.ignore_w_0 = 1 if fm.ignore_w_0 else 0 72 | p.ignore_w = 1 if fm.ignore_w else 0 73 | p.warm_start = 1 if fm.warm_start else 0 74 | return PyCapsule_New(p, "FFMParam", 75 | del_FFMParam) 76 | 77 | 78 | def ffm_predict(double w_0, double[:] w, 79 | np.ndarray[np.float64_t, ndim = 2] V, X): 80 | assert X.shape[1] == len(w) 81 | assert X.shape[1] == V.shape[1] 82 | X_ = CsMatrix(X) 83 | k = V.shape[0] 84 | pt_X = PyCapsule_GetPointer(X_, "CsMatrix") 85 | cdef np.ndarray[np.float64_t, ndim=1, mode='c'] y =\ 86 | np.zeros(X.shape[0], dtype=np.float64) 87 | cffm.ffm_predict(&w_0, &w[0], V.data, pt_X, &y[0], k) 88 | return y 89 | 90 | 91 | def ffm_als_fit(fm, X, double[:] y): 92 | assert X.shape[0] == len(y) # test shapes 93 | n_features = X.shape[1] 94 | X_ = CsMatrix(X) 95 | pt_X = PyCapsule_GetPointer(X_, "CsMatrix") 96 | param = FFMParam(fm) 97 | pt_param = PyCapsule_GetPointer(param, "FFMParam") 98 | cdef double w_0 99 | cdef np.ndarray[np.float64_t, ndim=1, mode='c'] w 100 | cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V 101 | 102 | if fm.warm_start: 103 | w_0 = 0 if fm.ignore_w_0 else fm.w0_ 104 | w = np.zeros(n_features, dtype=np.float64) if fm.ignore_w else fm.w_ 105 | V = np.zeros((fm.rank, n_features), dtype=np.float64)\ 106 | if fm.rank == 0 else fm.V_ 107 | else: 108 | w_0 = 0 109 | w = np.zeros(n_features, dtype=np.float64) 110 | V = np.zeros((fm.rank, n_features), dtype=np.float64) 111 | 112 | cffm.ffm_als_fit(&w_0, w.data, V.data, 113 | pt_X, &y[0], pt_param) 114 | return w_0, w, V 115 | 116 | 117 | def ffm_sgd_fit(fm, X, double[:] y): 118 | """ 119 | The sgd solver expects a transposed design matrix in column major order 120 | (csc_matrix) Samples are stored in columns, this allows fast sample by 121 | sample access. 122 | """ 123 | assert X.shape[1] == len(y) # test shapes 124 | n_features = X.shape[0] 125 | X_ = CsMatrix(X) 126 | pt_X = PyCapsule_GetPointer(X_, "CsMatrix") 127 | param = FFMParam(fm) 128 | pt_param = PyCapsule_GetPointer(param, "FFMParam") 129 | 130 | # allocate the coefs 131 | cdef double w_0 = 0 132 | cdef np.ndarray[np.float64_t, ndim=1, mode='c'] w =\ 133 | np.zeros(n_features, dtype=np.float64) 134 | cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V =\ 135 | np.zeros((fm.rank, n_features), dtype=np.float64) 136 | 137 | cffm.ffm_sgd_fit(&w_0, w.data, V.data, 138 | pt_X, &y[0], pt_param) 139 | return w_0, w, V 140 | 141 | 142 | def ffm_fit_sgd_bpr(fm, X, np.ndarray[np.float64_t, ndim=2, mode='c'] pairs): 143 | n_features = X.shape[0] 144 | X_ = CsMatrix(X) 145 | pt_X = PyCapsule_GetPointer(X_, "CsMatrix") 146 | param = FFMParam(fm) 147 | pt_param = PyCapsule_GetPointer(param, "FFMParam") 148 | 149 | #allocate the coefs 150 | cdef double w_0 = 0 151 | cdef np.ndarray[np.float64_t, ndim=1, mode='c'] w =\ 152 | np.zeros(n_features, dtype=np.float64) 153 | cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V =\ 154 | np.zeros((fm.rank, n_features), dtype=np.float64) 155 | 156 | cffm.ffm_sgd_bpr_fit(&w_0, w.data, V.data, 157 | pt_X, pairs.data, pairs.shape[0], pt_param) 158 | return w_0, w, V 159 | 160 | 161 | def ffm_mcmc_fit_predict(fm, X_train, X_test, double[:] y): 162 | assert X_train.shape[0] == len(y) 163 | assert X_train.shape[1] == X_test.shape[1] 164 | n_features = X_train.shape[1] 165 | param = FFMParam(fm) 166 | pt_param = PyCapsule_GetPointer(param, "FFMParam") 167 | X_train_ = CsMatrix(X_train) 168 | pt_X_train = PyCapsule_GetPointer(X_train_, "CsMatrix") 169 | X_test_ = CsMatrix(X_test) 170 | pt_X_test = PyCapsule_GetPointer(X_test_, "CsMatrix") 171 | 172 | cdef double w_0 173 | cdef np.ndarray[np.float64_t, ndim=1, mode='c'] w 174 | cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V 175 | # allocate the results vector 176 | cdef np.ndarray[np.float64_t, ndim=1, mode='c'] y_pred =\ 177 | np.zeros(X_test.shape[0], dtype=np.float64) 178 | 179 | if fm.warm_start: 180 | w_0 = 0 if fm.ignore_w_0 else fm.w0_ 181 | w = np.zeros(n_features, dtype=np.float64) if fm.ignore_w else fm.w_ 182 | V = np.zeros((fm.rank, n_features), dtype=np.float64)\ 183 | if fm.rank == 0 else fm.V_ 184 | else: 185 | w_0 = 0 186 | w = np.zeros(n_features, dtype=np.float64) 187 | V = np.zeros((fm.rank, n_features), dtype=np.float64) 188 | 189 | if fm.warm_start: 190 | y_pred = fm.prediction_ 191 | else: 192 | y_pred = np.zeros(X_test.shape[0], dtype=np.float64) 193 | 194 | # allocate vector for hyperparameter 195 | w_groups = 1 196 | n_hyper_param = 1 + 2 * w_groups + 2 * fm.rank 197 | cdef np.ndarray[np.float64_t, ndim=1, mode='c'] hyper_param 198 | 199 | if fm.warm_start: 200 | hyper_param = fm.hyper_param_ 201 | else: 202 | hyper_param = np.zeros(n_hyper_param, dtype=np.float64) 203 | pt_param.n_hyper_param = n_hyper_param 204 | pt_param.hyper_param = hyper_param.data 205 | 206 | cffm.ffm_mcmc_fit_predict(&w_0, w.data, V.data, 207 | pt_X_train, pt_X_test, 208 | &y[0], y_pred.data, 209 | pt_param) 210 | fm.hyper_param_ = hyper_param 211 | return (w_0, w, V), y_pred -------------------------------------------------------------------------------- /fastFM/mcmc.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | 5 | import ffm 6 | import numpy as np 7 | from sklearn.metrics import mean_squared_error 8 | from .validation import (assert_all_finite, check_consistent_length, 9 | check_array) 10 | from .base import (FactorizationMachine, _validate_class_labels, 11 | _check_warm_start) 12 | 13 | 14 | def find_init_stdev(fm, X_train, y_train, X_vali=None, y_vali=None, 15 | stdev_range=None, ): 16 | if not stdev_range: 17 | stdev_range = [0.1, 0.1, 0.2, 0.5, 1.0] 18 | 19 | if not isinstance(fm, FMRegression): 20 | raise Exception("only implemented for FMRegression") 21 | 22 | # just using a dummy here 23 | if X_vali is None: 24 | X_test = X_train[:2, :] 25 | else: 26 | X_test = X_vali 27 | 28 | best_init_stdev = 0 29 | best_mse = np.finfo(np.float64).max 30 | for init_stdev in stdev_range: 31 | fm.init_stdev = init_stdev 32 | y_pred_vali = fm.fit_predict(X_train, y_train, X_test) 33 | if X_vali is None: 34 | y_pred = fm.predict(X_train) 35 | mse = mean_squared_error(y_pred, y_train) 36 | else: 37 | mse = mean_squared_error(y_pred_vali, y_vali) 38 | if mse < best_mse: 39 | best_mse = mse 40 | best_init_stdev = init_stdev 41 | return best_init_stdev, best_mse 42 | 43 | 44 | def _validate_mcmc_fit_input(X_train, y_train, X_test): 45 | 46 | check_consistent_length(X_train, y_train) 47 | assert_all_finite(y_train) 48 | y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) 49 | 50 | assert X_train.shape[1] == X_test.shape[1] 51 | X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, 52 | order="F") 53 | X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64, 54 | order="F") 55 | return X_train, y_train, X_test 56 | 57 | 58 | class FMRegression(FactorizationMachine): 59 | """ Factorization Machine Regression with a MCMC solver. 60 | 61 | Parameters 62 | ---------- 63 | n_iter : int, optional 64 | The number of samples for the MCMC sampler, number or iterations over 65 | the training set for ALS and number of steps for SGD. 66 | 67 | init_stdev: float, optional 68 | Sets the stdev for the initialization of the parameter 69 | 70 | random_state: int, optional 71 | The seed of the pseudo random number generator that 72 | initializes the parameters and mcmc chain. 73 | 74 | rank: int 75 | The rank of the factorization used for the second order interactions. 76 | 77 | 78 | Attributes 79 | ---------- 80 | w0_ : float 81 | bias term 82 | 83 | w_ : float | array, shape = (n_features) 84 | Coefficients for linear combination. 85 | 86 | V_ : float | array, shape = (rank_pair, n_features) 87 | Coefficients of second order factor matrix. 88 | """ 89 | 90 | def fit_predict(self, X_train, y_train, X_test, n_more_iter=0): 91 | """Return average of posterior estimates of the test samples. 92 | 93 | Parameters 94 | ---------- 95 | X_train : scipy.sparse.csc_matrix, (n_samples, n_features) 96 | 97 | y_train : array, shape (n_samples) 98 | 99 | X_test : scipy.sparse.csc_matrix, (n_test_samples, n_features) 100 | 101 | n_more_iter : int 102 | Number of iterations to continue from the current Coefficients. 103 | 104 | Returns 105 | ------- 106 | T : array, shape (n_test_samples) 107 | """ 108 | self.task = "regression" 109 | X_train, y_train, X_test = _validate_mcmc_fit_input(X_train, y_train, 110 | X_test) 111 | 112 | self.n_iter = self.n_iter + n_more_iter 113 | 114 | if n_more_iter > 0: 115 | _check_warm_start(self, X_train) 116 | assert self.prediction_.shape[0] == X_test.shape[0] 117 | assert self.hyper_param_.shape 118 | self.warm_start = True 119 | else: 120 | self.iter_count = 0 121 | 122 | coef, y_pred = ffm.ffm_mcmc_fit_predict(self, X_train, 123 | X_test, y_train) 124 | self.w0_, self.w_, self.V_ = coef 125 | self.prediction_ = y_pred 126 | self.warm_start = False 127 | 128 | if self.iter_count != 0: 129 | self.iter_count = self.iter_count + n_more_iter 130 | else: 131 | self.iter_count = self.n_iter 132 | 133 | return y_pred 134 | 135 | 136 | class FMClassification(FactorizationMachine): 137 | """ Factorization Machine Classification with a MCMC solver. 138 | 139 | Parameters 140 | ---------- 141 | n_iter : int, optional 142 | The number of samples for the MCMC sampler, number or iterations over 143 | the training set for ALS and number of steps for SGD. 144 | 145 | init_stdev: float, optional 146 | Sets the stdev for the initialization of the parameter 147 | 148 | random_state: int, optional 149 | The seed of the pseudo random number generator that 150 | initializes the parameters and mcmc chain. 151 | 152 | rank: int 153 | The rank of the factorization used for the second order interactions. 154 | 155 | Attributes 156 | ---------- 157 | w0_ : float 158 | bias term 159 | 160 | w_ : float | array, shape = (n_features) 161 | Coefficients for linear combination. 162 | 163 | V_ : float | array, shape = (rank_pair, n_features) 164 | Coefficients of second order factor matrix. 165 | """ 166 | 167 | def fit_predict(self, X_train, y_train, X_test): 168 | """Return average class probabilities of posterior estimates of the 169 | test samples. 170 | Use only with MCMC! 171 | 172 | Parameters 173 | ---------- 174 | X_train : scipy.sparse.csc_matrix, (n_samples, n_features) 175 | 176 | y_train : array, shape (n_samples) 177 | the targets have to be encodes as {-1, 1}. 178 | 179 | X_test : scipy.sparse.csc_matrix, (n_test_samples, n_features) 180 | 181 | Returns 182 | ------- 183 | y_pred : array, shape (n_test_samples) 184 | Returns predicted class labels. 185 | 186 | """ 187 | y_proba = self.fit_predict_proba(X_train, y_train, X_test) 188 | y_pred = np.zeros_like(y_proba, dtype=np.float64) + self.classes_[0] 189 | y_pred[y_proba > .5] = self.classes_[1] 190 | return y_pred 191 | 192 | def fit_predict_proba(self, X_train, y_train, X_test): 193 | """Return average class probabilities of posterior estimates of the 194 | test samples. 195 | Use only with MCMC! 196 | 197 | Parameters 198 | ---------- 199 | X_train : scipy.sparse.csc_matrix, (n_samples, n_features) 200 | 201 | y_train : array, shape (n_samples) 202 | the targets have to be encodes as {-1, 1}. 203 | 204 | X_test : scipy.sparse.csc_matrix, (n_test_samples, n_features) 205 | 206 | Returns 207 | ------- 208 | y_pred : array, shape (n_test_samples) 209 | Returns probability estimates for the class with lowest 210 | classification label. 211 | 212 | """ 213 | self.task = "classification" 214 | 215 | self.classes_ = np.unique(y_train) 216 | if len(self.classes_) != 2: 217 | raise ValueError("This solver only supports binary classification" 218 | " but the data contains" 219 | " class: %r" % self.classes_) 220 | 221 | # fastFM-core expects labels to be in {-1,1} 222 | y_train = y_train.copy() 223 | i_class1 = (y_train == self.classes_[0]) 224 | y_train[i_class1] = -1 225 | y_train[~i_class1] = 1 226 | 227 | X_train, y_train, X_test = _validate_mcmc_fit_input(X_train, y_train, 228 | X_test) 229 | y_train = _validate_class_labels(y_train) 230 | 231 | coef, y_pred = ffm.ffm_mcmc_fit_predict(self, X_train, 232 | X_test, y_train) 233 | self.w0_, self.w_, self.V_ = coef 234 | return y_pred 235 | -------------------------------------------------------------------------------- /fastFM/sgd.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | 5 | import ffm 6 | import numpy as np 7 | from sklearn.base import RegressorMixin 8 | from .validation import check_array, check_consistent_length 9 | from .base import (FactorizationMachine, BaseFMClassifier, 10 | _validate_class_labels) 11 | 12 | 13 | class FMRegression(FactorizationMachine, RegressorMixin): 14 | 15 | """ Factorization Machine Regression trained with a stochastic gradient 16 | descent solver. 17 | 18 | Parameters 19 | ---------- 20 | n_iter : int, optional 21 | The number of interations of individual samples . 22 | 23 | init_stdev: float, optional 24 | Sets the stdev for the initialization of the parameter 25 | 26 | random_state: int, optional 27 | The seed of the pseudo random number generator that 28 | initializes the parameters and mcmc chain. 29 | 30 | rank: int 31 | The rank of the factorization used for the second order interactions. 32 | 33 | l2_reg_w : float 34 | L2 penalty weight for linear coefficients. 35 | 36 | l2_reg_V : float 37 | L2 penalty weight for pairwise coefficients. 38 | 39 | l2_reg : float 40 | L2 penalty weight for all coefficients (default=0). 41 | 42 | step_size : float 43 | Stepsize for the SGD solver, the solver uses a fixed step size and 44 | might require a tunning of the number of iterations `n_iter`. 45 | 46 | Attributes 47 | --------- 48 | 49 | w0_ : float 50 | bias term 51 | 52 | w_ : float | array, shape = (n_features) 53 | Coefficients for linear combination. 54 | 55 | V_ : float | array, shape = (rank_pair, n_features) 56 | Coefficients of second order factor matrix. 57 | """ 58 | 59 | def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, 60 | l2_reg_w=0.1, l2_reg_V=0.1, l2_reg=0, step_size=0.1): 61 | super(FMRegression, self).\ 62 | __init__(n_iter=n_iter, init_stdev=init_stdev, rank=rank, 63 | random_state=random_state) 64 | if (l2_reg != 0): 65 | self.l2_reg_V = l2_reg 66 | self.l2_reg_w = l2_reg 67 | else: 68 | self.l2_reg_w = l2_reg_w 69 | self.l2_reg_V = l2_reg_V 70 | self.l2_reg = l2_reg 71 | self.step_size = step_size 72 | self.task = "regression" 73 | 74 | def fit(self, X, y): 75 | """ Fit model with specified loss. 76 | 77 | Parameters 78 | ---------- 79 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 80 | 81 | y : float | ndarray, shape = (n_samples, ) 82 | 83 | """ 84 | 85 | check_consistent_length(X, y) 86 | y = check_array(y, ensure_2d=False, dtype=np.float64) 87 | 88 | # The sgd solver expects a transposed design matrix in column major 89 | # order (csc_matrix). 90 | X = X.T # creates a copy 91 | X = check_array(X, accept_sparse="csc", dtype=np.float64) 92 | 93 | self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) 94 | return self 95 | 96 | 97 | class FMClassification(BaseFMClassifier): 98 | 99 | """ Factorization Machine Classification trained with a stochastic gradient 100 | descent solver. 101 | 102 | Parameters 103 | ---------- 104 | n_iter : int, optional 105 | The number of interations of individual samples . 106 | 107 | init_std: float, optional 108 | Sets the stdev for the initialization of the parameter 109 | 110 | random_state: int, optional 111 | The seed of the pseudo random number generator that 112 | initializes the parameters and mcmc chain. 113 | 114 | rank: int 115 | The rank of the factorization used for the second order interactions. 116 | 117 | l2_reg_w : float 118 | L2 penalty weight for linear coefficients. 119 | 120 | l2_reg_V : float 121 | L2 penalty weight for pairwise coefficients. 122 | 123 | l2_reg : float 124 | L2 penalty weight for all coefficients (default=0). 125 | 126 | step_size : float 127 | Stepsize for the SGD solver, the solver uses a fixed step size and 128 | might require a tunning of the number of iterations `n_iter`. 129 | 130 | Attributes 131 | --------- 132 | 133 | w0_ : float 134 | bias term 135 | 136 | w_ : float | array, shape = (n_features) 137 | Coefficients for linear combination. 138 | 139 | V_ : float | array, shape = (rank_pair, n_features) 140 | Coefficients of second order factor matrix. 141 | """ 142 | 143 | def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, 144 | l2_reg_w=0, l2_reg_V=0, l2_reg=None, step_size=0.1): 145 | super(FMClassification, self).\ 146 | __init__(n_iter=n_iter, init_stdev=init_stdev, rank=rank, 147 | random_state=random_state) 148 | if (l2_reg is not None): 149 | self.l2_reg_V = l2_reg 150 | self.l2_reg_w = l2_reg 151 | else: 152 | self.l2_reg_w = l2_reg_w 153 | self.l2_reg_V = l2_reg_V 154 | self.l2_reg = l2_reg 155 | self.step_size = step_size 156 | self.task = "classification" 157 | 158 | def fit(self, X, y): 159 | """ Fit model with specified loss. 160 | 161 | Parameters 162 | ---------- 163 | X : scipy.sparse.csc_matrix, (n_samples, n_features) 164 | 165 | y : float | ndarray, shape = (n_samples, ) 166 | 167 | the targets have to be encodes as {-1, 1}. 168 | """ 169 | y = _validate_class_labels(y) 170 | self.classes_ = np.unique(y) 171 | if len(self.classes_) != 2: 172 | raise ValueError("This solver only supports binary classification" 173 | " but the data contains" 174 | " class: %r" % self.classes_) 175 | 176 | # fastFM-core expects labels to be in {-1,1} 177 | y_train = y.copy() 178 | i_class1 = (y_train == self.classes_[0]) 179 | y_train[i_class1] = -1 180 | y_train[~i_class1] = 1 181 | 182 | check_consistent_length(X, y) 183 | y = y.astype(np.float64) 184 | 185 | # The sgd solver expects a transposed design matrix in column major 186 | # order (csc_matrix). 187 | X = X.T # creates a copy 188 | X = check_array(X, accept_sparse="csc", dtype=np.float64) 189 | 190 | self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) 191 | return self 192 | -------------------------------------------------------------------------------- /fastFM/tests/test_als.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from sklearn import metrics 7 | from fastFM import als 8 | from fastFM.datasets import make_user_item_regression 9 | from sklearn.metrics import mean_squared_error 10 | from numpy.testing import assert_almost_equal 11 | 12 | 13 | def get_test_problem(task='regression'): 14 | X = sp.csc_matrix(np.array([[6, 1], 15 | [2, 3], 16 | [3, 0], 17 | [6, 1], 18 | [4, 5]]), dtype=np.float64) 19 | y = np.array([298, 266, 29, 298, 848], dtype=np.float64) 20 | V = np.array([[6, 0], 21 | [5, 8]], dtype=np.float64) 22 | w = np.array([9, 2], dtype=np.float64) 23 | w0 = 2 24 | if task == 'classification': 25 | y_labels = np.ones_like(y) 26 | y_labels[y < np.median(y)] = -1 27 | y = y_labels 28 | return w0, w, V, y, X 29 | 30 | 31 | def get_small_data(): 32 | X = sp.csc_matrix(np.array([[1, 2], 33 | [3, 4], 34 | [5, 6]]), dtype=np.float64) 35 | y = np.array([600, 2800, 10000], dtype=np.float64) 36 | return X, y 37 | 38 | 39 | def _test_fm_regression_only_w0(): 40 | X, y = get_small_data() 41 | 42 | fm = als.FMRegression(n_iter=0, l2_reg_w=0, l2_reg_V=0, rank=0) 43 | fm.ignore_w = True 44 | fm.w0_ = 2 45 | fm.fit(X, y, warm_start=True) 46 | assert_almost_equal(fm.w0_, 2, 6) 47 | 48 | fm = als.FMRegression(n_iter=1, l2_reg_w=0, l2_reg_V=0, rank=0) 49 | fm.ignore_w = True 50 | fm.w0_ = 2 51 | fm.fit(X, y, warm_start=True) 52 | assert_almost_equal(fm.w0_, 4466.6666666666661, 6) 53 | 54 | 55 | def _test_raise_when_input_is_dense(): 56 | fm = als.FMRegression(n_iter=0, l2_reg_w=0, l2_reg_V=0, rank=0) 57 | X = np.arange(3, 4, dtype=np.float64) 58 | y = np.arange(3, dtype=np.float64) 59 | fm.fit(X, y, warm_start=True) 60 | 61 | 62 | def test_fm_linear_regression(): 63 | X, y = get_small_data() 64 | 65 | fm = als.FMRegression(n_iter=1, l2_reg_w=1, l2_reg_V=1, rank=0) 66 | fm.fit(X, y) 67 | 68 | 69 | def test_fm_regression(): 70 | w0, w, V, y, X = get_test_problem() 71 | 72 | fm = als.FMRegression(n_iter=1000, l2_reg_w=0, l2_reg_V=0, rank=2) 73 | fm.fit(X, y) 74 | y_pred = fm.predict(X) 75 | assert_almost_equal(y_pred, y, 3) 76 | # check different size 77 | fm = als.FMRegression(n_iter=1000, l2_reg_w=0, l2_reg_V=0, rank=5) 78 | X_big = sp.hstack([X, X]) 79 | fm.fit(X_big, y) 80 | y_pred = fm.predict(X_big[:2, ]) 81 | 82 | 83 | def test_fm_classification(): 84 | w0, w, V, y, X = get_test_problem(task='classification') 85 | 86 | fm = als.FMClassification(n_iter=1000, 87 | init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2) 88 | fm.fit(X, y) 89 | y_pred = fm.predict(X) 90 | print(y_pred) 91 | assert metrics.accuracy_score(y, y_pred) > 0.95 92 | # check different size 93 | fm.fit(X[:2, ], y[:2]) 94 | 95 | 96 | def test_als_warm_start(): 97 | X, y, coef = make_user_item_regression(label_stdev=0) 98 | from sklearn.model_selection import train_test_split 99 | X_train, X_test, y_train, y_test = train_test_split( 100 | X, y, test_size=0.33, random_state=42) 101 | X_train = sp.csc_matrix(X_train) 102 | X_test = sp.csc_matrix(X_test) 103 | 104 | fm = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2) 105 | fm.fit(X_train, y_train) 106 | y_pred = fm.predict(X_test) 107 | error_10_iter = mean_squared_error(y_pred, y_test) 108 | 109 | fm = als.FMRegression(n_iter=5, l2_reg_w=0, l2_reg_V=0, rank=2) 110 | fm.fit(X_train, y_train) 111 | print(fm.iter_count) 112 | y_pred = fm.predict(X_test) 113 | error_5_iter = mean_squared_error(y_pred, y_test) 114 | 115 | fm.fit(sp.csc_matrix(X_train), y_train, n_more_iter=5) 116 | print(fm.iter_count) 117 | y_pred = fm.predict(X_test) 118 | error_5_iter_plus_5 = mean_squared_error(y_pred, y_test) 119 | 120 | print(error_5_iter, error_5_iter_plus_5, error_10_iter) 121 | 122 | assert error_10_iter == error_5_iter_plus_5 123 | 124 | 125 | def test_warm_start_path(): 126 | 127 | X, y, coef = make_user_item_regression(label_stdev=.4) 128 | from sklearn.model_selection import train_test_split 129 | X_train, X_test, y_train, y_test = train_test_split( 130 | X, y, test_size=0.33, random_state=42) 131 | X_train = sp.csc_matrix(X_train) 132 | X_test = sp.csc_matrix(X_test) 133 | n_iter = 10 134 | 135 | rank = 4 136 | seed = 333 137 | step_size = 1 138 | l2_reg_w = 0 139 | l2_reg_V = 0 140 | 141 | fm = als.FMRegression(n_iter=0, l2_reg_w=l2_reg_w, 142 | l2_reg_V=l2_reg_V, rank=rank, random_state=seed) 143 | # initalize coefs 144 | fm.fit(X_train, y_train) 145 | 146 | rmse_train = [] 147 | rmse_test = [] 148 | for i in range(1, n_iter): 149 | fm.fit(X_train, y_train, n_more_iter=step_size) 150 | rmse_train.append(np.sqrt(mean_squared_error( 151 | fm.predict(X_train), y_train))) 152 | rmse_test.append(np.sqrt(mean_squared_error( 153 | fm.predict(X_test), y_test))) 154 | 155 | print('------- restart ----------') 156 | values = np.arange(1, n_iter) 157 | rmse_test_re = [] 158 | rmse_train_re = [] 159 | for i in values: 160 | fm = als.FMRegression(n_iter=i, l2_reg_w=l2_reg_w, 161 | l2_reg_V=l2_reg_V, rank=rank, random_state=seed) 162 | fm.fit(X_train, y_train) 163 | rmse_test_re.append(np.sqrt(mean_squared_error( 164 | fm.predict(X_test), y_test))) 165 | rmse_train_re.append(np.sqrt(mean_squared_error( 166 | fm.predict(X_train), y_train))) 167 | 168 | assert_almost_equal(rmse_train, rmse_train_re) 169 | assert_almost_equal(rmse_test, rmse_test_re) 170 | 171 | 172 | def test_als_classification_warm_start(): 173 | w0, w, V, y, X = get_test_problem(task='classification') 174 | 175 | # 10 iter 176 | fm = als.FMClassification(n_iter=10, 177 | init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2) 178 | fm.fit(X, y) 179 | y_pred = fm.predict(X) 180 | score = metrics.accuracy_score(y, y_pred) 181 | 182 | # 5 iter + 5 more iter 183 | fm = als.FMClassification(n_iter=5, 184 | init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2) 185 | fm.fit(X, y) 186 | fm.fit(X, y, n_more_iter=5) 187 | y_pred = fm.predict(X) 188 | score_warm_start = metrics.accuracy_score(y, y_pred) 189 | 190 | # 0 iter + 10 more iter 191 | fm = als.FMClassification(n_iter=0, 192 | init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2) 193 | fm.fit(X, y) 194 | fm.fit(X, y, n_more_iter=10) 195 | y_pred = fm.predict(X) 196 | score_warm_start_2 = metrics.accuracy_score(y, y_pred) 197 | 198 | assert_almost_equal(score, score_warm_start) 199 | assert_almost_equal(score, score_warm_start_2) 200 | 201 | 202 | def test_clone(): 203 | from sklearn.base import clone 204 | 205 | a = als.FMRegression() 206 | b = clone(a) 207 | assert a.get_params() == b.get_params() 208 | 209 | a = als.FMClassification() 210 | b = clone(a) 211 | assert a.get_params() == b.get_params() 212 | 213 | 214 | if __name__ == '__main__': 215 | # test_fm_regression_only_w0() 216 | test_fm_linear_regression() 217 | -------------------------------------------------------------------------------- /fastFM/tests/test_base.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from sklearn import metrics 7 | from fastFM import als 8 | 9 | 10 | def get_test_problem(task='regression'): 11 | X = sp.csc_matrix(np.array([[6, 1], 12 | [2, 3], 13 | [3, 0], 14 | [6, 1], 15 | [4, 5]]), dtype=np.float64) 16 | y = np.array([298, 266, 29, 298, 848], dtype=np.float64) 17 | V = np.array([[6, 0], 18 | [5, 8]], dtype=np.float64) 19 | w = np.array([9, 2], dtype=np.float64) 20 | w0 = 2 21 | if task == 'classification': 22 | y_labels = np.ones_like(y) 23 | y_labels[y < np.median(y)] = -1 24 | y = y_labels 25 | return w0, w, V, y, X 26 | 27 | 28 | def test_fm_classification_predict_proba(): 29 | w0, w, V, y, X = get_test_problem(task='classification') 30 | 31 | fm = als.FMClassification(n_iter=1000, 32 | init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2) 33 | fm.fit(X, y) 34 | y_pred = fm.predict(X) 35 | y_pred = fm.predict_proba(X) 36 | 37 | y[y == -1] = 0 38 | assert metrics.roc_auc_score(y, y_pred) > 0.95 39 | 40 | if __name__ == '__main__': 41 | test_fm_classification_predict_proba() 42 | -------------------------------------------------------------------------------- /fastFM/tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | from fastFM.datasets import make_user_item_regression 5 | from sklearn.metrics import mean_squared_error 6 | import scipy.sparse as sp 7 | 8 | 9 | def test_make_user_item_regression(): 10 | from fastFM.mcmc import FMRegression 11 | X, y, coef = make_user_item_regression(label_stdev=0) 12 | from sklearn.model_selection import train_test_split 13 | X_train, X_test, y_train, y_test = train_test_split( 14 | X, y, test_size=0.33, random_state=42) 15 | 16 | fm = FMRegression(rank=2) 17 | y_pred = fm.fit_predict(sp.csc_matrix(X_train), 18 | y_train, sp.csc_matrix(X_test)) 19 | 20 | # generate data with noisy lables 21 | X, y, coef = make_user_item_regression(label_stdev=2) 22 | from sklearn.model_selection import train_test_split 23 | X_train, X_test, y_train, y_test = train_test_split( 24 | X, y, test_size=0.33, random_state=42) 25 | 26 | fm = FMRegression(rank=2) 27 | y_pred_noise = fm.fit_predict(sp.csc_matrix(X_train), 28 | y_train, sp.csc_matrix(X_test)) 29 | assert mean_squared_error(y_pred_noise, y_test) > \ 30 | mean_squared_error(y_pred, y_test) 31 | -------------------------------------------------------------------------------- /fastFM/tests/test_ffm.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from numpy.testing import assert_almost_equal, assert_equal 7 | import ffm 8 | 9 | 10 | def get_test_problem(): 11 | X = sp.csc_matrix(np.array([[6, 1], 12 | [2, 3], 13 | [3, 0], 14 | [6, 1], 15 | [4, 5]]), dtype=np.float64) 16 | y = np.array([298, 266, 29, 298, 848], dtype=np.float64) 17 | V = np.array([[6, 0], 18 | [5, 8]], dtype=np.float64) 19 | w = np.array([9, 2], dtype=np.float64) 20 | w0 = 2 21 | return w0, w, V, y, X 22 | 23 | 24 | def test_ffm_predict(): 25 | w0, w, V, y, X = get_test_problem() 26 | y_pred = ffm.ffm_predict(w0, w, V, X) 27 | assert_equal(y_pred, y) 28 | 29 | if __name__ == '__main__': 30 | pass 31 | -------------------------------------------------------------------------------- /fastFM/tests/test_mcmc.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from sklearn import metrics 7 | from fastFM import mcmc 8 | from fastFM.datasets import make_user_item_regression 9 | from sklearn.metrics import mean_squared_error 10 | from numpy.testing import assert_array_equal, assert_almost_equal 11 | 12 | def get_test_problem(task='regression'): 13 | X = sp.csc_matrix(np.array([[6, 1], 14 | [2, 3], 15 | [3, 0], 16 | [6, 1], 17 | [4, 5]]), dtype=np.float64) 18 | y = np.array([298, 266, 29, 298, 848], dtype=np.float64) 19 | V = np.array([[6, 0], 20 | [5, 8]], dtype=np.float64) 21 | w = np.array([9, 2], dtype=np.float64) 22 | w0 = 2 23 | if task == 'classification': 24 | y_labels = np.ones_like(y) 25 | y_labels[y < np.median(y)] = -1 26 | y = y_labels 27 | return w0, w, V, y, X 28 | 29 | 30 | def test_fm_regression(): 31 | w0, w, V, y, X = get_test_problem() 32 | 33 | fm = mcmc.FMRegression(n_iter=1000, rank=2, init_stdev=0.1) 34 | 35 | y_pred = fm.fit_predict(X, y, X) 36 | assert metrics.r2_score(y_pred, y) > 0.99 37 | 38 | 39 | def test_fm_classification(): 40 | w0, w, V, y, X = get_test_problem() 41 | # transform to labels easier problem then default one 42 | y_labels = np.ones_like(y) 43 | y_labels[y < np.mean(y)] = -1 44 | 45 | fm = mcmc.FMClassification(n_iter=1000, init_stdev=0.1, rank=2) 46 | y_pred = fm.fit_predict_proba(X, y_labels, X) 47 | 48 | fpr, tpr, thresholds = metrics.roc_curve(y_labels, y_pred) 49 | auc = metrics.auc(fpr, tpr) 50 | assert auc > 0.95 51 | y_pred = fm.predict(X[:2, ]) 52 | 53 | 54 | def test_linear_fm_classification(): 55 | w0, w, V, y, X = get_test_problem() 56 | # transform to labels easier problem then default one 57 | y_labels = np.ones_like(y) 58 | y_labels[y < np.mean(y)] = -1 59 | 60 | fm = mcmc.FMClassification(n_iter=1000, init_stdev=0.1, rank=0) 61 | y_pred = fm.fit_predict_proba(X, y_labels, X) 62 | 63 | fpr, tpr, thresholds = metrics.roc_curve(y_labels, y_pred) 64 | auc = metrics.auc(fpr, tpr) 65 | assert auc > 0.95 66 | y_pred = fm.predict(X[:2, ]) 67 | 68 | 69 | def test_fm_classification_proba(): 70 | w0, w, V, y, X = get_test_problem() 71 | # transform to labels easier problem then default one 72 | y_labels = np.ones_like(y) 73 | y_labels[y < np.mean(y)] = -1 74 | 75 | fm = mcmc.FMClassification(n_iter=1000, init_stdev=0.1, rank=2) 76 | y_pred_proba = fm.fit_predict_proba(X, y_labels, X) 77 | y_pred = fm.fit_predict(X, y_labels, X) 78 | y_pred_proba[y_pred_proba < .5] = -1 79 | y_pred_proba[y_pred_proba != -1] = 1 80 | assert_array_equal(y_pred, y_pred_proba) 81 | 82 | 83 | def test_mcmc_warm_start(): 84 | X, y, coef = make_user_item_regression(label_stdev=0) 85 | from sklearn.model_selection import train_test_split 86 | X_train, X_test, y_train, y_test = train_test_split( 87 | X, y, test_size=0.33, random_state=44) 88 | X_train = sp.csc_matrix(X_train) 89 | X_test = sp.csc_matrix(X_test) 90 | 91 | fm = mcmc.FMRegression(n_iter=100, rank=2) 92 | y_pred = fm.fit_predict(X_train, y_train, X_test) 93 | error_10_iter = mean_squared_error(y_pred, y_test) 94 | 95 | fm = mcmc.FMRegression(n_iter=50, rank=2) 96 | y_pred = fm.fit_predict(X_train, y_train, X_test) 97 | error_5_iter = mean_squared_error(y_pred, y_test) 98 | 99 | y_pred = fm.fit_predict(X_train, y_train, X_test, n_more_iter=50) 100 | error_5_iter_plus_5 = mean_squared_error(y_pred, y_test) 101 | print(error_5_iter, error_5_iter_plus_5, error_10_iter) 102 | print(fm.hyper_param_) 103 | assert_almost_equal(error_10_iter, error_5_iter_plus_5, decimal=2) 104 | 105 | 106 | def test_find_init_stdev(): 107 | X, y, coef = make_user_item_regression(label_stdev=.5) 108 | from sklearn.model_selection import train_test_split 109 | X_train, X_test, y_train, y_test = train_test_split( 110 | X, y, test_size=0.33, random_state=44) 111 | X_train = sp.csc_matrix(X_train) 112 | X_test = sp.csc_matrix(X_test) 113 | 114 | fm = mcmc.FMRegression(n_iter=10, rank=5) 115 | best_init_stdev, mse = mcmc.find_init_stdev(fm, X_train, y_train, 116 | stdev_range=[0.2, 0.5, 1.0]) 117 | best_init_stdev_bad, _ = mcmc.find_init_stdev(fm, X_train, y_train, 118 | stdev_range=[5.]) 119 | print('--' * 30) 120 | best_init_stdev_vali, mse_vali = mcmc.find_init_stdev(fm, 121 | X_train, y_train, 122 | X_test, y_test, 123 | stdev_range=[ 124 | 0.2, 0.5, 1.0]) 125 | assert best_init_stdev < best_init_stdev_bad 126 | assert best_init_stdev_vali == best_init_stdev 127 | assert mse_vali > mse 128 | 129 | 130 | def test_clone(): 131 | from sklearn.base import clone 132 | 133 | a = mcmc.FMRegression() 134 | b = clone(a) 135 | assert a.get_params() == b.get_params() 136 | 137 | a = mcmc.FMClassification() 138 | b = clone(a) 139 | assert a.get_params() == b.get_params() 140 | 141 | 142 | if __name__ == "__main__": 143 | test_linear_fm_classification() 144 | -------------------------------------------------------------------------------- /fastFM/tests/test_ranking.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from fastFM import bpr 7 | from fastFM import utils 8 | 9 | 10 | def get_test_problem(task='regression'): 11 | X = sp.csc_matrix(np.array([[6, 1], 12 | [2, 3], 13 | [3, 0], 14 | [6, 1], 15 | [4, 5]]), dtype=np.float64) 16 | y = np.array([298, 266, 29, 298, 848], dtype=np.float64) 17 | V = np.array([[6, 0], 18 | [5, 8]], dtype=np.float64) 19 | w = np.array([9, 2], dtype=np.float64) 20 | w0 = 2 21 | if task == 'classification': 22 | y_labels = np.ones_like(y) 23 | y_labels[y < np.median(y)] = -1 24 | y = y_labels 25 | return w0, w, V, y, X 26 | 27 | 28 | def test_fm_sgr_ranking(): 29 | w0, w, V, y, X = get_test_problem() 30 | X_test = X.copy() 31 | X_train = X.copy() 32 | 33 | import itertools 34 | pairs = [p for p in itertools.combinations(range(len(y)), 2)] 35 | compares = np.zeros((len(pairs), 2), dtype=np.float64) 36 | 37 | for i, p in enumerate(pairs): 38 | if y[p[0]] > y[p[1]]: 39 | compares[i, 0] = p[0] 40 | compares[i, 1] = p[1] 41 | else: 42 | compares[i, 0] = p[1] 43 | compares[i, 1] = p[0] 44 | 45 | print(compares) 46 | fm = bpr.FMRecommender(n_iter=2000, 47 | init_stdev=0.01, l2_reg_w=.5, l2_reg_V=.5, rank=2, 48 | step_size=.002, random_state=11) 49 | fm.fit(X_train, compares) 50 | y_pred = fm.predict(X_test) 51 | y_pred = np.argsort(y_pred) 52 | print(y) 53 | print(y_pred) 54 | print(np.argsort(y)) 55 | assert utils.kendall_tau(np.argsort(y), y_pred) == 1 56 | -------------------------------------------------------------------------------- /fastFM/tests/test_sgd.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from sklearn import metrics 7 | from sklearn.datasets import make_regression 8 | from numpy.testing import assert_almost_equal 9 | from fastFM import sgd 10 | from fastFM import als 11 | 12 | 13 | def get_test_problem(task='regression'): 14 | X = sp.csc_matrix(np.array([[6, 1], 15 | [2, 3], 16 | [3, 0], 17 | [6, 1], 18 | [4, 5]]), dtype=np.float64) 19 | y = np.array([298, 266, 29, 298, 848], dtype=np.float64) 20 | V = np.array([[6, 0], 21 | [5, 8]], dtype=np.float64) 22 | w = np.array([9, 2], dtype=np.float64) 23 | w0 = 2 24 | if task == 'classification': 25 | y_labels = np.ones_like(y) 26 | y_labels[y < np.median(y)] = -1 27 | y = y_labels 28 | return w0, w, V, y, X 29 | 30 | 31 | def test_sgd_regression_small_example(): 32 | w0, w, V, y, X = get_test_problem() 33 | X_test = X.copy() 34 | X_train = sp.csc_matrix(X) 35 | 36 | fm = sgd.FMRegression(n_iter=10000, 37 | init_stdev=0.01, l2_reg_w=0.5, l2_reg_V=50.5, rank=2, 38 | step_size=0.0001) 39 | 40 | fm.fit(X_train, y) 41 | y_pred = fm.predict(X_test) 42 | assert metrics.r2_score(y_pred, y) > 0.99 43 | 44 | 45 | def test_first_order_sgd_vs_als_regression(): 46 | X, y = make_regression(n_samples=100, n_features=50, random_state=123) 47 | X = sp.csc_matrix(X) 48 | 49 | fm_sgd = sgd.FMRegression(n_iter=900, init_stdev=0.01, l2_reg_w=0.0, 50 | l2_reg_V=50.5, rank=0, step_size=0.01) 51 | fm_als = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=0) 52 | 53 | y_pred_sgd = fm_sgd.fit(X, y).predict(X) 54 | y_pred_als = fm_als.fit(X, y).predict(X) 55 | 56 | score_als = metrics.r2_score(y_pred_als, y) 57 | score_sgd = metrics.r2_score(y_pred_sgd, y) 58 | 59 | assert_almost_equal(score_als, score_sgd, decimal=2) 60 | 61 | 62 | def test_second_order_sgd_vs_als_regression(): 63 | X, y = make_regression(n_samples=100, n_features=50, random_state=123) 64 | X = sp.csc_matrix(X) 65 | 66 | fm_sgd = sgd.FMRegression(n_iter=50000, init_stdev=0.00, l2_reg_w=0.0, 67 | l2_reg_V=50.5, rank=2, step_size=0.0002) 68 | fm_als = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2) 69 | 70 | y_pred_als = fm_als.fit(X, y).predict(X) 71 | y_pred_sgd = fm_sgd.fit(X, y).predict(X) 72 | 73 | score_als = metrics.r2_score(y_pred_als, y) 74 | score_sgd = metrics.r2_score(y_pred_sgd, y) 75 | 76 | assert_almost_equal(score_sgd, score_als, decimal=2) 77 | 78 | 79 | def test_sgd_classification_small_example(): 80 | w0, w, V, y, X = get_test_problem(task='classification') 81 | X_test = X.copy() 82 | X_train = sp.csc_matrix(X) 83 | 84 | fm = sgd.FMClassification(n_iter=1000, 85 | init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2, 86 | step_size=0.1) 87 | fm.fit(X_train, y) 88 | y_pred = fm.predict(X_test) 89 | print(y_pred) 90 | assert metrics.accuracy_score(y, y_pred) > 0.95 91 | 92 | 93 | def test_clone(): 94 | from sklearn.base import clone 95 | 96 | a = sgd.FMRegression() 97 | b = clone(a) 98 | assert a.get_params() == b.get_params() 99 | 100 | a = sgd.FMClassification() 101 | b = clone(a) 102 | assert a.get_params() == b.get_params() 103 | 104 | 105 | if __name__ == '__main__': 106 | test_sgd_regression_small_example() 107 | test_first_order_sgd_vs_als_regression() 108 | test_second_order_sgd_vs_als_regression() 109 | -------------------------------------------------------------------------------- /fastFM/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | from fastFM.utils import kendall_tau 6 | 7 | 8 | def test_ffm_vector_kendall_tau(): 9 | order = np.array([1, 2, 3, 4, 5]) 10 | order_wrong = np.array([5, 3, 4, 2, 1]) 11 | order_inv = np.array([5, 4, 3, 2, 1]) 12 | 13 | assert kendall_tau(order, order) == 1 14 | assert kendall_tau(order, order_inv) == -1 15 | assert kendall_tau(order, order_wrong) != -1 16 | 17 | 18 | if __name__ == '__main__': 19 | test_ffm_vector_kendall_tau() 20 | -------------------------------------------------------------------------------- /fastFM/utils.py: -------------------------------------------------------------------------------- 1 | # Author: Immanuel Bayer 2 | # License: BSD 3 clause 3 | 4 | 5 | def kendall_tau(a, b): 6 | n_samples = a.shape[0] 7 | assert a.shape == b.shape 8 | n_concordant = 0 9 | n_disconcordant = 0 10 | 11 | for i in range(n_samples): 12 | for j in range(i+1, n_samples): 13 | if a[i] > a[j] and b[i] > b[j]: 14 | n_concordant = n_concordant + 1 15 | if a[i] < a[j] and b[i] < b[j]: 16 | n_concordant = n_concordant + 1 17 | 18 | if a[i] > a[j] and b[i] < b[j]: 19 | n_disconcordant = n_disconcordant + 1 20 | if a[i] < a[j] and b[i] > b[j]: 21 | n_disconcordant = n_disconcordant + 1 22 | return (n_concordant - n_disconcordant) / (.5 * n_samples * 23 | (n_samples - 1)) 24 | -------------------------------------------------------------------------------- /fastFM/validation.py: -------------------------------------------------------------------------------- 1 | # Static versions of non-core sklearn.utils functions. 2 | # Placed here since they are subject to change. 3 | 4 | """Utilities for input validation""" 5 | 6 | # Authors: Olivier Grisel 7 | # Gael Varoquaux 8 | # Andreas Mueller 9 | # Lars Buitinck 10 | # Alexandre Gramfort 11 | # Nicolas Tresegnie 12 | # License: BSD 3 clause 13 | 14 | import numbers 15 | import warnings 16 | 17 | import numpy as np 18 | import scipy.sparse as sparse 19 | from functools import wraps 20 | 21 | 22 | def _check_matrix_is_sparse(func): 23 | """ 24 | Check that input is a scipy sparse matrix and raise warning otherwise. 25 | """ 26 | @wraps(func) 27 | def wrapper(*args, **kwargs): 28 | if 'accept_sparse' in kwargs and not sparse.isspmatrix(args[0]): 29 | raise TypeError('A dense matrix was passed in, but sparse' 30 | 'data is required.') 31 | result = func(*args, **kwargs) 32 | return result 33 | return wrapper 34 | 35 | 36 | def _ensure_sparse_format(spmatrix, accept_sparse, dtype, order, copy, 37 | force_all_finite): 38 | """Convert a sparse matrix to a given format. 39 | Checks the sparse format of spmatrix and converts if necessary. 40 | Parameters 41 | ---------- 42 | spmatrix : scipy sparse matrix 43 | Input to validate and convert. 44 | accept_sparse : string, list of string or None (default=None) 45 | String[s] representing allowed sparse matrix formats ('csc', 46 | 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). None means that sparse 47 | matrix input will raise an error. If the input is sparse but not in 48 | the allowed format, it will be converted to the first listed format. 49 | dtype : string, type or None (default=none) 50 | Data type of result. If None, the dtype of the input is preserved. 51 | order : 'F', 'C' or None (default=None) 52 | Whether an array will be forced to be fortran or c-style. 53 | copy : boolean (default=False) 54 | Whether a forced copy will be triggered. If copy=False, a copy might 55 | be triggered by a conversion. 56 | force_all_finite : boolean (default=True) 57 | Whether to raise an error on np.inf and np.nan in X. 58 | Returns 59 | ------- 60 | spmatrix_converted : scipy sparse matrix. 61 | Matrix that is ensured to have an allowed type. 62 | """ 63 | if accept_sparse is None: 64 | raise TypeError('A sparse matrix was passed, but dense ' 65 | 'data is required. Use X.toarray() to ' 66 | 'convert to a dense numpy array.') 67 | sparse_type = spmatrix.format 68 | if dtype is None: 69 | dtype = spmatrix.dtype 70 | if sparse_type in accept_sparse: 71 | # correct type 72 | if dtype == spmatrix.dtype: 73 | # correct dtype 74 | if copy: 75 | spmatrix = spmatrix.copy() 76 | else: 77 | # convert dtype 78 | spmatrix = spmatrix.astype(dtype) 79 | else: 80 | # create new 81 | spmatrix = spmatrix.asformat(accept_sparse[0]).astype(dtype) 82 | if force_all_finite: 83 | if not hasattr(spmatrix, "data"): 84 | warnings.warn("Can't check %s sparse matrix for nan or inf." 85 | % spmatrix.format) 86 | else: 87 | assert_all_finite(spmatrix.data) 88 | if hasattr(spmatrix, "data"): 89 | spmatrix.data = np.array(spmatrix.data, copy=False, order=order) 90 | return spmatrix 91 | 92 | 93 | def assert_all_finite(X): 94 | """Like assert_all_finite, but only for ndarray.""" 95 | X = np.asanyarray(X) 96 | # First try an O(n) time, O(1) space solution for the common case that 97 | # everything is finite; fall back to O(n) space np.isfinite to prevent 98 | # false positives from overflow in sum method. 99 | if (X.dtype.char in np.typecodes['AllFloat'] and 100 | not np.isfinite(X.sum()) and not np.isfinite(X).all()): 101 | raise ValueError("Input contains NaN, infinity" 102 | " or a value too large for %r." % X.dtype) 103 | 104 | 105 | @_check_matrix_is_sparse 106 | def check_array(array, accept_sparse=None, dtype="numeric", order=None, 107 | copy=False, force_all_finite=True, ensure_2d=True, 108 | allow_nd=False, ensure_min_samples=1, ensure_min_features=1): 109 | """Input validation on an array, list, sparse matrix or similar. 110 | By default, the input is converted to an at least 2nd numpy array. 111 | If the dtype of the array is object, attempt converting to float, 112 | raising on failure. 113 | Parameters 114 | ---------- 115 | array : object 116 | Input object to check / convert. 117 | accept_sparse : string, list of string or None (default=None) 118 | String[s] representing allowed sparse matrix formats, such as 'csc', 119 | 'csr', etc. None means that sparse matrix input will raise an error. 120 | If the input is sparse but not in the allowed format, it will be 121 | converted to the first listed format. 122 | dtype : string, type or None (default="numeric") 123 | Data type of result. If None, the dtype of the input is preserved. 124 | If "numeric", dtype is preserved unless array.dtype is object. 125 | order : 'F', 'C' or None (default=None) 126 | Whether an array will be forced to be fortran or c-style. 127 | copy : boolean (default=False) 128 | Whether a forced copy will be triggered. If copy=False, a copy might 129 | be triggered by a conversion. 130 | force_all_finite : boolean (default=True) 131 | Whether to raise an error on np.inf and np.nan in X. 132 | ensure_2d : boolean (default=True) 133 | Whether to make X at least 2d. 134 | allow_nd : boolean (default=False) 135 | Whether to allow X.ndim > 2. 136 | ensure_min_samples : int (default=1) 137 | Make sure that the array has a minimum number of samples in its first 138 | axis (rows for a 2D array). Setting to 0 disables this check. 139 | ensure_min_features : int (default=1) 140 | Make sure that the 2D array has some minimum number of features 141 | (columns). The default value of 1 rejects empty datasets. 142 | This check is only enforced when the input data has effectively 2 143 | dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 144 | disables this check. 145 | Returns 146 | ------- 147 | X_converted : object 148 | The converted and validated X. 149 | """ 150 | if isinstance(accept_sparse, str): 151 | accept_sparse = [accept_sparse] 152 | 153 | # store whether originally we wanted numeric dtype 154 | dtype_numeric = dtype == "numeric" 155 | 156 | if sparse.issparse(array): 157 | if dtype_numeric: 158 | dtype = None 159 | array = _ensure_sparse_format(array, accept_sparse, dtype, order, 160 | copy, force_all_finite) 161 | else: 162 | if ensure_2d: 163 | array = np.atleast_2d(array) 164 | if dtype_numeric: 165 | if (hasattr(array, "dtype") and 166 | getattr(array.dtype, "kind", None) == "O"): 167 | # if input is object, convert to float. 168 | dtype = np.float64 169 | else: 170 | dtype = None 171 | array = np.array(array, dtype=dtype, order=order, copy=copy) 172 | # make sure we actually converted to numeric: 173 | if dtype_numeric and array.dtype.kind == "O": 174 | array = array.astype(np.float64) 175 | if not allow_nd and array.ndim >= 3: 176 | raise ValueError("Found array with dim %d. Expected <= 2" % 177 | array.ndim) 178 | if force_all_finite: 179 | assert_all_finite(array) 180 | 181 | shape_repr = _shape_repr(array.shape) 182 | if ensure_min_samples > 0: 183 | n_samples = _num_samples(array) 184 | if n_samples < ensure_min_samples: 185 | raise ValueError("Found array with %d sample(s) (shape=%s) while a" 186 | " minimum of %d is required." 187 | % (n_samples, shape_repr, ensure_min_samples)) 188 | 189 | if ensure_min_features > 0 and array.ndim == 2: 190 | n_features = array.shape[1] 191 | if n_features < ensure_min_features: 192 | raise ValueError("Found array with %d feature(s) (shape=%s) while" 193 | " a minimum of %d is required." 194 | % (n_features, shape_repr, ensure_min_features)) 195 | return array 196 | 197 | 198 | def check_consistent_length(x1, x2): 199 | return x1.shape[0] == x2.shape[0] 200 | 201 | 202 | def check_random_state(seed): 203 | """Turn seed into a np.random.RandomState instance 204 | If seed is None, return the RandomState singleton used by np.random. 205 | If seed is an int, return a new RandomState instance seeded with seed. 206 | If seed is already a RandomState instance, return it. 207 | Otherwise raise ValueError. 208 | """ 209 | if seed is None or seed is np.random: 210 | return np.random.mtrand._rand 211 | if isinstance(seed, (numbers.Integral, np.integer)): 212 | return np.random.RandomState(seed) 213 | if isinstance(seed, np.random.RandomState): 214 | return seed 215 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 216 | ' instance' % seed) 217 | 218 | 219 | def _shape_repr(shape): 220 | """Return a platform independent reprensentation of an array shape 221 | Under Python 2, the `long` type introduces an 'L' suffix when using the 222 | default %r format for tuples of integers (typically used to store the shape 223 | of an array). 224 | Under Windows 64 bit (and Python 2), the `long` type is used by default 225 | in numpy shapes even when the integer dimensions are well below 32 bit. 226 | The platform specific type causes string messages or doctests to change 227 | from one platform to another which is not desirable. 228 | Under Python 3, there is no more `long` type so the `L` suffix is never 229 | introduced in string representation. 230 | >>> _shape_repr((1, 2)) 231 | '(1, 2)' 232 | >>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2 233 | >>> _shape_repr((one, 2 * one)) 234 | '(1, 2)' 235 | >>> _shape_repr((1,)) 236 | '(1,)' 237 | >>> _shape_repr(()) 238 | '()' 239 | """ 240 | if len(shape) == 0: 241 | return "()" 242 | joined = ", ".join("%d" % e for e in shape) 243 | if len(shape) == 1: 244 | # special notation for singleton tuples 245 | joined += ',' 246 | return "(%s)" % joined 247 | 248 | 249 | def _num_samples(x): 250 | """Return number of samples in array-like x.""" 251 | if hasattr(x, 'fit'): 252 | # Don't get num_samples from an ensembles length! 253 | raise TypeError('Expected sequence or array-like, got ' 254 | 'estimator %s' % x) 255 | if not hasattr(x, '__len__') and not hasattr(x, 'shape'): 256 | if hasattr(x, '__array__'): 257 | x = np.asarray(x) 258 | else: 259 | raise TypeError("Expected sequence or array-like, got %s" % 260 | type(x)) 261 | if hasattr(x, 'shape'): 262 | if len(x.shape) == 0: 263 | raise TypeError("Singleton array %r cannot be considered" 264 | " a valid collection." % x) 265 | return x.shape[0] 266 | else: 267 | return len(x) 268 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython>=0.22 2 | numpy>=1.9.1 3 | scipy>=0.16.0 4 | scikit-learn>=0.18.0 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from distutils.extension import Extension 3 | from Cython.Distutils import build_ext 4 | import numpy 5 | 6 | ext_modules = [ 7 | Extension('ffm', ['fastFM/ffm.pyx'], 8 | libraries=['m', 'fastfm'], 9 | library_dirs=['fastFM/', 'fastFM-core/bin/'], 10 | include_dirs=['fastFM/', 'fastFM-core/include/', 11 | 'fastFM-core/externals/CXSparse/Include/', 12 | numpy.get_include()])] 13 | 14 | setup( 15 | name='fastFM', 16 | cmdclass={'build_ext': build_ext}, 17 | ext_modules=ext_modules, 18 | 19 | packages=['fastFM'], 20 | 21 | package_data={'fastFM': ['fastFM/*.pxd']}, 22 | 23 | version='0.2.11', 24 | url='http://ibayer.github.io/fastFM', 25 | author='Immanuel Bayer', 26 | author_email='immanuel.bayer@uni-konstanz.de', 27 | 28 | # Choose your license 29 | license='BSD', 30 | 31 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 32 | classifiers=[ 33 | # How mature is this project? Common values are 34 | # 3 - Alpha 35 | # 4 - Beta 36 | # 5 - Production/Stable 37 | 'Development Status :: 4 - Beta', 38 | 39 | # Indicate who your project is intended for 40 | 'Intended Audience :: Developers', 41 | 'Intended Audience :: Science/Research', 42 | 'Topic :: Scientific/Engineering', 43 | 44 | 'License :: OSI Approved :: BSD License', 45 | 'Operating System :: Unix', 46 | 47 | # Specify the Python versions you support here. In particular, ensure 48 | # that you indicate whether you support Python 2, Python 3 or both. 49 | 'Programming Language :: Python :: 2', 50 | 'Programming Language :: Python :: 2.6', 51 | 'Programming Language :: Python :: 2.7', 52 | 'Programming Language :: Python :: 3', 53 | 'Programming Language :: Python :: 3.2', 54 | 'Programming Language :: Python :: 3.3', 55 | 'Programming Language :: Python :: 3.4', 56 | 'Programming Language :: Python :: 3.5', 57 | 'Programming Language :: Python :: 3.6', 58 | ], 59 | 60 | # List run-time dependencies here. These will be installed by pip when 61 | # your project is installed. For an analysis of "install_requires" vs pip's 62 | # requirements files see: 63 | # https://packaging.python.org/en/latest/requirements.html 64 | install_requires=['numpy', 'scikit-learn', 'scipy', 'cython'] 65 | ) 66 | --------------------------------------------------------------------------------