├── .github └── workflows │ ├── doctest.yml │ ├── pre-commit.yml │ ├── run-test.yml │ └── wheels.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── codecov.yml ├── cpp_source ├── Faddeeva.cc ├── bind.cpp ├── bind_float.cpp └── declare_module.hpp ├── doc ├── Makefile ├── requirements.txt └── source │ ├── api_reference.rst │ ├── conf.py │ ├── index.rst │ ├── movielens.rst │ ├── ordinal-regression.rst │ ├── quickstart.rst │ ├── rating_vs_cps.png │ └── relation-blocks.rst ├── doc_autobuild.sh ├── examples ├── ml-100k-extended.ipynb ├── ml-100k-regression.py ├── ml-100k-variational.py ├── ml-100k.ipynb ├── ml-10m-regression.py ├── ml-1m-extended.ipynb ├── ml-1m-regression.py ├── oprobit_example.py └── toy.py ├── include ├── Faddeeva │ └── Faddeeva.hh └── myfm │ ├── BaseFMTrainer.hpp │ ├── FM.hpp │ ├── FMLearningConfig.hpp │ ├── FMTrainer.hpp │ ├── HyperParams.hpp │ ├── LearningHistory.hpp │ ├── OProbitSampler.hpp │ ├── definitions.hpp │ ├── predictor.hpp │ ├── util.hpp │ └── variational.hpp ├── mypy.ini ├── pyproject.toml ├── setup.py ├── src └── myfm │ ├── __init__.py │ ├── _myfm.pyi │ ├── base.py │ ├── gibbs.py │ ├── utils │ ├── __init__.py │ ├── benchmark_data │ │ ├── __init__.py │ │ ├── loader_base.py │ │ ├── movielens100k_data.py │ │ ├── movielens10M_data.py │ │ └── movielens1M_data.py │ ├── callbacks │ │ ├── __init__.py │ │ └── libfm.py │ ├── dummy_data.py │ └── encoders │ │ ├── __init__.py │ │ ├── base.py │ │ ├── binning.py │ │ ├── categorical.py │ │ └── multi_value.py │ └── variational.py └── tests ├── __init__.py ├── classification ├── __init__.py └── test_classification.py ├── conftest.py ├── dataset ├── __init__.py ├── test_ml100k.py └── test_ml1m.py ├── oprobit ├── __init__.py └── test_oprobit_1dim.py ├── regression ├── __init__.py ├── test_block.py └── test_fit.py ├── test_utils.py └── utils ├── __init__.py ├── test_binning.py ├── test_categorical.py ├── test_dataframe_encoder.py └── test_multivalue.py /.github/workflows/doctest.yml: -------------------------------------------------------------------------------- 1 | name: Doctest 2 | on: [push] 3 | jobs: 4 | test_readme_and_sphinx_docs: 5 | runs-on: ubuntu-latest 6 | env: 7 | OS: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | with: 11 | fetch-depth: 0 12 | - name: Setup Python 13 | uses: actions/setup-python@v3 14 | with: 15 | python-version: "3.11" 16 | - name: Build myfm 17 | run: | 18 | pip install --upgrade pip 19 | pip install numpy scipy pandas scikit-learn 20 | pip install . 21 | curl http://files.grouplens.org/datasets/movielens/ml-100k.zip -o ~/.ml-100k.zip 22 | - name: Run pytest 23 | run: | 24 | pip install pytest phmdoctest sphinx==4.4.0 sphinx_rtd_theme 25 | - name: Test Readme.md 26 | run: | 27 | GEN_TEST_FILE=phmdoctest_out.py 28 | phmdoctest README.md --outfile "$GEN_TEST_FILE" 29 | pytest "$GEN_TEST_FILE" 30 | rm "$GEN_TEST_FILE" 31 | - name: Run sphinx doctest 32 | run: | 33 | cd doc 34 | make doctest 35 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | on: 3 | pull_request: 4 | push: 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-latest 8 | env: 9 | SKIP: no-commit-to-branch 10 | steps: 11 | - uses: actions/checkout@v3 12 | - uses: actions/setup-python@v3 13 | - uses: pre-commit/action@v3.0.0 14 | -------------------------------------------------------------------------------- /.github/workflows/run-test.yml: -------------------------------------------------------------------------------- 1 | name: Test & Upload coverage 2 | on: [push] 3 | jobs: 4 | run_pytest_upload_coverage: 5 | runs-on: ubuntu-latest 6 | env: 7 | OS: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | with: 11 | fetch-depth: 0 12 | - name: Setup Python 13 | uses: actions/setup-python@v3 14 | with: 15 | python-version: "3.11" 16 | - name: Build myfm 17 | run: | 18 | pip install --upgrade pip 19 | pip install numpy scipy pandas 20 | sudo apt-get install lcov 21 | FLAGS="-fprofile-arcs -ftest-coverage" 22 | CFLAGS="$FLAGS" CXXFLAGS="$FLAGS" pip install -e . 23 | - name: Run pytest 24 | run: | 25 | pip install pytest pytest-cov pytest-mock 26 | pytest --cov=./src/myfm tests/ 27 | - name: Generate coverage (ubuntu) 28 | run: | 29 | coverage xml 30 | lcov -d `pwd` -c -o coverage.info 31 | - name: Upload coverage to Codecov 32 | uses: codecov/codecov-action@v1 33 | with: 34 | files: ./coverage.xml,./coverage.info 35 | verbose: false 36 | env_vars: OS,PYTHON 37 | name: codecov-umbrella 38 | fail_ci_if_error: false 39 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build wheel 2 | on: 3 | push: 4 | branches: 5 | - main 6 | release: 7 | types: 8 | - created 9 | env: 10 | cibuildwheel_version: "2.12.2" 11 | jobs: 12 | build_sdist: 13 | name: Build source distribution 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 0 19 | - uses: actions/setup-python@v3 20 | name: Install Python 21 | with: 22 | python-version: "3.11" 23 | - name: Build sdist 24 | run: pip install pybind11 && python setup.py sdist 25 | - uses: actions/upload-artifact@v2 26 | with: 27 | path: dist/*.tar.gz 28 | build_wheels: 29 | name: Build wheels on ${{ matrix.os }} 30 | runs-on: ${{ matrix.os }} 31 | env: 32 | MACOSX_DEPLOYMENT_TARGET: "10.9" 33 | CIBW_BUILD_VERBOSITY: "1" 34 | CIBW_BUILD: "${{ matrix.cibw.build || '*' }}" 35 | CIBW_SKIP: "${{ matrix.cibw.skip || '' }}" 36 | CIBW_ENVIRONMENT: "${{ matrix.cibw.env || '' }}" 37 | CIBW_TEST_COMMAND: "pytest {project}/tests" 38 | CIBW_TEST_REQUIRES: pytest pytest-mock 39 | CIBW_MANYLINUX_X86_64_IMAGE: "${{ matrix.cibw.manylinux_image }}" 40 | CIBW_MANYLINUX_I686_IMAGE: "${{ matrix.cibw.manylinux_image }}" 41 | CIBW_MANYLINUX_AARCH64_IMAGE: "${{ matrix.cibw.manylinux_image }}" 42 | CIBW_ARCHS_LINUX: "${{ matrix.cibw.arch || 'auto' }}" 43 | CIBW_ARCHS_MACOS: "${{ matrix.cibw.arch || 'auto' }}" 44 | strategy: 45 | matrix: 46 | include: 47 | - os: macos-10.15 48 | name: mac 49 | cibw: 50 | arch: x86_64 51 | build: "cp37* cp38*" 52 | 53 | - os: macos-10.15 54 | name: mac-arm 55 | cibw: 56 | arch: universal2 57 | build: "cp39* cp310* cp311*" 58 | 59 | - os: ubuntu-20.04 60 | name: manylinux1 61 | cibw: 62 | build: "cp37*" 63 | skip: "*musllinux*" 64 | manylinux_image: manylinux2010 65 | arch: auto64 66 | 67 | - os: ubuntu-20.04 68 | name: manylinux2014 69 | cibw: 70 | build: "cp38* cp39* cp310* cp311*" 71 | skip: "*musllinux*" 72 | manylinux_image: manylinux2014 73 | arch: auto64 74 | 75 | - os: ubuntu-20.04 76 | name: manylinux_aarch64_cp37 77 | cibw: 78 | build: "cp37*" 79 | skip: "*musllinux*" 80 | manylinux_image: manylinux2014 81 | arch: aarch64 82 | 83 | - os: ubuntu-20.04 84 | name: manylinux_aarch64_cp38 85 | cibw: 86 | build: "cp38*" 87 | skip: "*musllinux*" 88 | manylinux_image: manylinux2014 89 | arch: aarch64 90 | 91 | - os: ubuntu-20.04 92 | name: manylinux_aarch64_cp39 93 | cibw: 94 | build: "cp39*" 95 | skip: "*musllinux*" 96 | manylinux_image: manylinux2014 97 | arch: aarch64 98 | 99 | - os: ubuntu-20.04 100 | name: manylinux_aarch64_cp310 101 | cibw: 102 | build: "cp310*" 103 | skip: "*musllinux*" 104 | manylinux_image: manylinux2014 105 | arch: aarch64 106 | 107 | - os: ubuntu-20.04 108 | name: manylinux_aarch64_cp311 109 | cibw: 110 | build: "cp311*" 111 | skip: "*musllinux*" 112 | manylinux_image: manylinux2014 113 | arch: aarch64 114 | 115 | - os: windows-2019 116 | name: win_amd64 117 | architecture: x64 118 | cibw: 119 | skip: "cp36*" 120 | build: "cp*win_amd64" 121 | 122 | steps: 123 | - uses: actions/checkout@v2 124 | with: 125 | fetch-depth: 0 126 | - uses: actions/setup-python@v2 127 | name: Install Python 128 | - name: register qemu 129 | if: contains(matrix.cibw.arch, 'aarch64') 130 | run: | 131 | docker run --rm --privileged hypriot/qemu-register:v4.2.0 132 | - name: Install cibuildwheel 133 | run: python -m pip install cibuildwheel=="${{env.cibuildwheel_version}}" 134 | - name: Build wheels 135 | run: python -m cibuildwheel --output-dir wheelhouse 136 | 137 | - uses: actions/upload-artifact@v2 138 | with: 139 | path: ./wheelhouse/*.whl 140 | 141 | upload_pypi: 142 | needs: [build_wheels, build_sdist] 143 | runs-on: ubuntu-latest 144 | steps: 145 | - uses: actions/download-artifact@v2 146 | with: 147 | name: artifact 148 | path: dist 149 | - name: Publish package to TestPyPI 150 | uses: pypa/gh-action-pypi-publish@master 151 | with: 152 | user: __token__ 153 | password: ${{ secrets.TEST_PYPI_APITOKEN }} 154 | packages_dir: dist/ 155 | repository_url: https://test.pypi.org/legacy/ 156 | verbose: true 157 | skip_existing: true 158 | - name: Publish package to PyPI 159 | if: github.event_name == 'release' 160 | uses: pypa/gh-action-pypi-publish@master 161 | with: 162 | user: __token__ 163 | password: ${{ secrets.PYPI_APITOKEN }} 164 | packages_dir: dist/ 165 | verbose: true 166 | skip_existing: true 167 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | eigen 3 | pybind11 4 | .python-version 5 | .clangd 6 | eigen-eigen-323c052e1731/ 7 | myfm.egg-info/ 8 | tmp 9 | .eggs 10 | *.pyc 11 | *.so 12 | .vscode 13 | compile_commands.json 14 | eigen3.zip 15 | dist/ 16 | *-checkpoint.ipynb 17 | doc/_build/* 18 | eigen-3.3.7/ 19 | eigen-3.4.0/ 20 | doc/source/_build 21 | 22 | stubs/* 23 | 24 | doc/source/api_reference/*.rst 25 | .cache 26 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: check-merge-conflict 8 | - id: check-yaml 9 | - id: end-of-file-fixer 10 | - id: no-commit-to-branch 11 | args: [--branch, main] 12 | - id: trailing-whitespace 13 | - id: end-of-file-fixer 14 | - id: check-added-large-files 15 | - repo: https://github.com/PyCQA/isort 16 | rev: 5.12.0 17 | hooks: 18 | - id: isort 19 | name: isort 20 | - repo: https://github.com/psf/black 21 | rev: 22.3.0 22 | hooks: 23 | - id: black 24 | language_version: python3 # Should be a command that runs python3.6+ 25 | - repo: https://github.com/hadialqattan/pycln 26 | rev: v1.1.0 27 | hooks: 28 | - id: pycln 29 | args: [--config=pyproject.toml] 30 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Required 2 | version: 2 3 | 4 | # Build documentation in the docs/ directory with Sphinx 5 | sphinx: 6 | configuration: doc/source/conf.py 7 | 8 | # Optionally build your docs in additional formats such as PDF 9 | formats: 10 | - pdf 11 | 12 | # Optionally set the version of Python and requirements required to build your docs 13 | python: 14 | version: 3.7 15 | install: 16 | - method: pip 17 | path: . 18 | - requirements: doc/requirements.txt 19 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CMAKE_C_COMPILER gcc) 2 | set(CMAKE_CXX_COMPILER g++) 3 | 4 | cmake_minimum_required(VERSION 3.0.0) 5 | project(myfm) 6 | 7 | set(CMAKE_BUILD_TYPE Release) 8 | set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS_INIT} -std=c++11 -fPIC") 9 | 10 | add_subdirectory(pybind11) 11 | include_directories(include eigen-3.3.7) 12 | pybind11_add_module(_myfm src/bind.cpp src/Faddeeva.cc) 13 | 14 | set(CPACK_PROJECT_NAME ${PROJECT_NAME}) 15 | set(CPACK_PROJECT_VERSION ${PROJECT_VERSION}) 16 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Tomoki Ohtsuki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include include/myfm *.hpp 2 | include LICENSE README.md src/declare_module.hpp include/Faddeeva/Faddeeva.hh src/Faddeeva.cc 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # myFM 2 | [![Python](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)](https://www.python.org) 3 | [![pypi](https://img.shields.io/pypi/v/myfm.svg)](https://pypi.python.org/pypi/myfm) 4 | [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/tohtsky/myFM) 5 | [![Build](https://github.com/tohtsky/myFM/workflows/Build%20wheel/badge.svg?branch=main)](https://github.com/tohtsky/myfm) 6 | [![Read the Docs](https://readthedocs.org/projects/myfm/badge/?version=stable)](https://myfm.readthedocs.io/en/stable/) 7 | [![codecov](https://codecov.io/gh/tohtsky/myfm/branch/main/graph/badge.svg?token=kLgOKTQqcV)](https://codecov.io/gh/tohtsky/myfm) 8 | 9 | 10 | myFM is an implementation of Bayesian [Factorization Machines](https://ieeexplore.ieee.org/abstract/document/5694074/) based on Gibbs sampling, which I believe is a wheel worth reinventing. 11 | 12 | Currently this supports most options for libFM MCMC engine, such as 13 | 14 | - Grouping of input variables (`-meta` option of [libFM](https://github.com/srendle/libfm)) 15 | - Relation Data format (See the paper ["Scaling Factorization Machines to relational data"](https://dl.acm.org/citation.cfm?id=2488340)) 16 | 17 | There are also functionalities not present in libFM: 18 | 19 | - The gibbs sampler for Ordered probit regression [5] implementing Metropolis-within-Gibbs scheme of [6]. 20 | - Variational inference for regression and binary classification. 21 | 22 | Tutorial and reference doc is provided at https://myfm.readthedocs.io/en/latest/. 23 | 24 | # Installation 25 | 26 | The package is pip-installable. 27 | 28 | ``` 29 | pip install myfm 30 | ``` 31 | 32 | There are binaries for major operating systems. 33 | 34 | If you are working with less popular OS/architecture, pip will attempt to build myFM from the source (you need a decent C++ compiler!). In that case, in addition to installing python dependencies (`numpy`, `scipy`, `pandas`, ...), the above command will automatically download eigen (ver 3.4.0) to its build directory and use it during the build. 35 | 36 | # Examples 37 | 38 | ## A Toy example 39 | 40 | This example is taken from [pyfm](https://github.com/coreylynch/pyFM) with some modification. 41 | 42 | ```python 43 | import myfm 44 | from sklearn.feature_extraction import DictVectorizer 45 | import numpy as np 46 | train = [ 47 | {"user": "1", "item": "5", "age": 19}, 48 | {"user": "2", "item": "43", "age": 33}, 49 | {"user": "3", "item": "20", "age": 55}, 50 | {"user": "4", "item": "10", "age": 20}, 51 | ] 52 | v = DictVectorizer() 53 | X = v.fit_transform(train) 54 | print(X.toarray()) 55 | # print 56 | # [[ 19. 0. 0. 0. 1. 1. 0. 0. 0.] 57 | # [ 33. 0. 0. 1. 0. 0. 1. 0. 0.] 58 | # [ 55. 0. 1. 0. 0. 0. 0. 1. 0.] 59 | # [ 20. 1. 0. 0. 0. 0. 0. 0. 1.]] 60 | y = np.asarray([0, 1, 1, 0]) 61 | fm = myfm.MyFMClassifier(rank=4) 62 | fm.fit(X,y) 63 | fm.predict(v.transform({"user": "1", "item": "10", "age": 24})) 64 | ``` 65 | 66 | ## A Movielens-100k Example 67 | 68 | This example will require `pandas` and `scikit-learn`. `movielens100k_loader` is present in `examples/movielens100k_loader.py`. 69 | 70 | You will be able to obtain a result comparable to SOTA algorithms like GC-MC. See `examples/ml-100k.ipynb` for the detailed version. 71 | 72 | ```python 73 | import numpy as np 74 | from sklearn.preprocessing import OneHotEncoder 75 | from sklearn import metrics 76 | 77 | import myfm 78 | from myfm.utils.benchmark_data import MovieLens100kDataManager 79 | 80 | data_manager = MovieLens100kDataManager() 81 | df_train, df_test = data_manager.load_rating_predefined_split( 82 | fold=3 83 | ) # Note the dependence on the fold 84 | 85 | def test_myfm(df_train, df_test, rank=8, grouping=None, n_iter=100, samples=95): 86 | explanation_columns = ["user_id", "movie_id"] 87 | ohe = OneHotEncoder(handle_unknown="ignore") 88 | X_train = ohe.fit_transform(df_train[explanation_columns]) 89 | X_test = ohe.transform(df_test[explanation_columns]) 90 | y_train = df_train.rating.values 91 | y_test = df_test.rating.values 92 | fm = myfm.MyFMRegressor(rank=rank, random_seed=114514) 93 | 94 | if grouping: 95 | # specify how columns of X_train are grouped 96 | group_shapes = [len(category) for category in ohe.categories_] 97 | assert sum(group_shapes) == X_train.shape[1] 98 | else: 99 | group_shapes = None 100 | 101 | fm.fit( 102 | X_train, 103 | y_train, 104 | group_shapes=group_shapes, 105 | n_iter=n_iter, 106 | n_kept_samples=samples, 107 | ) 108 | prediction = fm.predict(X_test) 109 | rmse = ((y_test - prediction) ** 2).mean() ** 0.5 110 | mae = np.abs(y_test - prediction).mean() 111 | print("rmse={rmse}, mae={mae}".format(rmse=rmse, mae=mae)) 112 | return fm 113 | 114 | 115 | # basic regression 116 | test_myfm(df_train, df_test, rank=8) 117 | # rmse=0.90321, mae=0.71164 118 | 119 | # with grouping 120 | fm = test_myfm(df_train, df_test, rank=8, grouping=True) 121 | # rmse=0.89594, mae=0.70481 122 | ``` 123 | 124 | ## Examples for Relational Data format 125 | 126 | Below is a toy movielens-like example that utilizes relational data format proposed in [3]. 127 | 128 | This example, however, is too simplistic to exhibit the computational advantage of this data format. For an example with drastically reduced computational complexity, see `examples/ml-100k-extended.ipynb`; 129 | 130 | ```python 131 | import pandas as pd 132 | import numpy as np 133 | from myfm import MyFMRegressor, RelationBlock 134 | from sklearn.preprocessing import OneHotEncoder 135 | 136 | users = pd.DataFrame([ 137 | {'user_id': 1, 'age': '20s', 'married': False}, 138 | {'user_id': 2, 'age': '30s', 'married': False}, 139 | {'user_id': 3, 'age': '40s', 'married': True} 140 | ]).set_index('user_id') 141 | 142 | movies = pd.DataFrame([ 143 | {'movie_id': 1, 'comedy': True, 'action': False }, 144 | {'movie_id': 2, 'comedy': False, 'action': True }, 145 | {'movie_id': 3, 'comedy': True, 'action': True} 146 | ]).set_index('movie_id') 147 | 148 | ratings = pd.DataFrame([ 149 | {'user_id': 1, 'movie_id': 1, 'rating': 2}, 150 | {'user_id': 1, 'movie_id': 2, 'rating': 5}, 151 | {'user_id': 2, 'movie_id': 2, 'rating': 4}, 152 | {'user_id': 2, 'movie_id': 3, 'rating': 3}, 153 | {'user_id': 3, 'movie_id': 3, 'rating': 3}, 154 | ]) 155 | 156 | user_ids, user_indices = np.unique(ratings.user_id, return_inverse=True) 157 | movie_ids, movie_indices = np.unique(ratings.movie_id, return_inverse=True) 158 | 159 | user_ohe = OneHotEncoder(handle_unknown='ignore').fit(users.reset_index()) # include user id as feature 160 | movie_ohe = OneHotEncoder(handle_unknown='ignore').fit(movies.reset_index()) 161 | 162 | X_user = user_ohe.transform( 163 | users.reindex(user_ids).reset_index() 164 | ) 165 | X_movie = movie_ohe.transform( 166 | movies.reindex(movie_ids).reset_index() 167 | ) 168 | 169 | block_user = RelationBlock(user_indices, X_user) 170 | block_movie = RelationBlock(movie_indices, X_movie) 171 | 172 | fm = MyFMRegressor(rank=2).fit(None, ratings.rating, X_rel=[block_user, block_movie]) 173 | 174 | prediction_df = pd.DataFrame([ 175 | dict(user_id=user_id,movie_id=movie_id, 176 | user_index=user_index, movie_index=movie_index) 177 | for user_index, user_id in enumerate(user_ids) 178 | for movie_index, movie_id in enumerate(movie_ids) 179 | ]) 180 | predicted_rating = fm.predict(None, [ 181 | RelationBlock(prediction_df.user_index, X_user), 182 | RelationBlock(prediction_df.movie_index, X_movie) 183 | ]) 184 | 185 | prediction_df['prediction'] = predicted_rating 186 | 187 | print( 188 | prediction_df.merge(ratings.rename(columns={'rating':'ground_truth'}), how='left') 189 | ) 190 | ``` 191 | 192 | # References 193 | 194 | 1. Rendle, Steffen. "Factorization machines." 2010 IEEE International Conference on Data Mining. IEEE, 2010. 195 | 1. Rendle, Steffen. "Factorization machines with libfm." ACM Transactions on Intelligent Systems and Technology (TIST) 3.3 (2012): 57. 196 | 1. Rendle, Steffen. "Scaling factorization machines to relational data." Proceedings of the VLDB Endowment. Vol. 6. No. 5. VLDB Endowment, 2013. 197 | 1. Bayer, Immanuel. "fastfm: A library for factorization machines." arXiv preprint arXiv:1505.00641 (2015). 198 | 1. Albert, James H., and Siddhartha Chib. "Bayesian analysis of binary and polychotomous response data." Journal of the American statistical Association 88.422 (1993): 669-679. 199 | 1. Albert, James H., and Siddhartha Chib. "Sequential ordinal modeling with applications to survival data." Biometrics 57.3 (2001): 829-836. 200 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "**/Faddeeva.*" 3 | -------------------------------------------------------------------------------- /cpp_source/bind.cpp: -------------------------------------------------------------------------------- 1 | #include "declare_module.hpp" 2 | 3 | PYBIND11_MODULE(_myfm, m) { 4 | declare_functional(m); 5 | } 6 | -------------------------------------------------------------------------------- /cpp_source/bind_float.cpp: -------------------------------------------------------------------------------- 1 | #include "declare_module.hpp" 2 | 3 | PYBIND11_MODULE(_myfm_float, m) { 4 | declare_functional(m); 5 | } 6 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==4.4.0 2 | -------------------------------------------------------------------------------- /doc/source/api_reference.rst: -------------------------------------------------------------------------------- 1 | .. _APIReference: 2 | 3 | ============== 4 | API References 5 | ============== 6 | 7 | .. currentmodule:: myfm 8 | 9 | Training API 10 | ------------ 11 | .. autosummary:: 12 | :toctree: api_reference 13 | 14 | RelationBlock 15 | MyFMRegressor 16 | MyFMClassifier 17 | MyFMGibbsRegressor 18 | MyFMGibbsClassifier 19 | MyFMOrderedProbit 20 | VariationalFMRegressor 21 | VariationalFMClassifier 22 | 23 | .. currentmodule:: myfm 24 | 25 | Benchmark Dataset 26 | ----------------- 27 | .. autosummary:: 28 | :toctree: api_reference 29 | 30 | utils.benchmark_data.MovieLens100kDataManager 31 | utils.benchmark_data.MovieLens1MDataManager 32 | utils.benchmark_data.MovieLens10MDataManager 33 | 34 | 35 | Utilities for Sparse Matrix Construction 36 | ---------------------------------------- 37 | 38 | .. autosummary:: 39 | :toctree: api_reference 40 | 41 | utils.encoders.DataFrameEncoder 42 | utils.encoders.CategoryValueToSparseEncoder 43 | utils.encoders.MultipleValuesToSparseEncoder 44 | utils.encoders.BinningEncoder 45 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | 14 | # sys.path.insert(0, os.path.abspath('../../')) 15 | 16 | # -- Project information ----------------------------------------------------- 17 | 18 | project = "myFM" 19 | copyright = "2020, Tomoki Ohtsuki" 20 | author = "Tomoki Ohtsuki" 21 | 22 | # The full version, including alpha/beta/rc tags 23 | release = "0.2.1" 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | "sphinx.ext.autodoc", 33 | "sphinx.ext.autosummary", 34 | "sphinx.ext.todo", 35 | "sphinx.ext.doctest", 36 | "sphinx.ext.viewcode", 37 | "sphinx.ext.autodoc", 38 | "sphinx.ext.napoleon", 39 | "sphinx_rtd_theme", 40 | ] 41 | 42 | napoleon_google_docstring = False 43 | napoleon_numpy_docstring = True 44 | napoleon_include_private_with_doc = False 45 | napoleon_include_special_with_doc = False 46 | napoleon_use_admonition_for_examples = False 47 | napoleon_use_admonition_for_notes = False 48 | napoleon_use_admonition_for_references = False 49 | napoleon_use_ivar = True 50 | napoleon_use_param = True 51 | napoleon_use_rtype = True 52 | 53 | 54 | autosummary_generate = ["api_reference.rst"] 55 | 56 | 57 | autodoc_default_flags = ["members", "inherited-members", "show-inheritance"] 58 | autodoc_default_options = { 59 | "members": True, 60 | "inherited-members": True, 61 | "show-inheritance": True, 62 | } 63 | 64 | 65 | autoclass_content = "class" 66 | # Add any paths that contain templates here, relative to this directory. 67 | templates_path = ["_templates"] 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This pattern also affects html_static_path and html_extra_path. 72 | exclude_patterns = [] 73 | 74 | 75 | # -- Options for HTML output ------------------------------------------------- 76 | 77 | # The theme to use for HTML and HTML Help pages. See the documentation for 78 | # a list of builtin themes. 79 | # 80 | html_theme = "sphinx_rtd_theme" 81 | 82 | # Add any paths that contain custom static files (such as style sheets) here, 83 | # relative to this directory. They are copied after the builtin static files, 84 | # so a file named "default.css" will overwrite the builtin "default.css". 85 | html_static_path = ["_static"] 86 | 87 | master_doc = "index" 88 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. myfm documentation master file, created by 2 | sphinx-quickstart on Wed Aug 19 13:39:04 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | myFM - Bayesian Factorization Machines in Python/C++ 8 | ==================================================== 9 | 10 | **myFM** is an unofficial implementation of Bayesian Factorization Machines in Python/C++. 11 | Notable features include: 12 | 13 | * Implementation of all corresponding functionalities in `libFM `_ MCMC engine (including grouping & relation block) 14 | * A simpler and faster implementation using `Pybind11 `_ and `Eigen `_ 15 | * Gibbs sampling for **ordinal regression** with probit link function. See :ref:`the tutorial ` for its usage. 16 | * Support variational inference, which converges faster and requires lower memory (but usually less accurate than the Gibbs sampling). 17 | 18 | 19 | In most cases, you can install the library from PyPI: :: 20 | 21 | pip install myfm 22 | 23 | It has an interface similar to sklearn, and you can use them for wide variety of prediction tasks. 24 | For example, 25 | 26 | .. testcode:: 27 | 28 | from sklearn.datasets import load_breast_cancer 29 | from sklearn.model_selection import train_test_split 30 | from sklearn.preprocessing import StandardScaler 31 | from sklearn import metrics 32 | 33 | from myfm import MyFMClassifier 34 | 35 | dataset = load_breast_cancer() 36 | X = StandardScaler().fit_transform(dataset['data']) 37 | y = dataset['target'] 38 | 39 | X_train, X_test, y_train, y_test = train_test_split( 40 | X, y, random_state=42 41 | ) 42 | fm = MyFMClassifier(rank=2).fit(X_train, y_train) 43 | 44 | print(metrics.roc_auc_score(y_test, fm.predict_proba(X_test))) 45 | # 0.9954 46 | 47 | .. testoutput:: 48 | :hide: 49 | :options: +ELLIPSIS 50 | 51 | 0.99... 52 | 53 | 54 | Try out the following :ref:`examples ` to see how Bayesian approaches to explicit collaborative filtering 55 | are still very competitive (almost unbeaten)! 56 | 57 | .. toctree:: 58 | :caption: Basic Usage 59 | :maxdepth: 1 60 | 61 | quickstart 62 | movielens 63 | relation-blocks 64 | ordinal-regression 65 | 66 | .. toctree:: 67 | :caption: Details 68 | :maxdepth: 1 69 | 70 | api_reference 71 | 72 | 73 | Indices and tables 74 | ================== 75 | 76 | * :ref:`genindex` 77 | * :ref:`search` 78 | -------------------------------------------------------------------------------- /doc/source/movielens.rst: -------------------------------------------------------------------------------- 1 | .. _MovielensIndex: 2 | 3 | ========================================= 4 | A Basic Tutorial with Movielens 100K 5 | ========================================= 6 | 7 | FMs perform remarkably well on datasets with huge and sparse feature matrices, 8 | and the most common examples are (explicit) collaborative filtering tasks. 9 | 10 | Let us examine the power of the Bayesian Factorization Machines 11 | by testing a series of APIs in myFM using the well-known Movielens 100k dataset. 12 | 13 | 14 | ------------------------- 15 | Pure Matrix Factorization 16 | ------------------------- 17 | 18 | First let us consider the probabilistic Matrix Factorization. 19 | That is, we model the user :math:`u`'s rating response to movie :math:`i`, 20 | which we write :math:`r_{ui}`, as 21 | 22 | .. math:: 23 | r_{ui} \sim w_0 + b_u + d_i + \vec{u}_u \cdot \vec{v}_j 24 | 25 | This formulation is equivalent to Factorization Machines with 26 | 27 | 1. User IDs treated as a categorical feature with one-hot encoding 28 | 2. Movie IDs treated as a categorical feature with one-hot encoding 29 | 30 | So you can efficiently use encoder like sklearn's `OneHotEncoder `_ 31 | to prepare the input matrix. 32 | 33 | .. testcode :: 34 | 35 | import numpy as np 36 | from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder 37 | from sklearn import metrics 38 | 39 | import myfm 40 | from myfm.utils.benchmark_data import MovieLens100kDataManager 41 | 42 | FM_RANK = 10 43 | 44 | data_manager = MovieLens100kDataManager() 45 | df_train, df_test = data_manager.load_rating_predefined_split(fold=3) 46 | 47 | FEATURE_COLUMNS = ['user_id', 'movie_id'] 48 | ohe = OneHotEncoder(handle_unknown='ignore') 49 | 50 | X_train = ohe.fit_transform(df_train[FEATURE_COLUMNS]) 51 | X_test = ohe.transform(df_test[FEATURE_COLUMNS]) 52 | y_train = df_train.rating.values 53 | y_test = df_test.rating.values 54 | 55 | fm = myfm.MyFMRegressor(rank=FM_RANK, random_seed=42) 56 | fm.fit(X_train, y_train, n_iter=200, n_kept_samples=200) 57 | 58 | prediction = fm.predict(X_test) 59 | rmse = ((y_test - prediction) ** 2).mean() ** .5 60 | mae = np.abs(y_test - prediction).mean() 61 | print(f'rmse={rmse}, mae={mae}') 62 | 63 | .. testoutput:: 64 | :hide: 65 | :options: +ELLIPSIS 66 | 67 | rmse=..., mae=... 68 | 69 | The above script should give you RMSE=0.8944, MAE=0.7031 which is already 70 | impressive compared with other recent methods. 71 | 72 | .. _grouping: 73 | 74 | ------------------------------------------- 75 | Assuming Separate Variance for movie & user 76 | ------------------------------------------- 77 | 78 | In Probabilistic Matrix Factorization, we usually assume 79 | user vectors and item vectors are drawn from separate normal priors: 80 | 81 | .. math:: 82 | u_i & \sim \mathcal{N}(\mu_U, \Sigma_U) \\ 83 | v_i & \sim \mathcal{N}(\mu_I, \Sigma_I) 84 | 85 | However, we haven't provided any information about which columns are users' and items'. 86 | 87 | You can tell :py:class:`myfm.MyFMRegressor` these information (i.e., which parameters share a common mean and variance) by ``group_shapes`` option: 88 | 89 | .. testcode :: 90 | 91 | fm_grouped = myfm.MyFMRegressor( 92 | rank=FM_RANK, random_seed=42, 93 | ) 94 | fm_grouped.fit( 95 | X_train, y_train, n_iter=200, n_kept_samples=200, 96 | group_shapes=[len(group) for group in ohe.categories_] 97 | ) 98 | 99 | prediction_grouped = fm_grouped.predict(X_test) 100 | rmse = ((y_test - prediction_grouped) ** 2).mean() ** .5 101 | mae = np.abs(y_test - prediction_grouped).mean() 102 | print(f'rmse={rmse}, mae={mae}') 103 | 104 | .. testoutput:: 105 | :hide: 106 | :options: +ELLIPSIS 107 | 108 | rmse=..., mae=... 109 | 110 | 111 | This will slightly improve the performance to RMSE=0.8925, MAE=0.7001. 112 | 113 | 114 | ------------------------------------------- 115 | Adding Side information 116 | ------------------------------------------- 117 | 118 | It is straightforward to include user/item side information. 119 | 120 | First we retrieve the side information from ``Movielens100kDataManager``: 121 | 122 | .. testcode :: 123 | 124 | user_info = data_manager.load_user_info().set_index('user_id') 125 | user_info["age"] = user_info.age // 5 * 5 126 | user_info["zipcode"] = user_info.zipcode.str[0] 127 | user_info_ohe = OneHotEncoder(handle_unknown='ignore').fit(user_info) 128 | 129 | movie_info = data_manager.load_movie_info().set_index('movie_id') 130 | movie_info['release_year'] = [ 131 | str(x) for x in movie_info['release_date'].dt.year.fillna('NaN') 132 | ] 133 | movie_info = movie_info[['release_year', 'genres']] 134 | movie_info_ohe = OneHotEncoder(handle_unknown='ignore').fit(movie_info[['release_year']]) 135 | movie_genre_mle = MultiLabelBinarizer(sparse_output=True).fit( 136 | movie_info.genres.apply(lambda x: x.split('|')) 137 | ) 138 | 139 | 140 | 141 | Note that the way movie genre information is represented in ``movie_info`` DataFrame is a bit tricky (it is already binary encoded). 142 | 143 | We can then augment ``X_train`` / ``X_test`` with auxiliary information. The `hstack `_ function of ``scipy.sparse`` is very convenient for this purpose: 144 | 145 | .. testcode :: 146 | 147 | import scipy.sparse as sps 148 | X_train_extended = sps.hstack([ 149 | X_train, 150 | user_info_ohe.transform( 151 | user_info.reindex(df_train.user_id) 152 | ), 153 | movie_info_ohe.transform( 154 | movie_info.reindex(df_train.movie_id).drop(columns=['genres']) 155 | ), 156 | movie_genre_mle.transform( 157 | movie_info.genres.reindex(df_train.movie_id).apply(lambda x: x.split('|')) 158 | ) 159 | ]) 160 | 161 | X_test_extended = sps.hstack([ 162 | X_test, 163 | user_info_ohe.transform( 164 | user_info.reindex(df_test.user_id) 165 | ), 166 | movie_info_ohe.transform( 167 | movie_info.reindex(df_test.movie_id).drop(columns=['genres']) 168 | ), 169 | movie_genre_mle.transform( 170 | movie_info.genres.reindex(df_test.movie_id).apply(lambda x: x.split('|')) 171 | ) 172 | ]) 173 | 174 | Then we can regress ``X_train_extended`` against ``y_train`` 175 | 176 | .. testcode :: 177 | 178 | group_shapes_extended = ( 179 | [len(group) for group in ohe.categories_] + 180 | [len(group) for group in user_info_ohe.categories_] + 181 | [len(group) for group in movie_info_ohe.categories_] + 182 | [ len(movie_genre_mle.classes_)] 183 | ) 184 | 185 | fm_side_info = myfm.MyFMRegressor( 186 | rank=FM_RANK, random_seed=42, 187 | ) 188 | fm_side_info.fit( 189 | X_train_extended, y_train, n_iter=200, n_kept_samples=200, 190 | group_shapes=group_shapes_extended 191 | ) 192 | 193 | prediction_side_info = fm_side_info.predict(X_test_extended) 194 | rmse = ((y_test - prediction_side_info) ** 2).mean() ** .5 195 | mae = np.abs(y_test - prediction_side_info).mean() 196 | print(f'rmse={rmse}, mae={mae}') 197 | 198 | .. testoutput:: 199 | :hide: 200 | :options: +ELLIPSIS 201 | 202 | rmse=..., mae=... 203 | 204 | The result should improve further with RMSE = 0.8855, MAE = 0.6944. 205 | 206 | Unfortunately, the running time is somewhat (~ 4 times) slower compared to 207 | the pure matrix-factorization described above. This is as it should be: 208 | the complexity of Bayesian FMs is proportional to :math:`O(\mathrm{NNZ})` 209 | (i.e., non-zero elements of input sparse matrix), 210 | and we have incorporated various non-zero elements (user/item features) for each row. 211 | 212 | Surprisingly, we can still train the equivalent model 213 | in a running time close to pure MF if represent the data in Relational Data Format. 214 | See :ref:`next section ` for how Relational Data Format works. 215 | -------------------------------------------------------------------------------- /doc/source/ordinal-regression.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: myfm 3 | .. _OrdinalRegression: 4 | 5 | ---------------------------- 6 | Ordinal Regression Tutorial 7 | ---------------------------- 8 | 9 | .. _OrdinalUCLA: 10 | 11 | UCLA Dataset 12 | ^^^^^^^^^^^^^^^^^^^^^^ 13 | 14 | Let us first explain the API of :py:class:`myfm.MyFMOrderedProbit` 15 | using `UCLA dataset `_. 16 | 17 | The data description says 18 | 19 | This hypothetical data set has a three level variable called apply, with levels “unlikely”, “somewhat likely”, and “very likely”, coded 1, 2, and 3, respectively, that we will use as our outcome variable. We also have three variables that we will use as predictors: pared, which is a 0/1 variable indicating whether at least one parent has a graduate degree; public, which is a 0/1 variable where 1 indicates that the undergraduate institution is public and 0 private, and gpa, which is the student’s grade point average. 20 | 21 | We can read the data (in Stata format) using pandas: 22 | 23 | .. testcode :: 24 | 25 | import pandas as pd 26 | df = pd.read_stata("https://stats.idre.ucla.edu/stat/data/ologit.dta") 27 | df.head() 28 | 29 | It should print 30 | 31 | .. csv-table:: 32 | :header-rows: 1 33 | 34 | ,apply,pared,public,gpa 35 | 0,very likely,0,0,3.26 36 | 1,somewhat likely,1,0,3.21 37 | 2,unlikely,1,1,3.94 38 | 3,somewhat likely,0,0,2.81 39 | 4,somewhat likely,0,0,2.53 40 | 41 | We regard the target label ``apply`` as a ordinal categorical variable, 42 | 43 | .. math:: 44 | (\text{unlikely} = 0) < (\text{somewhat likely} = 1) < (\text{very likely} = 2) 45 | 46 | so we map ``apply`` as 47 | 48 | .. testcode :: 49 | 50 | y = df['apply'].map({'unlikely': 0, 'somewhat likely': 1, 'very likely': 2}).values 51 | 52 | Prepare other features as usual. 53 | 54 | .. testcode :: 55 | 56 | from sklearn.model_selection import train_test_split 57 | from sklearn import metrics 58 | 59 | X = df[['pared', 'public', 'gpa']].values 60 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 61 | 62 | Now we can feed the data into :py:class:`myfm.MyFMOrderedProbit`. 63 | 64 | .. testcode :: 65 | 66 | from myfm import MyFMOrderedProbit 67 | clf = MyFMOrderedProbit(rank=0).fit(X_train, y_train, n_iter=200) 68 | 69 | p = clf.predict_proba(X_test) 70 | 71 | print(f'rmse={metrics.log_loss(y_test, p)}') 72 | # ~ 0.84, slightly better than constant model baseline. 73 | 74 | .. testoutput :: 75 | :hide: 76 | :options: +ELLIPSIS 77 | 78 | rmse=... 79 | 80 | Note that unlike binary probit regression, :py:meth:`MyFMOrderedProbit.predict_proba` 81 | returns 2D (N_item x N_class) array of class probability. 82 | 83 | Movielens ratings as ordinal outcome 84 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 85 | 86 | Let us now turn back to :ref:`Movielens 100K tutorial `. 87 | 88 | Although we have treated movie ratings as a real target variable 89 | and used :py:class:`MyFMRegressor`, it is more natural to regard them 90 | as ordinal outcomes, as there are no guarantee that the difference between rating 4 vs 5 91 | is equivalent to the one with rating 2 vs 3. 92 | 93 | So let us see what happens if we instead use :py:class:`MyFMOrderedProbit` to predict the rating. 94 | If you have followed the steps through :ref:`the previous ''grouping'' section `, 95 | you can train our ordered probit regressor by 96 | 97 | .. testcode :: 98 | 99 | import numpy as np 100 | from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder 101 | from sklearn import metrics 102 | 103 | import myfm 104 | from myfm.utils.benchmark_data import MovieLens100kDataManager 105 | 106 | FM_RANK = 10 107 | 108 | data_manager = MovieLens100kDataManager() 109 | df_train, df_test = data_manager.load_rating_predefined_split(fold=3) 110 | 111 | FEATURE_COLUMNS = ['user_id', 'movie_id'] 112 | ohe = OneHotEncoder(handle_unknown='ignore') 113 | 114 | X_train = ohe.fit_transform(df_train[FEATURE_COLUMNS]) 115 | X_test = ohe.transform(df_test[FEATURE_COLUMNS]) 116 | y_train = df_train.rating.values 117 | y_test = df_test.rating.values 118 | 119 | fm = myfm.MyFMOrderedProbit( 120 | rank=FM_RANK, random_seed=42, 121 | ) 122 | fm.fit( 123 | X_train, y_train - 1, n_iter=300, n_kept_samples=300, 124 | group_shapes=[len(group) for group in ohe.categories_] 125 | ) 126 | 127 | Note that we have used ``y_train - 1`` instead of ``y_train``, 128 | because rating ``r`` should be regarded as class ``r-1``. 129 | 130 | 131 | We can predict the class probability given ``X_test`` as 132 | 133 | .. testcode :: 134 | 135 | p_ordinal = fm.predict_proba(X_test) 136 | 137 | and the expected rating as 138 | 139 | .. testcode :: 140 | 141 | expected_rating = p_ordinal.dot(np.arange(1, 6)) 142 | rmse = ((y_test - expected_rating) ** 2).mean() ** .5 143 | mae = np.abs(y_test - expected_rating).mean() 144 | print(f'rmse={rmse}, mae={mae}') 145 | 146 | .. testoutput :: 147 | :hide: 148 | :options: +ELLIPSIS 149 | 150 | rmse=..., mae=... 151 | 152 | 153 | which gives us RMSE=0.8906 and MAE=0.6985, a slight improvement over the regression case. 154 | 155 | To see why it had an advantage over regression, let us check 156 | the posterior samples for the cutpoint parameters. 157 | 158 | .. testcode :: 159 | 160 | cutpoints = fm.cutpoint_samples - fm.w0_samples[:, None] 161 | 162 | You can see how rating boundaries vs cutpoints looks like. :: 163 | 164 | from matplotlib import pyplot as plt 165 | cp_mean = cutpoints.mean(axis=0) 166 | cp_std = cutpoints.std(axis=0) 167 | 168 | plt.plot(np.arange(1, 5), cp_mean); 169 | plt.fill_between( 170 | np.arange(1, 5), cp_mean + 2*cp_std, cp_mean - 2 * cp_std, 171 | alpha=0.3 172 | ) 173 | plt.title('rating boundary vs cutpoint') 174 | 175 | This will give you the following figure. The line is slightly non-linear, 176 | which may explain the advantage of the formulation in ordinal regression. 177 | 178 | .. image:: ./rating_vs_cps.png 179 | :alt: The relationship between cutpoints and rating boundaries are shown. 180 | :width: 50% 181 | 182 | You can also improve the performance for Movielens 1M & 10M dataset. 183 | See our `examples `_ directory. 184 | -------------------------------------------------------------------------------- /doc/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Quick Start 3 | =========== 4 | 5 | 6 | ------------ 7 | Installation 8 | ------------ 9 | 10 | On MacOS/Linux First try:: 11 | 12 | pip install myfm 13 | 14 | If it works, you can now try the examples. 15 | 16 | If there is something nasty, then read the :ref:`detailed installation guide ` 17 | and figure out what went wrong. 18 | Of course, feel free to create an issue on `GitHub `_! 19 | 20 | 21 | ------------- 22 | A toy example 23 | ------------- 24 | 25 | Let us first look at how :py:class:`myfm.MyFMClassifier` works for `a toy example provided in pyFM `_. 26 | 27 | .. doctest :: 28 | 29 | import myfm 30 | from sklearn.feature_extraction import DictVectorizer 31 | import numpy as np 32 | train = [ 33 | {"user": "1", "item": "5", "age": 19}, 34 | {"user": "2", "item": "43", "age": 33}, 35 | {"user": "3", "item": "20", "age": 55}, 36 | {"user": "4", "item": "10", "age": 20}, 37 | ] 38 | v = DictVectorizer() 39 | 40 | X = v.fit_transform(train) 41 | 42 | # Note that X is a sparse matrix 43 | print(X.toarray()) 44 | 45 | # The target variable to be classified. 46 | y = np.asarray([0, 1, 1, 0]) 47 | fm = myfm.MyFMClassifier(rank=4) 48 | fm.fit(X,y) 49 | 50 | # It also supports prediction for new unseen items. 51 | fm.predict_proba(v.transform([{"user": "1", "item": "10", "age": 24}])) 52 | 53 | .. testoutput :: 54 | :hide: 55 | :options: +ELLIPSIS 56 | 57 | [[ 19. 0. 0. 0. 1. 1. 0. 0. 0.] 58 | [ 33. 0. 0. 1. 0. 0. 1. 0. 0.] 59 | [ 55. 0. 1. 0. 0. 0. 0. 1. 0.] 60 | [ 20. 1. 0. 0. 0. 0. 0. 0. 1.]] 61 | 62 | 63 | As the example suggests, :py:class:`myfm.MyFMClassifier` takes 64 | sparse matrices of `scipy.sparse `_ as its input. 65 | In the above example, `sklearn's DictVectorizer `_ 66 | transforms the categorical variables (user id and movie id) into a one-hot encoded vectors. 67 | 68 | As you can see, :py:class:MyFMClassifier: can make predictions against 69 | new (unseen) items despite the fact that it is an MCMC solver. 70 | This is possible because it simply retains all the intermediate (noisy) samples. 71 | 72 | For more practical example with larger data, move on to :ref:`Movielens examples ` . 73 | -------------------------------------------------------------------------------- /doc/source/rating_vs_cps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/doc/source/rating_vs_cps.png -------------------------------------------------------------------------------- /doc/source/relation-blocks.rst: -------------------------------------------------------------------------------- 1 | .. _RelationBlockTutorial : 2 | 3 | -------------------------------------- 4 | TimeSVD++ Flipped with Relation Blocks 5 | -------------------------------------- 6 | 7 | As mentioned in the :ref:`Movielens example `, 8 | the complexity of Bayesian FMs is proportional to :math:`O(\mathrm{NNZ})`. 9 | This is especially troublesome when we include SVD++-like features in the feature matrix. 10 | In such a case, for each user, we include all of the item IDs that the user had interacted with, 11 | and the complexity grows further by a factor of :math:`O(\mathrm{NNZ} / N_U)`. 12 | 13 | However, we can get away with this catastrophic complexity if we notice the repeated pattern in the input matrix. 14 | Interested readers can refer to `[Rendle, '13] `_ 15 | and `libFM's Manual `_ for details. 16 | 17 | Below let us see how we can incorporate SVD++-like features efficiently 18 | using the relational data again using Movielens 100K dataset. 19 | 20 | ^^^^^^^^^^^^^^^^^^^^^^^^ 21 | Building SVD++ Features 22 | ^^^^^^^^^^^^^^^^^^^^^^^^ 23 | 24 | In `[Rendle, et al., '19] `_, 25 | in addition to the user/movie id, they have made use of the following features to improve the accuracy considerably: 26 | 27 | 1. User Implicit Features: All the movies the user had watched 28 | 2. Movie Implicit Features: All the users who have watched the movie 29 | 3. Time Variable: The day of watch event (regarded as a categorical variable) 30 | 31 | Let us construct these features. 32 | 33 | .. testcode :: 34 | 35 | from collections import defaultdict 36 | import numpy as np 37 | from sklearn.preprocessing import OneHotEncoder 38 | from sklearn import metrics 39 | import myfm 40 | from myfm import RelationBlock 41 | from scipy import sparse as sps 42 | 43 | from myfm.utils.benchmark_data import MovieLens100kDataManager 44 | 45 | data_manager = MovieLens100kDataManager() 46 | 47 | # fold 1 is the toughest one 48 | df_train, df_test = data_manager.load_rating_predefined_split(fold=1) 49 | 50 | date_ohe = OneHotEncoder(handle_unknown='ignore').fit( 51 | df_train.timestamp.dt.date.values.reshape(-1, 1) 52 | ) 53 | def categorize_date(df): 54 | return date_ohe.transform(df.timestamp.dt.date.values[:, np.newaxis]) 55 | 56 | # index "0" is reserved for unknown ids. 57 | user_to_index = defaultdict(lambda : 0, { uid: i+1 for i,uid in enumerate(np.unique(df_train.user_id)) }) 58 | movie_to_index = defaultdict(lambda: 0, { mid: i+1 for i,mid in enumerate(np.unique(df_train.movie_id))}) 59 | USER_ID_SIZE = len(user_to_index) + 1 60 | MOVIE_ID_SIZE = len(movie_to_index) + 1 61 | 62 | Above we constructed dictionaries which map user/movie id to the corresponding indices. 63 | We have preserved the index ''0'' for ''Unknown'' user/movies, respectively. 64 | 65 | To do the feature-engineering stated above, we have to memoize which users/movies had interactions with which movies/users. 66 | 67 | .. testcode :: 68 | 69 | # The flags to control the included features. 70 | use_date = True # use date info or not 71 | use_iu = True # use implicit user feature 72 | use_ii = True # use implicit item feature 73 | 74 | movie_vs_watched = dict() 75 | user_vs_watched = dict() 76 | for row in df_train.itertuples(): 77 | user_id = row.user_id 78 | movie_id = row.movie_id 79 | movie_vs_watched.setdefault(movie_id, list()).append(user_id) 80 | user_vs_watched.setdefault(user_id, list()).append(movie_id) 81 | 82 | if use_date: 83 | X_date_train = categorize_date(df_train) 84 | X_date_test = categorize_date(df_test) 85 | else: 86 | X_date_train, X_date_test = (None, None) 87 | 88 | 89 | We can then define functions which maps a list of user/movie ids to the features represented in sparse matrix format: 90 | 91 | .. testcode :: 92 | 93 | # given user/movie ids, add additional infos and return it as sparse 94 | def augment_user_id(user_ids): 95 | Xs = [] 96 | X_uid = sps.lil_matrix((len(user_ids), USER_ID_SIZE)) 97 | for index, user_id in enumerate(user_ids): 98 | X_uid[index, user_to_index[user_id]] = 1 99 | Xs.append(X_uid) 100 | if use_iu: 101 | X_iu = sps.lil_matrix((len(user_ids), MOVIE_ID_SIZE)) 102 | for index, user_id in enumerate(user_ids): 103 | watched_movies = user_vs_watched.get(user_id, []) 104 | normalizer = 1 / max(len(watched_movies), 1) ** 0.5 105 | for uid in watched_movies: 106 | X_iu[index, movie_to_index[uid]] = normalizer 107 | Xs.append(X_iu) 108 | return sps.hstack(Xs, format='csr') 109 | 110 | def augment_movie_id(movie_ids): 111 | Xs = [] 112 | X_movie = sps.lil_matrix((len(movie_ids), MOVIE_ID_SIZE)) 113 | for index, movie_id in enumerate(movie_ids): 114 | X_movie[index, movie_to_index[movie_id]] = 1 115 | Xs.append(X_movie) 116 | 117 | if use_ii: 118 | X_ii = sps.lil_matrix((len(movie_ids), USER_ID_SIZE)) 119 | for index, movie_id in enumerate(movie_ids): 120 | watched_users = movie_vs_watched.get(movie_id, []) 121 | normalizer = 1 / max(len(watched_users), 1) ** 0.5 122 | for uid in watched_users: 123 | X_ii[index, user_to_index[uid]] = normalizer 124 | Xs.append(X_ii) 125 | 126 | 127 | return sps.hstack(Xs, format='csr') 128 | 129 | ^^^^^^^^^^^^ 130 | A naive way 131 | ^^^^^^^^^^^^ 132 | 133 | We now setup the problem in a non-relational way: 134 | 135 | .. testcode :: 136 | 137 | train_uid_unique, train_uid_index = np.unique(df_train.user_id, return_inverse=True) 138 | train_mid_unique, train_mid_index = np.unique(df_train.movie_id, return_inverse=True) 139 | user_data_train = augment_user_id(train_uid_unique) 140 | movie_data_train = augment_movie_id(train_mid_unique) 141 | 142 | test_uid_unique, test_uid_index = np.unique(df_test.user_id, return_inverse=True) 143 | test_mid_unique, test_mid_index = np.unique(df_test.movie_id, return_inverse=True) 144 | user_data_test = augment_user_id(test_uid_unique) 145 | movie_data_test = augment_movie_id(test_mid_unique) 146 | 147 | X_train_naive = sps.hstack([ 148 | X_date_train, 149 | user_data_train[train_uid_index], 150 | movie_data_train[train_mid_index] 151 | ]) 152 | 153 | X_test_naive = sps.hstack([ 154 | X_date_test, 155 | user_data_test[test_uid_index], 156 | movie_data_test[test_mid_index] 157 | ]) 158 | 159 | fm_naive = myfm.MyFMRegressor(rank=10).fit(X_train_naive, df_train.rating, n_iter=3, n_kept_samples=3) 160 | 161 | In my environment, it takes ~ 2s per iteration, which is much slower than pure MF example. 162 | 163 | 164 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 165 | The problem formulation with RelationBlock. 166 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 167 | 168 | In the above code, we have already seen a hint to optimize the performance. 169 | The line :: 170 | 171 | user_data_train[train_uid_index], 172 | 173 | says that each row of the sparse matrix ``user_data_train`` appears many times, 174 | and we will compute the same combination of factors repeatedly. 175 | 176 | The role of :py:class:`myfm.RelationBlock` is to tell such a repeated pattern explicitly 177 | so that we can drastically reduce the complexity. 178 | 179 | 180 | .. testcode :: 181 | 182 | block_user_train = RelationBlock(train_uid_index, user_data_train) 183 | block_movie_train = RelationBlock(train_mid_index, movie_data_train) 184 | block_user_test = RelationBlock(test_uid_index, user_data_test) 185 | block_movie_test = RelationBlock(test_mid_index, movie_data_test) 186 | 187 | We can now feed these blocks into :py:meth:`myfm.MyFMRegressor.fit` by 188 | 189 | .. testcode :: 190 | 191 | fm_rb = myfm.MyFMRegressor(rank=10).fit( 192 | X_date_train, df_train.rating, 193 | X_rel=[block_user_train, block_movie_train], 194 | n_iter=300, n_kept_samples=300 195 | ) 196 | 197 | Note that we cannot express ``X_date_train`` as a relation block and we have 198 | supplied such a non-repeated data for the first argument. 199 | This time, the speed is 20 iters / s, almost 40x speed up compared to the naive version. 200 | This is also much faster than e.g., `Surprise's implementation of SVD++ `_. 201 | 202 | What the relation format does is to reorganize the computation, 203 | but the result should be the same up to floating point artifacts: 204 | 205 | .. testcode :: 206 | 207 | for i in range(3): 208 | sample_naive = fm_naive.w_samples[i] 209 | sample_rb = fm_rb.w_samples[i] 210 | assert(np.max(np.abs(sample_naive - sample_rb)) < 1e-5) 211 | # should print tiny numbers 212 | 213 | 214 | The resulting performance measures are RMSE=0.889, MAE=0.7000 : 215 | 216 | .. testcode :: 217 | 218 | test_prediction = fm_rb.predict( 219 | X_date_test, 220 | X_rel=[block_user_test, block_user_test] 221 | ) 222 | rmse = ((df_test.rating.values - test_prediction) ** 2).mean() ** 0.5 223 | mae = np.abs(df_test.rating.values - test_prediction).mean() 224 | print(f'rmse={rmse}, mae={mae}') 225 | 226 | .. testoutput :: 227 | :hide: 228 | :options: +ELLIPSIS 229 | 230 | rmse=..., mae=... 231 | 232 | Note that we still haven't exploited all the available ingredients such as 233 | user/item side-information and :ref:`grouping of the input variables `. 234 | See also `examples notebooks & scripts `_ 235 | for further improved results. 236 | -------------------------------------------------------------------------------- /doc_autobuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # a convenient script to run sphinx-autobuild 3 | sphinx-autobuild \ 4 | --host 0.0.0.0 \ 5 | --port 9999 \ 6 | --watch src/myfm/ \ 7 | doc/source doc/build 8 | -------------------------------------------------------------------------------- /examples/ml-100k-regression.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from typing import Dict, List, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import sparse as sps 8 | 9 | import myfm 10 | from myfm import RelationBlock 11 | from myfm.gibbs import MyFMGibbsRegressor, MyFMOrderedProbit 12 | from myfm.utils.benchmark_data.movielens100k_data import MovieLens100kDataManager 13 | from myfm.utils.callbacks import ( 14 | LibFMLikeCallbackBase, 15 | OrderedProbitCallback, 16 | RegressionCallback, 17 | ) 18 | from myfm.utils.encoders import CategoryValueToSparseEncoder 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser( 22 | description=""" 23 | This script apply the method and evaluation protocal proposed in 24 | "On the Difficulty of Evaluating Baselines" paper by Rendle et al, 25 | against smaller Movielens 100K dataset, using myFM. 26 | """, 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 28 | ) 29 | 30 | parser.add_argument( 31 | "fold_index", 32 | type=int, 33 | help="which index set to use as a test within 5-fold predefined CV.", 34 | default=1, 35 | ) 36 | parser.add_argument( 37 | "-a", 38 | "--algorithm", 39 | type=str, 40 | choices=["regression", "oprobit"], 41 | default="regression", 42 | help="specify the output type.", 43 | ) 44 | parser.add_argument( 45 | "-i", "--iteration", type=int, help="mcmc iteration", default=512 46 | ) 47 | parser.add_argument( 48 | "-d", "--dimension", type=int, help="fm embedding dimension", default=10 49 | ) 50 | 51 | parser.add_argument( 52 | "--stricter_protocol", 53 | action="store_true", 54 | help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].", 55 | default=True, 56 | ) 57 | 58 | parser.add_argument( 59 | "-f", 60 | "--feature", 61 | type=str, 62 | choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"], 63 | help="feature set used in the experiment.", 64 | default="timesvdpp_flipped", 65 | ) 66 | 67 | args = parser.parse_args() 68 | 69 | random_seed = 42 70 | 71 | # Additional features. 72 | # We add 73 | # 1. date of evaluation as categorical variables 74 | # 2. "all users who have evaluated a movie in the train set" or 75 | # 3. "all movies rated by a user" as a feature of user/movie. 76 | if args.feature == "mf": 77 | use_date = False 78 | use_iu = False 79 | use_ii = False 80 | elif args.feature == "svdpp": 81 | use_date = False 82 | use_iu = True 83 | use_ii = False 84 | elif args.feature == "timesvd": 85 | use_date = True 86 | use_iu = False 87 | use_ii = False 88 | elif args.feature == "timesvdpp": 89 | use_date = True 90 | use_iu = True 91 | use_ii = False 92 | elif args.feature == "timesvdpp_flipped": 93 | use_date = True # use date info or not 94 | use_iu = True # use implicit user feature 95 | use_ii = True # use implicit item feature 96 | else: 97 | raise ValueError("unknown feature set specified.") 98 | 99 | FOLD_INDEX = args.fold_index 100 | ITERATION = args.iteration 101 | DIMENSION = args.dimension 102 | if FOLD_INDEX < 1 or FOLD_INDEX >= 6: 103 | raise ValueError("fold_index must be in the range(1, 6).") 104 | ALGORITHM = args.algorithm 105 | data_manager = MovieLens100kDataManager() 106 | df_train, df_test = data_manager.load_rating_predefined_split(fold=FOLD_INDEX) 107 | 108 | if ALGORITHM == "oprobit": 109 | # interpret the rating (1, 2, 3, 4, 5) as class (0, 1, 2, 3, 4). 110 | for df_ in [df_train, df_test]: 111 | df_["rating"] -= 1 112 | df_["rating"] = df_.rating.astype(np.int32) 113 | 114 | if args.stricter_protocol: 115 | implicit_data_source = df_train 116 | else: 117 | implicit_data_source = pd.concat([df_train, df_test]) 118 | 119 | user_to_internal = CategoryValueToSparseEncoder[int]( 120 | implicit_data_source.user_id.values 121 | ) 122 | movie_to_internal = CategoryValueToSparseEncoder[int]( 123 | implicit_data_source.movie_id.values 124 | ) 125 | 126 | print( 127 | "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape) 128 | ) 129 | # treat the days of events as categorical variable 130 | date_encoder = CategoryValueToSparseEncoder( 131 | implicit_data_source.timestamp.dt.date.values 132 | ) 133 | 134 | def categorize_date(df: pd.DataFrame) -> sps.csr_matrix: 135 | return date_encoder.to_sparse(df.timestamp.dt.date.values) 136 | 137 | movie_vs_watched: Dict[int, List[int]] = dict() 138 | user_vs_watched: Dict[int, List[int]] = dict() 139 | 140 | for row in implicit_data_source.itertuples(): 141 | user_id: int = row.user_id 142 | movie_id: int = row.movie_id 143 | movie_vs_watched.setdefault(movie_id, list()).append(user_id) 144 | user_vs_watched.setdefault(user_id, list()).append(movie_id) 145 | 146 | if use_date: 147 | X_date_train = categorize_date(df_train) 148 | X_date_test = categorize_date(df_test) 149 | else: 150 | X_date_train, X_date_test = (None, None) 151 | 152 | # setup grouping 153 | feature_group_sizes = [] 154 | if use_date: 155 | feature_group_sizes.append( 156 | len(date_encoder), # date 157 | ) 158 | 159 | feature_group_sizes.append(len(user_to_internal)) # user ids 160 | 161 | if use_iu: 162 | # all movies which a user watched 163 | feature_group_sizes.append(len(movie_to_internal)) 164 | 165 | feature_group_sizes.append(len(movie_to_internal)) # movie ids 166 | 167 | if use_ii: 168 | feature_group_sizes.append( 169 | len(user_to_internal) # all the users who watched a movies 170 | ) 171 | 172 | grouping = [i for i, size in enumerate(feature_group_sizes) for _ in range(size)] 173 | 174 | # given user/movie ids, add additional infos and return it as sparse 175 | def augment_user_id(user_ids: List[int]) -> sps.csr_matrix: 176 | X = user_to_internal.to_sparse(user_ids) 177 | if not use_iu: 178 | return X 179 | data: List[float] = [] 180 | row: List[int] = [] 181 | col: List[int] = [] 182 | for index, user_id in enumerate(user_ids): 183 | watched_movies = user_vs_watched.get(user_id, []) 184 | normalizer = 1 / max(len(watched_movies), 1) ** 0.5 185 | for mid in watched_movies: 186 | data.append(normalizer) 187 | col.append(movie_to_internal[mid]) 188 | row.append(index) 189 | return sps.hstack( 190 | [ 191 | X, 192 | sps.csr_matrix( 193 | (data, (row, col)), 194 | shape=(len(user_ids), len(movie_to_internal)), 195 | ), 196 | ], 197 | format="csr", 198 | ) 199 | 200 | def augment_movie_id(movie_ids: List[int]) -> sps.csr_matrix: 201 | X = movie_to_internal.to_sparse(movie_ids) 202 | if not use_ii: 203 | return X 204 | 205 | data: List[float] = [] 206 | row: List[int] = [] 207 | col: List[int] = [] 208 | 209 | for index, movie_id in enumerate(movie_ids): 210 | watched_users = movie_vs_watched.get(movie_id, []) 211 | normalizer = 1 / max(len(watched_users), 1) ** 0.5 212 | for uid in watched_users: 213 | data.append(normalizer) 214 | row.append(index) 215 | col.append(user_to_internal[uid]) 216 | return sps.hstack( 217 | [ 218 | X, 219 | sps.csr_matrix( 220 | (data, (row, col)), 221 | shape=(len(movie_ids), len(user_to_internal)), 222 | ), 223 | ] 224 | ) 225 | 226 | # Create RelationBlock. 227 | train_blocks: List[RelationBlock] = [] 228 | test_blocks: List[RelationBlock] = [] 229 | for source, target in [(df_train, train_blocks), (df_test, test_blocks)]: 230 | unique_users, user_map = np.unique(source.user_id, return_inverse=True) 231 | target.append(RelationBlock(user_map, augment_user_id(unique_users))) 232 | unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True) 233 | target.append(RelationBlock(movie_map, augment_movie_id(unique_movies))) 234 | 235 | trace_path = "rmse_{0}_fold_{1}.csv".format(ALGORITHM, FOLD_INDEX) 236 | 237 | callback: LibFMLikeCallbackBase 238 | fm: Union[MyFMGibbsRegressor, MyFMOrderedProbit] 239 | if ALGORITHM == "regression": 240 | fm = myfm.MyFMRegressor(rank=DIMENSION) 241 | callback = RegressionCallback( 242 | n_iter=ITERATION, 243 | X_test=X_date_test, 244 | y_test=df_test.rating.values, 245 | X_rel_test=test_blocks, 246 | clip_min=df_train.rating.min(), 247 | clip_max=df_train.rating.max(), 248 | trace_path=trace_path, 249 | ) 250 | else: 251 | fm = myfm.MyFMOrderedProbit(rank=DIMENSION) 252 | callback = OrderedProbitCallback( 253 | n_iter=ITERATION, 254 | X_test=X_date_test, 255 | y_test=df_test.rating.values, 256 | n_class=5, 257 | X_rel_test=test_blocks, 258 | trace_path=trace_path, 259 | ) 260 | 261 | fm.fit( 262 | X_date_train, 263 | df_train.rating.values, 264 | X_rel=train_blocks, 265 | grouping=grouping, 266 | n_iter=ITERATION, 267 | n_kept_samples=ITERATION, 268 | callback=callback, 269 | ) 270 | with open( 271 | "callback_result_{0}_fold_{1}.pkl".format(ALGORITHM, FOLD_INDEX), "wb" 272 | ) as ofs: 273 | pickle.dump(callback, ofs) 274 | -------------------------------------------------------------------------------- /examples/ml-100k-variational.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from myfm import RelationBlock, VariationalFMRegressor 8 | from myfm.utils.benchmark_data.movielens100k_data import MovieLens100kDataManager 9 | from myfm.utils.encoders import ( 10 | CategoryValueToSparseEncoder, 11 | DataFrameEncoder, 12 | MultipleValuesToSparseEncoder, 13 | ) 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser( 17 | description=""" 18 | This script apply the method and evaluation protocal proposed in 19 | "On the Difficulty of Evaluating Baselines" paper by Rendle et al, 20 | against smaller Movielens 100K dataset, using myFM. 21 | """, 22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 23 | ) 24 | 25 | parser.add_argument( 26 | "fold_index", 27 | type=int, 28 | help="which index set to use as a test within 5-fold predefined CV.", 29 | default=1, 30 | ) 31 | 32 | parser.add_argument( 33 | "-i", "--iteration", type=int, help="mcmc iteration", default=512 34 | ) 35 | parser.add_argument( 36 | "-d", "--dimension", type=int, help="fm embedding dimension", default=10 37 | ) 38 | 39 | parser.add_argument( 40 | "--stricter_protocol", 41 | action="store_true", 42 | help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].", 43 | default=True, 44 | ) 45 | 46 | parser.add_argument( 47 | "-f", 48 | "--feature", 49 | type=str, 50 | choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"], 51 | help="feature set used in the experiment.", 52 | default="timesvdpp_flipped", 53 | ) 54 | 55 | args = parser.parse_args() 56 | 57 | random_seed = 42 58 | 59 | # Additional features. 60 | # We add 61 | # 1. date of evaluation as categorical variables 62 | # 2. "all users who have evaluated a movie in the train set" or 63 | # 3. "all movies rated by a user" as a feature of user/movie. 64 | if args.feature == "mf": 65 | use_date = False 66 | use_iu = False 67 | use_ii = False 68 | elif args.feature == "svdpp": 69 | use_date = False 70 | use_iu = True 71 | use_ii = False 72 | elif args.feature == "timesvd": 73 | use_date = True 74 | use_iu = False 75 | use_ii = False 76 | elif args.feature == "timesvdpp": 77 | use_date = True 78 | use_iu = True 79 | use_ii = False 80 | elif args.feature == "timesvdpp_flipped": 81 | use_date = True # use date info or not 82 | use_iu = True # use implicit user feature 83 | use_ii = True # use implicit item feature 84 | else: 85 | raise ValueError("unknown feature set specified.") 86 | 87 | FOLD_INDEX = args.fold_index 88 | ITERATION = args.iteration 89 | DIMENSION = args.dimension 90 | if FOLD_INDEX < 1 or FOLD_INDEX >= 6: 91 | raise ValueError("fold_index must be in the range(1, 6).") 92 | 93 | data_manager = MovieLens100kDataManager() 94 | df_train, df_test = data_manager.load_rating_predefined_split(fold=FOLD_INDEX) 95 | 96 | if args.stricter_protocol: 97 | implicit_data_source = df_train 98 | else: 99 | implicit_data_source = pd.concat([df_train, df_test]) 100 | 101 | def int_list_to_str(x): 102 | return "|".join([f"{id}" for id in x]) 103 | 104 | user_implicit_profile = ( 105 | implicit_data_source.groupby("user_id")["movie_id"] 106 | .agg(int_list_to_str) 107 | .reset_index() 108 | ) 109 | item_implicit_profile = ( 110 | implicit_data_source.groupby("movie_id")["user_id"] 111 | .agg(int_list_to_str) 112 | .reset_index() 113 | ) 114 | 115 | print( 116 | "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape) 117 | ) 118 | 119 | user_encoder = DataFrameEncoder().add_column( 120 | "user_id", 121 | CategoryValueToSparseEncoder(user_implicit_profile.user_id), 122 | ) 123 | if use_iu: 124 | user_encoder.add_column( 125 | "movie_id", 126 | MultipleValuesToSparseEncoder(user_implicit_profile.movie_id, sep="|"), 127 | ) 128 | 129 | movie_encoder = DataFrameEncoder().add_column( 130 | "movie_id", 131 | CategoryValueToSparseEncoder(item_implicit_profile.movie_id), 132 | ) 133 | if use_ii: 134 | movie_encoder.add_column( 135 | "user_id", 136 | MultipleValuesToSparseEncoder(item_implicit_profile.user_id, sep="|"), 137 | ) 138 | 139 | # treat the days of events as categorical variable 140 | 141 | feature_group_sizes: List[int] = [] 142 | if use_date: 143 | date_encoder = CategoryValueToSparseEncoder( 144 | implicit_data_source.timestamp.dt.date 145 | ) 146 | X_date_train = date_encoder.to_sparse(df_train.timestamp.dt.date) 147 | X_date_test = date_encoder.to_sparse(df_test.timestamp.dt.date) 148 | feature_group_sizes.append(len(date_encoder)) 149 | else: 150 | X_date_train, X_date_test = (None, None) 151 | 152 | # setup grouping 153 | feature_group_sizes.extend(user_encoder.encoder_shapes) 154 | feature_group_sizes.extend(movie_encoder.encoder_shapes) 155 | 156 | # Create RelationBlock. 157 | train_blocks: List[RelationBlock] = [] 158 | test_blocks: List[RelationBlock] = [] 159 | for source, target in [(df_train, train_blocks), (df_test, test_blocks)]: 160 | unique_users, user_map = np.unique(source.user_id, return_inverse=True) 161 | target.append( 162 | RelationBlock( 163 | user_map, 164 | user_encoder.encode_df( 165 | user_implicit_profile.set_index("user_id") 166 | .reindex(unique_users) 167 | .fillna("") 168 | .reset_index() 169 | ), 170 | ) 171 | ) 172 | unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True) 173 | target.append( 174 | RelationBlock( 175 | movie_map, 176 | movie_encoder.encode_df( 177 | item_implicit_profile.set_index("movie_id") 178 | .reindex(unique_movies) 179 | .fillna("") 180 | .reset_index() 181 | ), 182 | ) 183 | ) 184 | 185 | trace_path = "rmse_variational_fold_{0}.csv".format(FOLD_INDEX) 186 | fm = VariationalFMRegressor(rank=DIMENSION) 187 | 188 | fm.fit( 189 | X_date_train, 190 | df_train.rating.values, 191 | X_rel=train_blocks, 192 | n_iter=ITERATION, 193 | group_shapes=feature_group_sizes, 194 | ) 195 | rmse = ( 196 | (df_test.rating.values - fm.predict(X_date_test, test_blocks)) ** 2 197 | ).mean() ** 0.5 198 | assert fm.history_ is not None 199 | print("RMSE = {rmse}".format(rmse=rmse)) 200 | -------------------------------------------------------------------------------- /examples/ml-10m-regression.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from typing import Dict, List, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import sparse as sps 8 | 9 | import myfm 10 | from myfm import MyFMOrderedProbit, MyFMRegressor, RelationBlock 11 | from myfm.gibbs import MyFMOrderedProbit 12 | from myfm.utils.benchmark_data import MovieLens10MDataManager 13 | from myfm.utils.callbacks.libfm import ( 14 | LibFMLikeCallbackBase, 15 | OrderedProbitCallback, 16 | RegressionCallback, 17 | ) 18 | from myfm.utils.encoders import CategoryValueToSparseEncoder 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser( 22 | description=""" 23 | This script apply the method and evaluation protocal proposed in 24 | "On the Difficulty of Evaluating Baselines" paper by Rendle et al, 25 | against smaller Movielens 1M dataset, using myFM. 26 | """, 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 28 | ) 29 | 30 | parser.add_argument( 31 | "fold_index", 32 | type=int, 33 | help="which index set to use as a test within 10-fold CV.", 34 | ) 35 | parser.add_argument( 36 | "-a", 37 | "--algorithm", 38 | type=str, 39 | choices=["regression", "oprobit"], 40 | default="regression", 41 | help="specify the output type.", 42 | ) 43 | parser.add_argument( 44 | "-i", "--iteration", type=int, help="mcmc iteration", default=512 45 | ) 46 | parser.add_argument( 47 | "-d", 48 | "--dimension", 49 | type=int, 50 | help="fm embedding dimension", 51 | default=128, 52 | ) 53 | parser.add_argument( 54 | "--stricter_protocol", 55 | action="store_true", 56 | help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].", 57 | default=True, 58 | ) 59 | parser.add_argument( 60 | "-f", 61 | "--feature", 62 | type=str, 63 | choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"], 64 | help="feature set used in the experiment.", 65 | default="timesvdpp_flipped", 66 | ) 67 | args = parser.parse_args() 68 | 69 | random_seed = 42 70 | 71 | # Additional features. 72 | # We add 73 | # 1. date of evaluation as categorical variables 74 | # 2. "all users who have evaluated a movie in the train set" or 75 | # 3. "all movies rated by a user" as a feature of user/movie. 76 | if args.feature == "mf": 77 | use_date = False 78 | use_iu = False 79 | use_ii = False 80 | elif args.feature == "svdpp": 81 | use_date = False 82 | use_iu = True 83 | use_ii = False 84 | elif args.feature == "timesvd": 85 | use_date = True 86 | use_iu = False 87 | use_ii = False 88 | elif args.feature == "timesvdpp": 89 | use_date = True 90 | use_iu = True 91 | use_ii = False 92 | elif args.feature == "timesvdpp_flipped": 93 | use_date = True # use date info or not 94 | use_iu = True # use implicit user feature 95 | use_ii = True # use implicit item feature 96 | else: 97 | raise ValueError("unknown feature set specified.") 98 | 99 | FOLD_INDEX = args.fold_index 100 | ITERATION = args.iteration 101 | DIMENSION = args.dimension 102 | if FOLD_INDEX < 0 or FOLD_INDEX >= 10: 103 | raise ValueError("fold_index must be in the range(10).") 104 | ALGORITHM = args.algorithm 105 | data_manager = MovieLens10MDataManager() 106 | df_train, df_test = data_manager.load_rating_kfold_split( 107 | 10, FOLD_INDEX, random_seed 108 | ) 109 | 110 | if ALGORITHM == "oprobit": 111 | # interpret the rating 0.5, 1.0 ... , 5.0 as class (0, 1, ... , 10) 112 | for df_ in [df_train, df_test]: 113 | df_["rating"] -= 0.5 114 | df_["rating"] *= 2 115 | df_["rating"] = df_.rating.astype(np.int32) 116 | 117 | if args.stricter_protocol: 118 | implicit_data_source = df_train 119 | else: 120 | implicit_data_source = pd.concat([df_train, df_test]) 121 | 122 | user_to_internal = CategoryValueToSparseEncoder[int]( 123 | implicit_data_source.user_id.values 124 | ) 125 | movie_to_internal = CategoryValueToSparseEncoder[int]( 126 | implicit_data_source.movie_id.values 127 | ) 128 | 129 | print( 130 | "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape) 131 | ) 132 | # treat the days of events as categorical variable 133 | date_encoder = CategoryValueToSparseEncoder[pd.Timestamp]( 134 | implicit_data_source.timestamp.dt.date.values 135 | ) 136 | 137 | def categorize_date(df): 138 | return date_encoder.to_sparse(df.timestamp.dt.date.values) 139 | 140 | movie_vs_watched: Dict[int, List[int]] = dict() 141 | user_vs_watched: Dict[int, List[int]] = dict() 142 | 143 | for row in implicit_data_source.itertuples(): 144 | user_id = row.user_id 145 | movie_id = row.movie_id 146 | movie_vs_watched.setdefault(movie_id, list()).append(user_id) 147 | user_vs_watched.setdefault(user_id, list()).append(movie_id) 148 | 149 | if use_date: 150 | X_date_train = categorize_date(df_train) 151 | X_date_test = categorize_date(df_test) 152 | else: 153 | X_date_train, X_date_test = (None, None) 154 | 155 | # setup grouping 156 | feature_group_sizes = [] 157 | if use_date: 158 | feature_group_sizes.append( 159 | len(date_encoder), # date 160 | ) 161 | 162 | feature_group_sizes.append(len(user_to_internal)) # user ids 163 | 164 | if use_iu: 165 | # all movies which a user watched 166 | feature_group_sizes.append(len(movie_to_internal)) 167 | 168 | feature_group_sizes.append(len(movie_to_internal)) # movie ids 169 | 170 | if use_ii: 171 | feature_group_sizes.append( 172 | len(user_to_internal) # all the users who watched a movies 173 | ) 174 | 175 | grouping = [i for i, size in enumerate(feature_group_sizes) for _ in range(size)] 176 | 177 | def augment_user_id(user_ids: List[int]) -> sps.csr_matrix: 178 | X = user_to_internal.to_sparse(user_ids) 179 | if not use_iu: 180 | return X 181 | data: List[float] = [] 182 | row: List[int] = [] 183 | col: List[int] = [] 184 | for index, user_id in enumerate(user_ids): 185 | watched_movies = user_vs_watched.get(user_id, []) 186 | normalizer = 1 / max(len(watched_movies), 1) ** 0.5 187 | for mid in watched_movies: 188 | data.append(normalizer) 189 | col.append(movie_to_internal[mid]) 190 | row.append(index) 191 | return sps.hstack( 192 | [ 193 | X, 194 | sps.csr_matrix( 195 | (data, (row, col)), 196 | shape=(len(user_ids), len(movie_to_internal)), 197 | ), 198 | ], 199 | format="csr", 200 | ) 201 | 202 | def augment_movie_id(movie_ids: List[int]): 203 | X = movie_to_internal.to_sparse(movie_ids) 204 | if not use_ii: 205 | return X 206 | 207 | data: List[float] = [] 208 | row: List[int] = [] 209 | col: List[int] = [] 210 | 211 | for index, movie_id in enumerate(movie_ids): 212 | watched_users = movie_vs_watched.get(movie_id, []) 213 | normalizer = 1 / max(len(watched_users), 1) ** 0.5 214 | for uid in watched_users: 215 | data.append(normalizer) 216 | row.append(index) 217 | col.append(user_to_internal[uid]) 218 | return sps.hstack( 219 | [ 220 | X, 221 | sps.csr_matrix( 222 | (data, (row, col)), 223 | shape=(len(movie_ids), len(user_to_internal)), 224 | ), 225 | ] 226 | ) 227 | 228 | # Create RelationBlock. 229 | train_blocks: List[RelationBlock] = [] 230 | test_blocks: List[RelationBlock] = [] 231 | for source, target in [(df_train, train_blocks), (df_test, test_blocks)]: 232 | unique_users, user_map = np.unique(source.user_id, return_inverse=True) 233 | target.append(RelationBlock(user_map, augment_user_id(unique_users))) 234 | unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True) 235 | target.append(RelationBlock(movie_map, augment_movie_id(unique_movies))) 236 | 237 | trace_path = "rmse_{0}_fold_{1}.csv".format(ALGORITHM, FOLD_INDEX) 238 | 239 | callback: LibFMLikeCallbackBase 240 | fm: Union[MyFMRegressor, MyFMOrderedProbit] 241 | if ALGORITHM == "regression": 242 | fm = myfm.MyFMRegressor(rank=DIMENSION) 243 | callback = RegressionCallback( 244 | ITERATION, 245 | X_date_test, 246 | df_test.rating.values, 247 | X_rel_test=test_blocks, 248 | clip_min=0.5, 249 | clip_max=5.0, 250 | trace_path=trace_path, 251 | ) 252 | else: 253 | fm = myfm.MyFMOrderedProbit(rank=DIMENSION) 254 | callback = OrderedProbitCallback( 255 | ITERATION, 256 | X_date_test, 257 | df_test.rating.values, 258 | n_class=10, 259 | X_rel_test=test_blocks, 260 | trace_path=trace_path, 261 | ) 262 | fm.fit( 263 | X_date_train, 264 | df_train.rating.values, 265 | X_rel=train_blocks, 266 | grouping=grouping, 267 | n_iter=callback.n_iter, 268 | callback=callback, 269 | n_kept_samples=1, 270 | ) 271 | with open( 272 | "callback_result_{0}_fold_{1}.pkl".format(ALGORITHM, FOLD_INDEX), "wb" 273 | ) as ofs: 274 | pickle.dump(callback, ofs) 275 | -------------------------------------------------------------------------------- /examples/ml-1m-regression.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from typing import Dict, List, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import sparse as sps 8 | 9 | import myfm 10 | from myfm import MyFMOrderedProbit, MyFMRegressor, RelationBlock 11 | from myfm.gibbs import MyFMOrderedProbit 12 | from myfm.utils.benchmark_data import MovieLens1MDataManager 13 | from myfm.utils.callbacks.libfm import ( 14 | LibFMLikeCallbackBase, 15 | OrderedProbitCallback, 16 | RegressionCallback, 17 | ) 18 | from myfm.utils.encoders import CategoryValueToSparseEncoder 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser( 22 | description=""" 23 | This script apply the method and evaluation protocal proposed in 24 | "On the Difficulty of Evaluating Baselines" paper by Rendle et al, 25 | against smaller Movielens 1M dataset, using myFM. 26 | """, 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 28 | ) 29 | 30 | parser.add_argument( 31 | "fold_index", 32 | type=int, 33 | help="which index set to use as a test within 10-fold CV.", 34 | ) 35 | parser.add_argument( 36 | "-a", 37 | "--algorithm", 38 | type=str, 39 | choices=["regression", "oprobit"], 40 | default="regression", 41 | help="specify the output type.", 42 | ) 43 | parser.add_argument( 44 | "-i", "--iteration", type=int, help="mcmc iteration", default=512 45 | ) 46 | parser.add_argument( 47 | "-d", "--dimension", type=int, help="fm embedding dimension", default=32 48 | ) 49 | parser.add_argument( 50 | "--stricter_protocol", 51 | action="store_true", 52 | help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].", 53 | default=True, 54 | ) 55 | parser.add_argument( 56 | "-f", 57 | "--feature", 58 | type=str, 59 | choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"], 60 | help="feature set used in the experiment.", 61 | default="timesvdpp_flipped", 62 | ) 63 | args = parser.parse_args() 64 | 65 | random_seed = 42 66 | 67 | # Additional features. 68 | # We add 69 | # 1. date of evaluation as categorical variables 70 | # 2. "all users who have evaluated a movie in the train set" or 71 | # 3. "all movies rated by a user" as a feature of user/movie. 72 | if args.feature == "mf": 73 | use_date = False 74 | use_iu = False 75 | use_ii = False 76 | elif args.feature == "svdpp": 77 | use_date = False 78 | use_iu = True 79 | use_ii = False 80 | elif args.feature == "timesvd": 81 | use_date = True 82 | use_iu = False 83 | use_ii = False 84 | elif args.feature == "timesvdpp": 85 | use_date = True 86 | use_iu = True 87 | use_ii = False 88 | elif args.feature == "timesvdpp_flipped": 89 | use_date = True # use date info or not 90 | use_iu = True # use implicit user feature 91 | use_ii = True # use implicit item feature 92 | else: 93 | raise ValueError("unknown feature set specified.") 94 | 95 | FOLD_INDEX = args.fold_index 96 | ITERATION = args.iteration 97 | DIMENSION = args.dimension 98 | if FOLD_INDEX < 0 or FOLD_INDEX >= 10: 99 | raise ValueError("fold_index must be in the range(10).") 100 | ALGORITHM = args.algorithm 101 | data_manager = MovieLens1MDataManager() 102 | df_train, df_test = data_manager.load_rating_kfold_split( 103 | 10, FOLD_INDEX, random_seed 104 | ) 105 | 106 | if ALGORITHM == "oprobit": 107 | # interpret the rating (1, 2, 3, 4, 5) as class (0, 1, 2, 3, 4). 108 | for df_ in [df_train, df_test]: 109 | df_["rating"] -= 1 110 | df_["rating"] = df_.rating.astype(np.int32) 111 | 112 | if args.stricter_protocol: 113 | implicit_data_source = df_train 114 | else: 115 | implicit_data_source = pd.concat([df_train, df_test]) 116 | 117 | user_to_internal = CategoryValueToSparseEncoder[int]( 118 | implicit_data_source.user_id.values 119 | ) 120 | movie_to_internal = CategoryValueToSparseEncoder[int]( 121 | implicit_data_source.movie_id.values 122 | ) 123 | 124 | print( 125 | "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape) 126 | ) 127 | # treat the days of events as categorical variable 128 | date_encoder = CategoryValueToSparseEncoder[pd.Timestamp]( 129 | implicit_data_source.timestamp.dt.date.values 130 | ) 131 | 132 | def categorize_date(df): 133 | return date_encoder.to_sparse(df.timestamp.dt.date.values) 134 | 135 | movie_vs_watched: Dict[int, List[int]] = dict() 136 | user_vs_watched: Dict[int, List[int]] = dict() 137 | 138 | for row in implicit_data_source.itertuples(): 139 | user_id = row.user_id 140 | movie_id = row.movie_id 141 | movie_vs_watched.setdefault(movie_id, list()).append(user_id) 142 | user_vs_watched.setdefault(user_id, list()).append(movie_id) 143 | 144 | if use_date: 145 | X_date_train = categorize_date(df_train) 146 | X_date_test = categorize_date(df_test) 147 | else: 148 | X_date_train, X_date_test = (None, None) 149 | 150 | # setup grouping 151 | feature_group_sizes = [] 152 | if use_date: 153 | feature_group_sizes.append( 154 | len(date_encoder), # date 155 | ) 156 | 157 | feature_group_sizes.append(len(user_to_internal)) # user ids 158 | 159 | if use_iu: 160 | # all movies which a user watched 161 | feature_group_sizes.append(len(movie_to_internal)) 162 | 163 | feature_group_sizes.append(len(movie_to_internal)) # movie ids 164 | 165 | if use_ii: 166 | feature_group_sizes.append( 167 | len(user_to_internal) # all the users who watched a movies 168 | ) 169 | 170 | grouping = [i for i, size in enumerate(feature_group_sizes) for _ in range(size)] 171 | 172 | def augment_user_id(user_ids: List[int]) -> sps.csr_matrix: 173 | X = user_to_internal.to_sparse(user_ids) 174 | if not use_iu: 175 | return X 176 | data: List[float] = [] 177 | row: List[int] = [] 178 | col: List[int] = [] 179 | for index, user_id in enumerate(user_ids): 180 | watched_movies = user_vs_watched.get(user_id, []) 181 | normalizer = 1 / max(len(watched_movies), 1) ** 0.5 182 | for mid in watched_movies: 183 | data.append(normalizer) 184 | col.append(movie_to_internal[mid]) 185 | row.append(index) 186 | return sps.hstack( 187 | [ 188 | X, 189 | sps.csr_matrix( 190 | (data, (row, col)), 191 | shape=(len(user_ids), len(movie_to_internal)), 192 | ), 193 | ], 194 | format="csr", 195 | ) 196 | 197 | def augment_movie_id(movie_ids: List[int]): 198 | X = movie_to_internal.to_sparse(movie_ids) 199 | if not use_ii: 200 | return X 201 | 202 | data: List[float] = [] 203 | row: List[int] = [] 204 | col: List[int] = [] 205 | 206 | for index, movie_id in enumerate(movie_ids): 207 | watched_users = movie_vs_watched.get(movie_id, []) 208 | normalizer = 1 / max(len(watched_users), 1) ** 0.5 209 | for uid in watched_users: 210 | data.append(normalizer) 211 | row.append(index) 212 | col.append(user_to_internal[uid]) 213 | return sps.hstack( 214 | [ 215 | X, 216 | sps.csr_matrix( 217 | (data, (row, col)), 218 | shape=(len(movie_ids), len(user_to_internal)), 219 | ), 220 | ] 221 | ) 222 | 223 | # Create RelationBlock. 224 | train_blocks: List[RelationBlock] = [] 225 | test_blocks: List[RelationBlock] = [] 226 | for source, target in [(df_train, train_blocks), (df_test, test_blocks)]: 227 | unique_users, user_map = np.unique(source.user_id, return_inverse=True) 228 | target.append(RelationBlock(user_map, augment_user_id(unique_users))) 229 | unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True) 230 | target.append(RelationBlock(movie_map, augment_movie_id(unique_movies))) 231 | 232 | trace_path = "rmse_{0}_fold_{1}.csv".format(ALGORITHM, FOLD_INDEX) 233 | 234 | callback: LibFMLikeCallbackBase 235 | fm: Union[MyFMRegressor, MyFMOrderedProbit] 236 | if ALGORITHM == "regression": 237 | fm = myfm.MyFMRegressor(rank=DIMENSION) 238 | callback = RegressionCallback( 239 | ITERATION, 240 | X_date_test, 241 | df_test.rating.values, 242 | X_rel_test=test_blocks, 243 | clip_min=0.5, 244 | clip_max=5.0, 245 | trace_path=trace_path, 246 | ) 247 | else: 248 | fm = myfm.MyFMOrderedProbit(rank=DIMENSION) 249 | callback = OrderedProbitCallback( 250 | ITERATION, 251 | X_date_test, 252 | df_test.rating.values, 253 | n_class=5, 254 | X_rel_test=test_blocks, 255 | trace_path=trace_path, 256 | ) 257 | 258 | fm.fit( 259 | X_date_train, 260 | df_train.rating.values, 261 | X_rel=train_blocks, 262 | grouping=grouping, 263 | n_iter=callback.n_iter, 264 | callback=callback, 265 | n_kept_samples=1, 266 | ) 267 | with open( 268 | "callback_result_{0}_fold_{1}.pkl".format(ALGORITHM, FOLD_INDEX), "wb" 269 | ) as ofs: 270 | pickle.dump(callback, ofs) 271 | -------------------------------------------------------------------------------- /examples/oprobit_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple example for ordred probit regression, 3 | taken from the "MCMCoprobit" document of MCMCpack: 4 | https://rdrr.io/cran/MCMCpack/man/MCMCoprobit.html 5 | """ 6 | 7 | import numpy as np 8 | 9 | from myfm import MyFMOrderedProbit 10 | 11 | N_DATA = 100 12 | 13 | rns = np.random.RandomState(42) 14 | X = rns.randn(N_DATA, 2) 15 | z = 1 + X[:, 0] * 0.1 - X[:, 1] * 0.5 + rns.randn(N_DATA) 16 | 17 | y = z.copy() 18 | y[z < 0] = 0 19 | y[(z >= 0) & (z < 1)] = 1 20 | y[(z >= 1) & (z < 1.5)] = 2 21 | y[z >= 1.5] = 3 22 | 23 | # Faster than MCMCoprobit by 40x, in my environment. 24 | fm = MyFMOrderedProbit(0, random_seed=42).fit( 25 | X, 26 | y, 27 | n_iter=11000, 28 | n_kept_samples=10000, 29 | ) 30 | 31 | print(fm.cutpoint_samples.mean(axis=0)) 32 | -------------------------------------------------------------------------------- /examples/toy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.feature_extraction import DictVectorizer 3 | 4 | import myfm 5 | 6 | train = [ 7 | {"user": "1", "item": "5", "age": 19}, 8 | {"user": "2", "item": "43", "age": 33}, 9 | {"user": "3", "item": "20", "age": 55}, 10 | {"user": "4", "item": "10", "age": 20}, 11 | ] 12 | v = DictVectorizer() 13 | X = v.fit_transform(train) 14 | y = np.asarray([0, 1, 1, 0]) 15 | fm = myfm.MyFMClassifier(rank=4) 16 | fm.fit(X, y) 17 | p = fm.predict_proba(v.transform({"user": "1", "item": "10", "age": 24})) 18 | print(p) 19 | -------------------------------------------------------------------------------- /include/Faddeeva/Faddeeva.hh: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2012 Massachusetts Institute of Technology 2 | * 3 | * Permission is hereby granted, free of charge, to any person obtaining 4 | * a copy of this software and associated documentation files (the 5 | * "Software"), to deal in the Software without restriction, including 6 | * without limitation the rights to use, copy, modify, merge, publish, 7 | * distribute, sublicense, and/or sell copies of the Software, and to 8 | * permit persons to whom the Software is furnished to do so, subject to 9 | * the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be 12 | * included in all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | /* Available at: http://ab-initio.mit.edu/Faddeeva 24 | 25 | Header file for Faddeeva.cc; see that file for more information. */ 26 | 27 | #ifndef FADDEEVA_HH 28 | #define FADDEEVA_HH 1 29 | 30 | #include 31 | 32 | namespace Faddeeva { 33 | 34 | // compute w(z) = exp(-z^2) erfc(-iz) [ Faddeeva / scaled complex error func ] 35 | extern std::complex w(std::complex z,double relerr=0); 36 | extern double w_im(double x); // special-case code for Im[w(x)] of real x 37 | 38 | // Various functions that we can compute with the help of w(z) 39 | 40 | // compute erfcx(z) = exp(z^2) erfc(z) 41 | extern std::complex erfcx(std::complex z, double relerr=0); 42 | extern double erfcx(double x); // special case for real x 43 | 44 | // compute erf(z), the error function of complex arguments 45 | extern std::complex erf(std::complex z, double relerr=0); 46 | extern double erf(double x); // special case for real x 47 | 48 | // compute erfi(z) = -i erf(iz), the imaginary error function 49 | extern std::complex erfi(std::complex z, double relerr=0); 50 | extern double erfi(double x); // special case for real x 51 | 52 | // compute erfc(z) = 1 - erf(z), the complementary error function 53 | extern std::complex erfc(std::complex z, double relerr=0); 54 | extern double erfc(double x); // special case for real x 55 | 56 | // compute Dawson(z) = sqrt(pi)/2 * exp(-z^2) * erfi(z) 57 | extern std::complex Dawson(std::complex z, double relerr=0); 58 | extern double Dawson(double x); // special case for real x 59 | 60 | } // namespace Faddeeva 61 | 62 | #endif // FADDEEVA_HH 63 | -------------------------------------------------------------------------------- /include/myfm/BaseFMTrainer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "FMLearningConfig.hpp" 9 | #include "HyperParams.hpp" 10 | #include "OProbitSampler.hpp" 11 | #include "definitions.hpp" 12 | #include "predictor.hpp" 13 | #include "util.hpp" 14 | 15 | namespace myFM { 16 | template 18 | struct BaseFMTrainer { 19 | // typedef typename Derived::FMType FMType; 20 | // typedef typename Derived::HyperType HyperType; 21 | 22 | typedef typename FMType::Vector Vector; 23 | typedef typename FMType::DenseMatrix DenseMatrix; 24 | typedef typename FMType::SparseMatrix SparseMatrix; 25 | 26 | typedef relational::RelationBlock RelationBlock; 27 | // typedef relational::RelationWiseCache RelationWiseCache; 28 | 29 | typedef FMLearningConfig Config; 30 | typedef typename Config::TASKTYPE TASKTYPE; 31 | 32 | typedef pair, HistoryType> learn_result_type; 33 | 34 | typedef OprobitSampler OprobitSamplerType; 35 | 36 | SparseMatrix X; 37 | vector relations; 38 | SparseMatrix X_t; // transposed 39 | 40 | const size_t dim_all; 41 | const Vector y; 42 | 43 | const int n_train; 44 | int n_class = 0; // Used by ordered probit 45 | 46 | Vector e_train; 47 | Vector q_train; 48 | vector relation_caches; 49 | 50 | const Config learning_config; 51 | 52 | size_t n_nan_occurred = 0; 53 | 54 | inline BaseFMTrainer(const SparseMatrix &X, 55 | const vector &relations, int random_seed, 56 | Config learning_config) {} 57 | 58 | inline BaseFMTrainer(const SparseMatrix &X, 59 | const vector &relations, const Vector &y, 60 | int random_seed, Config learning_config) 61 | : X(X), relations(relations), X_t(X.transpose()), 62 | dim_all(check_row_consistency_return_column(X, relations)), y(y), 63 | n_train(X.rows()), e_train(X.rows()), q_train(X.rows()), 64 | relation_caches(), learning_config(learning_config), 65 | random_seed(random_seed), gen_(random_seed) { 66 | for (auto it = relations.begin(); it != relations.end(); it++) { 67 | relation_caches.emplace_back(*it); 68 | } 69 | if (X.rows() != y.rows()) { 70 | throw std::runtime_error(StringBuilder{} 71 | .add("Shape mismatch: X has size") 72 | .space_and_add(X.rows()) 73 | .space_and_add("and y has size") 74 | .space_and_add(y.rows()) 75 | .build()); 76 | } 77 | this->X.makeCompressed(); 78 | this->X_t.makeCompressed(); 79 | if (learning_config.task_type == Config::TASKTYPE::ORDERED) { 80 | 81 | const size_t rows = this->X.rows(); 82 | std::vector existence(rows, false); 83 | for (auto &group_config : learning_config.cutpoint_groups()) { 84 | for (size_t k : group_config.second) { 85 | if (k >= rows) { 86 | throw std::invalid_argument( 87 | "out of range for cutpoint group config."); 88 | } 89 | if (existence[k]) { 90 | std::stringstream ss; 91 | ss << "index " << k << " overlapping in cutpoint config."; 92 | throw std::invalid_argument(ss.str()); 93 | } 94 | existence[k] = true; 95 | } 96 | } 97 | for (size_t i_ = 0; i_ < rows; i_++) { 98 | if (!existence[i_]) { 99 | std::stringstream ss; 100 | ss << "cutpoint group not specified for " << i_ << "."; 101 | throw std::invalid_argument(ss.str()); 102 | } 103 | } 104 | } 105 | } 106 | 107 | inline FMType create_FM(int rank, Real init_std) { 108 | FMType fm(rank); 109 | fm.initialize_weight(dim_all, init_std, gen_); 110 | return fm; 111 | } 112 | 113 | inline HyperType create_Hyper(size_t rank) { 114 | return HyperType{rank, learning_config.get_n_groups()}; 115 | } 116 | 117 | 118 | inline learn_result_type 119 | learn_with_callback(FMType &fm, HyperType &hyper, 120 | std::function *, HistoryType *)> cb); 121 | 122 | inline void initialize_hyper(FMType &fm, HyperType &hyper) { 123 | static_cast(*this).initialize_alpha(); 124 | static_cast(*this).initialize_mu_w(); 125 | static_cast(*this).initialize_lambda_w(); 126 | 127 | static_cast(*this).initialize_mu_V(); 128 | static_cast(*this).initialize_lambda_V(); 129 | } 130 | 131 | inline void initialize_e(FMType &fm, const HyperType &hyper) { 132 | static_cast(*this).initialize_e(fm, hyper); 133 | } 134 | 135 | inline void update_all(FMType &fm, HyperType &hyper) { 136 | update_alpha_(fm, hyper); 137 | 138 | update_w0_(fm, hyper); 139 | 140 | update_lambda_w_(fm, hyper); 141 | 142 | update_mu_w_(fm, hyper); 143 | 144 | update_w_(fm, hyper); 145 | 146 | update_lambda_V_(fm, hyper); 147 | update_mu_V_(fm, hyper); 148 | 149 | update_V_(fm, hyper); 150 | 151 | update_e_(fm, hyper); 152 | } 153 | 154 | inline void update_alpha_(FMType &fm, HyperType &hyper) { 155 | static_cast(*this).update_alpha(fm, hyper); 156 | } 157 | 158 | inline void update_w0_(FMType &fm, HyperType &hyper) { 159 | static_cast(*this).update_w0(fm, hyper); 160 | } 161 | 162 | inline void update_lambda_w_(FMType &fm, HyperType &hyper) { 163 | static_cast(*this).update_lambda_w(fm, hyper); 164 | } 165 | 166 | inline void update_mu_w_(FMType &fm, HyperType &hyper) { 167 | static_cast(*this).update_mu_w(fm, hyper); 168 | } 169 | 170 | inline void update_lambda_V_(FMType &fm, HyperType &hyper) { 171 | static_cast(*this).update_lambda_V(fm, hyper); 172 | } 173 | 174 | inline void update_mu_V_(FMType &fm, HyperType &hyper) { 175 | static_cast(*this).update_mu_V(fm, hyper); 176 | } 177 | 178 | inline void update_w_(FMType &fm, HyperType &hyper) { 179 | static_cast(*this).update_w(fm, hyper); 180 | } 181 | 182 | inline void update_e_(FMType &fm, HyperType &hyper) { 183 | static_cast(*this).update_e(fm, hyper); 184 | } 185 | 186 | inline void update_V_(FMType &fm, HyperType &hyper) { 187 | static_cast(*this).update_V(fm, hyper); 188 | } 189 | 190 | const int random_seed; 191 | 192 | protected: 193 | mt19937 gen_; 194 | // std::vector cutpoint_sampler; 195 | 196 | }; // BaseFMTrainer 197 | } // namespace myFM 198 | -------------------------------------------------------------------------------- /include/myfm/FM.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "definitions.hpp" 3 | #include 4 | #include 5 | 6 | namespace myFM { 7 | 8 | using namespace std; 9 | 10 | template struct FM { 11 | 12 | typedef relational::RelationBlock RelationBlock; 13 | 14 | typedef types::DenseMatrix DenseMatrix; 15 | typedef types::SparseMatrix SparseMatrix; 16 | typedef types::Vector Vector; 17 | 18 | inline FM(int n_factors, size_t n_groups) 19 | : n_factors(n_factors), initialized(false) {} 20 | inline FM(int n_factors) : FM(n_factors, 1) {} 21 | 22 | inline FM(const FM &other) 23 | : n_factors(other.n_factors), w0(other.w0), w(other.w), V(other.V), 24 | cutpoints(other.cutpoints), initialized(other.initialized) {} 25 | 26 | inline FM(Real w0, const Vector &w, const DenseMatrix &V) 27 | : n_factors(V.cols()), w0(w0), w(w), V(V), initialized(true) {} 28 | 29 | inline FM(Real w0, const Vector &w, const DenseMatrix &V, 30 | const vector &cutpoints) 31 | : n_factors(V.cols()), w0(w0), w(w), V(V), cutpoints(cutpoints), 32 | initialized(true) {} 33 | 34 | inline void initialize_weight(int n_features, Real init_std, mt19937 &gen) { 35 | initialized = false; 36 | normal_distribution nd; 37 | 38 | auto get_rand = [&gen, &nd, init_std](Real dummy) { 39 | return nd(gen) * init_std; 40 | }; 41 | V = DenseMatrix{n_features, n_factors}.unaryExpr(get_rand); 42 | w = Vector{n_features}.unaryExpr(get_rand); 43 | w0 = get_rand(1); 44 | initialized = true; 45 | } 46 | 47 | inline Vector predict_score(const SparseMatrix &X, 48 | const vector &relations) const { 49 | Vector result(X.rows()); 50 | predict_score_write_target(result, X, relations); 51 | return result; 52 | } 53 | 54 | inline void 55 | predict_score_write_target(Eigen::Ref target, const SparseMatrix &X, 56 | const vector &relations) const { 57 | // check input consistency 58 | size_t case_size = X.rows(); 59 | size_t feature_size_all = X.cols(); 60 | for (auto const &rel : relations) { 61 | if (case_size != rel.original_to_block.size()) { 62 | throw std::invalid_argument( 63 | "Relation blocks have inconsistent mapper size with case_size"); 64 | } 65 | feature_size_all += rel.feature_size; 66 | } 67 | if (feature_size_all != static_cast(this->w.rows())) { 68 | std::stringstream error_stream; 69 | error_stream << "Total feature size mismatch. Should be " 70 | << (this->w.rows()) << ", but got " << feature_size_all 71 | << "."; 72 | throw std::invalid_argument(error_stream.str()); 73 | } 74 | 75 | if (!initialized) { 76 | throw std::runtime_error("get_score called before initialization"); 77 | } 78 | target = w0 + (X * w.head(X.cols())).array(); 79 | size_t offset = X.cols(); 80 | for (auto iter = relations.begin(); iter != relations.end(); iter++) { 81 | Vector w0_cache = (iter->X) * w.segment(offset, iter->feature_size); 82 | size_t j = 0; 83 | for (auto i : (iter->original_to_block)) { 84 | target(j++) += w0_cache(i); 85 | } 86 | offset += iter->feature_size; 87 | } 88 | 89 | Vector q_cache(target.rows()); 90 | size_t buffer_size = 1; 91 | vector buffer_cache(1); 92 | vector block_q_caches; 93 | for (auto &relation : relations) { 94 | buffer_size = std::max(buffer_size, relation.block_size); 95 | } 96 | buffer_cache.resize(buffer_size); 97 | 98 | for (int factor_index = 0; factor_index < this->n_factors; factor_index++) { 99 | q_cache = X * V.col(factor_index).head(X.cols()); 100 | size_t offset = X.cols(); 101 | size_t relation_index = 0; 102 | for (auto iter = relations.begin(); iter != relations.end(); 103 | iter++, relation_index++) { 104 | Eigen::Map block_cache(buffer_cache.data(), iter->block_size); 105 | block_cache = 106 | iter->X * V.col(factor_index).segment(offset, iter->feature_size); 107 | offset += iter->feature_size; 108 | size_t train_case_index = 0; 109 | for (auto i : iter->original_to_block) { 110 | q_cache(train_case_index++) += block_cache(i); 111 | } 112 | } 113 | target.array() += q_cache.array().square() * static_cast(0.5); 114 | 115 | offset = X.cols(); 116 | relation_index = 0; 117 | q_cache = X.cwiseAbs2() * 118 | (V.col(factor_index).head(X.cols()).array().square().matrix()); 119 | for (auto iter = relations.begin(); iter != relations.end(); 120 | iter++, relation_index++) { 121 | Eigen::Map block_cache(buffer_cache.data(), iter->block_size); 122 | block_cache = 123 | (iter->X.cwiseAbs2()) * (V.col(factor_index) 124 | .segment(offset, iter->feature_size) 125 | .array() 126 | .square() 127 | .matrix()); 128 | offset += iter->feature_size; 129 | size_t train_case_index = 0; 130 | for (auto i : iter->original_to_block) { 131 | q_cache(train_case_index++) += block_cache(i); 132 | } 133 | } 134 | target -= q_cache * static_cast(0.5); 135 | } 136 | } 137 | inline DenseMatrix 138 | oprobit_predict_proba(const SparseMatrix &X, 139 | const vector &relations, 140 | size_t cutpoint_index) const { 141 | if (cutpoints.empty()) { 142 | throw std::runtime_error("No cutpoint available for this FM."); 143 | } 144 | int n_cpt = cutpoints.at(cutpoint_index).size(); 145 | DenseMatrix result = DenseMatrix::Zero(X.rows(), n_cpt + 1); 146 | 147 | Vector score(X.rows()); 148 | DenseMatrix cache(X.rows(), n_cpt + 1); 149 | predict_score_write_target(score, X, relations); 150 | for (int cpt_index = 0; cpt_index < n_cpt; cpt_index++) { 151 | cache.col(cpt_index) = 152 | (1 + ((cutpoints.at(cutpoint_index)(cpt_index) - score.array()) * 153 | static_cast(std::sqrt(0.5))) 154 | .erf()) / 155 | 2; 156 | } 157 | cache.col(n_cpt) = (1 - cache.col(n_cpt - 1).array()); 158 | for (int col = n_cpt - 1; col >= 1; col--) { 159 | cache.col(col) -= cache.col(col - 1); 160 | } 161 | return cache; 162 | } 163 | 164 | const int n_factors; 165 | Real w0; 166 | Vector w; 167 | DenseMatrix V; // (n_feature, n_factor) - matrix 168 | vector cutpoints; // ordered probit 169 | 170 | protected: 171 | bool initialized; 172 | }; 173 | 174 | } // namespace myFM 175 | -------------------------------------------------------------------------------- /include/myfm/FMLearningConfig.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "OProbitSampler.hpp" 4 | #include "definitions.hpp" 5 | #include "util.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace myFM { 12 | template struct FMLearningConfig { 13 | public: 14 | enum class TASKTYPE { REGRESSION, CLASSIFICATION, ORDERED }; 15 | using CutpointGroupType = vector>>; 16 | 17 | inline FMLearningConfig(Real alpha_0, Real beta_0, Real gamma_0, Real mu_0, 18 | Real reg_0, TASKTYPE task_type, Real nu_oprobit, 19 | bool fit_w0, bool fit_linear, 20 | const vector &group_index, int n_iter, 21 | int n_kept_samples, Real cutpoint_scale, 22 | const CutpointGroupType &cutpoint_groups) 23 | : alpha_0(alpha_0), beta_0(beta_0), gamma_0(gamma_0), mu_0(mu_0), 24 | reg_0(reg_0), task_type(task_type), nu_oprobit(nu_oprobit), 25 | fit_w0(fit_w0), fit_linear(fit_linear), n_iter(n_iter), 26 | n_kept_samples(n_kept_samples), cutpoint_scale(cutpoint_scale), 27 | group_index_(group_index), cutpoint_groups_(cutpoint_groups) { 28 | 29 | /* check group_index consistency */ 30 | set all_index(group_index.begin(), group_index.end()); 31 | n_groups_ = all_index.size(); 32 | /* verify that groups from 0 - (n_groups - 1) are contained.*/ 33 | for (size_t i = 0; i < n_groups_; i++) { 34 | if (all_index.find(i) == all_index.cend()) { 35 | throw invalid_argument( 36 | (StringBuilder{})("No matching index for group index ")(i)( 37 | " found.") 38 | .build()); 39 | } 40 | } 41 | group_vs_feature_index_ = vector>{n_groups_}; 42 | 43 | size_t feature_index = 0; 44 | for (auto iter = group_index.cbegin(); iter != group_index.cend(); iter++) { 45 | group_vs_feature_index_[*iter].push_back(feature_index++); 46 | } 47 | 48 | if (n_kept_samples < 0) { 49 | throw invalid_argument("n_kept_samples must be non-negative,"); 50 | } 51 | if (n_iter <= 0) { 52 | throw invalid_argument("n_iter must be positive."); 53 | } 54 | if (n_iter < n_kept_samples) { 55 | throw invalid_argument("n_kept_samples must not exceed n_iter."); 56 | } 57 | } 58 | 59 | FMLearningConfig(const FMLearningConfig &other) = default; 60 | 61 | const Real alpha_0, beta_0, gamma_0; 62 | const Real mu_0; 63 | const Real reg_0; 64 | 65 | const TASKTYPE task_type; 66 | const Real nu_oprobit; 67 | bool fit_w0, fit_linear; 68 | 69 | const int n_iter, n_kept_samples; 70 | 71 | const Real cutpoint_scale; 72 | 73 | private: 74 | const vector group_index_; 75 | size_t n_groups_; 76 | vector> group_vs_feature_index_; 77 | 78 | const CutpointGroupType cutpoint_groups_; 79 | 80 | public: 81 | inline size_t get_n_groups() const { return n_groups_; } 82 | 83 | inline size_t group_index(int at) const { return group_index_.at(at); } 84 | const CutpointGroupType &cutpoint_groups() const { 85 | return this->cutpoint_groups_; 86 | } 87 | 88 | const vector> &group_vs_feature_index() const { 89 | return group_vs_feature_index_; 90 | } 91 | 92 | struct Builder { 93 | Real alpha_0 = 1; 94 | Real beta_0 = 1; 95 | Real gamma_0 = 1; 96 | Real mu_0 = 1; 97 | Real reg_0 = 1; 98 | int n_iter = 100; 99 | int n_kept_samples = 10; 100 | TASKTYPE task_type = TASKTYPE::REGRESSION; 101 | Real nu_oprobit = 5; 102 | bool fit_w0 = true; 103 | bool fit_linear = true; 104 | vector group_index; 105 | Real cutpoint_scale = 10; 106 | CutpointGroupType cutpoint_groups; 107 | 108 | Builder() {} 109 | 110 | inline Builder &set_alpha_0(Real arg) { 111 | this->alpha_0 = arg; 112 | return *this; 113 | } 114 | 115 | inline Builder &set_beta_0(Real arg) { 116 | this->beta_0 = arg; 117 | return *this; 118 | } 119 | 120 | inline Builder &set_gamma_0(Real arg) { 121 | this->gamma_0 = arg; 122 | return *this; 123 | } 124 | 125 | inline Builder &set_mu_0(Real arg) { 126 | this->mu_0 = arg; 127 | return *this; 128 | } 129 | inline Builder &set_reg_0(Real arg) { 130 | this->reg_0 = arg; 131 | return *this; 132 | } 133 | 134 | inline Builder &set_n_iter(int arg) { 135 | this->n_iter = arg; 136 | return *this; 137 | } 138 | 139 | inline Builder &set_n_kept_samples(int arg) { 140 | this->n_kept_samples = arg; 141 | return *this; 142 | } 143 | 144 | inline Builder &set_task_type(TASKTYPE arg) { 145 | this->task_type = arg; 146 | return *this; 147 | } 148 | 149 | inline Builder &set_group_index(const vector arg) { 150 | this->group_index = arg; 151 | return *this; 152 | } 153 | 154 | inline Builder &set_identical_groups(size_t n_features) { 155 | vector default_group_index(n_features); 156 | for (auto c = default_group_index.begin(); c != default_group_index.end(); 157 | c++) { 158 | *c = 0; 159 | } 160 | return set_group_index(default_group_index); 161 | } 162 | 163 | inline Builder &set_nu_oprobit(size_t nu_oprobit) { 164 | this->nu_oprobit = nu_oprobit; 165 | return *this; 166 | } 167 | 168 | inline Builder &set_fit_w0(bool fit_w0) { 169 | this->fit_w0 = fit_w0; 170 | return *this; 171 | } 172 | 173 | inline Builder &set_fit_linear(bool fit_linear) { 174 | this->fit_linear = fit_linear; 175 | return *this; 176 | } 177 | 178 | inline Builder &set_cutpoint_scale(Real cutpoint_scale) { 179 | this->cutpoint_scale = cutpoint_scale; 180 | return *this; 181 | } 182 | 183 | inline Builder & 184 | set_cutpoint_groups(const CutpointGroupType &cutpoint_groups) { 185 | this->cutpoint_groups = cutpoint_groups; 186 | return *this; 187 | } 188 | 189 | FMLearningConfig build() { 190 | return FMLearningConfig(alpha_0, beta_0, gamma_0, mu_0, reg_0, task_type, 191 | nu_oprobit, fit_w0, fit_linear, group_index, 192 | n_iter, n_kept_samples, cutpoint_scale, 193 | this->cutpoint_groups); 194 | } 195 | 196 | static FMLearningConfig get_default_config(size_t n_features) { 197 | Builder builder; 198 | return builder.set_identical_groups(n_features).build(); 199 | } 200 | 201 | }; // end Builder 202 | }; 203 | 204 | } // namespace myFM 205 | -------------------------------------------------------------------------------- /include/myfm/HyperParams.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "FM.hpp" 4 | #include "definitions.hpp" 5 | 6 | namespace myFM { 7 | 8 | template struct FMHyperParameters { 9 | using FMType = FM; 10 | using Vector = typename FMType::Vector; 11 | using DenseMatrix = typename FMType::DenseMatrix; 12 | 13 | Real alpha; 14 | 15 | Vector mu_w; // mean for w. will be (n_group) - vector 16 | Vector lambda_w; // variances for w. will be (n_group) - vector 17 | 18 | DenseMatrix mu_V; // mean for V. will be (n_group x n_factor) matrix 19 | DenseMatrix lambda_V; // variances for V (n_group x n_factor) - matrix 20 | 21 | inline FMHyperParameters(size_t n_factors, size_t n_groups) 22 | : mu_w(n_groups), lambda_w(n_groups), mu_V(n_groups, n_factors), 23 | lambda_V(n_groups, n_factors) {} 24 | 25 | inline FMHyperParameters(size_t n_factors) 26 | : FMHyperParameters(n_factors, 1) {} 27 | 28 | inline FMHyperParameters(Real alpha, const Vector &mu_w, 29 | const Vector &lambda_w, const DenseMatrix &mu_V, 30 | const DenseMatrix &lambda_V) 31 | : alpha(alpha), mu_w(mu_w), lambda_w(lambda_w), mu_V(mu_V), 32 | lambda_V(lambda_V) {} 33 | 34 | inline FMHyperParameters(const FMHyperParameters &other) 35 | : alpha(other.alpha), mu_w(other.mu_w), lambda_w(other.lambda_w), 36 | mu_V(other.mu_V), lambda_V(other.lambda_V) {} 37 | }; 38 | 39 | } // namespace myFM 40 | -------------------------------------------------------------------------------- /include/myfm/LearningHistory.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "HyperParams.hpp" 4 | 5 | namespace myFM { 6 | template struct GibbsLearningHistory { 7 | std::vector> hypers; 8 | std::vector 9 | n_mh_accept; // will be used for M-H step in ordered probit regression; 10 | std::vector train_log_losses; 11 | }; 12 | } // namespace myFM 13 | -------------------------------------------------------------------------------- /include/myfm/definitions.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | namespace myFM { 13 | 14 | using namespace std; 15 | namespace types { 16 | template 17 | using DenseMatrix = Eigen::Matrix; 18 | 19 | template using Vector = Eigen::Matrix; 20 | 21 | template 22 | using SparseMatrix = Eigen::SparseMatrix; 23 | 24 | template using SparseVector = Eigen::SparseVector; 25 | 26 | } // namespace types 27 | 28 | namespace relational { 29 | 30 | template struct RelationBlock { 31 | typedef Eigen::SparseMatrix SparseMatrix; 32 | typedef Eigen::Matrix Vector; 33 | 34 | inline RelationBlock(vector original_to_block, const SparseMatrix &X) 35 | : original_to_block(original_to_block), 36 | mapper_size(original_to_block.size()), X(X), block_size(X.rows()), 37 | feature_size(X.cols()) { 38 | for (auto c : original_to_block) { 39 | if (c >= block_size) 40 | throw runtime_error("index mapping points to non-existing row."); 41 | } 42 | } 43 | 44 | inline RelationBlock(const RelationBlock &other) 45 | : RelationBlock(other.original_to_block, other.X) {} 46 | 47 | const vector original_to_block; 48 | const size_t mapper_size; 49 | const SparseMatrix X; 50 | const size_t block_size; 51 | const size_t feature_size; 52 | }; 53 | 54 | template struct RelationWiseCache { 55 | typedef typename RelationBlock::Vector Vector; 56 | typedef typename RelationBlock::SparseMatrix SparseMatrix; 57 | 58 | inline RelationWiseCache(const RelationBlock &source) 59 | : target(source), X_t(source.X.transpose()), cardinality(source.X.rows()), 60 | y(source.X.rows()), q(source.X.rows()), q_S(source.X.rows()), 61 | c(source.X.rows()), c_S(source.X.rows()), e(source.X.rows()), 62 | e_q(source.X.rows()) { 63 | X_t.makeCompressed(); 64 | cardinality.array() = static_cast(0); 65 | for (auto v : source.original_to_block) { 66 | cardinality(v)++; 67 | } 68 | } 69 | 70 | const RelationBlock ⌖ 71 | SparseMatrix X_t; 72 | Vector cardinality; // for each 73 | 74 | Vector y; 75 | 76 | Vector q; 77 | Vector q_S; 78 | 79 | Vector c; 80 | Vector c_S; 81 | 82 | Vector e; 83 | Vector e_q; 84 | }; 85 | } // namespace relational 86 | 87 | } // namespace myFM 88 | -------------------------------------------------------------------------------- /include/myfm/predictor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "FM.hpp" 8 | #include "FMLearningConfig.hpp" 9 | #include "definitions.hpp" 10 | #include "util.hpp" 11 | 12 | namespace myFM { 13 | 14 | template > struct Predictor { 15 | typedef typename FMLearningConfig::TASKTYPE TASKTYPE; 16 | typedef typename FMType::SparseMatrix SparseMatrix; 17 | typedef typename FMType::Vector Vector; 18 | typedef typename FMType::DenseMatrix DenseMatrix; 19 | typedef typename FMType::RelationBlock RelationBlock; 20 | 21 | inline Predictor(size_t rank, size_t feature_size, TASKTYPE type) 22 | : rank(rank), feature_size(feature_size), type(type), samples() {} 23 | 24 | inline void check_input(const SparseMatrix &X, 25 | const vector &relations) const { 26 | auto given_feature_size = check_row_consistency_return_column(X, relations); 27 | if (feature_size != given_feature_size) { 28 | throw std::invalid_argument( 29 | StringBuilder{}("Told to predict for ")( 30 | given_feature_size)(" but this->feature_size is ")(feature_size) 31 | .build()); 32 | } 33 | } 34 | 35 | inline Vector predict_parallel(const SparseMatrix &X, 36 | const vector &relations, 37 | size_t n_workers) const { 38 | check_input(X, relations); 39 | if (samples.empty()) { 40 | throw std::runtime_error("Told to predict but no sample available."); 41 | } 42 | Vector result = Vector::Zero(X.rows()); 43 | const size_t n_samples = this->samples.size(); 44 | 45 | std::mutex mtx; 46 | std::atomic currently_done(0); 47 | std::vector workers; 48 | 49 | for (size_t i = 0; i < n_workers; i++) { 50 | workers.emplace_back( 51 | [this, n_samples, &result, &X, &relations, ¤tly_done, &mtx] { 52 | Vector cache(X.rows()); 53 | while (true) { 54 | size_t cd = currently_done++; 55 | if (cd >= n_samples) 56 | break; 57 | this->samples[cd].predict_score_write_target(cache, X, relations); 58 | if (this->type == TASKTYPE::CLASSIFICATION) { 59 | cache.array() = 60 | ((cache.array() * static_cast(std::sqrt(0.5))).erf() + 61 | static_cast(1)) / 62 | static_cast(2); 63 | } 64 | { 65 | std::lock_guard lock{mtx}; 66 | result += cache; 67 | } 68 | } 69 | }); 70 | } 71 | for (auto &worker : workers) { 72 | worker.join(); 73 | } 74 | result.array() /= static_cast(n_samples); 75 | return result; 76 | } 77 | 78 | inline DenseMatrix 79 | predict_parallel_oprobit(const SparseMatrix &X, 80 | const vector &relations, 81 | size_t n_workers, size_t cutpoint_index) const { 82 | check_input(X, relations); 83 | if (samples.empty()) { 84 | throw std::runtime_error("Told to predict but no sample available."); 85 | } 86 | if (this->type != TASKTYPE::ORDERED) { 87 | throw std::runtime_error( 88 | "predict_parallel_oprobit must be called for oprobit model."); 89 | } 90 | int n_cpt = (this->samples.at(0)).cutpoints.at(cutpoint_index).size(); 91 | DenseMatrix result = DenseMatrix::Zero(X.rows(), n_cpt + 1); 92 | const size_t n_samples = this->samples.size(); 93 | 94 | std::mutex mtx; 95 | std::atomic currently_done(0); 96 | std::vector workers; 97 | 98 | for (size_t i = 0; i < n_workers; i++) { 99 | workers.emplace_back([this, n_samples, &result, &X, &relations, 100 | ¤tly_done, &mtx, cutpoint_index, n_cpt] { 101 | Vector score(X.rows()); 102 | 103 | while (true) { 104 | size_t cd = currently_done.fetch_add(1); 105 | if (cd >= n_samples) 106 | break; 107 | 108 | DenseMatrix sample_result = 109 | this->samples.at(cd).oprobit_predict_proba(X, relations, 110 | cutpoint_index); 111 | 112 | { 113 | std::lock_guard lock{mtx}; 114 | result += sample_result; 115 | } 116 | } 117 | }); 118 | } 119 | for (auto &worker : workers) { 120 | worker.join(); 121 | } 122 | result.array() /= static_cast(n_samples); 123 | return result; 124 | } 125 | 126 | inline Vector predict(const SparseMatrix &X, 127 | const vector &relations) const { 128 | check_input(X, relations); 129 | if (samples.empty()) { 130 | throw std::runtime_error("Empty samples!"); 131 | } 132 | Vector result = Vector::Zero(X.rows()); 133 | Vector cache = Vector(X.rows()); 134 | for (auto iter = samples.cbegin(); iter != samples.cend(); iter++) { 135 | iter->predict_score_write_target(cache, X, relations); 136 | if (type == TASKTYPE::REGRESSION) { 137 | result += cache; 138 | } else if (type == TASKTYPE::CLASSIFICATION) { 139 | result.array() += 140 | ((cache.array() * static_cast(std::sqrt(0.5))).erf() + 141 | static_cast(1)) / 142 | static_cast(2); 143 | } 144 | } 145 | result.array() /= static_cast(samples.size()); 146 | return result; 147 | } 148 | 149 | inline void set_samples(vector &&samples_from) { 150 | samples = std::forward>(samples_from); 151 | } 152 | 153 | inline void add_sample(const FMType &fm) { 154 | if (fm.w0.rows() != feature_size) { 155 | throw std::invalid_argument("feature size mismatch!"); 156 | } 157 | if (fm.V.cols() != rank) { 158 | throw std::invalid_argument("rank mismatch!"); 159 | } 160 | samples.emplace_back(fm); 161 | } 162 | 163 | const size_t rank; 164 | const size_t feature_size; 165 | const TASKTYPE type; 166 | vector samples; 167 | }; 168 | 169 | } // namespace myFM 170 | -------------------------------------------------------------------------------- /include/myfm/util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Faddeeva/Faddeeva.hh" 3 | #include "definitions.hpp" 4 | #include 5 | #include 6 | 7 | namespace myFM { 8 | using namespace std; 9 | 10 | /* 11 | Sample from truncated normal distribution. 12 | https://arxiv.org/pdf/0907.4010.pdf 13 | Proposition 2.3. 14 | */ 15 | template 16 | inline Real sample_truncated_normal_left(mt19937 &gen, Real mu_minus) { 17 | if (mu_minus < 0) { 18 | normal_distribution dist(0, 1); 19 | while (true) { 20 | Real z = dist(gen); 21 | if (z > mu_minus) { 22 | return z; 23 | } 24 | } 25 | } else { 26 | Real alpha_star = (mu_minus + std::sqrt(mu_minus * mu_minus + 4)) / 2; 27 | uniform_real_distribution dist(0, 1); 28 | while (true) { 29 | Real z = -std::log(dist(gen)) / alpha_star + mu_minus; 30 | Real rho = std::exp(-(z - alpha_star) * (z - alpha_star) / 2); 31 | Real u = dist(gen); 32 | if (u < rho) { 33 | return z; 34 | } 35 | } 36 | } 37 | } 38 | 39 | template 40 | inline Real sample_truncated_normal_twoside(mt19937 &gen, Real mu_minus, 41 | Real mu_plus) { 42 | uniform_real_distribution proposal(mu_minus, mu_plus); 43 | uniform_real_distribution acceptance(0, 1); 44 | Real rho; 45 | while (true) { 46 | Real z = proposal(gen); 47 | if ((mu_minus <= static_cast(0)) && 48 | (mu_plus >= static_cast(0))) { 49 | rho = std::exp(-z * z / 2); 50 | } else if (mu_plus < static_cast(0)) { 51 | rho = std::exp((mu_plus * mu_plus - z * z) / 2); 52 | } else { 53 | rho = std::exp((mu_minus * mu_minus - z * z) / 2); 54 | } 55 | Real u = acceptance(gen); 56 | if (u < rho) { 57 | return z; 58 | } 59 | } 60 | } 61 | template 62 | inline Real sample_truncated_normal_left(mt19937 &gen, Real mean, Real std, 63 | Real mu_minus) { 64 | return mean + 65 | std * sample_truncated_normal_left(gen, (mu_minus - mean) / std); 66 | } 67 | 68 | template 69 | inline Real sample_truncated_normal_right(mt19937 &gen, Real mu_plus) { 70 | return -sample_truncated_normal_left(gen, -mu_plus); 71 | } 72 | 73 | template 74 | inline Real sample_truncated_normal_right(mt19937 &gen, Real mean, Real std, 75 | Real mu_plus) { 76 | return mean + 77 | std * sample_truncated_normal_right(gen, (mu_plus - mean) / std); 78 | } 79 | 80 | template 81 | inline std::tuple mean_var_truncated_normal_left(Real mu) { 82 | static constexpr Real SQRT2 = 1.4142135623730951; 83 | static constexpr Real SQRTPI = 1.7724538509055159; 84 | static constexpr Real SQRT2PI = SQRT2 * SQRTPI; 85 | 86 | // mean, variance, log(Z) 87 | 88 | /* 89 | q(z) = 1{z > 0} exp( - frac{1}{2}(z-mu)^2) / Z 90 | Z = 1 - \Phi(-mu) 91 | E_q[z] = \mu + 1/\sqrt{2\pi} exp(-\mu^2/2) / (1 - \Phi(-mu)) 92 | */ 93 | Real phi_Z; 94 | Real lnZ; 95 | Real mu_square = mu * mu / 2; 96 | if (mu > 0) { 97 | Real Z = (1 - Faddeeva::erf(-mu / SQRT2)); 98 | phi_Z = 2 * std::exp(-mu_square) / SQRT2PI / Z; 99 | lnZ = std::log(Z); 100 | } else { 101 | Real Z = (Faddeeva::erfcx(-mu / SQRT2)); 102 | phi_Z = 2 / Z / SQRT2PI; 103 | lnZ = std::log(Z) - mu_square; 104 | } 105 | std::tuple result(mu + phi_Z, 106 | 1 - mu * phi_Z - phi_Z * phi_Z, lnZ); 107 | return result; 108 | } 109 | 110 | template 111 | inline std::tuple mean_var_truncated_normal_right(Real mu) { 112 | auto result = mean_var_truncated_normal_left(-mu); 113 | std::get<0>(result) *= -1; 114 | return result; 115 | } 116 | 117 | struct StringBuilder { 118 | inline StringBuilder() : oss_() {} 119 | 120 | template inline StringBuilder &add(const T &arg) { 121 | oss_ << arg; 122 | return *this; 123 | } 124 | 125 | template inline StringBuilder &operator()(const T &arg) { 126 | oss_ << arg; 127 | return *this; 128 | } 129 | 130 | template inline StringBuilder &space_and_add(const T &arg) { 131 | oss_ << " " << arg; 132 | return *this; 133 | } 134 | 135 | template 136 | inline StringBuilder &add(const T &arg, const T &fmt) { 137 | oss_ << fmt << arg; 138 | return *this; 139 | } 140 | 141 | inline string build() { return oss_.str(); } 142 | 143 | private: 144 | ostringstream oss_; 145 | }; 146 | 147 | template 148 | inline size_t check_row_consistency_return_column( 149 | const types::SparseMatrix &X, 150 | const vector> &relations) { 151 | size_t row = X.rows(); 152 | size_t col = X.cols(); 153 | int i = 0; 154 | for (const auto &rel : relations) { 155 | if (row != rel.original_to_block.size()) { 156 | throw std::runtime_error( 157 | (StringBuilder{})("main table has size ")(row)(" but the relation[")( 158 | i)("] has size ")(rel.original_to_block.size()) 159 | .build()); 160 | } 161 | col += rel.feature_size; 162 | i++; 163 | } 164 | return col; 165 | } 166 | 167 | template void print_to_stream(std::ostream &ss, Cs &&... args); 168 | 169 | template 170 | inline void print_to_stream(std::ostream &ss, C &&c0, Cs &&... args) { 171 | ss << c0; 172 | print_to_stream(ss, std::forward(args)...); 173 | } 174 | 175 | template <> inline void print_to_stream(std::ostream &ss) {} 176 | 177 | template std::string print_to_string(Cs &&... args) { 178 | std::stringstream ss; 179 | print_to_stream(ss, std::forward(args)...); 180 | return ss.str(); 181 | } 182 | 183 | } // namespace myFM 184 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | # Specify the target platform details in config, so your developers are 3 | # free to run mypy on Windows, Linux, or macOS and get consistent 4 | # results. 5 | python_version=3.6 6 | platform=linux 7 | 8 | show_column_numbers=True 9 | 10 | follow_imports=normal 11 | 12 | # suppress errors about unsatisfied imports 13 | ignore_missing_imports=True 14 | 15 | # be strict 16 | disallow_untyped_calls=True 17 | warn_return_any=True 18 | strict_optional=True 19 | warn_no_return=True 20 | warn_redundant_casts=True 21 | warn_unused_ignores=True 22 | 23 | # The following are off by default. Flip them on if you feel 24 | # adventurous. 25 | disallow_untyped_defs=True 26 | check_untyped_defs=True 27 | 28 | # No incremental mode 29 | cache_dir=/dev/null 30 | 31 | plugins = numpy.typing.mypy_plugin 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel", 5 | "pybind11>=2.8.0", 6 | "httpx", 7 | "setuptools_scm[toml]>=6.2", 8 | ] 9 | 10 | build-backend = "setuptools.build_meta" 11 | 12 | [tool.black] 13 | ensure_newline_before_comments = true 14 | force_grid_wrap = 0 15 | include_trailing_comma = true 16 | line_length = 88 17 | multi_line_output = 3 18 | use_parentheses = true 19 | 20 | [tool.isort] 21 | ensure_newline_before_comments = true 22 | force_grid_wrap = 0 23 | include_trailing_comma = true 24 | known_third_party = ["pybind11"] 25 | line_length = 88 26 | multi_line_output = 3 27 | use_parentheses = true 28 | 29 | [tool.pycln] 30 | all = true 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Any 4 | 5 | from pybind11.setup_helpers import Pybind11Extension, build_ext 6 | from setuptools import find_packages, setup 7 | 8 | install_requires = [ 9 | "numpy>=1.11", 10 | "scipy>=1.0", 11 | "tqdm>=4", 12 | "pandas>=1.0.0", 13 | "typing-extensions>=4.0.0", 14 | ] 15 | 16 | CURRENT_DIR = Path(__file__).resolve().parent 17 | README_FILE = CURRENT_DIR / "README.md" 18 | 19 | 20 | class get_eigen_include(object): 21 | EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip" 22 | EIGEN3_DIRNAME = "eigen-3.4.0" 23 | 24 | def __str__(self) -> str: 25 | eigen_include_dir = os.environ.get("EIGEN3_INCLUDE_DIR", None) 26 | if eigen_include_dir is not None: 27 | return eigen_include_dir 28 | 29 | basedir = Path(__file__).resolve().parent 30 | target_dir = basedir / self.EIGEN3_DIRNAME 31 | if target_dir.exists(): 32 | return str(target_dir) 33 | 34 | download_target_dir = basedir / "eigen3.zip" 35 | import zipfile 36 | 37 | import httpx 38 | 39 | print("Start downloading Eigen library from {}.".format(self.EIGEN3_DIRNAME)) 40 | with httpx.stream("GET", self.EIGEN3_URL, verify=False) as response: 41 | with download_target_dir.open("wb") as ofs: 42 | for chunk in response.iter_bytes(chunk_size=1024): 43 | ofs.write(chunk) 44 | print("Downloaded Eigen into {}.".format(download_target_dir)) 45 | 46 | with zipfile.ZipFile(download_target_dir) as ifs: 47 | ifs.extractall() 48 | 49 | return str(target_dir) 50 | 51 | 52 | headers = [ 53 | "include/myfm/definitions.hpp", 54 | "include/myfm/util.hpp", 55 | "include/myfm/FM.hpp", 56 | "include/myfm/HyperParams.hpp", 57 | "include/myfm/predictor.hpp", 58 | "include/myfm/FMTrainer.hpp", 59 | "include/myfm/FMLearningConfig.hpp", 60 | "include/myfm/OProbitSampler.hpp", 61 | "include/Faddeeva/Faddeeva.hh", 62 | "cpp_source/declare_module.hpp", 63 | ] 64 | 65 | 66 | ext_modules = [ 67 | Pybind11Extension( 68 | "myfm._myfm", 69 | ["cpp_source/bind.cpp", "cpp_source/Faddeeva.cc"], 70 | include_dirs=[ 71 | # Path to pybind11 headers 72 | get_eigen_include(), 73 | "include", 74 | ], 75 | ), 76 | ] 77 | 78 | 79 | def local_scheme(version: Any) -> str: 80 | return "" 81 | 82 | 83 | setup( 84 | name="myfm", 85 | use_scm_version={"local_scheme": local_scheme}, 86 | author="Tomoki Ohtsuki", 87 | url="https://github.com/tohtsky/myfm", 88 | author_email="tomoki.ohtsuki.19937@outlook.jp", 89 | description="Yet another Bayesian factorization machines.", 90 | long_description=README_FILE.read_text(), 91 | long_description_content_type="text/markdown", 92 | ext_modules=ext_modules, 93 | install_requires=install_requires, 94 | cmdclass={"build_ext": build_ext}, 95 | package_dir={"": "src"}, 96 | zip_safe=False, 97 | headers=headers, 98 | python_requires=">=3.6", 99 | packages=find_packages("src"), 100 | package_data={"myfm": ["*.pyi"]}, 101 | ) 102 | -------------------------------------------------------------------------------- /src/myfm/__init__.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import DistributionNotFound, get_distribution # type: ignore 2 | 3 | try: 4 | __version__ = get_distribution("myfm").version 5 | except DistributionNotFound: # pragma: no cover 6 | # package is not installed 7 | pass # pragma: no cover 8 | 9 | from ._myfm import RelationBlock 10 | from .gibbs import MyFMGibbsClassifier, MyFMGibbsRegressor, MyFMOrderedProbit 11 | from .variational import VariationalFMClassifier, VariationalFMRegressor 12 | 13 | MyFMRegressor = MyFMGibbsRegressor 14 | MyFMClassifier = MyFMGibbsClassifier 15 | 16 | __all__ = [ 17 | "RelationBlock", 18 | "MyFMOrderedProbit", 19 | "MyFMRegressor", 20 | "MyFMClassifier", 21 | "MyFMGibbsRegressor", 22 | "MyFMGibbsClassifier", 23 | "VariationalFMRegressor", 24 | "VariationalFMClassifier", 25 | ] 26 | -------------------------------------------------------------------------------- /src/myfm/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/src/myfm/utils/__init__.py -------------------------------------------------------------------------------- /src/myfm/utils/benchmark_data/__init__.py: -------------------------------------------------------------------------------- 1 | from .movielens1M_data import MovieLens1MDataManager 2 | from .movielens10M_data import MovieLens10MDataManager 3 | from .movielens100k_data import MovieLens100kDataManager 4 | 5 | __all__ = [ 6 | "MovieLens100kDataManager", 7 | "MovieLens1MDataManager", 8 | "MovieLens10MDataManager", 9 | ] 10 | -------------------------------------------------------------------------------- /src/myfm/utils/benchmark_data/loader_base.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from abc import ABC, abstractmethod, abstractproperty 3 | from pathlib import Path 4 | from typing import Optional, Tuple 5 | from zipfile import ZipFile 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from numpy.random import RandomState 10 | 11 | 12 | def train_test_split_with_kfold( 13 | df: pd.DataFrame, 14 | K: int, 15 | fold: int, 16 | random_state: Optional[int] = None, 17 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 18 | rns = RandomState(random_state) 19 | if not ((0 <= fold) and (fold < K)): 20 | raise ValueError("0 <= fold < K") 21 | n_data = df.shape[0] 22 | n_test = n_data // K 23 | if fold < (n_data % K): 24 | n_test += 1 25 | index = np.arange(df.shape[0]) 26 | rns.shuffle(index) 27 | df = df.iloc[index] 28 | test_start_position = (n_data // K) * fold + min((n_data % K), fold) 29 | test_end_position = test_start_position + n_test 30 | return ( 31 | pd.concat( 32 | [df.iloc[:test_start_position], df.iloc[test_end_position:]] 33 | ).reset_index(drop=True), 34 | df.iloc[test_start_position:test_end_position].reset_index(drop=True), 35 | ) 36 | 37 | 38 | class DataLoaderBase(ABC): 39 | zf: ZipFile 40 | 41 | @abstractproperty 42 | def DOWNLOAD_URL(self) -> str: 43 | raise NotImplementedError("must be implemented") # pragma: no cover 44 | 45 | @abstractproperty 46 | def DEFAULT_PATH(self) -> Path: 47 | raise NotImplementedError("must be implemented") # pragma: no cover 48 | 49 | def __init__(self, zippath: Optional[Path] = None): 50 | zippath = Path(zippath or self.DEFAULT_PATH) 51 | if not zippath.exists(): 52 | permission = input( 53 | "Could not find {}.\nCan I download and save it there?[y/N]".format( 54 | zippath 55 | ) 56 | ).lower() 57 | download = permission == "y" 58 | if download: 59 | print("start download...") 60 | urllib.request.urlretrieve(self.DOWNLOAD_URL, zippath) 61 | print("complete") 62 | else: 63 | raise RuntimeError("abort.") 64 | self.zf = ZipFile(zippath) 65 | 66 | 67 | class MovieLensBase(DataLoaderBase, ABC): 68 | @abstractmethod 69 | def load_rating_all(self) -> pd.DataFrame: 70 | raise NotImplementedError("must be implemented") 71 | 72 | def load_rating_kfold_split( 73 | self, K: int, fold: int, random_state: Optional[int] = 0 74 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 75 | """Load the entire dataset and split it into train/test set. 76 | K-fold 77 | 78 | Parameters 79 | ---------- 80 | K : int 81 | K in the K-fold splitting scheme. 82 | fold : int 83 | fold index. 84 | random_state : Union[np.RandomState, int, None], optional 85 | Controlls random state of the split. 86 | 87 | Returns 88 | ------- 89 | Tuple[pd.DataFrame, pd.DataFrame] 90 | train and test dataframes. 91 | 92 | Raises 93 | ------ 94 | ValueError 95 | When 0 <= fold < K is not met. 96 | """ 97 | if not ((0 <= fold) and (fold < K)): 98 | raise ValueError("0 <= fold < K") 99 | df_all = self.load_rating_all() 100 | return train_test_split_with_kfold(df_all, K, fold, random_state) 101 | -------------------------------------------------------------------------------- /src/myfm/utils/benchmark_data/movielens100k_data.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from pathlib import Path 3 | from typing import List, Tuple 4 | 5 | import pandas as pd 6 | 7 | from .loader_base import MovieLensBase 8 | 9 | 10 | class MovieLens100kDataManager(MovieLensBase): 11 | """The Data manager for MovieLens 100k dataset.""" 12 | 13 | @property 14 | def DOWNLOAD_URL(self) -> str: 15 | return "http://files.grouplens.org/datasets/movielens/ml-100k.zip" 16 | 17 | @property 18 | def DEFAULT_PATH(self) -> Path: 19 | return Path("~/.ml-100k.zip").expanduser() 20 | 21 | def _read_interaction(self, byte_stream: bytes) -> pd.DataFrame: 22 | with BytesIO(byte_stream) as ifs: 23 | data = pd.read_csv( 24 | ifs, 25 | sep="\t", 26 | header=None, 27 | names=["user_id", "movie_id", "rating", "timestamp"], 28 | ) 29 | data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s") 30 | return data 31 | 32 | def load_rating_all(self) -> pd.DataFrame: 33 | """Load the entire rating dataset. 34 | 35 | Returns 36 | ------- 37 | pd.DataFrame 38 | all the available ratings. 39 | """ 40 | return self._read_interaction(self.zf.read("ml-100k/u.data")) 41 | 42 | def load_rating_predefined_split( 43 | self, 44 | fold: int, 45 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 46 | """Read the pre-defined train/test split. 47 | Fold index ranges from 1 to 5. 48 | 49 | Parameters 50 | ---------- 51 | fold : int 52 | specifies the fold index. 53 | 54 | Returns 55 | ------- 56 | Tuple[pd.DataFrame, pd.DataFrame] 57 | train and test dataframes. 58 | 59 | """ 60 | assert fold >= 1 and fold <= 5 61 | train_path = "ml-100k/u{}.base".format(fold) 62 | test_path = "ml-100k/u{}.test".format(fold) 63 | df_train = self._read_interaction(self.zf.read(train_path)) 64 | df_test = self._read_interaction(self.zf.read(test_path)) 65 | 66 | return df_train, df_test 67 | 68 | def load_user_info(self) -> pd.DataFrame: 69 | """load user meta information. 70 | 71 | Returns 72 | ------- 73 | pd.DataFrame 74 | user infomation 75 | """ 76 | user_info_bytes = self.zf.read("ml-100k/u.user") 77 | with BytesIO(user_info_bytes) as ifs: 78 | return pd.read_csv( 79 | ifs, 80 | sep="|", 81 | header=None, 82 | names=["user_id", "age", "gender", "occupation", "zipcode"], 83 | ) 84 | 85 | def genres(self) -> List[str]: 86 | with BytesIO(self.zf.read("ml-100k/u.genre")) as ifs: 87 | genres: List[str] = list(pd.read_csv(ifs, sep="|", header=None)[0]) 88 | return genres 89 | 90 | def load_movie_info(self) -> pd.DataFrame: 91 | r"""load movie meta information. 92 | 93 | Returns 94 | ------- 95 | pd.DataFrame 96 | A dataframe containing meta-information (id, title, release_date, url, genres) about the movies. 97 | Multiple genres per movie will be concatenated by "|". 98 | """ 99 | MOVIE_COLUMNS = ["movie_id", "title", "release_date", "unk", "url"] 100 | genres = self.genres() 101 | 102 | with BytesIO(self.zf.read("ml-100k/u.item")) as ifs: 103 | df_mov = pd.read_csv( 104 | ifs, 105 | sep="|", 106 | encoding="latin-1", 107 | header=None, 108 | ) 109 | df_mov.columns = MOVIE_COLUMNS + genres 110 | df_mov["release_date"] = pd.to_datetime(df_mov.release_date) 111 | movie_index, genre_index = df_mov[genres].values.nonzero() 112 | genre_df = ( 113 | ( 114 | pd.DataFrame( 115 | dict( 116 | movie_id=df_mov.movie_id.values[movie_index], 117 | genre=[genres[i] for i in genre_index], 118 | ) 119 | ) 120 | .groupby("movie_id") 121 | .genre.agg(lambda x: "|".join(x)) 122 | ) 123 | .reindex(df_mov.movie_id) 124 | .fillna("") 125 | ) 126 | df_mov["genres"] = genre_df.values 127 | return df_mov 128 | -------------------------------------------------------------------------------- /src/myfm/utils/benchmark_data/movielens10M_data.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | 6 | from .loader_base import MovieLensBase 7 | from .movielens1M_data import read_ml1m10m_df 8 | 9 | 10 | class MovieLens10MDataManager(MovieLensBase): 11 | DOWNLOAD_URL = "http://files.grouplens.org/datasets/movielens/ml-10m.zip" 12 | DEFAULT_PATH = Path("~/.ml-10m.zip").expanduser() 13 | 14 | def load_rating_all(self) -> pd.DataFrame: 15 | with BytesIO(self.zf.read("ml-10M100K/ratings.dat")) as ifs: 16 | return read_ml1m10m_df(ifs) 17 | -------------------------------------------------------------------------------- /src/myfm/utils/benchmark_data/movielens1M_data.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | 6 | from .movielens100k_data import MovieLensBase 7 | 8 | 9 | def read_ml1m10m_df(ifs: BytesIO) -> pd.DataFrame: 10 | r"""A hacky function to read from Movielens 1M/10M dataset using native parser. 11 | This hack is taken from irspack: https://github.com/tohtsky/irspack/blob/a1893be54200b0dc765957220deeccc1764fe39c/irspack/dataset/movielens/ML1M.py 12 | """ 13 | df = pd.read_csv( 14 | ifs, 15 | sep=":", 16 | header=None, 17 | )[[0, 2, 4, 6]].copy() 18 | 19 | df.columns = ["user_id", "movie_id", "rating", "timestamp"] 20 | df["timestamp"] = pd.to_datetime(df.timestamp, unit="s") 21 | return df 22 | 23 | 24 | class MovieLens1MDataManager(MovieLensBase): 25 | DOWNLOAD_URL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip" 26 | DEFAULT_PATH = Path("~/.ml-1m.zip").expanduser() 27 | 28 | def load_rating_all(self) -> pd.DataFrame: 29 | """Read all (1M) interactions. 30 | 31 | Returns 32 | ------- 33 | pd.DataFrame 34 | Movielens 1M rating dataframe. 35 | """ 36 | with BytesIO(self.zf.read("ml-1m/ratings.dat")) as ifs: 37 | return read_ml1m10m_df(ifs) 38 | -------------------------------------------------------------------------------- /src/myfm/utils/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .libfm import ( 2 | ClassificationCallback, 3 | LibFMLikeCallbackBase, 4 | OrderedProbitCallback, 5 | RegressionCallback, 6 | ) 7 | 8 | __all__ = [ 9 | "LibFMLikeCallbackBase", 10 | "OrderedProbitCallback", 11 | "ClassificationCallback", 12 | "RegressionCallback", 13 | ] 14 | -------------------------------------------------------------------------------- /src/myfm/utils/callbacks/libfm.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import OrderedDict 3 | from typing import Dict, List, Optional, Tuple 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import sparse as sps 8 | 9 | from myfm._myfm import FM, FMHyperParameters, LearningHistory, RelationBlock 10 | from myfm.base import REAL, ArrayLike, check_data_consistency, std_cdf 11 | 12 | 13 | class LibFMLikeCallbackBase(ABC): 14 | def __init__( 15 | self, 16 | n_iter: int, 17 | X_test: Optional[ArrayLike], 18 | X_rel_test: List[RelationBlock], 19 | y_test: np.ndarray, 20 | trace_path: Optional[str] = None, 21 | ): 22 | """Provides a LibFM-like callback after each iteration. 23 | This will be helpful when we cannot afford enough memory to store 24 | all posterior samples.""" 25 | self.n_test_data = check_data_consistency(X_test, X_rel_test) 26 | 27 | self.n_iter = n_iter 28 | if X_test is not None: 29 | self.X_test = sps.csr_matrix(X_test, dtype=REAL) 30 | else: 31 | self.X_test = sps.csr_matrix((self.n_test_data, 0), dtype=REAL) 32 | self.X_rel_test = X_rel_test 33 | self.y_test: np.ndarray = y_test 34 | self.result_trace: List[Dict[str, float]] = [] 35 | self.trace_path = trace_path 36 | self.n_samples = 0 37 | 38 | @abstractmethod 39 | def _measure_score( 40 | self, i: int, fm: FM, hyper: FMHyperParameters 41 | ) -> Tuple[str, Dict[str, float]]: 42 | raise NotImplementedError("must be implemented") 43 | 44 | def __call__( 45 | self, i: int, fm: FM, hyper: FMHyperParameters, history: LearningHistory 46 | ) -> Tuple[bool, Optional[str]]: 47 | description, trace_result = self._measure_score(i, fm, hyper) 48 | self.result_trace.append(trace_result) 49 | 50 | if self.trace_path is not None: 51 | df = pd.DataFrame(self.result_trace) 52 | df.to_csv(self.trace_path, index=False) 53 | 54 | return False, description 55 | 56 | 57 | class RegressionCallback(LibFMLikeCallbackBase): 58 | def __init__( 59 | self, 60 | n_iter: int, 61 | X_test: Optional[ArrayLike], 62 | y_test: np.ndarray, 63 | X_rel_test: List[RelationBlock] = [], 64 | clip_min: Optional[float] = None, 65 | clip_max: Optional[float] = None, 66 | trace_path: Optional[str] = None, 67 | ): 68 | super(RegressionCallback, self).__init__( 69 | n_iter, X_test, X_rel_test, y_test, trace_path=trace_path 70 | ) 71 | self.predictions = np.zeros((self.n_test_data,), dtype=np.float64) 72 | self.prediction_all_but_5 = np.zeros((self.n_test_data,), dtype=np.float64) 73 | self.clip_min = clip_min 74 | self.clip_max = clip_max 75 | 76 | def clip_value(self, arr: np.ndarray) -> None: 77 | if self.clip_min is not None: 78 | arr[arr <= self.clip_min] = self.clip_min 79 | if self.clip_max is not None: 80 | arr[arr >= self.clip_max] = self.clip_max 81 | 82 | def _measure_score( 83 | self, i: int, fm: FM, hyper: FMHyperParameters 84 | ) -> Tuple[str, Dict[str, float]]: 85 | score = fm.predict_score(self.X_test, self.X_rel_test) 86 | self.predictions += score 87 | self.n_samples += 1 88 | prediction_mean = self.predictions / self.n_samples 89 | self.clip_value(prediction_mean) 90 | if i >= 5: 91 | self.prediction_all_but_5 += score 92 | prediction_mean_all_but_5 = self.prediction_all_but_5 / (i + 1 - 5) 93 | self.clip_value(prediction_mean_all_but_5) 94 | rmse_all_but_5 = float( 95 | ((self.y_test - prediction_mean_all_but_5) ** 2).mean() ** 0.5 96 | ) 97 | else: 98 | rmse_all_but_5 = float("nan") 99 | 100 | rmse = float(((self.y_test - prediction_mean) ** 2).mean() ** 0.5) 101 | rmse_this = float(((self.y_test - score) ** 2).mean() ** 0.5) 102 | description = "alpha={0:.4f}, rmse_mean={1:.4f}, rmse_this={2:.4f}, rmse_all_but_5={3:.4f}".format( 103 | hyper.alpha, rmse, rmse_this, rmse_all_but_5 104 | ) 105 | result = OrderedDict( 106 | [ 107 | ("alpha", hyper.alpha), 108 | ("rmse", rmse), 109 | ("rmse_this", rmse_this), 110 | ("rmse_all_but_5", rmse_all_but_5), 111 | ] 112 | ) 113 | return description, result 114 | 115 | 116 | class ClassificationCallback(LibFMLikeCallbackBase): 117 | def __init__( 118 | self, 119 | n_iter: int, 120 | X_test: Optional[ArrayLike], 121 | y_test: np.ndarray, 122 | X_rel_test: List[RelationBlock] = [], 123 | eps: Optional[float] = 1e-15, 124 | trace_path: Optional[str] = None, 125 | ): 126 | super(ClassificationCallback, self).__init__( 127 | n_iter, X_test, X_rel_test, y_test, trace_path=trace_path 128 | ) 129 | self.predictions = np.zeros((self.n_test_data,), dtype=np.float64) 130 | self.prediction_all_but_5 = np.zeros((self.n_test_data,), dtype=np.float64) 131 | self.eps = eps 132 | 133 | def clip_value(self, arr: np.ndarray) -> None: 134 | if self.eps is not None: 135 | arr[arr <= self.eps] = self.eps 136 | arr[arr >= (1 - self.eps)] = 1 - self.eps 137 | 138 | def __log_loss(self, arr: np.ndarray) -> float: 139 | result = 0 140 | result += np.log(arr[self.y_test == 1]).sum() 141 | result += np.log(1 - arr[self.y_test == 0]).sum() 142 | return -result 143 | 144 | def __accuracy(self, arr: np.ndarray) -> float: 145 | return float((self.y_test == (arr >= 0.5)).mean()) 146 | 147 | def _measure_score( 148 | self, i: int, fm: FM, hyper: FMHyperParameters 149 | ) -> Tuple[str, Dict[str, float]]: 150 | prob_this = std_cdf(fm.predict_score(self.X_test, self.X_rel_test)) 151 | self.predictions += prob_this 152 | self.n_samples += 1 153 | prediction_mean = self.predictions / self.n_samples 154 | self.clip_value(prediction_mean) 155 | if i >= 5: 156 | self.prediction_all_but_5 += prob_this 157 | prediction_mean_all_but_5 = self.prediction_all_but_5 / (i + 1 - 5) 158 | self.clip_value(prediction_mean_all_but_5) 159 | ll_all_but_5 = self.__log_loss(prediction_mean_all_but_5) 160 | accuracy_all_but_5 = self.__accuracy(prediction_mean_all_but_5) 161 | else: 162 | ll_all_but_5 = float("nan") 163 | accuracy_all_but_5 = float("nan") 164 | 165 | ll = self.__log_loss(prediction_mean) 166 | accuracy = self.__accuracy(prediction_mean) 167 | ll_this = self.__log_loss(prob_this) 168 | accuracy_this = self.__accuracy(prob_this) 169 | description = "ll_mean={0:.4f}, ll_this={1:.4f}, ll_all_but_5={2:.4f}".format( 170 | ll, ll_this, ll_all_but_5 171 | ) 172 | result = OrderedDict( 173 | [ 174 | ("log_loss", ll), 175 | ("log_loss_this", ll_this), 176 | ("log_loss_all_but_5", ll_all_but_5), 177 | ("accuracy", accuracy), 178 | ("accuracy_this", accuracy_this), 179 | ("accuracy_all_but_5", accuracy_all_but_5), 180 | ] 181 | ) 182 | return description, result 183 | 184 | 185 | class OrderedProbitCallback(LibFMLikeCallbackBase): 186 | def __init__( 187 | self, 188 | n_iter: int, 189 | X_test: Optional[ArrayLike], 190 | y_test: np.ndarray, 191 | n_class: int, 192 | X_rel_test: List[RelationBlock] = [], 193 | eps: Optional[float] = 1e-15, 194 | trace_path: Optional[str] = None, 195 | ): 196 | super(OrderedProbitCallback, self).__init__( 197 | n_iter, X_test, X_rel_test, y_test, trace_path=trace_path 198 | ) 199 | self.predictions = np.zeros((self.n_test_data, n_class), dtype=np.float64) 200 | self.prediction_all_but_5 = np.zeros( 201 | (self.n_test_data, n_class), dtype=np.float64 202 | ) 203 | self.n_class = n_class 204 | self.eps = eps 205 | self.y_test = self.y_test.astype(np.int32) 206 | assert (self.y_test.min() >= 0) and (self.y_test.max() <= (self.n_class - 1)) 207 | 208 | def __log_loss(self, arr: np.ndarray) -> float: 209 | ps = arr[np.arange(self.y_test.shape[0]), self.y_test].copy() 210 | ps[ps <= self.eps] = self.eps 211 | return -float(np.log(ps).sum()) 212 | 213 | def __accuracy(self, arr: np.ndarray) -> float: 214 | return float((self.y_test == (arr.argmax(axis=1))).mean()) 215 | 216 | def __rmse(self, arr: np.ndarray) -> float: 217 | result: float = ( 218 | float(((self.y_test - arr.dot(np.arange(self.n_class))) ** 2).mean()) ** 0.5 219 | ) 220 | return result 221 | 222 | def _measure_score( 223 | self, i: int, fm: FM, hyper: FMHyperParameters 224 | ) -> Tuple[str, Dict[str, float]]: 225 | prob_this = fm.oprobit_predict_proba(self.X_test, self.X_rel_test, 0) 226 | self.predictions += prob_this 227 | self.n_samples += 1 228 | prediction_mean = self.predictions / self.n_samples 229 | if i >= 5: 230 | self.prediction_all_but_5 += prob_this 231 | prediction_mean_all_but_5 = self.prediction_all_but_5 / (i + 1 - 5) 232 | ll_all_but_5 = self.__log_loss(prediction_mean_all_but_5) 233 | accuracy_all_but_5 = self.__accuracy(prediction_mean_all_but_5) 234 | rmse_all_but_5 = self.__rmse(prediction_mean_all_but_5) 235 | else: 236 | ll_all_but_5 = float("nan") 237 | accuracy_all_but_5 = float("nan") 238 | rmse_all_but_5 = float("nan") 239 | 240 | ll = self.__log_loss(prediction_mean) 241 | accuracy = self.__accuracy(prediction_mean) 242 | rmse = self.__rmse(prediction_mean) 243 | ll_this = self.__log_loss(prob_this) 244 | accuracy_this = self.__accuracy(prob_this) 245 | rmse_this = self.__rmse(prob_this) 246 | description = "ll_mean={0:.4f}, ll_this={1:.4f}, ll_all_but_5={2:.4f}".format( 247 | ll, ll_this, ll_all_but_5 248 | ) 249 | result = OrderedDict( 250 | [ 251 | ("log_loss", ll), 252 | ("log_loss_this", ll_this), 253 | ("log_loss_all_but_5", ll_all_but_5), 254 | ("accuracy", accuracy), 255 | ("accuracy_this", accuracy_this), 256 | ("accuracy_all_but_5", accuracy_all_but_5), 257 | ("rmse", rmse), 258 | ("rmse_this", rmse_this), 259 | ("rmse_all_but_5", rmse_all_but_5), 260 | ] 261 | ) 262 | return description, result 263 | -------------------------------------------------------------------------------- /src/myfm/utils/dummy_data.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from scipy import sparse as sps 6 | 7 | from myfm.base import DenseArray, RelationBlock 8 | 9 | 10 | def gen_dummy_rating_df( 11 | random_seed: int = 0, 12 | factor_rank: int = 3, 13 | size: int = 100, 14 | user_colname: str = "userId", 15 | item_colname: str = "itemId", 16 | timestamp_colname: str = "timestamp", 17 | rating_colname: str = "rating", 18 | ) -> pd.DataFrame: 19 | rns = np.random.RandomState(random_seed) 20 | user_indices_all = np.arange(max(int(size / 3), 10)) 21 | item_indices_all = np.arange(max(int(size / 2), 10)) 22 | user_factor = rns.normal( 23 | 0, 1 / factor_rank**0.5, size=(user_indices_all.shape[0], factor_rank) 24 | ) 25 | item_factor = rns.normal(0, 1, size=(item_indices_all.shape[0], factor_rank)) 26 | 27 | time = pd.Timestamp("2000-01-01") + pd.to_timedelta( 28 | rns.randint(-365, 365, size=size), unit="day" 29 | ) 30 | 31 | result_df = pd.DataFrame( 32 | { 33 | user_colname: rns.choice(user_indices_all, size=size, replace=True) + 1, 34 | item_colname: rns.choice(item_indices_all, size=size, replace=True) + 1, 35 | timestamp_colname: time, 36 | } 37 | ) 38 | score = ( 39 | user_factor[result_df[user_colname].values - 1, :] 40 | * item_factor[result_df[item_colname].values - 1, :] 41 | ).sum(axis=1) 42 | cutpoints: List[float] = list(np.percentile(score, [20, 40, 60, 80])) # type: ignore 43 | rating = np.ones((size,), dtype=np.int64) 44 | for cp in cutpoints: 45 | rating += score >= cp 46 | result_df[rating_colname] = rating 47 | return result_df 48 | 49 | 50 | def gen_dummy_X( 51 | random_seed: int = 0, 52 | factor_rank: int = 3, 53 | size: int = 100, 54 | ) -> Tuple[List[RelationBlock], DenseArray, List[int]]: 55 | user_column = "userId" 56 | item_column = "itemId" 57 | rating_column = "rating" 58 | df_ = gen_dummy_rating_df( 59 | random_seed, 60 | factor_rank=factor_rank, 61 | size=size, 62 | user_colname=user_column, 63 | item_colname=item_column, 64 | rating_colname=rating_column, 65 | ) 66 | blocks = [] 67 | shapes = [] 68 | for colname in [user_column, item_column]: 69 | categorical_expression = pd.Categorical(df_[colname]) 70 | X = sps.identity( 71 | len(categorical_expression.categories), dtype=np.float64 72 | ).tocsr() 73 | ind = categorical_expression.codes 74 | blocks.append(RelationBlock(ind, X)) 75 | shapes.append(X.shape[1]) 76 | return (blocks, df_[rating_column].values, shapes) 77 | 78 | 79 | __all__ = ["gen_dummy_rating_df"] 80 | -------------------------------------------------------------------------------- /src/myfm/utils/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import DataFrameEncoder 2 | from .binning import BinningEncoder 3 | from .categorical import CategoryValueToSparseEncoder 4 | from .multi_value import MultipleValuesToSparseEncoder 5 | 6 | __all__ = [ 7 | "DataFrameEncoder", 8 | "CategoryValueToSparseEncoder", 9 | "BinningEncoder", 10 | "MultipleValuesToSparseEncoder", 11 | ] 12 | -------------------------------------------------------------------------------- /src/myfm/utils/encoders/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import OrderedDict 3 | from typing import Any, Dict, List 4 | 5 | import pandas as pd 6 | import scipy.sparse as sps 7 | 8 | 9 | class SparseEncoderBase(ABC): 10 | r"""The base class for encoders into sparse matrices.""" 11 | 12 | @abstractmethod 13 | def to_sparse(self, x: List[Any]) -> sps.csr_matrix: 14 | raise NotImplementedError("must be implemented") # pragma: no cover 15 | 16 | @abstractmethod 17 | def __len__(self) -> int: 18 | raise NotImplementedError("must be implemented") # pragma: no cover 19 | 20 | @abstractmethod 21 | def names(self) -> List[str]: 22 | r"""Description of each non-zero entry.""" 23 | raise NotImplementedError("must be implemented") # pragma: no cover 24 | 25 | 26 | class DataFrameEncoder: 27 | """Encode pandas.DataFrame into concatenated sparse matrices.""" 28 | 29 | def __init__(self) -> None: 30 | r"""Construct the encoders starting from empty one.""" 31 | self.col_encoders: Dict[str, SparseEncoderBase] = OrderedDict() 32 | 33 | def all_names(self) -> List[str]: 34 | return [ 35 | f"{col_name}__{description}" 36 | for col_name, encoder in self.col_encoders.items() 37 | for description in encoder.names() 38 | ] 39 | 40 | @property 41 | def encoder_shapes(self) -> List[int]: 42 | r"""Show how the columns for an encoded CSR matrix are organized. 43 | 44 | Returns 45 | ------- 46 | List[int] 47 | list of length of internal encoders. 48 | """ 49 | return [len(enc) for enc in self.col_encoders.values()] 50 | 51 | def add_column( 52 | self, colname: str, encoder: SparseEncoderBase 53 | ) -> "DataFrameEncoder": 54 | r"""Add a column name to be encoded / encoder pair. 55 | 56 | Parameters 57 | ---------- 58 | colname : str 59 | The column name to be encoded. 60 | encoder : SparseEncoderBase 61 | The corresponding encoder. 62 | """ 63 | self.col_encoders[colname] = encoder 64 | return self 65 | 66 | def encode_df(self, df: pd.DataFrame) -> sps.csr_matrix: 67 | r"""Encode the dataframe into a concatenated CSR matrix. 68 | 69 | Parameters 70 | ---------- 71 | df : pd.DataFrame 72 | The source. 73 | 74 | Returns 75 | ------- 76 | sps.csr_matrix 77 | The result. 78 | """ 79 | matrices: List[sps.csr_matrix] = [] 80 | for colname, encoder in self.col_encoders.items(): 81 | matrices.append(encoder.to_sparse(df[colname])) 82 | 83 | return sps.hstack(matrices, format="csr") 84 | -------------------------------------------------------------------------------- /src/myfm/utils/encoders/binning.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, List, TypeVar 2 | 3 | import numpy as np 4 | from scipy import sparse as sps 5 | 6 | from myfm.base import DenseArray 7 | 8 | from .base import SparseEncoderBase 9 | 10 | if TYPE_CHECKING: 11 | from numpy.typing import ArrayLike 12 | else: 13 | ArrayLike = object 14 | 15 | Numeric = TypeVar("Numeric", int, float) 16 | 17 | 18 | class BinningEncoder(SparseEncoderBase): 19 | """The class to one-hot encode a List of numerical values into a sparse matrix representation by binning.""" 20 | 21 | def __init__(self, x: ArrayLike, n_percentiles: int = 10) -> None: 22 | """Initializes the encoder by compting the percentile values of input. 23 | 24 | Parameters 25 | ---------- 26 | x: 27 | list of numerical values. 28 | n_percentiles: 29 | number of percentiles computed against x, by default 10. 30 | 31 | """ 32 | if n_percentiles <= 0: 33 | raise ValueError("n_percentiles must be greater than 0.") 34 | self.percentages = np.linspace(0, 100, n_percentiles + 2)[1:-1] 35 | x_arr = np.asfarray(x) 36 | temp_percentiles: DenseArray = np.percentile( 37 | x_arr[~np.isnan(x_arr)], self.percentages 38 | ) 39 | self.percentiles = np.unique(temp_percentiles) 40 | 41 | def names(self) -> List[str]: 42 | return ( 43 | ["NaN"] 44 | + [f"<={val}" for val in self.percentiles] 45 | + [f">{self.percentiles[-1]}"] 46 | ) 47 | 48 | def to_sparse(self, x: ArrayLike) -> sps.csr_matrix: 49 | x_array = np.asarray(x, dtype=np.float64) 50 | N = x_array.shape[0] 51 | non_na_index = ~np.isnan(x_array) 52 | x_not_na = x_array[non_na_index] 53 | cols = np.zeros(N, dtype=np.int64) 54 | cols[non_na_index] += 1 55 | for p in self.percentiles: 56 | cols[non_na_index] += x_not_na > p 57 | return sps.csr_matrix( 58 | (np.ones(N, dtype=np.float64), (np.arange(N), cols)), 59 | shape=(N, len(self)), 60 | ) 61 | 62 | def __len__(self) -> int: 63 | return len(self.percentiles) + 2 64 | -------------------------------------------------------------------------------- /src/myfm/utils/encoders/categorical.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from typing import Dict, Generic, Iterable, List, Optional, TypeVar, Union 3 | 4 | import numpy as np 5 | import scipy.sparse as sps 6 | from typing_extensions import Literal 7 | 8 | from .base import SparseEncoderBase 9 | 10 | T = TypeVar("T", int, float, str) 11 | 12 | 13 | class CategoryValueToSparseEncoder(Generic[T], SparseEncoderBase): 14 | """The class to one-hot encode a List of items into a sparse matrix representation.""" 15 | 16 | def __init__( 17 | self, 18 | items: Iterable[T], 19 | min_freq: int = 1, 20 | handle_unknown: Literal["create", "ignore", "raise"] = "create", 21 | ): 22 | r"""Construct the encoder by providing a list of items. 23 | 24 | Parameters 25 | ---------- 26 | items : Iterable[T] 27 | The items list. 28 | min_freq : int, optional 29 | The minimal frequency for an item to be retained in the known items list, by default 1 30 | handle_unknown: Literal["create", "ignore", "raise"], optional 31 | How to handle previously unseen values during encoding. 32 | If "create", then there is a single category named "__UNK__" for unknown values, 33 | ant it is treated as 0th category. 34 | If "ignore", such an item will be ignored. 35 | If "raise", a `KeyError` is raised. 36 | Defaults to "create". 37 | """ 38 | counter_ = Counter(items) 39 | unique_items = sorted([x for x, freq in counter_.items() if freq >= min_freq]) 40 | self._item_index_offset = 1 if handle_unknown == "create" else 0 41 | self.handle_unknown = handle_unknown 42 | self._dict: Dict[T, int] = { 43 | item: i + self._item_index_offset for i, item in enumerate(unique_items) 44 | } 45 | self.values: List[Union[str, T]] = [] 46 | if self.handle_unknown == "create": 47 | self.values.append("__UNK__") 48 | self.values.extend(unique_items) 49 | 50 | def _get_index(self, x: T) -> Optional[int]: 51 | try: 52 | return self._dict[x] 53 | except KeyError: 54 | if self.handle_unknown == "create": 55 | return 0 56 | elif self.handle_unknown == "ignore": 57 | return None 58 | raise 59 | 60 | def __getitem__(self, x: T) -> int: 61 | result = self._get_index(x) 62 | if result is None: 63 | raise KeyError(f"{x} not found.") 64 | return result 65 | 66 | def names(self) -> List[str]: 67 | return [str(y) for y in self.values] 68 | 69 | def to_sparse(self, items: Iterable[T]) -> sps.csr_matrix: 70 | rows = [] 71 | cols = [] 72 | n_row = 0 73 | for i, x in enumerate(items): 74 | n_row += 1 75 | index = self._get_index(x) 76 | if index is None: 77 | continue 78 | rows.append(i) 79 | cols.append(index) 80 | return sps.csr_matrix( 81 | ( 82 | np.ones(len(rows), dtype=np.float64), 83 | (rows, cols), 84 | ), 85 | shape=(n_row, len(self)), 86 | ) 87 | 88 | def __len__(self) -> int: 89 | return len(self._dict) + self._item_index_offset 90 | -------------------------------------------------------------------------------- /src/myfm/utils/encoders/multi_value.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | import scipy.sparse as sps 4 | from typing_extensions import Literal 5 | 6 | from .categorical import CategoryValueToSparseEncoder 7 | 8 | 9 | class MultipleValuesToSparseEncoder(CategoryValueToSparseEncoder[str]): 10 | """The class to N-hot encode a List of items into a sparse matrix representation.""" 11 | 12 | def __init__( 13 | self, 14 | items: Iterable[str], 15 | min_freq: int = 1, 16 | sep: str = ",", 17 | normalize: bool = True, 18 | handle_unknown: Literal["create", "ignore", "raise"] = "create", 19 | ): 20 | """Construct the encoder by providing a list of strings, 21 | each of which is a list of strings concatenated by `sep`. 22 | 23 | Parameters 24 | ---------- 25 | items : Iterable[str] 26 | Iterable of strings, each of which is a concatenated list of possibly multiple items. 27 | min_freq : int, optional 28 | The minimal frequency for an item to be retained in the known items list, by default 1. 29 | sep: str, optional 30 | Tells how to separate string back into a list. Defaults to `','`. 31 | normalize: bool, optional 32 | If `True`, non-zero entry in the encoded matrix will have `1 / N ** 0.5`, 33 | where `N` is the number of non-zero entries in that row. Defaults to `True`. 34 | handle_unknown: Literal["create", "ignore", "raise"], optional 35 | How to handle previously unseen values during encoding. 36 | If "create", then there is a single category named "__UNK__" for unknown values, 37 | ant it is treated as 0th category. 38 | If "ignore", such an item will be ignored. 39 | If "raise", a `KeyError` is raised. 40 | Defaults to "create". 41 | """ 42 | items_flatten = [ 43 | y for x in items for y in set(x.split(sep)) if y 44 | ] # ignore empty string. 45 | self.sep = sep 46 | self.normalize = normalize 47 | super().__init__( 48 | items_flatten, min_freq=min_freq, handle_unknown=handle_unknown 49 | ) 50 | 51 | def to_sparse(self, items: Iterable[str]) -> sps.csr_matrix: 52 | indptr = [0] 53 | indices = [] 54 | data = [] 55 | n_row = 0 56 | cursor = 0 57 | for row in items: 58 | n_row += 1 59 | items = row.split(self.sep) 60 | indices_local = sorted( 61 | list( 62 | { 63 | index 64 | for index in [self._get_index(v) for v in items if v] 65 | if index is not None 66 | } 67 | ) 68 | ) 69 | 70 | if not indices_local: 71 | indptr.append(cursor) 72 | continue 73 | n = len(indices_local) 74 | value = 1.0 / (float(n) ** 0.5) if self.normalize else 1.0 75 | indices.extend(indices_local) 76 | data.extend([value] * n) 77 | cursor += n 78 | indptr.append(cursor) 79 | return sps.csr_matrix( 80 | (data, indices, indptr), 81 | shape=(n_row, len(self)), 82 | ) 83 | -------------------------------------------------------------------------------- /src/myfm/variational.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Optional, Tuple, TypeVar 2 | 3 | import numpy as np 4 | import scipy.sparse as sps 5 | 6 | from ._myfm import ( 7 | ConfigBuilder, 8 | FMLearningConfig, 9 | RelationBlock, 10 | VariationalFM, 11 | VariationalFMHyperParameters, 12 | VariationalLearningHistory, 13 | VariationalPredictor, 14 | create_train_vfm, 15 | ) 16 | from .base import ( 17 | REAL, 18 | ArrayLike, 19 | ClassifierMixin, 20 | MyFMBase, 21 | RegressorMixin, 22 | check_data_consistency, 23 | ) 24 | 25 | ArrayOrDenseArray = TypeVar("ArrayOrDenseArray", np.ndarray, float) 26 | 27 | 28 | def runtime_error_to_optional( 29 | fm: "MyFMVariationalBase", 30 | retrieve_method: Callable[[VariationalFM], ArrayOrDenseArray], 31 | ) -> Optional[ArrayOrDenseArray]: 32 | try: 33 | predictor = fm._fetch_predictor() 34 | except: 35 | return None 36 | weights = predictor.weights() 37 | return retrieve_method(weights) 38 | 39 | 40 | class MyFMVariationalBase( 41 | MyFMBase[ 42 | VariationalFM, 43 | VariationalFMHyperParameters, 44 | VariationalPredictor, 45 | VariationalLearningHistory, 46 | ] 47 | ): 48 | @property 49 | def w0_mean(self) -> Optional[float]: 50 | r"""Mean of variational posterior distribution of global bias `w0`. 51 | If the model is not fit yet, returns `None`. 52 | 53 | Returns: 54 | Mean of variational posterior distribution of global bias `w0`. 55 | """ 56 | 57 | def _retrieve(fm: VariationalFM) -> float: 58 | return fm.w0 59 | 60 | return runtime_error_to_optional(self, _retrieve) 61 | 62 | @property 63 | def w0_var(self) -> Optional[float]: 64 | r"""Variance of variational posterior distribution of global bias `w0`. 65 | If the model is not fit yet, returns `None`. 66 | 67 | Returns: 68 | Variance of variational posterior distribution of global bias `w0`. 69 | """ 70 | 71 | def _retrieve(fm: VariationalFM) -> float: 72 | return fm.w0_var 73 | 74 | return runtime_error_to_optional(self, _retrieve) 75 | 76 | @property 77 | def w_mean(self) -> Optional[np.ndarray]: 78 | r"""Mean of variational posterior distribution of linear coefficnent `w`. 79 | If the model is not fit yet, returns `None`. 80 | 81 | Returns: 82 | Mean of variational posterior distribution of linear coefficnent `w`. 83 | """ 84 | 85 | def _retrieve(fm: VariationalFM) -> np.ndarray: 86 | return fm.w 87 | 88 | return runtime_error_to_optional(self, _retrieve) 89 | 90 | @property 91 | def w_var(self) -> Optional[np.ndarray]: 92 | r"""Variance of variational posterior distribution of linear coefficnent `w`. 93 | If the model is not fit yet, returns `None`. 94 | 95 | Returns: 96 | Variance of variational posterior distribution of linear coefficnent `w`. 97 | """ 98 | 99 | def _retrieve(fm: VariationalFM) -> np.ndarray: 100 | return fm.w_var 101 | 102 | return runtime_error_to_optional(self, _retrieve) 103 | 104 | @property 105 | def V_mean(self) -> Optional[np.ndarray]: 106 | r"""Mean of variational posterior distribution of factorized quadratic coefficnent `V`. 107 | If the model is not fit yet, returns `None`. 108 | 109 | Returns: 110 | Mean of variational posterior distribution of factorized quadratic coefficient `V`. 111 | """ 112 | 113 | def _retrieve(fm: VariationalFM) -> np.ndarray: 114 | return fm.V 115 | 116 | return runtime_error_to_optional(self, _retrieve) 117 | 118 | @property 119 | def V_var(self) -> Optional[np.ndarray]: 120 | r"""Variance of variational posterior distribution of factorized quadratic coefficnent `V`. 121 | If the model is not fit yet, returns `None`. 122 | 123 | Returns: 124 | Variance of variational posterior distribution of factorized quadratic coefficient `V`. 125 | """ 126 | 127 | def _retrieve(fm: VariationalFM) -> np.ndarray: 128 | return fm.V_var 129 | 130 | return runtime_error_to_optional(self, _retrieve) 131 | 132 | @classmethod 133 | def _train_core( 134 | cls, 135 | rank: int, 136 | init_stdev: float, 137 | X: sps.csr_matrix, 138 | X_rel: List[RelationBlock], 139 | y: np.ndarray, 140 | random_seed: int, 141 | config: FMLearningConfig, 142 | callback: Callable[ 143 | [ 144 | int, 145 | VariationalFM, 146 | VariationalFMHyperParameters, 147 | VariationalLearningHistory, 148 | ], 149 | bool, 150 | ], 151 | ) -> Tuple[VariationalPredictor, VariationalLearningHistory]: 152 | return create_train_vfm( 153 | rank, init_stdev, X, X_rel, y, random_seed, config, callback 154 | ) 155 | 156 | def _predict_core( 157 | self, 158 | X: Optional[ArrayLike], 159 | X_rel: List[RelationBlock] = [], 160 | ) -> np.ndarray: 161 | predictor = self._fetch_predictor() 162 | shape = check_data_consistency(X, X_rel) 163 | if X is None: 164 | X = sps.csr_matrix((shape, 0), dtype=REAL) 165 | else: 166 | X = sps.csr_matrix(X) 167 | return predictor.predict(X, X_rel) 168 | 169 | 170 | class VariationalFMRegressor( 171 | RegressorMixin[VariationalFM, VariationalFMHyperParameters], 172 | MyFMVariationalBase, 173 | ): 174 | """Variational Inference for Regression Task.""" 175 | 176 | def fit( 177 | self, 178 | X: ArrayLike, 179 | y: np.ndarray, 180 | X_rel: List[RelationBlock] = [], 181 | X_test: Optional[ArrayLike] = None, 182 | y_test: Optional[np.ndarray] = None, 183 | X_rel_test: List[RelationBlock] = [], 184 | n_iter: int = 100, 185 | grouping: Optional[List[int]] = None, 186 | group_shapes: Optional[List[int]] = None, 187 | callback: Optional[ 188 | Callable[ 189 | [ 190 | int, 191 | VariationalFM, 192 | VariationalFMHyperParameters, 193 | VariationalLearningHistory, 194 | ], 195 | Tuple[bool, Optional[str]], 196 | ] 197 | ] = None, 198 | config_builder: Optional[ConfigBuilder] = None, 199 | ) -> "VariationalFMRegressor": 200 | r"""Performs batch variational inference fit the data. 201 | 202 | Parameters 203 | ---------- 204 | X : 2D array-like. 205 | Input variable. 206 | 207 | y : 1D array-like. 208 | Target variable. 209 | 210 | X_rel: list of RelationBlock, optional (default=[]) 211 | Relation blocks which supplements X. 212 | 213 | n_iter : int, optional (default = 100) 214 | Iterations to perform. 215 | 216 | grouping: Integer List, optional (default = None) 217 | If not `None`, this specifies which column of X belongs to which group. 218 | That is, if grouping[i] is g, then, :math:`w_i` and :math:`V_{i, r}` 219 | will be distributed according to 220 | :math:`\mathcal{N}(\mu_w[g], \lambda_w[g])` and :math:`\mathcal{N}(\mu_V[g, r], \lambda_V[g,r])`, 221 | respectively. 222 | If `None`, all the columns of X are assumed to belong to a single group, 0. 223 | 224 | group_shapes: Integer array, optional (default = None) 225 | If not `None`, this specifies each variable group's size. 226 | Ignored if grouping is not None. 227 | For example, if ``group_shapes = [n_1, n_2]``, 228 | this is equivalent to ``grouping = [0] * n_1 + [1] * n_2`` 229 | 230 | callback: function(int, fm, hyper, history) -> bool, optional(default = None) 231 | Called at the every end of each Gibbs iteration. 232 | """ 233 | self._fit( 234 | X, 235 | y, 236 | X_rel=X_rel, 237 | X_test=X_test, 238 | X_rel_test=X_rel_test, 239 | y_test=y_test, 240 | n_iter=n_iter, 241 | grouping=grouping, 242 | callback=callback, 243 | group_shapes=group_shapes, 244 | config_builder=config_builder, 245 | ) 246 | return self 247 | 248 | def predict( 249 | self, X: Optional[ArrayLike], X_rel: List[RelationBlock] = [] 250 | ) -> np.ndarray: 251 | r"""Make a prediction based on variational mean. 252 | 253 | Parameters 254 | ---------- 255 | X : Optional[ArrayLike] 256 | Main Table. When None, treated as a matrix without columns. 257 | X_rel : List[RelationBlock], optional 258 | Relations, by default [] 259 | 260 | Returns 261 | ------- 262 | np.ndarray 263 | [description] 264 | """ 265 | return self._predict_core(X, X_rel) 266 | 267 | 268 | class VariationalFMClassifier( 269 | ClassifierMixin[VariationalFM, VariationalFMHyperParameters], 270 | MyFMVariationalBase, 271 | ): 272 | """Variational Inference for Classification Task.""" 273 | 274 | def fit( 275 | self, 276 | X: ArrayLike, 277 | y: np.ndarray, 278 | X_rel: List[RelationBlock] = [], 279 | X_test: Optional[ArrayLike] = None, 280 | y_test: Optional[np.ndarray] = None, 281 | X_rel_test: List[RelationBlock] = [], 282 | n_iter: int = 100, 283 | grouping: Optional[List[int]] = None, 284 | group_shapes: Optional[List[int]] = None, 285 | callback: Optional[ 286 | Callable[ 287 | [ 288 | int, 289 | VariationalFM, 290 | VariationalFMHyperParameters, 291 | VariationalLearningHistory, 292 | ], 293 | Tuple[bool, Optional[str]], 294 | ] 295 | ] = None, 296 | config_builder: Optional[ConfigBuilder] = None, 297 | ) -> "VariationalFMClassifier": 298 | r"""Performs batch variational inference fit the data. 299 | 300 | Parameters 301 | ---------- 302 | X : Optional[ArrayLike]. 303 | Main table. When None, treated as a matrix without columns. 304 | 305 | y : 1D array-like. 306 | Target variable. 307 | 308 | X_rel: list of RelationBlock, optional (default=[]) 309 | Relation blocks which supplements X. 310 | 311 | n_iter : int, optional (default = 100) 312 | Iterations to perform. 313 | 314 | grouping: Integer List, optional (default = None) 315 | If not `None`, this specifies which column of X belongs to which group. 316 | That is, if grouping[i] is g, then, :math:`w_i` and :math:`V_{i, r}` 317 | will be distributed according to 318 | :math:`\mathcal{N}(\mu_w[g], \lambda_w[g])` and :math:`\mathcal{N}(\mu_V[g, r], \lambda_V[g,r])`, 319 | respectively. 320 | If `None`, all the columns of X are assumed to belong to a single group, 0. 321 | 322 | group_shapes: Integer array, optional (default = None) 323 | If not `None`, this specifies each variable group's size. 324 | Ignored if grouping is not None. 325 | For example, if ``group_shapes = [n_1, n_2]``, 326 | this is equivalent to ``grouping = [0] * n_1 + [1] * n_2`` 327 | 328 | callback: function(int, fm, hyper) -> bool, optional(default = None) 329 | Called at the every end of each Gibbs iteration. 330 | """ 331 | self._fit( 332 | X, 333 | y, 334 | X_rel=X_rel, 335 | X_test=X_test, 336 | X_rel_test=X_rel_test, 337 | y_test=y_test, 338 | n_iter=n_iter, 339 | grouping=grouping, 340 | callback=callback, 341 | group_shapes=group_shapes, 342 | config_builder=config_builder, 343 | ) 344 | return self 345 | 346 | def predict( 347 | self, X: Optional[ArrayLike], X_rel: List[RelationBlock] = [] 348 | ) -> np.ndarray: 349 | r"""Based on the class probability, return binary classified outcome based on threshold = 0.5. 350 | If you want class probability instead, use `predict_proba` method. 351 | 352 | Parameters 353 | ---------- 354 | X : Optional[ArrayLike] 355 | Main Table. When None, treated as a matrix without columns. 356 | X_rel : List[RelationBlock], optional 357 | Relations, by default [] 358 | 359 | Returns 360 | ------- 361 | np.ndarray 362 | 0/1 predictions based on the probability. 363 | """ 364 | return self.predict_proba(X, X_rel) > 0.5 365 | 366 | def predict_proba( 367 | self, X: Optional[ArrayLike], X_rel: List[RelationBlock] = [] 368 | ) -> np.ndarray: 369 | r"""Compute the probability that the outcome will be 1 based on variational mean. 370 | 371 | Parameters 372 | ---------- 373 | X : Optional[ArrayLike] 374 | Main Table. When None, treated as a matrix without columns. 375 | X_rel : List[RelationBlock], optional 376 | Relations, by default [] 377 | 378 | Returns 379 | ------- 380 | np.ndarray 381 | the probability. 382 | """ 383 | return self._predict_core(X, X_rel) 384 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/__init__.py -------------------------------------------------------------------------------- /tests/classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/classification/__init__.py -------------------------------------------------------------------------------- /tests/classification/test_classification.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | import pytest 5 | from scipy import sparse as sps 6 | 7 | from myfm import MyFMGibbsClassifier, VariationalFMClassifier 8 | from myfm.utils.callbacks import ClassificationCallback 9 | 10 | from ..test_utils import FMWeights 11 | 12 | 13 | @pytest.mark.parametrize("use_libfm_callback", [True, False]) 14 | def test_middle_clf( 15 | middle_data: Tuple[sps.csr_matrix, np.ndarray], 16 | stub_weight: FMWeights, 17 | use_libfm_callback: bool, 18 | ) -> None: 19 | rns = np.random.RandomState(0) 20 | X, score = middle_data 21 | score_noised = score + rns.normal(0, 1, size=score.shape) 22 | score_noised -= score_noised.mean() 23 | y = score_noised > 0 24 | if use_libfm_callback: 25 | callback = ClassificationCallback(200, X, y) 26 | else: 27 | callback = None 28 | 29 | fm = MyFMGibbsClassifier(3).fit( 30 | X, y, X_test=X, y_test=y, n_iter=200, n_kept_samples=200, callback=callback 31 | ) 32 | if use_libfm_callback: 33 | np.testing.assert_allclose(fm.predict_proba(X), callback.predictions / 200) 34 | 35 | vfm_before_fit = VariationalFMClassifier(3) 36 | assert vfm_before_fit.w0_mean is None 37 | assert vfm_before_fit.w0_var is None 38 | assert vfm_before_fit.w_mean is None 39 | assert vfm_before_fit.w_var is None 40 | assert vfm_before_fit.V_mean is None 41 | assert vfm_before_fit.V_var is None 42 | 43 | vfm = vfm_before_fit.fit( 44 | X, y, X_test=X, y_test=y, n_iter=200 # , n_kept_samples=50 45 | ) 46 | 47 | assert vfm.w0_mean is not None 48 | assert vfm.w0_var is not None 49 | assert vfm.w_mean is not None 50 | assert vfm.w_var is not None 51 | assert vfm.V_mean is not None 52 | assert vfm.V_var is not None 53 | 54 | assert fm.predictor_ is not None 55 | 56 | last_samples = fm.predictor_.samples[-20:] 57 | 58 | for i in range(3): 59 | for j in range(i + 1, 3): 60 | cross_term = stub_weight.factors[:, i].dot(stub_weight.factors[:, j]) 61 | if abs(cross_term) < 0.5: 62 | continue 63 | sign = cross_term / abs(cross_term) 64 | assert vfm.V_mean[i].dot(vfm.V_mean[j]) > sign * cross_term * 0.8 65 | assert vfm.V_mean[i].dot(vfm.V_mean[j]) < sign * cross_term * 1.2 66 | 67 | for s in last_samples: 68 | sample_cross_term = s.V[i].dot(s.V[j]) 69 | assert sample_cross_term > sign * cross_term * 0.5 70 | assert sample_cross_term < sign * cross_term * 2 71 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | import pytest 5 | import scipy.sparse as sps 6 | 7 | from .test_utils import FMWeights, prediction 8 | 9 | N_FEATURES = 3 10 | N_LATENT = 4 11 | 12 | 13 | @pytest.fixture 14 | def stub_weight() -> FMWeights: 15 | weights = FMWeights( 16 | -3.0, 17 | np.asfarray([1.0, 2.0, -1.0]), 18 | np.asfarray( 19 | [[1.0, -1.0, 0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [-1.0, 0, -1.0]] 20 | ), 21 | ) 22 | return weights 23 | 24 | 25 | def create_data( 26 | n_train: int, stub_weight: FMWeights 27 | ) -> Tuple[sps.csr_matrix, np.ndarray]: 28 | rns = np.random.RandomState(0) 29 | rows: List[int] = [] 30 | cols: List[int] = [] 31 | data: List[float] = [] 32 | for row in range(n_train): 33 | indices = np.where(rns.random(N_FEATURES) > 0.5)[0] 34 | for ind in indices: 35 | rows.append(row) 36 | cols.append(ind) 37 | data.append(float(rns.choice([-2, -1, 1, 2]))) 38 | X = sps.csr_matrix((data, (rows, cols))) 39 | p = prediction(X, weight=stub_weight) 40 | return X, p 41 | 42 | 43 | @pytest.fixture 44 | def middle_data(stub_weight: FMWeights) -> Tuple[sps.csr_matrix, np.ndarray]: 45 | return create_data(1000, stub_weight) 46 | -------------------------------------------------------------------------------- /tests/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/dataset/__init__.py -------------------------------------------------------------------------------- /tests/dataset/test_ml100k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pytest_mock import MockerFixture 4 | 5 | from myfm.utils.benchmark_data import MovieLens100kDataManager 6 | 7 | 8 | def test_ml100k(mocker: MockerFixture) -> None: 9 | mocker.patch("builtins.input", return_value="y") 10 | dm = MovieLens100kDataManager() 11 | unique_key_pair = ["user_id", "movie_id"] 12 | 13 | df_all_recovered = dm.load_rating_all().sort_values(unique_key_pair) 14 | 15 | user_infos = dm.load_user_info() 16 | assert np.all(df_all_recovered["user_id"].isin(user_infos["user_id"])) 17 | assert np.all(user_infos["age"] >= 0) 18 | assert np.all(user_infos["gender"].isin(["M", "F"]).values) 19 | 20 | movie_infos = dm.load_movie_info() 21 | genres = dm.genres() 22 | for genre_concat in movie_infos["genres"]: 23 | for genre in genre_concat.split("|"): 24 | assert genre in genres 25 | 26 | for k in [2, 3]: 27 | df_train, df_test = dm.load_rating_predefined_split(k) 28 | df_reconcat = pd.concat([df_train, df_test]).sort_values(unique_key_pair) 29 | for key in ["user_id", "movie_id", "timestamp"]: 30 | assert np.all(df_all_recovered[key].values == df_reconcat[key].values) 31 | 32 | N_manual_fold = 7 33 | df_tests = [] 34 | for i in range(N_manual_fold): 35 | df_tr, df_te = dm.load_rating_kfold_split(N_manual_fold, i) 36 | assert ( 37 | pd.concat([df_tr, df_te]).drop_duplicates(unique_key_pair).shape[0] 38 | == df_all_recovered.shape[0] 39 | ) 40 | assert df_tr.shape[0] + df_te.shape[0] == df_all_recovered.shape[0] 41 | test_size = df_all_recovered.shape[0] // N_manual_fold 42 | assert df_te.shape[0] in {test_size, test_size + 1} 43 | df_tests.append(df_te) 44 | df_tests_concat = pd.concat(df_tests) 45 | assert df_tests_concat.shape[0] == df_all_recovered.shape[0] 46 | assert ( 47 | df_tests_concat.drop_duplicates(unique_key_pair).shape[0] 48 | == df_all_recovered.shape[0] 49 | ) 50 | -------------------------------------------------------------------------------- /tests/dataset/test_ml1m.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | from tempfile import TemporaryDirectory 4 | from zipfile import ZipFile 5 | 6 | import numpy as np 7 | import pytest 8 | from pytest_mock import MockerFixture 9 | 10 | from myfm.utils.benchmark_data import MovieLens1MDataManager 11 | from myfm.utils.dummy_data import gen_dummy_rating_df 12 | 13 | 14 | def test_ml1m(mocker: MockerFixture) -> None: 15 | if sys.platform == "win32": 16 | pytest.skip("Skip on Windows.") 17 | dummy_df = gen_dummy_rating_df(user_colname="user_id", item_colname="movie_id") 18 | dummy_df["timestamp"] = (dummy_df["timestamp"].view(np.int64) / 1e9).astype( 19 | np.int64 20 | ) 21 | with TemporaryDirectory() as temp_dir: 22 | target = Path(temp_dir) / "ml1m.zip" 23 | mocker.patch("builtins.input", return_value="NOO") 24 | with pytest.raises(RuntimeError): 25 | dm = MovieLens1MDataManager(target) 26 | df_stringified = "\n".join( 27 | [ 28 | "::".join([str(v) for v in row]) 29 | for row in dummy_df[ 30 | ["user_id", "movie_id", "rating", "timestamp"] 31 | ].values 32 | ] 33 | ) 34 | with ZipFile(target, "w") as zf: 35 | zf.writestr("ml-1m/ratings.dat", df_stringified) 36 | dm = MovieLens1MDataManager(target) 37 | unique_key_pair = ["user_id", "movie_id", "rating"] 38 | df_all_recovered = dm.load_rating_all() 39 | for key in unique_key_pair: 40 | assert np.all(df_all_recovered[key] == dummy_df[key]) 41 | -------------------------------------------------------------------------------- /tests/oprobit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/oprobit/__init__.py -------------------------------------------------------------------------------- /tests/oprobit/test_oprobit_1dim.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from myfm import MyFMOrderedProbit 5 | from myfm.base import std_cdf 6 | from myfm.utils.callbacks import OrderedProbitCallback 7 | 8 | 9 | @pytest.mark.parametrize("use_libfm_callback", [True, False]) 10 | def test_oprobit(use_libfm_callback: bool) -> None: 11 | N_train = 1000 12 | cps = np.asfarray([0.0, 0.5, 1.5]) 13 | rns = np.random.RandomState(0) 14 | X = rns.normal(0, 2, size=N_train) 15 | coeff = 0.5 16 | y = np.zeros(N_train, dtype=np.float64) 17 | score = X * coeff + rns.randn(N_train) 18 | for cp_value in cps: 19 | y += (score > cp_value).astype(np.int64) 20 | if use_libfm_callback: 21 | callback = OrderedProbitCallback(100, X_test=X[:, None], y_test=y, n_class=4) 22 | else: 23 | callback = None 24 | fm = MyFMOrderedProbit(0, fit_w0=False) 25 | fm.fit( 26 | X[:, None], 27 | y, 28 | callback=callback, 29 | n_iter=100, 30 | n_kept_samples=100, 31 | ) 32 | 33 | assert fm.predictor_ is not None 34 | for cutpoint_sample in fm.cutpoint_samples[-10:]: 35 | cp_1, cp_2, cp_3 = cutpoint_sample 36 | assert abs(cp_1) < 0.25 37 | assert abs(cp_2 - cp_1 - 0.5) < 0.25 38 | assert abs(cp_3 - cp_1 - 1.5) < 0.25 39 | 40 | p_using_core = fm.predict_proba(X[:, None]) 41 | 42 | if use_libfm_callback: 43 | assert callback is not None 44 | np.testing.assert_allclose(callback.predictions / 100, p_using_core) 45 | result_manual = np.zeros((X.shape[0], 4)) 46 | 47 | n_ = 0 48 | for sample in fm.predictor_.samples: 49 | n_ += 1 50 | score = sample.predict_score(X[:, None], []) 51 | cdf = std_cdf((sample.cutpoints[0][np.newaxis, :] - score[:, np.newaxis])) 52 | diff = np.hstack( 53 | [ 54 | np.zeros((score.shape[0], 1)), 55 | cdf, 56 | np.ones((score.shape[0], 1)), 57 | ] 58 | ) 59 | result_manual += diff[:, 1:] - diff[:, :-1] 60 | result_manual /= n_ 61 | np.testing.assert_allclose(result_manual, p_using_core) 62 | -------------------------------------------------------------------------------- /tests/regression/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/regression/__init__.py -------------------------------------------------------------------------------- /tests/regression/test_block.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import tempfile 3 | 4 | import numpy as np 5 | from scipy import sparse as sps 6 | 7 | from myfm import MyFMRegressor, RelationBlock, VariationalFMRegressor 8 | 9 | 10 | def test_block_vfm() -> None: 11 | N_train = 1000 12 | rns = np.random.RandomState(1) 13 | user_block = sps.csr_matrix( 14 | [[1, 0, 1], [0, 1, 1], [1, 1, 0]], 15 | dtype=np.float64, 16 | ) 17 | user_indices = rns.randint(0, user_block.shape[0], size=N_train) 18 | item_block = sps.csr_matrix( 19 | [ 20 | [1, 0, 0, 1], 21 | [0, 1, 1, 0], 22 | ], 23 | dtype=np.float64, 24 | ) 25 | item_indices = rns.randint(0, item_block.shape[0], size=N_train) 26 | tm_column = rns.randn(N_train, 1) 27 | 28 | X_flatten = sps.hstack( 29 | [tm_column, user_block[user_indices], item_block[item_indices]] 30 | ) 31 | X_flatten_squread = X_flatten.copy() 32 | X_flatten_squread.data = X_flatten_squread.data**2 33 | factor = rns.randn(X_flatten.shape[1], 3) 34 | f2 = (factor**2).sum(axis=1) 35 | Xf = X_flatten.dot(factor) 36 | 37 | gb = 3.0 38 | linear_weights = rns.randn(X_flatten.shape[1]) 39 | y = ( 40 | gb 41 | + X_flatten.dot(linear_weights) 42 | + 0.5 * ((Xf**2).sum(axis=1) - X_flatten_squread.dot(f2)) 43 | + rns.normal(1.0, size=X_flatten.shape[0]) 44 | ) 45 | 46 | blocks = [ 47 | RelationBlock(user_indices, user_block), 48 | RelationBlock(item_indices, item_block), 49 | ] 50 | with tempfile.TemporaryFile() as temp_fs: 51 | pickle.dump(blocks, temp_fs) 52 | del blocks 53 | temp_fs.seek(0) 54 | blocks = pickle.load(temp_fs) 55 | fm_flatten = VariationalFMRegressor(3).fit( 56 | X_flatten, 57 | y, 58 | n_iter=100, 59 | ) 60 | fm_blocked_serialized = VariationalFMRegressor(3).fit( 61 | tm_column, 62 | y, 63 | blocks, 64 | n_iter=100, 65 | ) 66 | 67 | with tempfile.TemporaryFile() as temp_fs: 68 | pickle.dump(fm_blocked_serialized, temp_fs) 69 | del fm_blocked_serialized 70 | temp_fs.seek(0) 71 | fm_blocked: VariationalFMRegressor = pickle.load(temp_fs) 72 | 73 | np.testing.assert_allclose(fm_flatten.w_mean, fm_blocked.w_mean) 74 | np.testing.assert_allclose(fm_flatten.V_mean, fm_blocked.V_mean) 75 | predicton_flatten = fm_flatten.predict(tm_column, blocks) 76 | predicton_blocked = fm_blocked.predict(X_flatten) 77 | np.testing.assert_allclose(predicton_flatten, predicton_blocked) 78 | 79 | 80 | def test_block() -> None: 81 | rns = np.random.RandomState(0) 82 | N_train = 100 83 | user_block = sps.csr_matrix( 84 | [ 85 | [1, 0, 0], 86 | [0, 1, 0], 87 | [0, 0, 1], 88 | ], 89 | dtype=np.float64, 90 | ) 91 | user_indices = rns.randint(0, user_block.shape[0], size=N_train) 92 | item_block = sps.csr_matrix( 93 | [ 94 | [1, 0], 95 | [0, 1], 96 | ], 97 | dtype=np.float64, 98 | ) 99 | 100 | group_shapes = [1, user_block.shape[1], item_block.shape[1]] 101 | item_indices = rns.randint(0, item_block.shape[0], size=N_train) 102 | tm_column = rns.randn(N_train, 1) 103 | 104 | X_flatten = sps.hstack( 105 | [tm_column, user_block[user_indices], item_block[item_indices]] 106 | ) 107 | X_flatten_squread = X_flatten.copy() 108 | X_flatten_squread.data = X_flatten_squread.data**2 109 | 110 | weights = rns.randn(3, X_flatten.shape[1]) 111 | Xw = X_flatten.dot(weights.T) 112 | X2w2 = X_flatten_squread.dot((weights**2).sum(axis=0)) 113 | y = 0.5 * ((Xw**2).sum(axis=1) - X2w2) + rns.randn(N_train) 114 | 115 | blocks = [ 116 | RelationBlock(user_indices, user_block), 117 | RelationBlock(item_indices, item_block), 118 | ] 119 | fm_flatten = MyFMRegressor(2, fit_w0=False).fit( 120 | X_flatten, 121 | y, 122 | group_shapes=group_shapes, 123 | n_iter=30, 124 | n_kept_samples=30, 125 | ) 126 | fm_blocked = MyFMRegressor(2, fit_w0=False).fit( 127 | tm_column, 128 | y, 129 | blocks, 130 | group_shapes=group_shapes, 131 | n_iter=30, 132 | n_kept_samples=30, 133 | ) 134 | assert fm_flatten.predictor_ is not None 135 | assert fm_blocked.predictor_ is not None 136 | for s_flatten, s_blocked in zip( 137 | fm_flatten.predictor_.samples, fm_blocked.predictor_.samples 138 | ): 139 | np.testing.assert_allclose(s_flatten.V, s_blocked.V) 140 | 141 | with tempfile.TemporaryFile() as temp_fs: 142 | pickle.dump(fm_blocked, temp_fs) 143 | del fm_blocked 144 | temp_fs.seek(0) 145 | fm_blocked = pickle.load(temp_fs) 146 | 147 | predicton_flatten = fm_flatten.predict(tm_column, blocks, n_workers=2) 148 | predicton_blocked = fm_blocked.predict(X_flatten, n_workers=None) 149 | np.testing.assert_allclose(predicton_flatten, predicton_blocked) 150 | -------------------------------------------------------------------------------- /tests/regression/test_fit.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import numpy as np 4 | import pytest 5 | from scipy import sparse as sps 6 | 7 | from myfm import MyFMGibbsRegressor, VariationalFMRegressor 8 | from myfm.base import DenseArray 9 | from myfm.utils.callbacks import RegressionCallback 10 | 11 | from ..test_utils import FMWeights 12 | 13 | 14 | def assert_unwrap(x: Optional[DenseArray]) -> DenseArray: 15 | assert x is not None 16 | return x 17 | 18 | 19 | @pytest.mark.parametrize("alpha_inv", [0.3, 1.0, 3]) 20 | def test_middle_reg( 21 | alpha_inv: float, 22 | middle_data: Tuple[sps.csr_matrix, np.ndarray], 23 | stub_weight: FMWeights, 24 | ) -> None: 25 | rns = np.random.RandomState(0) 26 | X, score = middle_data 27 | y = score + alpha_inv * rns.normal(0, 1, size=score.shape) 28 | 29 | callback = RegressionCallback(100, X_test=X, y_test=y) 30 | 31 | fm_init = MyFMGibbsRegressor(3) 32 | assert fm_init.w0_samples is None 33 | assert fm_init.w_samples is None 34 | assert fm_init.V_samples is None 35 | fm = fm_init.fit( 36 | X, y, X_test=X, y_test=y, n_iter=100, n_kept_samples=100, callback=callback 37 | ) 38 | 39 | np.testing.assert_allclose(fm.predict(X), callback.predictions / 100) 40 | vfm = VariationalFMRegressor(3).fit(X, y, X_test=X, y_test=y, n_iter=50) 41 | vfm_weights = vfm.predictor_.weights() 42 | hp_trace = fm.get_hyper_trace() 43 | last_alphs = hp_trace["alpha"].iloc[-20:].values 44 | assert np.all(last_alphs > ((1 / alpha_inv**2) / 2)) 45 | assert np.all(last_alphs < ((1 / alpha_inv**2) * 2)) 46 | 47 | last_w0_samples = assert_unwrap(fm.w0_samples)[-20:] 48 | assert np.all(last_w0_samples < (stub_weight.global_bias + 0.5)) 49 | assert np.all(last_w0_samples > (stub_weight.global_bias - 0.5)) 50 | 51 | last_w_samples = assert_unwrap(fm.w_samples)[-20:] 52 | 53 | for w_ in last_w_samples: 54 | assert np.all(w_ < (stub_weight.weight + 1.0)) 55 | assert np.all(w_ > (stub_weight.weight - 1.0)) 56 | 57 | last_V_samples = assert_unwrap(fm.V_samples)[-20:] 58 | 59 | for i in range(3): 60 | for j in range(i + 1, 3): 61 | cross_term = stub_weight.factors[:, i].dot(stub_weight.factors[:, j]) 62 | if abs(cross_term) < 0.1: 63 | continue 64 | sign = cross_term / abs(cross_term) 65 | vfm_cross_term = vfm_weights.V[i].dot(vfm_weights.V[j]) 66 | assert vfm_cross_term > sign * cross_term * 0.8 67 | assert vfm_cross_term < sign * cross_term * 1.25 68 | 69 | for V_ in last_V_samples: 70 | sample_cross_term = V_[i].dot(V_[j]) 71 | assert sample_cross_term > sign * cross_term * 0.5 72 | assert sample_cross_term < sign * cross_term * 2 73 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | import numpy as np 4 | import scipy.sparse as sps 5 | 6 | N_FEATURES = 3 7 | N_LATENT = 4 8 | 9 | 10 | class FMWeights(NamedTuple): 11 | global_bias: float 12 | weight: np.ndarray 13 | factors: np.ndarray 14 | 15 | 16 | def prediction(X: sps.csr_matrix, weight: FMWeights) -> np.ndarray: 17 | X2 = X.copy() 18 | X2.data[:] = X2.data**2 19 | result = np.zeros(X.shape[0], dtype=np.float64) 20 | result[:] = weight.global_bias 21 | result += X.dot(weight.weight) 22 | w2 = (weight.factors**2).sum(axis=0) 23 | Xw = X.dot(weight.factors.T) 24 | result += ((Xw**2).sum(axis=1) - (X2.dot(w2))) * 0.5 25 | return result 26 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_binning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from myfm.utils.encoders import BinningEncoder 4 | 5 | 6 | def test_binning_dense() -> None: 7 | rns = np.random.RandomState(0) 8 | v = rns.randn(1000) 9 | v[0] = np.nan 10 | enc = BinningEncoder(v) 11 | assert enc.percentiles.shape[0] == 10 12 | X = enc.to_sparse(v) 13 | assert np.all(X.sum(axis=1).A1 == 1.0) 14 | assert X.shape[1] == 12 15 | assert X[0, 0] == 1.0 16 | for j in np.where(v[1:] <= enc.percentiles[0])[0]: 17 | assert X[j + 1, 1] == 1.0 18 | for k in range(1, 10): 19 | for j in np.where( 20 | (v[1:] <= enc.percentiles[k]) & (v[1:] > enc.percentiles[k - 1]) 21 | )[0]: 22 | assert X[j + 1, k + 1] == 1.0 23 | for j in np.where(v[1:] > enc.percentiles[-1])[0]: 24 | assert X[j + 1, 11] == 1.0 25 | 26 | 27 | def test_binning_sparse() -> None: 28 | rns = np.random.RandomState(0) 29 | v = rns.poisson(2, size=1000) 30 | enc = BinningEncoder(v) 31 | X = enc.to_sparse(v) 32 | assert np.all(X.sum(axis=1).A1 == 1.0) 33 | assert X.shape[1] == len(enc) 34 | for j in np.where(v == 0)[0]: 35 | assert X[j, 1] == 1.0 36 | 37 | for j in np.where(v == 1.0)[0]: 38 | assert X[j, 2] == 1.0 39 | 40 | for k in range(2, len(enc.percentiles)): 41 | for j in np.where((v <= enc.percentiles[k]) & (v > enc.percentiles[k - 1]))[0]: 42 | assert X[j, k + 1] == 1.0 43 | 44 | for j in np.where(v > enc.percentiles[-1])[0]: 45 | assert X[j, len(enc.percentiles) + 1] == 1.0 46 | -------------------------------------------------------------------------------- /tests/utils/test_categorical.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from myfm.utils.encoders import CategoryValueToSparseEncoder 5 | 6 | TEST_ITEMS = [ 7 | "item1", 8 | "item2", 9 | "item3", 10 | "item1", 11 | "item2", 12 | "item3", 13 | "item1", 14 | "item2", 15 | ] 16 | 17 | 18 | def test_categorical_encs_create() -> None: 19 | enc = CategoryValueToSparseEncoder(TEST_ITEMS, handle_unknown="create") 20 | X = enc.to_sparse(["item4", "item1", "item2", "item3"]) 21 | for i in range(4): 22 | for j in range(len(enc)): 23 | if i == j: 24 | assert X[i, j] == 1 25 | else: 26 | assert X[i, j] == 0 27 | 28 | enc_cutoff = CategoryValueToSparseEncoder( 29 | TEST_ITEMS, handle_unknown="create", min_freq=3 30 | ) 31 | assert len(enc_cutoff) == 3 32 | X_cutoffed = enc_cutoff.to_sparse(["item4", "item1", "item2", "item3"]) 33 | np.testing.assert_allclose( 34 | X_cutoffed.toarray(), np.asfarray([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]]) 35 | ) 36 | 37 | 38 | def test_categorical_encs_ignore() -> None: 39 | enc = CategoryValueToSparseEncoder(TEST_ITEMS, handle_unknown="ignore") 40 | X = enc.to_sparse(["item4", "item1", "item2", "item3"]) 41 | np.testing.assert_allclose( 42 | X.toarray(), np.asfarray([[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]) 43 | ) 44 | enc_cutoff = CategoryValueToSparseEncoder( 45 | TEST_ITEMS, handle_unknown="ignore", min_freq=3 46 | ) 47 | X = enc_cutoff.to_sparse(["item4", "item1", "item2", "item3"]) 48 | np.testing.assert_allclose( 49 | X.toarray(), np.asfarray([[0, 0], [1, 0], [0, 1], [0, 0]]) 50 | ) 51 | 52 | 53 | def test_categorical_encs_raise() -> None: 54 | enc = CategoryValueToSparseEncoder(TEST_ITEMS, handle_unknown="raise") 55 | with pytest.raises(KeyError): 56 | X = enc.to_sparse(["item4", "item1", "item2", "item3"]) 57 | X = enc.to_sparse(["item1", "item2", "item3"]) 58 | 59 | np.testing.assert_allclose( 60 | X.toarray(), np.asfarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) 61 | ) 62 | -------------------------------------------------------------------------------- /tests/utils/test_dataframe_encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from myfm.utils.encoders import ( 5 | BinningEncoder, 6 | DataFrameEncoder, 7 | MultipleValuesToSparseEncoder, 8 | ) 9 | from myfm.utils.encoders.categorical import CategoryValueToSparseEncoder 10 | 11 | 12 | def test_dfe() -> None: 13 | rns = np.random.RandomState(0) 14 | N = 1000 15 | categories = [1, 2, 3] 16 | 17 | multi_categories = ["i1", "i2", "i3", "i4"] 18 | multi_values = [] 19 | cnts = [] 20 | for _ in range(N): 21 | n = rns.randint(0, len(multi_categories) + 1) 22 | cnts.append(n) 23 | v = ",".join(rns.choice(multi_categories, size=n, replace=False)) 24 | multi_values.append(v) 25 | df = pd.DataFrame( 26 | dict( 27 | numerical_value=rns.randn(N), 28 | categorical_value=rns.choice(categories, size=N, replace=True), 29 | multi_values=multi_values, 30 | ) 31 | ) 32 | dfe = DataFrameEncoder().add_column( 33 | "numerical_value", BinningEncoder(df.numerical_value) 34 | ) 35 | assert np.all(dfe.encode_df(df).sum(axis=1).A1 == 1.0) 36 | dfe.add_column( 37 | "categorical_value", CategoryValueToSparseEncoder(df.categorical_value) 38 | ) 39 | assert np.all(dfe.encode_df(df).sum(axis=1).A1 == 2.0) 40 | dfe.add_column( 41 | "multi_values", MultipleValuesToSparseEncoder(df.multi_values, normalize=False) 42 | ) 43 | for nnz, cnt in zip(dfe.encode_df(df).sum(axis=1).A1, cnts): 44 | assert nnz == cnt + 2 45 | cursor = 0 46 | names = dfe.all_names() 47 | for s, name_prefix in zip( 48 | dfe.encoder_shapes, ["numerical_value", "categorical_value", "multi_values"] 49 | ): 50 | for X_col_name in names[cursor : cursor + s]: 51 | assert X_col_name.startswith(name_prefix) 52 | cursor += s 53 | -------------------------------------------------------------------------------- /tests/utils/test_multivalue.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from myfm.utils.encoders import MultipleValuesToSparseEncoder 4 | 5 | TEST_ITEMS = [ 6 | "item1", 7 | "item2, item1", 8 | "item3, item2", 9 | "item2", 10 | "item3, item1", 11 | ] 12 | 13 | 14 | def test_categorical_encs_create() -> None: 15 | enc = MultipleValuesToSparseEncoder(TEST_ITEMS, handle_unknown="create") 16 | X = enc.to_sparse(["item4,item1", "item1,item2,item3", "item2", "item3"]) 17 | nnz_rows = (X.toarray() > 0).astype(np.int32).sum(axis=1) 18 | np.testing.assert_allclose(nnz_rows, np.asarray([2, 3, 1, 1])) 19 | --------------------------------------------------------------------------------