├── .github
    └── workflows
    │   ├── doctest.yml
    │   ├── pre-commit.yml
    │   ├── run-test.yml
    │   └── wheels.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── codecov.yml
├── cpp_source
    ├── Faddeeva.cc
    ├── bind.cpp
    ├── bind_float.cpp
    └── declare_module.hpp
├── doc
    ├── Makefile
    ├── requirements.txt
    └── source
    │   ├── api_reference.rst
    │   ├── conf.py
    │   ├── index.rst
    │   ├── movielens.rst
    │   ├── ordinal-regression.rst
    │   ├── quickstart.rst
    │   ├── rating_vs_cps.png
    │   └── relation-blocks.rst
├── doc_autobuild.sh
├── examples
    ├── ml-100k-extended.ipynb
    ├── ml-100k-regression.py
    ├── ml-100k-variational.py
    ├── ml-100k.ipynb
    ├── ml-10m-regression.py
    ├── ml-1m-extended.ipynb
    ├── ml-1m-regression.py
    ├── oprobit_example.py
    └── toy.py
├── include
    ├── Faddeeva
    │   └── Faddeeva.hh
    └── myfm
    │   ├── BaseFMTrainer.hpp
    │   ├── FM.hpp
    │   ├── FMLearningConfig.hpp
    │   ├── FMTrainer.hpp
    │   ├── HyperParams.hpp
    │   ├── LearningHistory.hpp
    │   ├── OProbitSampler.hpp
    │   ├── definitions.hpp
    │   ├── predictor.hpp
    │   ├── util.hpp
    │   └── variational.hpp
├── mypy.ini
├── pyproject.toml
├── setup.py
├── src
    └── myfm
    │   ├── __init__.py
    │   ├── _myfm.pyi
    │   ├── base.py
    │   ├── gibbs.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── benchmark_data
    │       │   ├── __init__.py
    │       │   ├── loader_base.py
    │       │   ├── movielens100k_data.py
    │       │   ├── movielens10M_data.py
    │       │   └── movielens1M_data.py
    │       ├── callbacks
    │       │   ├── __init__.py
    │       │   └── libfm.py
    │       ├── dummy_data.py
    │       └── encoders
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   ├── binning.py
    │       │   ├── categorical.py
    │       │   └── multi_value.py
    │   └── variational.py
└── tests
    ├── __init__.py
    ├── classification
        ├── __init__.py
        └── test_classification.py
    ├── conftest.py
    ├── dataset
        ├── __init__.py
        ├── test_ml100k.py
        └── test_ml1m.py
    ├── oprobit
        ├── __init__.py
        └── test_oprobit_1dim.py
    ├── regression
        ├── __init__.py
        ├── test_block.py
        └── test_fit.py
    ├── test_utils.py
    └── utils
        ├── __init__.py
        ├── test_binning.py
        ├── test_categorical.py
        ├── test_dataframe_encoder.py
        └── test_multivalue.py


/.github/workflows/doctest.yml:
--------------------------------------------------------------------------------
 1 | name: Doctest
 2 | on: [push]
 3 | jobs:
 4 |   test_readme_and_sphinx_docs:
 5 |     runs-on: ubuntu-latest
 6 |     env:
 7 |       OS: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |         with:
11 |           fetch-depth: 0
12 |       - name: Setup Python
13 |         uses: actions/setup-python@v3
14 |         with:
15 |           python-version: "3.11"
16 |       - name: Build myfm
17 |         run: |
18 |           pip install --upgrade pip
19 |           pip install numpy scipy pandas scikit-learn
20 |           pip install .
21 |           curl http://files.grouplens.org/datasets/movielens/ml-100k.zip -o ~/.ml-100k.zip
22 |       - name: Run pytest
23 |         run: |
24 |           pip install pytest phmdoctest sphinx==4.4.0 sphinx_rtd_theme
25 |       - name: Test Readme.md
26 |         run: |
27 |           GEN_TEST_FILE=phmdoctest_out.py
28 |           phmdoctest README.md --outfile "$GEN_TEST_FILE"
29 |           pytest "$GEN_TEST_FILE"
30 |           rm "$GEN_TEST_FILE"
31 |       - name: Run sphinx doctest
32 |         run: |
33 |           cd doc
34 |           make doctest
35 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-latest
 8 |     env:
 9 |       SKIP: no-commit-to-branch
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - uses: actions/setup-python@v3
13 |     - uses: pre-commit/action@v3.0.0
14 | 


--------------------------------------------------------------------------------
/.github/workflows/run-test.yml:
--------------------------------------------------------------------------------
 1 | name: Test & Upload coverage
 2 | on: [push]
 3 | jobs:
 4 |   run_pytest_upload_coverage:
 5 |     runs-on: ubuntu-latest
 6 |     env:
 7 |       OS: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |         with:
11 |           fetch-depth: 0
12 |       - name: Setup Python
13 |         uses: actions/setup-python@v3
14 |         with:
15 |           python-version: "3.11"
16 |       - name: Build myfm
17 |         run: |
18 |           pip install --upgrade pip
19 |           pip install numpy scipy pandas
20 |           sudo apt-get install lcov
21 |           FLAGS="-fprofile-arcs -ftest-coverage"
22 |           CFLAGS="$FLAGS" CXXFLAGS="$FLAGS" pip install -e .
23 |       - name: Run pytest
24 |         run: |
25 |           pip install pytest pytest-cov pytest-mock
26 |           pytest --cov=./src/myfm tests/
27 |       - name: Generate coverage (ubuntu)
28 |         run: |
29 |           coverage xml
30 |           lcov -d `pwd` -c -o coverage.info
31 |       - name: Upload coverage to Codecov
32 |         uses: codecov/codecov-action@v1
33 |         with:
34 |           files: ./coverage.xml,./coverage.info
35 |           verbose: false
36 |           env_vars: OS,PYTHON
37 |           name: codecov-umbrella
38 |           fail_ci_if_error: false
39 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels.yml:
--------------------------------------------------------------------------------
  1 | name: Build wheel
  2 | on:
  3 |   push:
  4 |     branches:
  5 |       - main
  6 |   release:
  7 |     types:
  8 |       - created
  9 | env:
 10 |   cibuildwheel_version: "2.12.2"
 11 | jobs:
 12 |   build_sdist:
 13 |     name: Build source distribution
 14 |     runs-on: ubuntu-latest
 15 |     steps:
 16 |       - uses: actions/checkout@v3
 17 |         with:
 18 |           fetch-depth: 0
 19 |       - uses: actions/setup-python@v3
 20 |         name: Install Python
 21 |         with:
 22 |           python-version: "3.11"
 23 |       - name: Build sdist
 24 |         run: pip install pybind11 && python setup.py sdist
 25 |       - uses: actions/upload-artifact@v2
 26 |         with:
 27 |           path: dist/*.tar.gz
 28 |   build_wheels:
 29 |     name: Build wheels on ${{ matrix.os }}
 30 |     runs-on: ${{ matrix.os }}
 31 |     env:
 32 |       MACOSX_DEPLOYMENT_TARGET: "10.9"
 33 |       CIBW_BUILD_VERBOSITY: "1"
 34 |       CIBW_BUILD: "${{ matrix.cibw.build || '*' }}"
 35 |       CIBW_SKIP: "${{ matrix.cibw.skip || '' }}"
 36 |       CIBW_ENVIRONMENT: "${{ matrix.cibw.env || '' }}"
 37 |       CIBW_TEST_COMMAND: "pytest {project}/tests"
 38 |       CIBW_TEST_REQUIRES: pytest pytest-mock
 39 |       CIBW_MANYLINUX_X86_64_IMAGE: "${{ matrix.cibw.manylinux_image }}"
 40 |       CIBW_MANYLINUX_I686_IMAGE: "${{ matrix.cibw.manylinux_image }}"
 41 |       CIBW_MANYLINUX_AARCH64_IMAGE: "${{ matrix.cibw.manylinux_image }}"
 42 |       CIBW_ARCHS_LINUX: "${{ matrix.cibw.arch || 'auto' }}"
 43 |       CIBW_ARCHS_MACOS: "${{ matrix.cibw.arch || 'auto' }}"
 44 |     strategy:
 45 |       matrix:
 46 |         include:
 47 |           - os: macos-10.15
 48 |             name: mac
 49 |             cibw:
 50 |               arch: x86_64
 51 |               build: "cp37* cp38*"
 52 | 
 53 |           - os: macos-10.15
 54 |             name: mac-arm
 55 |             cibw:
 56 |               arch: universal2
 57 |               build: "cp39* cp310* cp311*"
 58 | 
 59 |           - os: ubuntu-20.04
 60 |             name: manylinux1
 61 |             cibw:
 62 |               build: "cp37*"
 63 |               skip: "*musllinux*"
 64 |               manylinux_image: manylinux2010
 65 |               arch: auto64
 66 | 
 67 |           - os: ubuntu-20.04
 68 |             name: manylinux2014
 69 |             cibw:
 70 |               build: "cp38* cp39* cp310* cp311*"
 71 |               skip: "*musllinux*"
 72 |               manylinux_image: manylinux2014
 73 |               arch: auto64
 74 | 
 75 |           - os: ubuntu-20.04
 76 |             name: manylinux_aarch64_cp37
 77 |             cibw:
 78 |               build: "cp37*"
 79 |               skip: "*musllinux*"
 80 |               manylinux_image: manylinux2014
 81 |               arch: aarch64
 82 | 
 83 |           - os: ubuntu-20.04
 84 |             name: manylinux_aarch64_cp38
 85 |             cibw:
 86 |               build: "cp38*"
 87 |               skip: "*musllinux*"
 88 |               manylinux_image: manylinux2014
 89 |               arch: aarch64
 90 | 
 91 |           - os: ubuntu-20.04
 92 |             name: manylinux_aarch64_cp39
 93 |             cibw:
 94 |               build: "cp39*"
 95 |               skip: "*musllinux*"
 96 |               manylinux_image: manylinux2014
 97 |               arch: aarch64
 98 | 
 99 |           - os: ubuntu-20.04
100 |             name: manylinux_aarch64_cp310
101 |             cibw:
102 |               build: "cp310*"
103 |               skip: "*musllinux*"
104 |               manylinux_image: manylinux2014
105 |               arch: aarch64
106 | 
107 |           - os: ubuntu-20.04
108 |             name: manylinux_aarch64_cp311
109 |             cibw:
110 |               build: "cp311*"
111 |               skip: "*musllinux*"
112 |               manylinux_image: manylinux2014
113 |               arch: aarch64
114 | 
115 |           - os: windows-2019
116 |             name: win_amd64
117 |             architecture: x64
118 |             cibw:
119 |               skip: "cp36*"
120 |               build: "cp*win_amd64"
121 | 
122 |     steps:
123 |       - uses: actions/checkout@v2
124 |         with:
125 |           fetch-depth: 0
126 |       - uses: actions/setup-python@v2
127 |         name: Install Python
128 |       - name: register qemu
129 |         if: contains(matrix.cibw.arch, 'aarch64')
130 |         run: |
131 |           docker run --rm --privileged hypriot/qemu-register:v4.2.0
132 |       - name: Install cibuildwheel
133 |         run: python -m pip install cibuildwheel=="${{env.cibuildwheel_version}}"
134 |       - name: Build wheels
135 |         run: python -m cibuildwheel --output-dir wheelhouse
136 | 
137 |       - uses: actions/upload-artifact@v2
138 |         with:
139 |           path: ./wheelhouse/*.whl
140 | 
141 |   upload_pypi:
142 |     needs: [build_wheels, build_sdist]
143 |     runs-on: ubuntu-latest
144 |     steps:
145 |       - uses: actions/download-artifact@v2
146 |         with:
147 |           name: artifact
148 |           path: dist
149 |       - name: Publish package to TestPyPI
150 |         uses: pypa/gh-action-pypi-publish@master
151 |         with:
152 |           user: __token__
153 |           password: ${{ secrets.TEST_PYPI_APITOKEN }}
154 |           packages_dir: dist/
155 |           repository_url: https://test.pypi.org/legacy/
156 |           verbose: true
157 |           skip_existing: true
158 |       - name: Publish package to PyPI
159 |         if: github.event_name == 'release'
160 |         uses: pypa/gh-action-pypi-publish@master
161 |         with:
162 |           user: __token__
163 |           password: ${{ secrets.PYPI_APITOKEN }}
164 |           packages_dir: dist/
165 |           verbose: true
166 |           skip_existing: true
167 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build
 2 | eigen
 3 | pybind11
 4 | .python-version
 5 | .clangd
 6 | eigen-eigen-323c052e1731/
 7 | myfm.egg-info/
 8 | tmp
 9 | .eggs
10 | *.pyc
11 | *.so
12 | .vscode
13 | compile_commands.json
14 | eigen3.zip
15 | dist/
16 | *-checkpoint.ipynb
17 | doc/_build/*
18 | eigen-3.3.7/
19 | eigen-3.4.0/
20 | doc/source/_build
21 | 
22 | stubs/*
23 | 
24 | doc/source/api_reference/*.rst
25 | .cache
26 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v3.2.0
 6 |     hooks:
 7 |       - id: check-merge-conflict
 8 |       - id: check-yaml
 9 |       - id: end-of-file-fixer
10 |       - id: no-commit-to-branch
11 |         args: [--branch, main]
12 |       - id: trailing-whitespace
13 |       - id: end-of-file-fixer
14 |       - id: check-added-large-files
15 |   - repo: https://github.com/PyCQA/isort
16 |     rev: 5.12.0
17 |     hooks:
18 |       - id: isort
19 |         name: isort
20 |   - repo: https://github.com/psf/black
21 |     rev: 22.3.0
22 |     hooks:
23 |       - id: black
24 |         language_version: python3 # Should be a command that runs python3.6+
25 |   - repo: https://github.com/hadialqattan/pycln
26 |     rev: v1.1.0
27 |     hooks:
28 |       - id: pycln
29 |         args: [--config=pyproject.toml]
30 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Required
 2 | version: 2
 3 | 
 4 | # Build documentation in the docs/ directory with Sphinx
 5 | sphinx:
 6 |   configuration: doc/source/conf.py
 7 | 
 8 | # Optionally build your docs in additional formats such as PDF
 9 | formats:
10 |   - pdf
11 | 
12 | # Optionally set the version of Python and requirements required to build your docs
13 | python:
14 |   version: 3.7
15 |   install:
16 |     - method: pip
17 |       path: .
18 |     - requirements: doc/requirements.txt
19 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(CMAKE_C_COMPILER gcc)
 2 | set(CMAKE_CXX_COMPILER g++)
 3 | 
 4 | cmake_minimum_required(VERSION 3.0.0)
 5 | project(myfm)
 6 | 
 7 | set(CMAKE_BUILD_TYPE Release)
 8 | set(CMAKE_CXX_FLAGS                " ${CMAKE_CXX_FLAGS_INIT} -std=c++11 -fPIC")
 9 | 
10 | add_subdirectory(pybind11)
11 | include_directories(include eigen-3.3.7)
12 | pybind11_add_module(_myfm src/bind.cpp src/Faddeeva.cc)
13 | 
14 | set(CPACK_PROJECT_NAME ${PROJECT_NAME})
15 | set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
16 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Tomoki Ohtsuki
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include include/myfm *.hpp
2 | include LICENSE README.md src/declare_module.hpp include/Faddeeva/Faddeeva.hh src/Faddeeva.cc
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # myFM
  2 | [![Python](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)](https://www.python.org)
  3 | [![pypi](https://img.shields.io/pypi/v/myfm.svg)](https://pypi.python.org/pypi/myfm)
  4 | [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/tohtsky/myFM)
  5 | [![Build](https://github.com/tohtsky/myFM/workflows/Build%20wheel/badge.svg?branch=main)](https://github.com/tohtsky/myfm)
  6 | [![Read the Docs](https://readthedocs.org/projects/myfm/badge/?version=stable)](https://myfm.readthedocs.io/en/stable/)
  7 | [![codecov](https://codecov.io/gh/tohtsky/myfm/branch/main/graph/badge.svg?token=kLgOKTQqcV)](https://codecov.io/gh/tohtsky/myfm)
  8 | 
  9 | 
 10 | myFM is an implementation of Bayesian [Factorization Machines](https://ieeexplore.ieee.org/abstract/document/5694074/) based on Gibbs sampling, which I believe is a wheel worth reinventing.
 11 | 
 12 | Currently this supports most options for libFM MCMC engine, such as
 13 | 
 14 | - Grouping of input variables (`-meta` option of [libFM](https://github.com/srendle/libfm))
 15 | - Relation Data format (See the paper ["Scaling Factorization Machines to relational data"](https://dl.acm.org/citation.cfm?id=2488340))
 16 | 
 17 | There are also functionalities not present in libFM:
 18 | 
 19 | - The gibbs sampler for Ordered probit regression [5] implementing Metropolis-within-Gibbs scheme of [6].
 20 | - Variational inference for regression and binary classification.
 21 | 
 22 | Tutorial and reference doc is provided at https://myfm.readthedocs.io/en/latest/.
 23 | 
 24 | # Installation
 25 | 
 26 | The package is pip-installable.
 27 | 
 28 | ```
 29 | pip install myfm
 30 | ```
 31 | 
 32 | There are binaries for major operating systems.
 33 | 
 34 | If you are working with less popular OS/architecture, pip will attempt to build myFM from the source (you need a decent C++ compiler!). In that case, in addition to installing python dependencies (`numpy`, `scipy`, `pandas`, ...), the above command will automatically download eigen (ver 3.4.0) to its build directory and use it during the build.
 35 | 
 36 | # Examples
 37 | 
 38 | ## A Toy example
 39 | 
 40 | This example is taken from [pyfm](https://github.com/coreylynch/pyFM) with some modification.
 41 | 
 42 | ```python
 43 | import myfm
 44 | from sklearn.feature_extraction import DictVectorizer
 45 | import numpy as np
 46 | train = [
 47 | 	{"user": "1", "item": "5", "age": 19},
 48 | 	{"user": "2", "item": "43", "age": 33},
 49 | 	{"user": "3", "item": "20", "age": 55},
 50 | 	{"user": "4", "item": "10", "age": 20},
 51 | ]
 52 | v = DictVectorizer()
 53 | X = v.fit_transform(train)
 54 | print(X.toarray())
 55 | # print
 56 | # [[ 19.   0.   0.   0.   1.   1.   0.   0.   0.]
 57 | #  [ 33.   0.   0.   1.   0.   0.   1.   0.   0.]
 58 | #  [ 55.   0.   1.   0.   0.   0.   0.   1.   0.]
 59 | #  [ 20.   1.   0.   0.   0.   0.   0.   0.   1.]]
 60 | y = np.asarray([0, 1, 1, 0])
 61 | fm = myfm.MyFMClassifier(rank=4)
 62 | fm.fit(X,y)
 63 | fm.predict(v.transform({"user": "1", "item": "10", "age": 24}))
 64 | ```
 65 | 
 66 | ## A Movielens-100k Example
 67 | 
 68 | This example will require `pandas` and `scikit-learn`. `movielens100k_loader` is present in `examples/movielens100k_loader.py`.
 69 | 
 70 | You will be able to obtain a result comparable to SOTA algorithms like GC-MC. See `examples/ml-100k.ipynb` for the detailed version.
 71 | 
 72 | ```python
 73 | import numpy as np
 74 | from sklearn.preprocessing import OneHotEncoder
 75 | from sklearn import metrics
 76 | 
 77 | import myfm
 78 | from myfm.utils.benchmark_data import MovieLens100kDataManager
 79 | 
 80 | data_manager = MovieLens100kDataManager()
 81 | df_train, df_test = data_manager.load_rating_predefined_split(
 82 |     fold=3
 83 | )  # Note the dependence on the fold
 84 | 
 85 | def test_myfm(df_train, df_test, rank=8, grouping=None, n_iter=100, samples=95):
 86 |     explanation_columns = ["user_id", "movie_id"]
 87 |     ohe = OneHotEncoder(handle_unknown="ignore")
 88 |     X_train = ohe.fit_transform(df_train[explanation_columns])
 89 |     X_test = ohe.transform(df_test[explanation_columns])
 90 |     y_train = df_train.rating.values
 91 |     y_test = df_test.rating.values
 92 |     fm = myfm.MyFMRegressor(rank=rank, random_seed=114514)
 93 | 
 94 |     if grouping:
 95 |         # specify how columns of X_train are grouped
 96 |         group_shapes = [len(category) for category in ohe.categories_]
 97 |         assert sum(group_shapes) == X_train.shape[1]
 98 |     else:
 99 |         group_shapes = None
100 | 
101 |     fm.fit(
102 |         X_train,
103 |         y_train,
104 |         group_shapes=group_shapes,
105 |         n_iter=n_iter,
106 |         n_kept_samples=samples,
107 |     )
108 |     prediction = fm.predict(X_test)
109 |     rmse = ((y_test - prediction) ** 2).mean() ** 0.5
110 |     mae = np.abs(y_test - prediction).mean()
111 |     print("rmse={rmse}, mae={mae}".format(rmse=rmse, mae=mae))
112 |     return fm
113 | 
114 | 
115 | # basic regression
116 | test_myfm(df_train, df_test, rank=8)
117 | # rmse=0.90321, mae=0.71164
118 | 
119 | # with grouping
120 | fm = test_myfm(df_train, df_test, rank=8, grouping=True)
121 | # rmse=0.89594, mae=0.70481
122 | ```
123 | 
124 | ## Examples for Relational Data format
125 | 
126 | Below is a toy movielens-like example that utilizes relational data format proposed in [3].
127 | 
128 | This example, however, is too simplistic to exhibit the computational advantage of this data format. For an example with drastically reduced computational complexity, see `examples/ml-100k-extended.ipynb`;
129 | 
130 | ```python
131 | import pandas as pd
132 | import numpy as np
133 | from myfm import MyFMRegressor, RelationBlock
134 | from sklearn.preprocessing import OneHotEncoder
135 | 
136 | users = pd.DataFrame([
137 |     {'user_id': 1, 'age': '20s', 'married': False},
138 |     {'user_id': 2, 'age': '30s', 'married': False},
139 |     {'user_id': 3, 'age': '40s', 'married': True}
140 | ]).set_index('user_id')
141 | 
142 | movies = pd.DataFrame([
143 |     {'movie_id': 1, 'comedy': True, 'action': False },
144 |     {'movie_id': 2, 'comedy': False, 'action': True },
145 |     {'movie_id': 3, 'comedy': True, 'action': True}
146 | ]).set_index('movie_id')
147 | 
148 | ratings = pd.DataFrame([
149 |     {'user_id': 1, 'movie_id': 1, 'rating': 2},
150 |     {'user_id': 1, 'movie_id': 2, 'rating': 5},
151 |     {'user_id': 2, 'movie_id': 2, 'rating': 4},
152 |     {'user_id': 2, 'movie_id': 3, 'rating': 3},
153 |     {'user_id': 3, 'movie_id': 3, 'rating': 3},
154 | ])
155 | 
156 | user_ids, user_indices = np.unique(ratings.user_id, return_inverse=True)
157 | movie_ids, movie_indices = np.unique(ratings.movie_id, return_inverse=True)
158 | 
159 | user_ohe = OneHotEncoder(handle_unknown='ignore').fit(users.reset_index()) # include user id as feature
160 | movie_ohe = OneHotEncoder(handle_unknown='ignore').fit(movies.reset_index())
161 | 
162 | X_user = user_ohe.transform(
163 |     users.reindex(user_ids).reset_index()
164 | )
165 | X_movie = movie_ohe.transform(
166 |     movies.reindex(movie_ids).reset_index()
167 | )
168 | 
169 | block_user = RelationBlock(user_indices, X_user)
170 | block_movie = RelationBlock(movie_indices, X_movie)
171 | 
172 | fm = MyFMRegressor(rank=2).fit(None, ratings.rating, X_rel=[block_user, block_movie])
173 | 
174 | prediction_df = pd.DataFrame([
175 |     dict(user_id=user_id,movie_id=movie_id,
176 |          user_index=user_index, movie_index=movie_index)
177 |     for user_index, user_id in enumerate(user_ids)
178 |     for movie_index, movie_id in enumerate(movie_ids)
179 | ])
180 | predicted_rating = fm.predict(None, [
181 |     RelationBlock(prediction_df.user_index, X_user),
182 |     RelationBlock(prediction_df.movie_index, X_movie)
183 | ])
184 | 
185 | prediction_df['prediction']  = predicted_rating
186 | 
187 | print(
188 |     prediction_df.merge(ratings.rename(columns={'rating':'ground_truth'}), how='left')
189 | )
190 | ```
191 | 
192 | # References
193 | 
194 | 1. Rendle, Steffen. "Factorization machines." 2010 IEEE International Conference on Data Mining. IEEE, 2010.
195 | 1. Rendle, Steffen. "Factorization machines with libfm." ACM Transactions on Intelligent Systems and Technology (TIST) 3.3 (2012): 57.
196 | 1. Rendle, Steffen. "Scaling factorization machines to relational data." Proceedings of the VLDB Endowment. Vol. 6. No. 5. VLDB Endowment, 2013.
197 | 1. Bayer, Immanuel. "fastfm: A library for factorization machines." arXiv preprint arXiv:1505.00641 (2015).
198 | 1. Albert, James H., and Siddhartha Chib. "Bayesian analysis of binary and polychotomous response data." Journal of the American statistical Association 88.422 (1993): 669-679.
199 | 1. Albert, James H., and Siddhartha Chib. "Sequential ordinal modeling with applications to survival data." Biometrics 57.3 (2001): 829-836.
200 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | ignore:
2 |   - "**/Faddeeva.*"
3 | 


--------------------------------------------------------------------------------
/cpp_source/bind.cpp:
--------------------------------------------------------------------------------
1 | #include "declare_module.hpp"
2 | 
3 | PYBIND11_MODULE(_myfm, m) {
4 |   declare_functional<double>(m);
5 | }
6 | 


--------------------------------------------------------------------------------
/cpp_source/bind_float.cpp:
--------------------------------------------------------------------------------
1 | #include "declare_module.hpp"
2 | 
3 | PYBIND11_MODULE(_myfm_float, m) {
4 |   declare_functional<float>(m);
5 | }
6 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==4.4.0
2 | 


--------------------------------------------------------------------------------
/doc/source/api_reference.rst:
--------------------------------------------------------------------------------
 1 | .. _APIReference:
 2 | 
 3 | ==============
 4 | API References
 5 | ==============
 6 | 
 7 | .. currentmodule:: myfm
 8 | 
 9 | Training API
10 | ------------
11 | .. autosummary::
12 |     :toctree: api_reference
13 | 
14 |     RelationBlock
15 |     MyFMRegressor
16 |     MyFMClassifier
17 |     MyFMGibbsRegressor
18 |     MyFMGibbsClassifier
19 |     MyFMOrderedProbit
20 |     VariationalFMRegressor
21 |     VariationalFMClassifier
22 | 
23 | .. currentmodule:: myfm
24 | 
25 | Benchmark Dataset
26 | -----------------
27 | .. autosummary::
28 |     :toctree: api_reference
29 | 
30 |     utils.benchmark_data.MovieLens100kDataManager
31 |     utils.benchmark_data.MovieLens1MDataManager
32 |     utils.benchmark_data.MovieLens10MDataManager
33 | 
34 | 
35 | Utilities for Sparse Matrix Construction
36 | ----------------------------------------
37 | 
38 | .. autosummary::
39 |     :toctree: api_reference
40 | 
41 |     utils.encoders.DataFrameEncoder
42 |     utils.encoders.CategoryValueToSparseEncoder
43 |     utils.encoders.MultipleValuesToSparseEncoder
44 |     utils.encoders.BinningEncoder
45 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | 
14 | # sys.path.insert(0, os.path.abspath('../../'))
15 | 
16 | # -- Project information -----------------------------------------------------
17 | 
18 | project = "myFM"
19 | copyright = "2020, Tomoki Ohtsuki"
20 | author = "Tomoki Ohtsuki"
21 | 
22 | # The full version, including alpha/beta/rc tags
23 | release = "0.2.1"
24 | 
25 | 
26 | # -- General configuration ---------------------------------------------------
27 | 
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = [
32 |     "sphinx.ext.autodoc",
33 |     "sphinx.ext.autosummary",
34 |     "sphinx.ext.todo",
35 |     "sphinx.ext.doctest",
36 |     "sphinx.ext.viewcode",
37 |     "sphinx.ext.autodoc",
38 |     "sphinx.ext.napoleon",
39 |     "sphinx_rtd_theme",
40 | ]
41 | 
42 | napoleon_google_docstring = False
43 | napoleon_numpy_docstring = True
44 | napoleon_include_private_with_doc = False
45 | napoleon_include_special_with_doc = False
46 | napoleon_use_admonition_for_examples = False
47 | napoleon_use_admonition_for_notes = False
48 | napoleon_use_admonition_for_references = False
49 | napoleon_use_ivar = True
50 | napoleon_use_param = True
51 | napoleon_use_rtype = True
52 | 
53 | 
54 | autosummary_generate = ["api_reference.rst"]
55 | 
56 | 
57 | autodoc_default_flags = ["members", "inherited-members", "show-inheritance"]
58 | autodoc_default_options = {
59 |     "members": True,
60 |     "inherited-members": True,
61 |     "show-inheritance": True,
62 | }
63 | 
64 | 
65 | autoclass_content = "class"
66 | # Add any paths that contain templates here, relative to this directory.
67 | templates_path = ["_templates"]
68 | 
69 | # List of patterns, relative to source directory, that match files and
70 | # directories to ignore when looking for source files.
71 | # This pattern also affects html_static_path and html_extra_path.
72 | exclude_patterns = []
73 | 
74 | 
75 | # -- Options for HTML output -------------------------------------------------
76 | 
77 | # The theme to use for HTML and HTML Help pages.  See the documentation for
78 | # a list of builtin themes.
79 | #
80 | html_theme = "sphinx_rtd_theme"
81 | 
82 | # Add any paths that contain custom static files (such as style sheets) here,
83 | # relative to this directory. They are copied after the builtin static files,
84 | # so a file named "default.css" will overwrite the builtin "default.css".
85 | html_static_path = ["_static"]
86 | 
87 | master_doc = "index"
88 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. myfm documentation master file, created by
 2 |    sphinx-quickstart on Wed Aug 19 13:39:04 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | 
 7 | myFM - Bayesian Factorization Machines in Python/C++
 8 | ====================================================
 9 | 
10 | **myFM** is an unofficial implementation of Bayesian Factorization Machines in Python/C++.
11 | Notable features include:
12 | 
13 | * Implementation of all corresponding functionalities in `libFM <http://libfm.org/>`_ MCMC engine (including grouping & relation block)
14 | * A simpler and faster implementation using `Pybind11 <https://github.com/pybind/pybind11>`_ and `Eigen <http://eigen.tuxfamily.org/index.php?title=Main_Page>`_
15 | * Gibbs sampling for **ordinal regression** with probit link function. See :ref:`the tutorial <OrdinalRegression>` for its usage.
16 | * Support variational inference, which converges faster and requires lower memory (but usually less accurate than the Gibbs sampling).
17 | 
18 | 
19 | In most cases, you can install the library from PyPI: ::
20 | 
21 |    pip install myfm
22 | 
23 | It has an interface similar to sklearn, and you can use them for wide variety of prediction tasks.
24 | For example,
25 | 
26 | .. testcode::
27 | 
28 |    from sklearn.datasets import load_breast_cancer
29 |    from sklearn.model_selection import train_test_split
30 |    from sklearn.preprocessing import StandardScaler
31 |    from sklearn import metrics
32 | 
33 |    from myfm import MyFMClassifier
34 | 
35 |    dataset = load_breast_cancer()
36 |    X = StandardScaler().fit_transform(dataset['data'])
37 |    y = dataset['target']
38 | 
39 |    X_train, X_test, y_train, y_test = train_test_split(
40 |       X, y, random_state=42
41 |    )
42 |    fm = MyFMClassifier(rank=2).fit(X_train, y_train)
43 | 
44 |    print(metrics.roc_auc_score(y_test, fm.predict_proba(X_test)))
45 |    # 0.9954
46 | 
47 | .. testoutput::
48 |    :hide:
49 |    :options: +ELLIPSIS
50 | 
51 |    0.99...
52 | 
53 | 
54 | Try out the following :ref:`examples <MovielensIndex>` to see how Bayesian approaches to explicit collaborative filtering
55 | are still very competitive (almost unbeaten)!
56 | 
57 | .. toctree::
58 |    :caption: Basic Usage
59 |    :maxdepth: 1
60 | 
61 |    quickstart
62 |    movielens
63 |    relation-blocks
64 |    ordinal-regression
65 | 
66 | .. toctree::
67 |    :caption: Details
68 |    :maxdepth: 1
69 | 
70 |    api_reference
71 | 
72 | 
73 | Indices and tables
74 | ==================
75 | 
76 | * :ref:`genindex`
77 | * :ref:`search`
78 | 


--------------------------------------------------------------------------------
/doc/source/movielens.rst:
--------------------------------------------------------------------------------
  1 | .. _MovielensIndex:
  2 | 
  3 | =========================================
  4 | A Basic Tutorial with Movielens 100K
  5 | =========================================
  6 | 
  7 | FMs perform remarkably well on datasets with huge and sparse feature matrices,
  8 | and the most common examples are (explicit) collaborative filtering tasks.
  9 | 
 10 | Let us examine the power of the Bayesian Factorization Machines
 11 | by testing a series of APIs in myFM using the well-known Movielens 100k dataset.
 12 | 
 13 | 
 14 | -------------------------
 15 | Pure Matrix Factorization
 16 | -------------------------
 17 | 
 18 | First let us consider the probabilistic Matrix Factorization.
 19 | That is, we model the user :math:`u`'s rating response to movie :math:`i`,
 20 | which we write :math:`r_{ui}`, as
 21 | 
 22 | .. math::
 23 |     r_{ui} \sim w_0 + b_u + d_i + \vec{u}_u \cdot \vec{v}_j
 24 | 
 25 | This formulation is equivalent to Factorization Machines with
 26 | 
 27 | 1. User IDs treated as a categorical feature with one-hot encoding
 28 | 2. Movie IDs treated as a categorical feature with one-hot encoding
 29 | 
 30 | So you can efficiently use encoder like sklearn's `OneHotEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html>`_
 31 | to prepare the input matrix.
 32 | 
 33 | .. testcode ::
 34 | 
 35 |     import numpy as np
 36 |     from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
 37 |     from sklearn import metrics
 38 | 
 39 |     import myfm
 40 |     from myfm.utils.benchmark_data import MovieLens100kDataManager
 41 | 
 42 |     FM_RANK = 10
 43 | 
 44 |     data_manager = MovieLens100kDataManager()
 45 |     df_train, df_test = data_manager.load_rating_predefined_split(fold=3)
 46 | 
 47 |     FEATURE_COLUMNS = ['user_id', 'movie_id']
 48 |     ohe = OneHotEncoder(handle_unknown='ignore')
 49 | 
 50 |     X_train = ohe.fit_transform(df_train[FEATURE_COLUMNS])
 51 |     X_test = ohe.transform(df_test[FEATURE_COLUMNS])
 52 |     y_train = df_train.rating.values
 53 |     y_test = df_test.rating.values
 54 | 
 55 |     fm = myfm.MyFMRegressor(rank=FM_RANK, random_seed=42)
 56 |     fm.fit(X_train, y_train, n_iter=200, n_kept_samples=200)
 57 | 
 58 |     prediction = fm.predict(X_test)
 59 |     rmse = ((y_test - prediction) ** 2).mean() ** .5
 60 |     mae = np.abs(y_test - prediction).mean()
 61 |     print(f'rmse={rmse}, mae={mae}')
 62 | 
 63 | .. testoutput::
 64 |     :hide:
 65 |     :options: +ELLIPSIS
 66 | 
 67 |     rmse=..., mae=...
 68 | 
 69 | The above script should give you RMSE=0.8944, MAE=0.7031 which is already
 70 | impressive compared with other recent methods.
 71 | 
 72 | .. _grouping:
 73 | 
 74 | -------------------------------------------
 75 | Assuming Separate Variance for movie & user
 76 | -------------------------------------------
 77 | 
 78 | In Probabilistic Matrix Factorization, we usually assume
 79 | user vectors and item vectors are drawn from separate normal priors:
 80 | 
 81 | .. math::
 82 |     u_i & \sim \mathcal{N}(\mu_U, \Sigma_U) \\
 83 |     v_i & \sim \mathcal{N}(\mu_I, \Sigma_I)
 84 | 
 85 | However, we haven't provided any information about which columns are users' and items'.
 86 | 
 87 | You can tell  :py:class:`myfm.MyFMRegressor` these information (i.e., which parameters share a common mean and variance) by ``group_shapes`` option:
 88 | 
 89 | .. testcode ::
 90 | 
 91 |     fm_grouped = myfm.MyFMRegressor(
 92 |         rank=FM_RANK, random_seed=42,
 93 |     )
 94 |     fm_grouped.fit(
 95 |         X_train, y_train, n_iter=200, n_kept_samples=200,
 96 |         group_shapes=[len(group) for group in ohe.categories_]
 97 |     )
 98 | 
 99 |     prediction_grouped = fm_grouped.predict(X_test)
100 |     rmse = ((y_test - prediction_grouped) ** 2).mean() ** .5
101 |     mae = np.abs(y_test - prediction_grouped).mean()
102 |     print(f'rmse={rmse}, mae={mae}')
103 | 
104 | .. testoutput::
105 |     :hide:
106 |     :options: +ELLIPSIS
107 | 
108 |     rmse=..., mae=...
109 | 
110 | 
111 | This will slightly improve the performance to RMSE=0.8925, MAE=0.7001.
112 | 
113 | 
114 | -------------------------------------------
115 | Adding Side information
116 | -------------------------------------------
117 | 
118 | It is straightforward to include user/item side information.
119 | 
120 | First we retrieve the side information from ``Movielens100kDataManager``:
121 | 
122 | .. testcode ::
123 | 
124 |     user_info = data_manager.load_user_info().set_index('user_id')
125 |     user_info["age"] = user_info.age // 5 * 5
126 |     user_info["zipcode"] = user_info.zipcode.str[0]
127 |     user_info_ohe = OneHotEncoder(handle_unknown='ignore').fit(user_info)
128 | 
129 |     movie_info = data_manager.load_movie_info().set_index('movie_id')
130 |     movie_info['release_year'] = [
131 |         str(x) for x in movie_info['release_date'].dt.year.fillna('NaN')
132 |     ]
133 |     movie_info = movie_info[['release_year', 'genres']]
134 |     movie_info_ohe = OneHotEncoder(handle_unknown='ignore').fit(movie_info[['release_year']])
135 |     movie_genre_mle = MultiLabelBinarizer(sparse_output=True).fit(
136 |         movie_info.genres.apply(lambda x: x.split('|'))
137 |     )
138 | 
139 | 
140 | 
141 | Note that the way movie genre information is represented in ``movie_info`` DataFrame is a bit tricky (it is already binary encoded).
142 | 
143 | We can then augment ``X_train`` / ``X_test`` with auxiliary information. The `hstack <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.hstack.html>`_ function of ``scipy.sparse`` is very convenient for this purpose:
144 | 
145 | .. testcode ::
146 | 
147 |     import scipy.sparse as sps
148 |     X_train_extended = sps.hstack([
149 |         X_train,
150 |         user_info_ohe.transform(
151 |             user_info.reindex(df_train.user_id)
152 |         ),
153 |         movie_info_ohe.transform(
154 |             movie_info.reindex(df_train.movie_id).drop(columns=['genres'])
155 |         ),
156 |         movie_genre_mle.transform(
157 |             movie_info.genres.reindex(df_train.movie_id).apply(lambda x: x.split('|'))
158 |         )
159 |     ])
160 | 
161 |     X_test_extended = sps.hstack([
162 |         X_test,
163 |         user_info_ohe.transform(
164 |             user_info.reindex(df_test.user_id)
165 |         ),
166 |         movie_info_ohe.transform(
167 |             movie_info.reindex(df_test.movie_id).drop(columns=['genres'])
168 |         ),
169 |         movie_genre_mle.transform(
170 |             movie_info.genres.reindex(df_test.movie_id).apply(lambda x: x.split('|'))
171 |         )
172 |     ])
173 | 
174 | Then we can regress ``X_train_extended`` against ``y_train``
175 | 
176 | .. testcode ::
177 | 
178 |     group_shapes_extended = (
179 |         [len(group) for group in ohe.categories_] +
180 |         [len(group) for group in user_info_ohe.categories_] +
181 |         [len(group) for group in movie_info_ohe.categories_] +
182 |         [ len(movie_genre_mle.classes_)]
183 |     )
184 | 
185 |     fm_side_info = myfm.MyFMRegressor(
186 |         rank=FM_RANK, random_seed=42,
187 |     )
188 |     fm_side_info.fit(
189 |         X_train_extended, y_train, n_iter=200, n_kept_samples=200,
190 |         group_shapes=group_shapes_extended
191 |     )
192 | 
193 |     prediction_side_info = fm_side_info.predict(X_test_extended)
194 |     rmse = ((y_test - prediction_side_info) ** 2).mean() ** .5
195 |     mae = np.abs(y_test - prediction_side_info).mean()
196 |     print(f'rmse={rmse}, mae={mae}')
197 | 
198 | .. testoutput::
199 |     :hide:
200 |     :options: +ELLIPSIS
201 | 
202 |     rmse=..., mae=...
203 | 
204 | The result should improve further with RMSE = 0.8855, MAE = 0.6944.
205 | 
206 | Unfortunately, the running time is somewhat (~ 4 times) slower compared to
207 | the pure matrix-factorization described above. This is as it should be:
208 | the complexity of Bayesian FMs is proportional to :math:`O(\mathrm{NNZ})`
209 | (i.e., non-zero elements of input sparse matrix),
210 | and we have incorporated various non-zero elements (user/item features) for each row.
211 | 
212 | Surprisingly, we can still train the equivalent model
213 | in a running time close to pure MF if represent the data in Relational Data Format.
214 | See :ref:`next section <RelationBlockTutorial>` for how Relational Data Format works.
215 | 


--------------------------------------------------------------------------------
/doc/source/ordinal-regression.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. currentmodule:: myfm
  3 | .. _OrdinalRegression:
  4 | 
  5 | ----------------------------
  6 | Ordinal Regression Tutorial
  7 | ----------------------------
  8 | 
  9 | .. _OrdinalUCLA:
 10 | 
 11 | UCLA Dataset
 12 | ^^^^^^^^^^^^^^^^^^^^^^
 13 | 
 14 | Let us first explain the API of :py:class:`myfm.MyFMOrderedProbit`
 15 | using `UCLA dataset <https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/>`_.
 16 | 
 17 | The data description says
 18 | 
 19 |     This hypothetical data set has a three level variable called apply, with levels “unlikely”, “somewhat likely”, and “very likely”, coded 1, 2, and 3, respectively, that we will use as our outcome variable. We also have three variables that we will use as predictors: pared, which is a 0/1 variable indicating whether at least one parent has a graduate degree; public, which is a 0/1 variable where 1 indicates that the undergraduate institution is public and 0 private, and gpa, which is the student’s grade point average.
 20 | 
 21 | We can read the data (in Stata format) using pandas:
 22 | 
 23 | .. testcode ::
 24 | 
 25 |     import pandas as pd
 26 |     df = pd.read_stata("https://stats.idre.ucla.edu/stat/data/ologit.dta")
 27 |     df.head()
 28 | 
 29 | It should print
 30 | 
 31 | .. csv-table::
 32 |     :header-rows: 1
 33 | 
 34 |     ,apply,pared,public,gpa
 35 |     0,very likely,0,0,3.26
 36 |     1,somewhat likely,1,0,3.21
 37 |     2,unlikely,1,1,3.94
 38 |     3,somewhat likely,0,0,2.81
 39 |     4,somewhat likely,0,0,2.53
 40 | 
 41 | We regard the target label ``apply`` as a ordinal categorical variable,
 42 | 
 43 | .. math::
 44 |     (\text{unlikely} = 0) < (\text{somewhat likely} = 1) < (\text{very likely} = 2)
 45 | 
 46 | so we map ``apply`` as
 47 | 
 48 | .. testcode ::
 49 | 
 50 |     y = df['apply'].map({'unlikely': 0, 'somewhat likely': 1, 'very likely': 2}).values
 51 | 
 52 | Prepare other features as usual.
 53 | 
 54 | .. testcode ::
 55 | 
 56 |     from sklearn.model_selection import train_test_split
 57 |     from sklearn import metrics
 58 | 
 59 |     X = df[['pared', 'public', 'gpa']].values
 60 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 61 | 
 62 | Now we can feed the data into :py:class:`myfm.MyFMOrderedProbit`.
 63 | 
 64 | .. testcode ::
 65 | 
 66 |     from myfm import MyFMOrderedProbit
 67 |     clf = MyFMOrderedProbit(rank=0).fit(X_train, y_train, n_iter=200)
 68 | 
 69 |     p = clf.predict_proba(X_test)
 70 | 
 71 |     print(f'rmse={metrics.log_loss(y_test, p)}')
 72 |     # ~ 0.84, slightly better than constant model baseline.
 73 | 
 74 | .. testoutput ::
 75 |     :hide:
 76 |     :options: +ELLIPSIS
 77 | 
 78 |     rmse=...
 79 | 
 80 | Note that unlike binary probit regression, :py:meth:`MyFMOrderedProbit.predict_proba`
 81 | returns 2D (N_item x N_class) array of class probability.
 82 | 
 83 | Movielens ratings as ordinal outcome
 84 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 85 | 
 86 | Let us now turn back to :ref:`Movielens 100K tutorial <MovielensIndex>`.
 87 | 
 88 | Although we have treated movie ratings as a real target variable
 89 | and used :py:class:`MyFMRegressor`, it is more natural to regard them
 90 | as ordinal outcomes, as there are no guarantee that the difference between rating 4 vs 5
 91 | is equivalent to the one with rating 2 vs 3.
 92 | 
 93 | So let us see what happens if we instead use :py:class:`MyFMOrderedProbit` to predict the rating.
 94 | If you have followed the steps through :ref:`the previous ''grouping'' section <grouping>`,
 95 | you can train our ordered probit regressor by
 96 | 
 97 | .. testcode ::
 98 | 
 99 |     import numpy as np
100 |     from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
101 |     from sklearn import metrics
102 | 
103 |     import myfm
104 |     from myfm.utils.benchmark_data import MovieLens100kDataManager
105 | 
106 |     FM_RANK = 10
107 | 
108 |     data_manager = MovieLens100kDataManager()
109 |     df_train, df_test = data_manager.load_rating_predefined_split(fold=3)
110 | 
111 |     FEATURE_COLUMNS = ['user_id', 'movie_id']
112 |     ohe = OneHotEncoder(handle_unknown='ignore')
113 | 
114 |     X_train = ohe.fit_transform(df_train[FEATURE_COLUMNS])
115 |     X_test = ohe.transform(df_test[FEATURE_COLUMNS])
116 |     y_train = df_train.rating.values
117 |     y_test = df_test.rating.values
118 | 
119 |     fm = myfm.MyFMOrderedProbit(
120 |         rank=FM_RANK, random_seed=42,
121 |     )
122 |     fm.fit(
123 |         X_train, y_train - 1, n_iter=300, n_kept_samples=300,
124 |         group_shapes=[len(group) for group in ohe.categories_]
125 |     )
126 | 
127 | Note that we have used ``y_train - 1`` instead of ``y_train``,
128 | because rating ``r`` should be regarded as class ``r-1``.
129 | 
130 | 
131 | We can predict the class probability given ``X_test`` as
132 | 
133 | .. testcode ::
134 | 
135 |     p_ordinal = fm.predict_proba(X_test)
136 | 
137 | and the expected rating as
138 | 
139 | .. testcode ::
140 | 
141 |     expected_rating = p_ordinal.dot(np.arange(1, 6))
142 |     rmse = ((y_test - expected_rating) ** 2).mean() ** .5
143 |     mae = np.abs(y_test - expected_rating).mean()
144 |     print(f'rmse={rmse}, mae={mae}')
145 | 
146 | .. testoutput ::
147 |     :hide:
148 |     :options: +ELLIPSIS
149 | 
150 |     rmse=..., mae=...
151 | 
152 | 
153 | which gives us RMSE=0.8906 and MAE=0.6985, a slight improvement over the regression case.
154 | 
155 | To see why it had an advantage over regression, let us check
156 | the posterior samples for the cutpoint parameters.
157 | 
158 | .. testcode ::
159 | 
160 |     cutpoints = fm.cutpoint_samples - fm.w0_samples[:, None]
161 | 
162 | You can see how rating boundaries vs cutpoints looks like. ::
163 | 
164 |     from matplotlib import pyplot as plt
165 |     cp_mean = cutpoints.mean(axis=0)
166 |     cp_std = cutpoints.std(axis=0)
167 | 
168 |     plt.plot(np.arange(1, 5), cp_mean);
169 |     plt.fill_between(
170 |         np.arange(1, 5), cp_mean + 2*cp_std, cp_mean - 2 * cp_std,
171 |         alpha=0.3
172 |     )
173 |     plt.title('rating boundary vs cutpoint')
174 | 
175 | This will give you the following figure. The line is slightly non-linear,
176 | which may explain the advantage of the formulation in ordinal regression.
177 | 
178 | .. image:: ./rating_vs_cps.png
179 |     :alt: The relationship between cutpoints and rating boundaries are shown.
180 |     :width: 50%
181 | 
182 | You can also improve the performance for Movielens 1M & 10M dataset.
183 | See our `examples <https://github.com/tohtsky/myFM/blob/master/examples/>`_ directory.
184 | 


--------------------------------------------------------------------------------
/doc/source/quickstart.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Quick Start
 3 | ===========
 4 | 
 5 | 
 6 | ------------
 7 | Installation
 8 | ------------
 9 | 
10 | On MacOS/Linux First try::
11 | 
12 |     pip install myfm
13 | 
14 | If it works, you can now try the examples.
15 | 
16 | If there is something nasty, then read the :ref:`detailed installation guide <DetailedInstallationGuide>`
17 | and figure out what went wrong.
18 | Of course, feel free to create an issue on `GitHub <https://github.com/tohtsky/myFM>`_!
19 | 
20 | 
21 | -------------
22 | A toy example
23 | -------------
24 | 
25 | Let us first look at how :py:class:`myfm.MyFMClassifier` works for `a toy example provided in pyFM <https://github.com/coreylynch/pyFM>`_.
26 | 
27 | .. doctest ::
28 | 
29 |     import myfm
30 |     from sklearn.feature_extraction import DictVectorizer
31 |     import numpy as np
32 |     train = [
33 |     	{"user": "1", "item": "5", "age": 19},
34 |     	{"user": "2", "item": "43", "age": 33},
35 |     	{"user": "3", "item": "20", "age": 55},
36 |     	{"user": "4", "item": "10", "age": 20},
37 |     ]
38 |     v = DictVectorizer()
39 | 
40 |     X = v.fit_transform(train)
41 | 
42 |     # Note that X is a sparse matrix
43 |     print(X.toarray())
44 | 
45 |     # The target variable to be classified.
46 |     y = np.asarray([0, 1, 1, 0])
47 |     fm = myfm.MyFMClassifier(rank=4)
48 |     fm.fit(X,y)
49 | 
50 |     # It also supports prediction for new unseen items.
51 |     fm.predict_proba(v.transform([{"user": "1", "item": "10", "age": 24}]))
52 | 
53 | .. testoutput ::
54 |     :hide:
55 |     :options: +ELLIPSIS
56 | 
57 |      [[ 19.   0.   0.   0.   1.   1.   0.   0.   0.]
58 |       [ 33.   0.   0.   1.   0.   0.   1.   0.   0.]
59 |       [ 55.   0.   1.   0.   0.   0.   0.   1.   0.]
60 |       [ 20.   1.   0.   0.   0.   0.   0.   0.   1.]]
61 | 
62 | 
63 | As the example suggests, :py:class:`myfm.MyFMClassifier` takes
64 | sparse matrices of `scipy.sparse <https://docs.scipy.org/doc/scipy/reference/sparse.html>`_ as its input.
65 | In the above example, `sklearn's DictVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_
66 | transforms the categorical variables (user id and movie id) into a one-hot encoded vectors.
67 | 
68 | As you can see, :py:class:MyFMClassifier: can make predictions against
69 | new (unseen) items despite the fact that it is an MCMC solver.
70 | This is possible because it simply retains all the intermediate (noisy) samples.
71 | 
72 | For more practical example with larger data, move on to :ref:`Movielens examples <MovielensIndex>` .
73 | 


--------------------------------------------------------------------------------
/doc/source/rating_vs_cps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/doc/source/rating_vs_cps.png


--------------------------------------------------------------------------------
/doc/source/relation-blocks.rst:
--------------------------------------------------------------------------------
  1 | .. _RelationBlockTutorial :
  2 | 
  3 | --------------------------------------
  4 | TimeSVD++ Flipped with Relation Blocks
  5 | --------------------------------------
  6 | 
  7 | As mentioned in the :ref:`Movielens example <MovielensIndex>`,
  8 | the complexity of Bayesian FMs is proportional to :math:`O(\mathrm{NNZ})`.
  9 | This is especially troublesome when we include SVD++-like features in the feature matrix.
 10 | In such a case, for each user, we include all of the item IDs that the user had interacted with,
 11 | and the complexity grows further by a factor of :math:`O(\mathrm{NNZ} / N_U)`.
 12 | 
 13 | However, we can get away with this catastrophic complexity if we notice the repeated pattern in the input matrix.
 14 | Interested readers can refer to `[Rendle, '13] <https://dl.acm.org/doi/abs/10.14778/2535573.2488340>`_
 15 | and `libFM's Manual <http://www.libfm.org/libfm-1.40.manual.pdf>`_ for details.
 16 | 
 17 | Below let us see how we can incorporate SVD++-like features efficiently
 18 | using the relational data again using Movielens 100K dataset.
 19 | 
 20 | ^^^^^^^^^^^^^^^^^^^^^^^^
 21 | Building SVD++ Features
 22 | ^^^^^^^^^^^^^^^^^^^^^^^^
 23 | 
 24 | In `[Rendle, et al., '19] <https://arxiv.org/abs/1905.01395>`_,
 25 | in addition to the user/movie id, they have made use of the following features to improve the accuracy considerably:
 26 | 
 27 | 1. User Implicit Features: All the movies the user had watched
 28 | 2. Movie Implicit Features: All the users who have watched the movie
 29 | 3. Time Variable: The day of watch event (regarded as a categorical variable)
 30 | 
 31 | Let us construct these features.
 32 | 
 33 | .. testcode ::
 34 | 
 35 |     from collections import defaultdict
 36 |     import numpy as np
 37 |     from sklearn.preprocessing import OneHotEncoder
 38 |     from sklearn import metrics
 39 |     import myfm
 40 |     from myfm import RelationBlock
 41 |     from scipy import sparse as sps
 42 | 
 43 |     from myfm.utils.benchmark_data import MovieLens100kDataManager
 44 | 
 45 |     data_manager = MovieLens100kDataManager()
 46 | 
 47 |     # fold 1 is the toughest one
 48 |     df_train, df_test = data_manager.load_rating_predefined_split(fold=1)
 49 | 
 50 |     date_ohe = OneHotEncoder(handle_unknown='ignore').fit(
 51 |         df_train.timestamp.dt.date.values.reshape(-1, 1)
 52 |     )
 53 |     def categorize_date(df):
 54 |         return date_ohe.transform(df.timestamp.dt.date.values[:, np.newaxis])
 55 | 
 56 |     # index "0" is reserved for unknown ids.
 57 |     user_to_index = defaultdict(lambda : 0, { uid: i+1 for i,uid in enumerate(np.unique(df_train.user_id)) })
 58 |     movie_to_index = defaultdict(lambda: 0, { mid: i+1 for i,mid in enumerate(np.unique(df_train.movie_id))})
 59 |     USER_ID_SIZE = len(user_to_index) + 1
 60 |     MOVIE_ID_SIZE = len(movie_to_index) + 1
 61 | 
 62 | Above we constructed dictionaries which map user/movie id to the corresponding indices.
 63 | We have preserved the index ''0'' for ''Unknown'' user/movies, respectively.
 64 | 
 65 | To do the feature-engineering stated above, we have to memoize which users/movies had interactions with which movies/users.
 66 | 
 67 | .. testcode ::
 68 | 
 69 |     # The flags to control the included features.
 70 |     use_date = True # use date info or not
 71 |     use_iu = True # use implicit user feature
 72 |     use_ii = True # use implicit item feature
 73 | 
 74 |     movie_vs_watched = dict()
 75 |     user_vs_watched = dict()
 76 |     for row in df_train.itertuples():
 77 |         user_id = row.user_id
 78 |         movie_id = row.movie_id
 79 |         movie_vs_watched.setdefault(movie_id, list()).append(user_id)
 80 |         user_vs_watched.setdefault(user_id, list()).append(movie_id)
 81 | 
 82 |     if use_date:
 83 |         X_date_train = categorize_date(df_train)
 84 |         X_date_test  = categorize_date(df_test)
 85 |     else:
 86 |         X_date_train, X_date_test = (None, None)
 87 | 
 88 | 
 89 | We can then define functions which maps a list of user/movie ids to the features represented in sparse matrix format:
 90 | 
 91 | .. testcode ::
 92 | 
 93 |     # given user/movie ids, add additional infos and return it as sparse
 94 |     def augment_user_id(user_ids):
 95 |         Xs = []
 96 |         X_uid = sps.lil_matrix((len(user_ids), USER_ID_SIZE))
 97 |         for index, user_id in enumerate(user_ids):
 98 |             X_uid[index, user_to_index[user_id]] = 1
 99 |         Xs.append(X_uid)
100 |         if use_iu:
101 |             X_iu = sps.lil_matrix((len(user_ids), MOVIE_ID_SIZE))
102 |             for index, user_id in enumerate(user_ids):
103 |                 watched_movies = user_vs_watched.get(user_id, [])
104 |                 normalizer = 1 / max(len(watched_movies), 1) ** 0.5
105 |                 for uid in watched_movies:
106 |                     X_iu[index, movie_to_index[uid]] = normalizer
107 |             Xs.append(X_iu)
108 |         return sps.hstack(Xs, format='csr')
109 | 
110 |     def augment_movie_id(movie_ids):
111 |         Xs = []
112 |         X_movie = sps.lil_matrix((len(movie_ids), MOVIE_ID_SIZE))
113 |         for index, movie_id in enumerate(movie_ids):
114 |             X_movie[index, movie_to_index[movie_id]] = 1
115 |         Xs.append(X_movie)
116 | 
117 |         if use_ii:
118 |             X_ii = sps.lil_matrix((len(movie_ids), USER_ID_SIZE))
119 |             for index, movie_id in enumerate(movie_ids):
120 |                 watched_users = movie_vs_watched.get(movie_id, [])
121 |                 normalizer = 1 / max(len(watched_users), 1) ** 0.5
122 |                 for uid in watched_users:
123 |                     X_ii[index, user_to_index[uid]] = normalizer
124 |             Xs.append(X_ii)
125 | 
126 | 
127 |         return sps.hstack(Xs, format='csr')
128 | 
129 | ^^^^^^^^^^^^
130 | A naive way
131 | ^^^^^^^^^^^^
132 | 
133 | We now setup the problem in a non-relational way:
134 | 
135 | .. testcode ::
136 | 
137 |     train_uid_unique, train_uid_index = np.unique(df_train.user_id, return_inverse=True)
138 |     train_mid_unique, train_mid_index = np.unique(df_train.movie_id, return_inverse=True)
139 |     user_data_train = augment_user_id(train_uid_unique)
140 |     movie_data_train = augment_movie_id(train_mid_unique)
141 | 
142 |     test_uid_unique, test_uid_index = np.unique(df_test.user_id, return_inverse=True)
143 |     test_mid_unique, test_mid_index = np.unique(df_test.movie_id, return_inverse=True)
144 |     user_data_test = augment_user_id(test_uid_unique)
145 |     movie_data_test = augment_movie_id(test_mid_unique)
146 | 
147 |     X_train_naive = sps.hstack([
148 |         X_date_train,
149 |         user_data_train[train_uid_index],
150 |         movie_data_train[train_mid_index]
151 |     ])
152 | 
153 |     X_test_naive = sps.hstack([
154 |         X_date_test,
155 |         user_data_test[test_uid_index],
156 |         movie_data_test[test_mid_index]
157 |     ])
158 | 
159 |     fm_naive = myfm.MyFMRegressor(rank=10).fit(X_train_naive, df_train.rating, n_iter=3, n_kept_samples=3)
160 | 
161 | In my environment, it takes ~ 2s per iteration, which is much slower than pure MF example.
162 | 
163 | 
164 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
165 | The problem formulation with RelationBlock.
166 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
167 | 
168 | In the above code, we have already seen a hint to optimize the performance.
169 | The line ::
170 | 
171 |         user_data_train[train_uid_index],
172 | 
173 | says that each row of the sparse matrix ``user_data_train`` appears many times,
174 | and we will compute the same combination of factors repeatedly.
175 | 
176 | The role of :py:class:`myfm.RelationBlock` is to tell such a repeated pattern explicitly
177 | so that we can drastically reduce the complexity.
178 | 
179 | 
180 | .. testcode ::
181 | 
182 |     block_user_train = RelationBlock(train_uid_index, user_data_train)
183 |     block_movie_train = RelationBlock(train_mid_index, movie_data_train)
184 |     block_user_test = RelationBlock(test_uid_index, user_data_test)
185 |     block_movie_test = RelationBlock(test_mid_index, movie_data_test)
186 | 
187 | We can now feed these blocks into :py:meth:`myfm.MyFMRegressor.fit` by
188 | 
189 | .. testcode ::
190 | 
191 |     fm_rb = myfm.MyFMRegressor(rank=10).fit(
192 |         X_date_train, df_train.rating,
193 |         X_rel=[block_user_train, block_movie_train],
194 |         n_iter=300, n_kept_samples=300
195 |     )
196 | 
197 | Note that we cannot express ``X_date_train`` as a relation block and we have
198 | supplied such a non-repeated data for the first argument.
199 | This time, the speed is 20 iters / s, almost 40x speed up compared to the naive version.
200 | This is also much faster than e.g., `Surprise's implementation of SVD++ <https://github.com/NicolasHug/Surprise>`_.
201 | 
202 | What the relation format does is to reorganize the computation,
203 | but the result should be the same up to floating point artifacts:
204 | 
205 | .. testcode ::
206 | 
207 |     for i in range(3):
208 |         sample_naive = fm_naive.w_samples[i]
209 |         sample_rb = fm_rb.w_samples[i]
210 |         assert(np.max(np.abs(sample_naive - sample_rb)) < 1e-5)
211 |         # should print tiny numbers
212 | 
213 | 
214 | The resulting performance measures are RMSE=0.889, MAE=0.7000 :
215 | 
216 | .. testcode ::
217 | 
218 |     test_prediction = fm_rb.predict(
219 |         X_date_test,
220 |         X_rel=[block_user_test, block_user_test]
221 |     )
222 |     rmse = ((df_test.rating.values - test_prediction) ** 2).mean() ** 0.5
223 |     mae = np.abs(df_test.rating.values - test_prediction).mean()
224 |     print(f'rmse={rmse}, mae={mae}')
225 | 
226 | .. testoutput ::
227 |     :hide:
228 |     :options: +ELLIPSIS
229 | 
230 |     rmse=..., mae=...
231 | 
232 | Note that we still haven't exploited all the available ingredients such as
233 | user/item side-information and :ref:`grouping of the input variables <grouping>`.
234 | See also `examples notebooks & scripts <https://github.com/tohtsky/myFM/blob/master/examples/>`_
235 | for further improved results.
236 | 


--------------------------------------------------------------------------------
/doc_autobuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # a convenient script to run sphinx-autobuild
3 | sphinx-autobuild  \
4 |     --host 0.0.0.0 \
5 |     --port 9999 \
6 |     --watch src/myfm/ \
7 |     doc/source doc/build
8 | 


--------------------------------------------------------------------------------
/examples/ml-100k-regression.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from typing import Dict, List, Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import sparse as sps
  8 | 
  9 | import myfm
 10 | from myfm import RelationBlock
 11 | from myfm.gibbs import MyFMGibbsRegressor, MyFMOrderedProbit
 12 | from myfm.utils.benchmark_data.movielens100k_data import MovieLens100kDataManager
 13 | from myfm.utils.callbacks import (
 14 |     LibFMLikeCallbackBase,
 15 |     OrderedProbitCallback,
 16 |     RegressionCallback,
 17 | )
 18 | from myfm.utils.encoders import CategoryValueToSparseEncoder
 19 | 
 20 | if __name__ == "__main__":
 21 |     parser = argparse.ArgumentParser(
 22 |         description="""
 23 |     This script apply the method and evaluation protocal proposed in
 24 |     "On the Difficulty of Evaluating Baselines" paper by Rendle et al,
 25 |     against smaller Movielens 100K dataset, using myFM.
 26 |     """,
 27 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 28 |     )
 29 | 
 30 |     parser.add_argument(
 31 |         "fold_index",
 32 |         type=int,
 33 |         help="which index set to use as a test within 5-fold predefined CV.",
 34 |         default=1,
 35 |     )
 36 |     parser.add_argument(
 37 |         "-a",
 38 |         "--algorithm",
 39 |         type=str,
 40 |         choices=["regression", "oprobit"],
 41 |         default="regression",
 42 |         help="specify the output type.",
 43 |     )
 44 |     parser.add_argument(
 45 |         "-i", "--iteration", type=int, help="mcmc iteration", default=512
 46 |     )
 47 |     parser.add_argument(
 48 |         "-d", "--dimension", type=int, help="fm embedding dimension", default=10
 49 |     )
 50 | 
 51 |     parser.add_argument(
 52 |         "--stricter_protocol",
 53 |         action="store_true",
 54 |         help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].",
 55 |         default=True,
 56 |     )
 57 | 
 58 |     parser.add_argument(
 59 |         "-f",
 60 |         "--feature",
 61 |         type=str,
 62 |         choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"],
 63 |         help="feature set used in the experiment.",
 64 |         default="timesvdpp_flipped",
 65 |     )
 66 | 
 67 |     args = parser.parse_args()
 68 | 
 69 |     random_seed = 42
 70 | 
 71 |     # Additional features.
 72 |     # We add
 73 |     # 1. date of evaluation as categorical variables
 74 |     # 2. "all users who have evaluated a movie in the train set" or
 75 |     # 3. "all movies rated by a user" as a feature of user/movie.
 76 |     if args.feature == "mf":
 77 |         use_date = False
 78 |         use_iu = False
 79 |         use_ii = False
 80 |     elif args.feature == "svdpp":
 81 |         use_date = False
 82 |         use_iu = True
 83 |         use_ii = False
 84 |     elif args.feature == "timesvd":
 85 |         use_date = True
 86 |         use_iu = False
 87 |         use_ii = False
 88 |     elif args.feature == "timesvdpp":
 89 |         use_date = True
 90 |         use_iu = True
 91 |         use_ii = False
 92 |     elif args.feature == "timesvdpp_flipped":
 93 |         use_date = True  # use date info or not
 94 |         use_iu = True  # use implicit user feature
 95 |         use_ii = True  # use implicit item feature
 96 |     else:
 97 |         raise ValueError("unknown feature set specified.")
 98 | 
 99 |     FOLD_INDEX = args.fold_index
100 |     ITERATION = args.iteration
101 |     DIMENSION = args.dimension
102 |     if FOLD_INDEX < 1 or FOLD_INDEX >= 6:
103 |         raise ValueError("fold_index must be in the range(1, 6).")
104 |     ALGORITHM = args.algorithm
105 |     data_manager = MovieLens100kDataManager()
106 |     df_train, df_test = data_manager.load_rating_predefined_split(fold=FOLD_INDEX)
107 | 
108 |     if ALGORITHM == "oprobit":
109 |         # interpret the rating (1, 2, 3, 4, 5) as class (0, 1, 2, 3, 4).
110 |         for df_ in [df_train, df_test]:
111 |             df_["rating"] -= 1
112 |             df_["rating"] = df_.rating.astype(np.int32)
113 | 
114 |     if args.stricter_protocol:
115 |         implicit_data_source = df_train
116 |     else:
117 |         implicit_data_source = pd.concat([df_train, df_test])
118 | 
119 |     user_to_internal = CategoryValueToSparseEncoder[int](
120 |         implicit_data_source.user_id.values
121 |     )
122 |     movie_to_internal = CategoryValueToSparseEncoder[int](
123 |         implicit_data_source.movie_id.values
124 |     )
125 | 
126 |     print(
127 |         "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape)
128 |     )
129 |     # treat the days of events as categorical variable
130 |     date_encoder = CategoryValueToSparseEncoder(
131 |         implicit_data_source.timestamp.dt.date.values
132 |     )
133 | 
134 |     def categorize_date(df: pd.DataFrame) -> sps.csr_matrix:
135 |         return date_encoder.to_sparse(df.timestamp.dt.date.values)
136 | 
137 |     movie_vs_watched: Dict[int, List[int]] = dict()
138 |     user_vs_watched: Dict[int, List[int]] = dict()
139 | 
140 |     for row in implicit_data_source.itertuples():
141 |         user_id: int = row.user_id
142 |         movie_id: int = row.movie_id
143 |         movie_vs_watched.setdefault(movie_id, list()).append(user_id)
144 |         user_vs_watched.setdefault(user_id, list()).append(movie_id)
145 | 
146 |     if use_date:
147 |         X_date_train = categorize_date(df_train)
148 |         X_date_test = categorize_date(df_test)
149 |     else:
150 |         X_date_train, X_date_test = (None, None)
151 | 
152 |     # setup grouping
153 |     feature_group_sizes = []
154 |     if use_date:
155 |         feature_group_sizes.append(
156 |             len(date_encoder),  # date
157 |         )
158 | 
159 |     feature_group_sizes.append(len(user_to_internal))  # user ids
160 | 
161 |     if use_iu:
162 |         # all movies which a user watched
163 |         feature_group_sizes.append(len(movie_to_internal))
164 | 
165 |     feature_group_sizes.append(len(movie_to_internal))  # movie ids
166 | 
167 |     if use_ii:
168 |         feature_group_sizes.append(
169 |             len(user_to_internal)  # all the users who watched a movies
170 |         )
171 | 
172 |     grouping = [i for i, size in enumerate(feature_group_sizes) for _ in range(size)]
173 | 
174 |     # given user/movie ids, add additional infos and return it as sparse
175 |     def augment_user_id(user_ids: List[int]) -> sps.csr_matrix:
176 |         X = user_to_internal.to_sparse(user_ids)
177 |         if not use_iu:
178 |             return X
179 |         data: List[float] = []
180 |         row: List[int] = []
181 |         col: List[int] = []
182 |         for index, user_id in enumerate(user_ids):
183 |             watched_movies = user_vs_watched.get(user_id, [])
184 |             normalizer = 1 / max(len(watched_movies), 1) ** 0.5
185 |             for mid in watched_movies:
186 |                 data.append(normalizer)
187 |                 col.append(movie_to_internal[mid])
188 |                 row.append(index)
189 |         return sps.hstack(
190 |             [
191 |                 X,
192 |                 sps.csr_matrix(
193 |                     (data, (row, col)),
194 |                     shape=(len(user_ids), len(movie_to_internal)),
195 |                 ),
196 |             ],
197 |             format="csr",
198 |         )
199 | 
200 |     def augment_movie_id(movie_ids: List[int]) -> sps.csr_matrix:
201 |         X = movie_to_internal.to_sparse(movie_ids)
202 |         if not use_ii:
203 |             return X
204 | 
205 |         data: List[float] = []
206 |         row: List[int] = []
207 |         col: List[int] = []
208 | 
209 |         for index, movie_id in enumerate(movie_ids):
210 |             watched_users = movie_vs_watched.get(movie_id, [])
211 |             normalizer = 1 / max(len(watched_users), 1) ** 0.5
212 |             for uid in watched_users:
213 |                 data.append(normalizer)
214 |                 row.append(index)
215 |                 col.append(user_to_internal[uid])
216 |         return sps.hstack(
217 |             [
218 |                 X,
219 |                 sps.csr_matrix(
220 |                     (data, (row, col)),
221 |                     shape=(len(movie_ids), len(user_to_internal)),
222 |                 ),
223 |             ]
224 |         )
225 | 
226 |     # Create RelationBlock.
227 |     train_blocks: List[RelationBlock] = []
228 |     test_blocks: List[RelationBlock] = []
229 |     for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
230 |         unique_users, user_map = np.unique(source.user_id, return_inverse=True)
231 |         target.append(RelationBlock(user_map, augment_user_id(unique_users)))
232 |         unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
233 |         target.append(RelationBlock(movie_map, augment_movie_id(unique_movies)))
234 | 
235 |     trace_path = "rmse_{0}_fold_{1}.csv".format(ALGORITHM, FOLD_INDEX)
236 | 
237 |     callback: LibFMLikeCallbackBase
238 |     fm: Union[MyFMGibbsRegressor, MyFMOrderedProbit]
239 |     if ALGORITHM == "regression":
240 |         fm = myfm.MyFMRegressor(rank=DIMENSION)
241 |         callback = RegressionCallback(
242 |             n_iter=ITERATION,
243 |             X_test=X_date_test,
244 |             y_test=df_test.rating.values,
245 |             X_rel_test=test_blocks,
246 |             clip_min=df_train.rating.min(),
247 |             clip_max=df_train.rating.max(),
248 |             trace_path=trace_path,
249 |         )
250 |     else:
251 |         fm = myfm.MyFMOrderedProbit(rank=DIMENSION)
252 |         callback = OrderedProbitCallback(
253 |             n_iter=ITERATION,
254 |             X_test=X_date_test,
255 |             y_test=df_test.rating.values,
256 |             n_class=5,
257 |             X_rel_test=test_blocks,
258 |             trace_path=trace_path,
259 |         )
260 | 
261 |     fm.fit(
262 |         X_date_train,
263 |         df_train.rating.values,
264 |         X_rel=train_blocks,
265 |         grouping=grouping,
266 |         n_iter=ITERATION,
267 |         n_kept_samples=ITERATION,
268 |         callback=callback,
269 |     )
270 |     with open(
271 |         "callback_result_{0}_fold_{1}.pkl".format(ALGORITHM, FOLD_INDEX), "wb"
272 |     ) as ofs:
273 |         pickle.dump(callback, ofs)
274 | 


--------------------------------------------------------------------------------
/examples/ml-100k-variational.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from typing import List
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from myfm import RelationBlock, VariationalFMRegressor
  8 | from myfm.utils.benchmark_data.movielens100k_data import MovieLens100kDataManager
  9 | from myfm.utils.encoders import (
 10 |     CategoryValueToSparseEncoder,
 11 |     DataFrameEncoder,
 12 |     MultipleValuesToSparseEncoder,
 13 | )
 14 | 
 15 | if __name__ == "__main__":
 16 |     parser = argparse.ArgumentParser(
 17 |         description="""
 18 |     This script apply the method and evaluation protocal proposed in
 19 |     "On the Difficulty of Evaluating Baselines" paper by Rendle et al,
 20 |     against smaller Movielens 100K dataset, using myFM.
 21 |     """,
 22 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 23 |     )
 24 | 
 25 |     parser.add_argument(
 26 |         "fold_index",
 27 |         type=int,
 28 |         help="which index set to use as a test within 5-fold predefined CV.",
 29 |         default=1,
 30 |     )
 31 | 
 32 |     parser.add_argument(
 33 |         "-i", "--iteration", type=int, help="mcmc iteration", default=512
 34 |     )
 35 |     parser.add_argument(
 36 |         "-d", "--dimension", type=int, help="fm embedding dimension", default=10
 37 |     )
 38 | 
 39 |     parser.add_argument(
 40 |         "--stricter_protocol",
 41 |         action="store_true",
 42 |         help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].",
 43 |         default=True,
 44 |     )
 45 | 
 46 |     parser.add_argument(
 47 |         "-f",
 48 |         "--feature",
 49 |         type=str,
 50 |         choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"],
 51 |         help="feature set used in the experiment.",
 52 |         default="timesvdpp_flipped",
 53 |     )
 54 | 
 55 |     args = parser.parse_args()
 56 | 
 57 |     random_seed = 42
 58 | 
 59 |     # Additional features.
 60 |     # We add
 61 |     # 1. date of evaluation as categorical variables
 62 |     # 2. "all users who have evaluated a movie in the train set" or
 63 |     # 3. "all movies rated by a user" as a feature of user/movie.
 64 |     if args.feature == "mf":
 65 |         use_date = False
 66 |         use_iu = False
 67 |         use_ii = False
 68 |     elif args.feature == "svdpp":
 69 |         use_date = False
 70 |         use_iu = True
 71 |         use_ii = False
 72 |     elif args.feature == "timesvd":
 73 |         use_date = True
 74 |         use_iu = False
 75 |         use_ii = False
 76 |     elif args.feature == "timesvdpp":
 77 |         use_date = True
 78 |         use_iu = True
 79 |         use_ii = False
 80 |     elif args.feature == "timesvdpp_flipped":
 81 |         use_date = True  # use date info or not
 82 |         use_iu = True  # use implicit user feature
 83 |         use_ii = True  # use implicit item feature
 84 |     else:
 85 |         raise ValueError("unknown feature set specified.")
 86 | 
 87 |     FOLD_INDEX = args.fold_index
 88 |     ITERATION = args.iteration
 89 |     DIMENSION = args.dimension
 90 |     if FOLD_INDEX < 1 or FOLD_INDEX >= 6:
 91 |         raise ValueError("fold_index must be in the range(1, 6).")
 92 | 
 93 |     data_manager = MovieLens100kDataManager()
 94 |     df_train, df_test = data_manager.load_rating_predefined_split(fold=FOLD_INDEX)
 95 | 
 96 |     if args.stricter_protocol:
 97 |         implicit_data_source = df_train
 98 |     else:
 99 |         implicit_data_source = pd.concat([df_train, df_test])
100 | 
101 |     def int_list_to_str(x):
102 |         return "|".join([f"{id}" for id in x])
103 | 
104 |     user_implicit_profile = (
105 |         implicit_data_source.groupby("user_id")["movie_id"]
106 |         .agg(int_list_to_str)
107 |         .reset_index()
108 |     )
109 |     item_implicit_profile = (
110 |         implicit_data_source.groupby("movie_id")["user_id"]
111 |         .agg(int_list_to_str)
112 |         .reset_index()
113 |     )
114 | 
115 |     print(
116 |         "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape)
117 |     )
118 | 
119 |     user_encoder = DataFrameEncoder().add_column(
120 |         "user_id",
121 |         CategoryValueToSparseEncoder(user_implicit_profile.user_id),
122 |     )
123 |     if use_iu:
124 |         user_encoder.add_column(
125 |             "movie_id",
126 |             MultipleValuesToSparseEncoder(user_implicit_profile.movie_id, sep="|"),
127 |         )
128 | 
129 |     movie_encoder = DataFrameEncoder().add_column(
130 |         "movie_id",
131 |         CategoryValueToSparseEncoder(item_implicit_profile.movie_id),
132 |     )
133 |     if use_ii:
134 |         movie_encoder.add_column(
135 |             "user_id",
136 |             MultipleValuesToSparseEncoder(item_implicit_profile.user_id, sep="|"),
137 |         )
138 | 
139 |     # treat the days of events as categorical variable
140 | 
141 |     feature_group_sizes: List[int] = []
142 |     if use_date:
143 |         date_encoder = CategoryValueToSparseEncoder(
144 |             implicit_data_source.timestamp.dt.date
145 |         )
146 |         X_date_train = date_encoder.to_sparse(df_train.timestamp.dt.date)
147 |         X_date_test = date_encoder.to_sparse(df_test.timestamp.dt.date)
148 |         feature_group_sizes.append(len(date_encoder))
149 |     else:
150 |         X_date_train, X_date_test = (None, None)
151 | 
152 |     # setup grouping
153 |     feature_group_sizes.extend(user_encoder.encoder_shapes)
154 |     feature_group_sizes.extend(movie_encoder.encoder_shapes)
155 | 
156 |     # Create RelationBlock.
157 |     train_blocks: List[RelationBlock] = []
158 |     test_blocks: List[RelationBlock] = []
159 |     for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
160 |         unique_users, user_map = np.unique(source.user_id, return_inverse=True)
161 |         target.append(
162 |             RelationBlock(
163 |                 user_map,
164 |                 user_encoder.encode_df(
165 |                     user_implicit_profile.set_index("user_id")
166 |                     .reindex(unique_users)
167 |                     .fillna("")
168 |                     .reset_index()
169 |                 ),
170 |             )
171 |         )
172 |         unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
173 |         target.append(
174 |             RelationBlock(
175 |                 movie_map,
176 |                 movie_encoder.encode_df(
177 |                     item_implicit_profile.set_index("movie_id")
178 |                     .reindex(unique_movies)
179 |                     .fillna("")
180 |                     .reset_index()
181 |                 ),
182 |             )
183 |         )
184 | 
185 |     trace_path = "rmse_variational_fold_{0}.csv".format(FOLD_INDEX)
186 |     fm = VariationalFMRegressor(rank=DIMENSION)
187 | 
188 |     fm.fit(
189 |         X_date_train,
190 |         df_train.rating.values,
191 |         X_rel=train_blocks,
192 |         n_iter=ITERATION,
193 |         group_shapes=feature_group_sizes,
194 |     )
195 |     rmse = (
196 |         (df_test.rating.values - fm.predict(X_date_test, test_blocks)) ** 2
197 |     ).mean() ** 0.5
198 |     assert fm.history_ is not None
199 |     print("RMSE = {rmse}".format(rmse=rmse))
200 | 


--------------------------------------------------------------------------------
/examples/ml-10m-regression.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from typing import Dict, List, Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import sparse as sps
  8 | 
  9 | import myfm
 10 | from myfm import MyFMOrderedProbit, MyFMRegressor, RelationBlock
 11 | from myfm.gibbs import MyFMOrderedProbit
 12 | from myfm.utils.benchmark_data import MovieLens10MDataManager
 13 | from myfm.utils.callbacks.libfm import (
 14 |     LibFMLikeCallbackBase,
 15 |     OrderedProbitCallback,
 16 |     RegressionCallback,
 17 | )
 18 | from myfm.utils.encoders import CategoryValueToSparseEncoder
 19 | 
 20 | if __name__ == "__main__":
 21 |     parser = argparse.ArgumentParser(
 22 |         description="""
 23 |     This script apply the method and evaluation protocal proposed in
 24 |     "On the Difficulty of Evaluating Baselines" paper by Rendle et al,
 25 |     against smaller Movielens 1M dataset, using myFM.
 26 |     """,
 27 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 28 |     )
 29 | 
 30 |     parser.add_argument(
 31 |         "fold_index",
 32 |         type=int,
 33 |         help="which index set to use as a test within 10-fold CV.",
 34 |     )
 35 |     parser.add_argument(
 36 |         "-a",
 37 |         "--algorithm",
 38 |         type=str,
 39 |         choices=["regression", "oprobit"],
 40 |         default="regression",
 41 |         help="specify the output type.",
 42 |     )
 43 |     parser.add_argument(
 44 |         "-i", "--iteration", type=int, help="mcmc iteration", default=512
 45 |     )
 46 |     parser.add_argument(
 47 |         "-d",
 48 |         "--dimension",
 49 |         type=int,
 50 |         help="fm embedding dimension",
 51 |         default=128,
 52 |     )
 53 |     parser.add_argument(
 54 |         "--stricter_protocol",
 55 |         action="store_true",
 56 |         help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].",
 57 |         default=True,
 58 |     )
 59 |     parser.add_argument(
 60 |         "-f",
 61 |         "--feature",
 62 |         type=str,
 63 |         choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"],
 64 |         help="feature set used in the experiment.",
 65 |         default="timesvdpp_flipped",
 66 |     )
 67 |     args = parser.parse_args()
 68 | 
 69 |     random_seed = 42
 70 | 
 71 |     # Additional features.
 72 |     # We add
 73 |     # 1. date of evaluation as categorical variables
 74 |     # 2. "all users who have evaluated a movie in the train set" or
 75 |     # 3. "all movies rated by a user" as a feature of user/movie.
 76 |     if args.feature == "mf":
 77 |         use_date = False
 78 |         use_iu = False
 79 |         use_ii = False
 80 |     elif args.feature == "svdpp":
 81 |         use_date = False
 82 |         use_iu = True
 83 |         use_ii = False
 84 |     elif args.feature == "timesvd":
 85 |         use_date = True
 86 |         use_iu = False
 87 |         use_ii = False
 88 |     elif args.feature == "timesvdpp":
 89 |         use_date = True
 90 |         use_iu = True
 91 |         use_ii = False
 92 |     elif args.feature == "timesvdpp_flipped":
 93 |         use_date = True  # use date info or not
 94 |         use_iu = True  # use implicit user feature
 95 |         use_ii = True  # use implicit item feature
 96 |     else:
 97 |         raise ValueError("unknown feature set specified.")
 98 | 
 99 |     FOLD_INDEX = args.fold_index
100 |     ITERATION = args.iteration
101 |     DIMENSION = args.dimension
102 |     if FOLD_INDEX < 0 or FOLD_INDEX >= 10:
103 |         raise ValueError("fold_index must be in the range(10).")
104 |     ALGORITHM = args.algorithm
105 |     data_manager = MovieLens10MDataManager()
106 |     df_train, df_test = data_manager.load_rating_kfold_split(
107 |         10, FOLD_INDEX, random_seed
108 |     )
109 | 
110 |     if ALGORITHM == "oprobit":
111 |         # interpret the rating 0.5, 1.0 ... , 5.0 as class (0, 1, ... , 10)
112 |         for df_ in [df_train, df_test]:
113 |             df_["rating"] -= 0.5
114 |             df_["rating"] *= 2
115 |             df_["rating"] = df_.rating.astype(np.int32)
116 | 
117 |     if args.stricter_protocol:
118 |         implicit_data_source = df_train
119 |     else:
120 |         implicit_data_source = pd.concat([df_train, df_test])
121 | 
122 |     user_to_internal = CategoryValueToSparseEncoder[int](
123 |         implicit_data_source.user_id.values
124 |     )
125 |     movie_to_internal = CategoryValueToSparseEncoder[int](
126 |         implicit_data_source.movie_id.values
127 |     )
128 | 
129 |     print(
130 |         "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape)
131 |     )
132 |     # treat the days of events as categorical variable
133 |     date_encoder = CategoryValueToSparseEncoder[pd.Timestamp](
134 |         implicit_data_source.timestamp.dt.date.values
135 |     )
136 | 
137 |     def categorize_date(df):
138 |         return date_encoder.to_sparse(df.timestamp.dt.date.values)
139 | 
140 |     movie_vs_watched: Dict[int, List[int]] = dict()
141 |     user_vs_watched: Dict[int, List[int]] = dict()
142 | 
143 |     for row in implicit_data_source.itertuples():
144 |         user_id = row.user_id
145 |         movie_id = row.movie_id
146 |         movie_vs_watched.setdefault(movie_id, list()).append(user_id)
147 |         user_vs_watched.setdefault(user_id, list()).append(movie_id)
148 | 
149 |     if use_date:
150 |         X_date_train = categorize_date(df_train)
151 |         X_date_test = categorize_date(df_test)
152 |     else:
153 |         X_date_train, X_date_test = (None, None)
154 | 
155 |     # setup grouping
156 |     feature_group_sizes = []
157 |     if use_date:
158 |         feature_group_sizes.append(
159 |             len(date_encoder),  # date
160 |         )
161 | 
162 |     feature_group_sizes.append(len(user_to_internal))  # user ids
163 | 
164 |     if use_iu:
165 |         # all movies which a user watched
166 |         feature_group_sizes.append(len(movie_to_internal))
167 | 
168 |     feature_group_sizes.append(len(movie_to_internal))  # movie ids
169 | 
170 |     if use_ii:
171 |         feature_group_sizes.append(
172 |             len(user_to_internal)  # all the users who watched a movies
173 |         )
174 | 
175 |     grouping = [i for i, size in enumerate(feature_group_sizes) for _ in range(size)]
176 | 
177 |     def augment_user_id(user_ids: List[int]) -> sps.csr_matrix:
178 |         X = user_to_internal.to_sparse(user_ids)
179 |         if not use_iu:
180 |             return X
181 |         data: List[float] = []
182 |         row: List[int] = []
183 |         col: List[int] = []
184 |         for index, user_id in enumerate(user_ids):
185 |             watched_movies = user_vs_watched.get(user_id, [])
186 |             normalizer = 1 / max(len(watched_movies), 1) ** 0.5
187 |             for mid in watched_movies:
188 |                 data.append(normalizer)
189 |                 col.append(movie_to_internal[mid])
190 |                 row.append(index)
191 |         return sps.hstack(
192 |             [
193 |                 X,
194 |                 sps.csr_matrix(
195 |                     (data, (row, col)),
196 |                     shape=(len(user_ids), len(movie_to_internal)),
197 |                 ),
198 |             ],
199 |             format="csr",
200 |         )
201 | 
202 |     def augment_movie_id(movie_ids: List[int]):
203 |         X = movie_to_internal.to_sparse(movie_ids)
204 |         if not use_ii:
205 |             return X
206 | 
207 |         data: List[float] = []
208 |         row: List[int] = []
209 |         col: List[int] = []
210 | 
211 |         for index, movie_id in enumerate(movie_ids):
212 |             watched_users = movie_vs_watched.get(movie_id, [])
213 |             normalizer = 1 / max(len(watched_users), 1) ** 0.5
214 |             for uid in watched_users:
215 |                 data.append(normalizer)
216 |                 row.append(index)
217 |                 col.append(user_to_internal[uid])
218 |         return sps.hstack(
219 |             [
220 |                 X,
221 |                 sps.csr_matrix(
222 |                     (data, (row, col)),
223 |                     shape=(len(movie_ids), len(user_to_internal)),
224 |                 ),
225 |             ]
226 |         )
227 | 
228 |     # Create RelationBlock.
229 |     train_blocks: List[RelationBlock] = []
230 |     test_blocks: List[RelationBlock] = []
231 |     for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
232 |         unique_users, user_map = np.unique(source.user_id, return_inverse=True)
233 |         target.append(RelationBlock(user_map, augment_user_id(unique_users)))
234 |         unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
235 |         target.append(RelationBlock(movie_map, augment_movie_id(unique_movies)))
236 | 
237 |     trace_path = "rmse_{0}_fold_{1}.csv".format(ALGORITHM, FOLD_INDEX)
238 | 
239 |     callback: LibFMLikeCallbackBase
240 |     fm: Union[MyFMRegressor, MyFMOrderedProbit]
241 |     if ALGORITHM == "regression":
242 |         fm = myfm.MyFMRegressor(rank=DIMENSION)
243 |         callback = RegressionCallback(
244 |             ITERATION,
245 |             X_date_test,
246 |             df_test.rating.values,
247 |             X_rel_test=test_blocks,
248 |             clip_min=0.5,
249 |             clip_max=5.0,
250 |             trace_path=trace_path,
251 |         )
252 |     else:
253 |         fm = myfm.MyFMOrderedProbit(rank=DIMENSION)
254 |         callback = OrderedProbitCallback(
255 |             ITERATION,
256 |             X_date_test,
257 |             df_test.rating.values,
258 |             n_class=10,
259 |             X_rel_test=test_blocks,
260 |             trace_path=trace_path,
261 |         )
262 |     fm.fit(
263 |         X_date_train,
264 |         df_train.rating.values,
265 |         X_rel=train_blocks,
266 |         grouping=grouping,
267 |         n_iter=callback.n_iter,
268 |         callback=callback,
269 |         n_kept_samples=1,
270 |     )
271 |     with open(
272 |         "callback_result_{0}_fold_{1}.pkl".format(ALGORITHM, FOLD_INDEX), "wb"
273 |     ) as ofs:
274 |         pickle.dump(callback, ofs)
275 | 


--------------------------------------------------------------------------------
/examples/ml-1m-regression.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from typing import Dict, List, Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import sparse as sps
  8 | 
  9 | import myfm
 10 | from myfm import MyFMOrderedProbit, MyFMRegressor, RelationBlock
 11 | from myfm.gibbs import MyFMOrderedProbit
 12 | from myfm.utils.benchmark_data import MovieLens1MDataManager
 13 | from myfm.utils.callbacks.libfm import (
 14 |     LibFMLikeCallbackBase,
 15 |     OrderedProbitCallback,
 16 |     RegressionCallback,
 17 | )
 18 | from myfm.utils.encoders import CategoryValueToSparseEncoder
 19 | 
 20 | if __name__ == "__main__":
 21 |     parser = argparse.ArgumentParser(
 22 |         description="""
 23 |     This script apply the method and evaluation protocal proposed in
 24 |     "On the Difficulty of Evaluating Baselines" paper by Rendle et al,
 25 |     against smaller Movielens 1M dataset, using myFM.
 26 |     """,
 27 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 28 |     )
 29 | 
 30 |     parser.add_argument(
 31 |         "fold_index",
 32 |         type=int,
 33 |         help="which index set to use as a test within 10-fold CV.",
 34 |     )
 35 |     parser.add_argument(
 36 |         "-a",
 37 |         "--algorithm",
 38 |         type=str,
 39 |         choices=["regression", "oprobit"],
 40 |         default="regression",
 41 |         help="specify the output type.",
 42 |     )
 43 |     parser.add_argument(
 44 |         "-i", "--iteration", type=int, help="mcmc iteration", default=512
 45 |     )
 46 |     parser.add_argument(
 47 |         "-d", "--dimension", type=int, help="fm embedding dimension", default=32
 48 |     )
 49 |     parser.add_argument(
 50 |         "--stricter_protocol",
 51 |         action="store_true",
 52 |         help="Whether to use the \"stricter\" protocol (i.e., don't include the test set implicit information) stated in [Rendle, '19].",
 53 |         default=True,
 54 |     )
 55 |     parser.add_argument(
 56 |         "-f",
 57 |         "--feature",
 58 |         type=str,
 59 |         choices=["mf", "svdpp", "timesvd", "timesvdpp", "timesvdpp_flipped"],
 60 |         help="feature set used in the experiment.",
 61 |         default="timesvdpp_flipped",
 62 |     )
 63 |     args = parser.parse_args()
 64 | 
 65 |     random_seed = 42
 66 | 
 67 |     # Additional features.
 68 |     # We add
 69 |     # 1. date of evaluation as categorical variables
 70 |     # 2. "all users who have evaluated a movie in the train set" or
 71 |     # 3. "all movies rated by a user" as a feature of user/movie.
 72 |     if args.feature == "mf":
 73 |         use_date = False
 74 |         use_iu = False
 75 |         use_ii = False
 76 |     elif args.feature == "svdpp":
 77 |         use_date = False
 78 |         use_iu = True
 79 |         use_ii = False
 80 |     elif args.feature == "timesvd":
 81 |         use_date = True
 82 |         use_iu = False
 83 |         use_ii = False
 84 |     elif args.feature == "timesvdpp":
 85 |         use_date = True
 86 |         use_iu = True
 87 |         use_ii = False
 88 |     elif args.feature == "timesvdpp_flipped":
 89 |         use_date = True  # use date info or not
 90 |         use_iu = True  # use implicit user feature
 91 |         use_ii = True  # use implicit item feature
 92 |     else:
 93 |         raise ValueError("unknown feature set specified.")
 94 | 
 95 |     FOLD_INDEX = args.fold_index
 96 |     ITERATION = args.iteration
 97 |     DIMENSION = args.dimension
 98 |     if FOLD_INDEX < 0 or FOLD_INDEX >= 10:
 99 |         raise ValueError("fold_index must be in the range(10).")
100 |     ALGORITHM = args.algorithm
101 |     data_manager = MovieLens1MDataManager()
102 |     df_train, df_test = data_manager.load_rating_kfold_split(
103 |         10, FOLD_INDEX, random_seed
104 |     )
105 | 
106 |     if ALGORITHM == "oprobit":
107 |         # interpret the rating (1, 2, 3, 4, 5) as class (0, 1, 2, 3, 4).
108 |         for df_ in [df_train, df_test]:
109 |             df_["rating"] -= 1
110 |             df_["rating"] = df_.rating.astype(np.int32)
111 | 
112 |     if args.stricter_protocol:
113 |         implicit_data_source = df_train
114 |     else:
115 |         implicit_data_source = pd.concat([df_train, df_test])
116 | 
117 |     user_to_internal = CategoryValueToSparseEncoder[int](
118 |         implicit_data_source.user_id.values
119 |     )
120 |     movie_to_internal = CategoryValueToSparseEncoder[int](
121 |         implicit_data_source.movie_id.values
122 |     )
123 | 
124 |     print(
125 |         "df_train.shape = {}, df_test.shape = {}".format(df_train.shape, df_test.shape)
126 |     )
127 |     # treat the days of events as categorical variable
128 |     date_encoder = CategoryValueToSparseEncoder[pd.Timestamp](
129 |         implicit_data_source.timestamp.dt.date.values
130 |     )
131 | 
132 |     def categorize_date(df):
133 |         return date_encoder.to_sparse(df.timestamp.dt.date.values)
134 | 
135 |     movie_vs_watched: Dict[int, List[int]] = dict()
136 |     user_vs_watched: Dict[int, List[int]] = dict()
137 | 
138 |     for row in implicit_data_source.itertuples():
139 |         user_id = row.user_id
140 |         movie_id = row.movie_id
141 |         movie_vs_watched.setdefault(movie_id, list()).append(user_id)
142 |         user_vs_watched.setdefault(user_id, list()).append(movie_id)
143 | 
144 |     if use_date:
145 |         X_date_train = categorize_date(df_train)
146 |         X_date_test = categorize_date(df_test)
147 |     else:
148 |         X_date_train, X_date_test = (None, None)
149 | 
150 |     # setup grouping
151 |     feature_group_sizes = []
152 |     if use_date:
153 |         feature_group_sizes.append(
154 |             len(date_encoder),  # date
155 |         )
156 | 
157 |     feature_group_sizes.append(len(user_to_internal))  # user ids
158 | 
159 |     if use_iu:
160 |         # all movies which a user watched
161 |         feature_group_sizes.append(len(movie_to_internal))
162 | 
163 |     feature_group_sizes.append(len(movie_to_internal))  # movie ids
164 | 
165 |     if use_ii:
166 |         feature_group_sizes.append(
167 |             len(user_to_internal)  # all the users who watched a movies
168 |         )
169 | 
170 |     grouping = [i for i, size in enumerate(feature_group_sizes) for _ in range(size)]
171 | 
172 |     def augment_user_id(user_ids: List[int]) -> sps.csr_matrix:
173 |         X = user_to_internal.to_sparse(user_ids)
174 |         if not use_iu:
175 |             return X
176 |         data: List[float] = []
177 |         row: List[int] = []
178 |         col: List[int] = []
179 |         for index, user_id in enumerate(user_ids):
180 |             watched_movies = user_vs_watched.get(user_id, [])
181 |             normalizer = 1 / max(len(watched_movies), 1) ** 0.5
182 |             for mid in watched_movies:
183 |                 data.append(normalizer)
184 |                 col.append(movie_to_internal[mid])
185 |                 row.append(index)
186 |         return sps.hstack(
187 |             [
188 |                 X,
189 |                 sps.csr_matrix(
190 |                     (data, (row, col)),
191 |                     shape=(len(user_ids), len(movie_to_internal)),
192 |                 ),
193 |             ],
194 |             format="csr",
195 |         )
196 | 
197 |     def augment_movie_id(movie_ids: List[int]):
198 |         X = movie_to_internal.to_sparse(movie_ids)
199 |         if not use_ii:
200 |             return X
201 | 
202 |         data: List[float] = []
203 |         row: List[int] = []
204 |         col: List[int] = []
205 | 
206 |         for index, movie_id in enumerate(movie_ids):
207 |             watched_users = movie_vs_watched.get(movie_id, [])
208 |             normalizer = 1 / max(len(watched_users), 1) ** 0.5
209 |             for uid in watched_users:
210 |                 data.append(normalizer)
211 |                 row.append(index)
212 |                 col.append(user_to_internal[uid])
213 |         return sps.hstack(
214 |             [
215 |                 X,
216 |                 sps.csr_matrix(
217 |                     (data, (row, col)),
218 |                     shape=(len(movie_ids), len(user_to_internal)),
219 |                 ),
220 |             ]
221 |         )
222 | 
223 |     # Create RelationBlock.
224 |     train_blocks: List[RelationBlock] = []
225 |     test_blocks: List[RelationBlock] = []
226 |     for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
227 |         unique_users, user_map = np.unique(source.user_id, return_inverse=True)
228 |         target.append(RelationBlock(user_map, augment_user_id(unique_users)))
229 |         unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
230 |         target.append(RelationBlock(movie_map, augment_movie_id(unique_movies)))
231 | 
232 |     trace_path = "rmse_{0}_fold_{1}.csv".format(ALGORITHM, FOLD_INDEX)
233 | 
234 |     callback: LibFMLikeCallbackBase
235 |     fm: Union[MyFMRegressor, MyFMOrderedProbit]
236 |     if ALGORITHM == "regression":
237 |         fm = myfm.MyFMRegressor(rank=DIMENSION)
238 |         callback = RegressionCallback(
239 |             ITERATION,
240 |             X_date_test,
241 |             df_test.rating.values,
242 |             X_rel_test=test_blocks,
243 |             clip_min=0.5,
244 |             clip_max=5.0,
245 |             trace_path=trace_path,
246 |         )
247 |     else:
248 |         fm = myfm.MyFMOrderedProbit(rank=DIMENSION)
249 |         callback = OrderedProbitCallback(
250 |             ITERATION,
251 |             X_date_test,
252 |             df_test.rating.values,
253 |             n_class=5,
254 |             X_rel_test=test_blocks,
255 |             trace_path=trace_path,
256 |         )
257 | 
258 |     fm.fit(
259 |         X_date_train,
260 |         df_train.rating.values,
261 |         X_rel=train_blocks,
262 |         grouping=grouping,
263 |         n_iter=callback.n_iter,
264 |         callback=callback,
265 |         n_kept_samples=1,
266 |     )
267 |     with open(
268 |         "callback_result_{0}_fold_{1}.pkl".format(ALGORITHM, FOLD_INDEX), "wb"
269 |     ) as ofs:
270 |         pickle.dump(callback, ofs)
271 | 


--------------------------------------------------------------------------------
/examples/oprobit_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple example for ordred probit regression,
 3 | taken from the "MCMCoprobit" document of MCMCpack:
 4 | https://rdrr.io/cran/MCMCpack/man/MCMCoprobit.html
 5 | """
 6 | 
 7 | import numpy as np
 8 | 
 9 | from myfm import MyFMOrderedProbit
10 | 
11 | N_DATA = 100
12 | 
13 | rns = np.random.RandomState(42)
14 | X = rns.randn(N_DATA, 2)
15 | z = 1 + X[:, 0] * 0.1 - X[:, 1] * 0.5 + rns.randn(N_DATA)
16 | 
17 | y = z.copy()
18 | y[z < 0] = 0
19 | y[(z >= 0) & (z < 1)] = 1
20 | y[(z >= 1) & (z < 1.5)] = 2
21 | y[z >= 1.5] = 3
22 | 
23 | # Faster than MCMCoprobit by 40x, in my environment.
24 | fm = MyFMOrderedProbit(0, random_seed=42).fit(
25 |     X,
26 |     y,
27 |     n_iter=11000,
28 |     n_kept_samples=10000,
29 | )
30 | 
31 | print(fm.cutpoint_samples.mean(axis=0))
32 | 


--------------------------------------------------------------------------------
/examples/toy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.feature_extraction import DictVectorizer
 3 | 
 4 | import myfm
 5 | 
 6 | train = [
 7 |     {"user": "1", "item": "5", "age": 19},
 8 |     {"user": "2", "item": "43", "age": 33},
 9 |     {"user": "3", "item": "20", "age": 55},
10 |     {"user": "4", "item": "10", "age": 20},
11 | ]
12 | v = DictVectorizer()
13 | X = v.fit_transform(train)
14 | y = np.asarray([0, 1, 1, 0])
15 | fm = myfm.MyFMClassifier(rank=4)
16 | fm.fit(X, y)
17 | p = fm.predict_proba(v.transform({"user": "1", "item": "10", "age": 24}))
18 | print(p)
19 | 


--------------------------------------------------------------------------------
/include/Faddeeva/Faddeeva.hh:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2012 Massachusetts Institute of Technology
 2 |  *
 3 |  * Permission is hereby granted, free of charge, to any person obtaining
 4 |  * a copy of this software and associated documentation files (the
 5 |  * "Software"), to deal in the Software without restriction, including
 6 |  * without limitation the rights to use, copy, modify, merge, publish,
 7 |  * distribute, sublicense, and/or sell copies of the Software, and to
 8 |  * permit persons to whom the Software is furnished to do so, subject to
 9 |  * the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be
12 |  * included in all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 |  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 |  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 |  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |  */
22 | 
23 | /* Available at: http://ab-initio.mit.edu/Faddeeva
24 | 
25 |    Header file for Faddeeva.cc; see that file for more information. */
26 | 
27 | #ifndef FADDEEVA_HH
28 | #define FADDEEVA_HH 1
29 | 
30 | #include <complex>
31 | 
32 | namespace Faddeeva {
33 | 
34 | // compute w(z) = exp(-z^2) erfc(-iz) [ Faddeeva / scaled complex error func ]
35 | extern std::complex<double> w(std::complex<double> z,double relerr=0);
36 | extern double w_im(double x); // special-case code for Im[w(x)] of real x
37 | 
38 | // Various functions that we can compute with the help of w(z)
39 | 
40 | // compute erfcx(z) = exp(z^2) erfc(z)
41 | extern std::complex<double> erfcx(std::complex<double> z, double relerr=0);
42 | extern double erfcx(double x); // special case for real x
43 | 
44 | // compute erf(z), the error function of complex arguments
45 | extern std::complex<double> erf(std::complex<double> z, double relerr=0);
46 | extern double erf(double x); // special case for real x
47 | 
48 | // compute erfi(z) = -i erf(iz), the imaginary error function
49 | extern std::complex<double> erfi(std::complex<double> z, double relerr=0);
50 | extern double erfi(double x); // special case for real x
51 | 
52 | // compute erfc(z) = 1 - erf(z), the complementary error function
53 | extern std::complex<double> erfc(std::complex<double> z, double relerr=0);
54 | extern double erfc(double x); // special case for real x
55 | 
56 | // compute Dawson(z) = sqrt(pi)/2  *  exp(-z^2) * erfi(z)
57 | extern std::complex<double> Dawson(std::complex<double> z, double relerr=0);
58 | extern double Dawson(double x); // special case for real x
59 | 
60 | } // namespace Faddeeva
61 | 
62 | #endif // FADDEEVA_HH
63 | 


--------------------------------------------------------------------------------
/include/myfm/BaseFMTrainer.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <sstream>
  4 | #include <stdexcept>
  5 | #include <string>
  6 | #include <tuple>
  7 | 
  8 | #include "FMLearningConfig.hpp"
  9 | #include "HyperParams.hpp"
 10 | #include "OProbitSampler.hpp"
 11 | #include "definitions.hpp"
 12 | #include "predictor.hpp"
 13 | #include "util.hpp"
 14 | 
 15 | namespace myFM {
 16 | template <typename Real, class Derived, class FMType, class HyperType,
 17 |           class RelationWiseCache, class HistoryType>
 18 | struct BaseFMTrainer {
 19 |   // typedef typename Derived::FMType FMType;
 20 |   // typedef typename Derived::HyperType HyperType;
 21 | 
 22 |   typedef typename FMType::Vector Vector;
 23 |   typedef typename FMType::DenseMatrix DenseMatrix;
 24 |   typedef typename FMType::SparseMatrix SparseMatrix;
 25 | 
 26 |   typedef relational::RelationBlock<Real> RelationBlock;
 27 |   // typedef relational::RelationWiseCache<Real> RelationWiseCache;
 28 | 
 29 |   typedef FMLearningConfig<Real> Config;
 30 |   typedef typename Config::TASKTYPE TASKTYPE;
 31 | 
 32 |   typedef pair<Predictor<Real>, HistoryType> learn_result_type;
 33 | 
 34 |   typedef OprobitSampler<Real> OprobitSamplerType;
 35 | 
 36 |   SparseMatrix X;
 37 |   vector<RelationBlock> relations;
 38 |   SparseMatrix X_t; // transposed
 39 | 
 40 |   const size_t dim_all;
 41 |   const Vector y;
 42 | 
 43 |   const int n_train;
 44 |   int n_class = 0; // Used by ordered probit
 45 | 
 46 |   Vector e_train;
 47 |   Vector q_train;
 48 |   vector<RelationWiseCache> relation_caches;
 49 | 
 50 |   const Config learning_config;
 51 | 
 52 |   size_t n_nan_occurred = 0;
 53 | 
 54 |   inline BaseFMTrainer(const SparseMatrix &X,
 55 |                        const vector<RelationBlock> &relations, int random_seed,
 56 |                        Config learning_config) {}
 57 | 
 58 |   inline BaseFMTrainer(const SparseMatrix &X,
 59 |                        const vector<RelationBlock> &relations, const Vector &y,
 60 |                        int random_seed, Config learning_config)
 61 |       : X(X), relations(relations), X_t(X.transpose()),
 62 |         dim_all(check_row_consistency_return_column(X, relations)), y(y),
 63 |         n_train(X.rows()), e_train(X.rows()), q_train(X.rows()),
 64 |         relation_caches(), learning_config(learning_config),
 65 |         random_seed(random_seed), gen_(random_seed) {
 66 |     for (auto it = relations.begin(); it != relations.end(); it++) {
 67 |       relation_caches.emplace_back(*it);
 68 |     }
 69 |     if (X.rows() != y.rows()) {
 70 |       throw std::runtime_error(StringBuilder{}
 71 |                                    .add("Shape mismatch: X has size")
 72 |                                    .space_and_add(X.rows())
 73 |                                    .space_and_add("and y has size")
 74 |                                    .space_and_add(y.rows())
 75 |                                    .build());
 76 |     }
 77 |     this->X.makeCompressed();
 78 |     this->X_t.makeCompressed();
 79 |     if (learning_config.task_type == Config::TASKTYPE::ORDERED) {
 80 | 
 81 |       const size_t rows = this->X.rows();
 82 |       std::vector<bool> existence(rows, false);
 83 |       for (auto &group_config : learning_config.cutpoint_groups()) {
 84 |         for (size_t k : group_config.second) {
 85 |           if (k >= rows) {
 86 |             throw std::invalid_argument(
 87 |                 "out of range for cutpoint group config.");
 88 |           }
 89 |           if (existence[k]) {
 90 |             std::stringstream ss;
 91 |             ss << "index " << k << " overlapping in cutpoint config.";
 92 |             throw std::invalid_argument(ss.str());
 93 |           }
 94 |           existence[k] = true;
 95 |         }
 96 |       }
 97 |       for (size_t i_ = 0; i_ < rows; i_++) {
 98 |         if (!existence[i_]) {
 99 |           std::stringstream ss;
100 |           ss << "cutpoint group not specified for " << i_ << ".";
101 |           throw std::invalid_argument(ss.str());
102 |         }
103 |       }
104 |     }
105 |   }
106 | 
107 |   inline FMType create_FM(int rank, Real init_std) {
108 |     FMType fm(rank);
109 |     fm.initialize_weight(dim_all, init_std, gen_);
110 |     return fm;
111 |   }
112 | 
113 |   inline HyperType create_Hyper(size_t rank) {
114 |     return HyperType{rank, learning_config.get_n_groups()};
115 |   }
116 | 
117 | 
118 |   inline learn_result_type
119 |   learn_with_callback(FMType &fm, HyperType &hyper,
120 |                       std::function<bool(int, FMType *, HyperType *, Predictor<Real> *, HistoryType *)> cb);
121 | 
122 |   inline void initialize_hyper(FMType &fm, HyperType &hyper) {
123 |     static_cast<Derived &>(*this).initialize_alpha();
124 |     static_cast<Derived &>(*this).initialize_mu_w();
125 |     static_cast<Derived &>(*this).initialize_lambda_w();
126 | 
127 |     static_cast<Derived &>(*this).initialize_mu_V();
128 |     static_cast<Derived &>(*this).initialize_lambda_V();
129 |   }
130 | 
131 |   inline void initialize_e(FMType &fm, const HyperType &hyper) {
132 |     static_cast<Derived &>(*this).initialize_e(fm, hyper);
133 |   }
134 | 
135 |   inline void update_all(FMType &fm, HyperType &hyper) {
136 |     update_alpha_(fm, hyper);
137 | 
138 |     update_w0_(fm, hyper);
139 | 
140 |     update_lambda_w_(fm, hyper);
141 | 
142 |     update_mu_w_(fm, hyper);
143 | 
144 |     update_w_(fm, hyper);
145 | 
146 |     update_lambda_V_(fm, hyper);
147 |     update_mu_V_(fm, hyper);
148 | 
149 |     update_V_(fm, hyper);
150 | 
151 |     update_e_(fm, hyper);
152 |   }
153 | 
154 |   inline void update_alpha_(FMType &fm, HyperType &hyper) {
155 |     static_cast<Derived &>(*this).update_alpha(fm, hyper);
156 |   }
157 | 
158 |   inline void update_w0_(FMType &fm, HyperType &hyper) {
159 |     static_cast<Derived &>(*this).update_w0(fm, hyper);
160 |   }
161 | 
162 |   inline void update_lambda_w_(FMType &fm, HyperType &hyper) {
163 |     static_cast<Derived &>(*this).update_lambda_w(fm, hyper);
164 |   }
165 | 
166 |   inline void update_mu_w_(FMType &fm, HyperType &hyper) {
167 |     static_cast<Derived &>(*this).update_mu_w(fm, hyper);
168 |   }
169 | 
170 |   inline void update_lambda_V_(FMType &fm, HyperType &hyper) {
171 |     static_cast<Derived &>(*this).update_lambda_V(fm, hyper);
172 |   }
173 | 
174 |   inline void update_mu_V_(FMType &fm, HyperType &hyper) {
175 |     static_cast<Derived &>(*this).update_mu_V(fm, hyper);
176 |   }
177 | 
178 |   inline void update_w_(FMType &fm, HyperType &hyper) {
179 |     static_cast<Derived &>(*this).update_w(fm, hyper);
180 |   }
181 | 
182 |   inline void update_e_(FMType &fm, HyperType &hyper) {
183 |     static_cast<Derived &>(*this).update_e(fm, hyper);
184 |   }
185 | 
186 |   inline void update_V_(FMType &fm, HyperType &hyper) {
187 |     static_cast<Derived &>(*this).update_V(fm, hyper);
188 |   }
189 | 
190 |   const int random_seed;
191 | 
192 | protected:
193 |   mt19937 gen_;
194 |   // std::vector<OprobitSamplerType> cutpoint_sampler;
195 | 
196 | }; // BaseFMTrainer
197 | } // namespace myFM
198 | 


--------------------------------------------------------------------------------
/include/myfm/FM.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "definitions.hpp"
  3 | #include <cmath>
  4 | #include <sstream>
  5 | 
  6 | namespace myFM {
  7 | 
  8 | using namespace std;
  9 | 
 10 | template <typename Real> struct FM {
 11 | 
 12 |   typedef relational::RelationBlock<Real> RelationBlock;
 13 | 
 14 |   typedef types::DenseMatrix<Real> DenseMatrix;
 15 |   typedef types::SparseMatrix<Real> SparseMatrix;
 16 |   typedef types::Vector<Real> Vector;
 17 | 
 18 |   inline FM(int n_factors, size_t n_groups)
 19 |       : n_factors(n_factors), initialized(false) {}
 20 |   inline FM(int n_factors) : FM(n_factors, 1) {}
 21 | 
 22 |   inline FM(const FM &other)
 23 |       : n_factors(other.n_factors), w0(other.w0), w(other.w), V(other.V),
 24 |         cutpoints(other.cutpoints), initialized(other.initialized) {}
 25 | 
 26 |   inline FM(Real w0, const Vector &w, const DenseMatrix &V)
 27 |       : n_factors(V.cols()), w0(w0), w(w), V(V), initialized(true) {}
 28 | 
 29 |   inline FM(Real w0, const Vector &w, const DenseMatrix &V,
 30 |             const vector<Vector> &cutpoints)
 31 |       : n_factors(V.cols()), w0(w0), w(w), V(V), cutpoints(cutpoints),
 32 |         initialized(true) {}
 33 | 
 34 |   inline void initialize_weight(int n_features, Real init_std, mt19937 &gen) {
 35 |     initialized = false;
 36 |     normal_distribution<Real> nd;
 37 | 
 38 |     auto get_rand = [&gen, &nd, init_std](Real dummy) {
 39 |       return nd(gen) * init_std;
 40 |     };
 41 |     V = DenseMatrix{n_features, n_factors}.unaryExpr(get_rand);
 42 |     w = Vector{n_features}.unaryExpr(get_rand);
 43 |     w0 = get_rand(1);
 44 |     initialized = true;
 45 |   }
 46 | 
 47 |   inline Vector predict_score(const SparseMatrix &X,
 48 |                               const vector<RelationBlock> &relations) const {
 49 |     Vector result(X.rows());
 50 |     predict_score_write_target(result, X, relations);
 51 |     return result;
 52 |   }
 53 | 
 54 |   inline void
 55 |   predict_score_write_target(Eigen::Ref<Vector> target, const SparseMatrix &X,
 56 |                              const vector<RelationBlock> &relations) const {
 57 |     // check input consistency
 58 |     size_t case_size = X.rows();
 59 |     size_t feature_size_all = X.cols();
 60 |     for (auto const &rel : relations) {
 61 |       if (case_size != rel.original_to_block.size()) {
 62 |         throw std::invalid_argument(
 63 |             "Relation blocks have inconsistent mapper size with case_size");
 64 |       }
 65 |       feature_size_all += rel.feature_size;
 66 |     }
 67 |     if (feature_size_all != static_cast<size_t>(this->w.rows())) {
 68 |       std::stringstream error_stream;
 69 |       error_stream << "Total feature size mismatch. Should be "
 70 |                    << (this->w.rows()) << ", but got " << feature_size_all
 71 |                    << ".";
 72 |       throw std::invalid_argument(error_stream.str());
 73 |     }
 74 | 
 75 |     if (!initialized) {
 76 |       throw std::runtime_error("get_score called before initialization");
 77 |     }
 78 |     target = w0 + (X * w.head(X.cols())).array();
 79 |     size_t offset = X.cols();
 80 |     for (auto iter = relations.begin(); iter != relations.end(); iter++) {
 81 |       Vector w0_cache = (iter->X) * w.segment(offset, iter->feature_size);
 82 |       size_t j = 0;
 83 |       for (auto i : (iter->original_to_block)) {
 84 |         target(j++) += w0_cache(i);
 85 |       }
 86 |       offset += iter->feature_size;
 87 |     }
 88 | 
 89 |     Vector q_cache(target.rows());
 90 |     size_t buffer_size = 1;
 91 |     vector<Real> buffer_cache(1);
 92 |     vector<Vector> block_q_caches;
 93 |     for (auto &relation : relations) {
 94 |       buffer_size = std::max(buffer_size, relation.block_size);
 95 |     }
 96 |     buffer_cache.resize(buffer_size);
 97 | 
 98 |     for (int factor_index = 0; factor_index < this->n_factors; factor_index++) {
 99 |       q_cache = X * V.col(factor_index).head(X.cols());
100 |       size_t offset = X.cols();
101 |       size_t relation_index = 0;
102 |       for (auto iter = relations.begin(); iter != relations.end();
103 |            iter++, relation_index++) {
104 |         Eigen::Map<Vector> block_cache(buffer_cache.data(), iter->block_size);
105 |         block_cache =
106 |             iter->X * V.col(factor_index).segment(offset, iter->feature_size);
107 |         offset += iter->feature_size;
108 |         size_t train_case_index = 0;
109 |         for (auto i : iter->original_to_block) {
110 |           q_cache(train_case_index++) += block_cache(i);
111 |         }
112 |       }
113 |       target.array() += q_cache.array().square() * static_cast<Real>(0.5);
114 | 
115 |       offset = X.cols();
116 |       relation_index = 0;
117 |       q_cache = X.cwiseAbs2() *
118 |                 (V.col(factor_index).head(X.cols()).array().square().matrix());
119 |       for (auto iter = relations.begin(); iter != relations.end();
120 |            iter++, relation_index++) {
121 |         Eigen::Map<Vector> block_cache(buffer_cache.data(), iter->block_size);
122 |         block_cache =
123 |             (iter->X.cwiseAbs2()) * (V.col(factor_index)
124 |                                          .segment(offset, iter->feature_size)
125 |                                          .array()
126 |                                          .square()
127 |                                          .matrix());
128 |         offset += iter->feature_size;
129 |         size_t train_case_index = 0;
130 |         for (auto i : iter->original_to_block) {
131 |           q_cache(train_case_index++) += block_cache(i);
132 |         }
133 |       }
134 |       target -= q_cache * static_cast<Real>(0.5);
135 |     }
136 |   }
137 |   inline DenseMatrix
138 |   oprobit_predict_proba(const SparseMatrix &X,
139 |                         const vector<RelationBlock> &relations,
140 |                         size_t cutpoint_index) const {
141 |     if (cutpoints.empty()) {
142 |       throw std::runtime_error("No cutpoint available for this FM.");
143 |     }
144 |     int n_cpt = cutpoints.at(cutpoint_index).size();
145 |     DenseMatrix result = DenseMatrix::Zero(X.rows(), n_cpt + 1);
146 | 
147 |     Vector score(X.rows());
148 |     DenseMatrix cache(X.rows(), n_cpt + 1);
149 |     predict_score_write_target(score, X, relations);
150 |     for (int cpt_index = 0; cpt_index < n_cpt; cpt_index++) {
151 |       cache.col(cpt_index) =
152 |           (1 + ((cutpoints.at(cutpoint_index)(cpt_index) - score.array()) *
153 |                 static_cast<Real>(std::sqrt(0.5)))
154 |                    .erf()) /
155 |           2;
156 |     }
157 |     cache.col(n_cpt) = (1 - cache.col(n_cpt - 1).array());
158 |     for (int col = n_cpt - 1; col >= 1; col--) {
159 |       cache.col(col) -= cache.col(col - 1);
160 |     }
161 |     return cache;
162 |   }
163 | 
164 |   const int n_factors;
165 |   Real w0;
166 |   Vector w;
167 |   DenseMatrix V;            // (n_feature, n_factor) - matrix
168 |   vector<Vector> cutpoints; // ordered probit
169 | 
170 | protected:
171 |   bool initialized;
172 | };
173 | 
174 | } // namespace myFM
175 | 


--------------------------------------------------------------------------------
/include/myfm/FMLearningConfig.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "OProbitSampler.hpp"
  4 | #include "definitions.hpp"
  5 | #include "util.hpp"
  6 | #include <cstddef>
  7 | #include <set>
  8 | #include <tuple>
  9 | #include <vector>
 10 | 
 11 | namespace myFM {
 12 | template <typename Real> struct FMLearningConfig {
 13 | public:
 14 |   enum class TASKTYPE { REGRESSION, CLASSIFICATION, ORDERED };
 15 |   using CutpointGroupType = vector<pair<size_t, vector<size_t>>>;
 16 | 
 17 |   inline FMLearningConfig(Real alpha_0, Real beta_0, Real gamma_0, Real mu_0,
 18 |                           Real reg_0, TASKTYPE task_type, Real nu_oprobit,
 19 |                           bool fit_w0, bool fit_linear,
 20 |                           const vector<size_t> &group_index, int n_iter,
 21 |                           int n_kept_samples, Real cutpoint_scale,
 22 |                           const CutpointGroupType &cutpoint_groups)
 23 |       : alpha_0(alpha_0), beta_0(beta_0), gamma_0(gamma_0), mu_0(mu_0),
 24 |         reg_0(reg_0), task_type(task_type), nu_oprobit(nu_oprobit),
 25 |         fit_w0(fit_w0), fit_linear(fit_linear), n_iter(n_iter),
 26 |         n_kept_samples(n_kept_samples), cutpoint_scale(cutpoint_scale),
 27 |         group_index_(group_index), cutpoint_groups_(cutpoint_groups) {
 28 | 
 29 |     /* check group_index consistency */
 30 |     set<size_t> all_index(group_index.begin(), group_index.end());
 31 |     n_groups_ = all_index.size();
 32 |     /* verify that groups from 0 - (n_groups - 1)  are contained.*/
 33 |     for (size_t i = 0; i < n_groups_; i++) {
 34 |       if (all_index.find(i) == all_index.cend()) {
 35 |         throw invalid_argument(
 36 |             (StringBuilder{})("No matching index for group index ")(i)(
 37 |                 " found.")
 38 |                 .build());
 39 |       }
 40 |     }
 41 |     group_vs_feature_index_ = vector<vector<size_t>>{n_groups_};
 42 | 
 43 |     size_t feature_index = 0;
 44 |     for (auto iter = group_index.cbegin(); iter != group_index.cend(); iter++) {
 45 |       group_vs_feature_index_[*iter].push_back(feature_index++);
 46 |     }
 47 | 
 48 |     if (n_kept_samples < 0) {
 49 |       throw invalid_argument("n_kept_samples must be non-negative,");
 50 |     }
 51 |     if (n_iter <= 0) {
 52 |       throw invalid_argument("n_iter must be positive.");
 53 |     }
 54 |     if (n_iter < n_kept_samples) {
 55 |       throw invalid_argument("n_kept_samples must not exceed n_iter.");
 56 |     }
 57 |   }
 58 | 
 59 |   FMLearningConfig(const FMLearningConfig &other) = default;
 60 | 
 61 |   const Real alpha_0, beta_0, gamma_0;
 62 |   const Real mu_0;
 63 |   const Real reg_0;
 64 | 
 65 |   const TASKTYPE task_type;
 66 |   const Real nu_oprobit;
 67 |   bool fit_w0, fit_linear;
 68 | 
 69 |   const int n_iter, n_kept_samples;
 70 | 
 71 |   const Real cutpoint_scale;
 72 | 
 73 | private:
 74 |   const vector<size_t> group_index_;
 75 |   size_t n_groups_;
 76 |   vector<vector<size_t>> group_vs_feature_index_;
 77 | 
 78 |   const CutpointGroupType cutpoint_groups_;
 79 | 
 80 | public:
 81 |   inline size_t get_n_groups() const { return n_groups_; }
 82 | 
 83 |   inline size_t group_index(int at) const { return group_index_.at(at); }
 84 |   const CutpointGroupType &cutpoint_groups() const {
 85 |     return this->cutpoint_groups_;
 86 |   }
 87 | 
 88 |   const vector<vector<size_t>> &group_vs_feature_index() const {
 89 |     return group_vs_feature_index_;
 90 |   }
 91 | 
 92 |   struct Builder {
 93 |     Real alpha_0 = 1;
 94 |     Real beta_0 = 1;
 95 |     Real gamma_0 = 1;
 96 |     Real mu_0 = 1;
 97 |     Real reg_0 = 1;
 98 |     int n_iter = 100;
 99 |     int n_kept_samples = 10;
100 |     TASKTYPE task_type = TASKTYPE::REGRESSION;
101 |     Real nu_oprobit = 5;
102 |     bool fit_w0 = true;
103 |     bool fit_linear = true;
104 |     vector<size_t> group_index;
105 |     Real cutpoint_scale = 10;
106 |     CutpointGroupType cutpoint_groups;
107 | 
108 |     Builder() {}
109 | 
110 |     inline Builder &set_alpha_0(Real arg) {
111 |       this->alpha_0 = arg;
112 |       return *this;
113 |     }
114 | 
115 |     inline Builder &set_beta_0(Real arg) {
116 |       this->beta_0 = arg;
117 |       return *this;
118 |     }
119 | 
120 |     inline Builder &set_gamma_0(Real arg) {
121 |       this->gamma_0 = arg;
122 |       return *this;
123 |     }
124 | 
125 |     inline Builder &set_mu_0(Real arg) {
126 |       this->mu_0 = arg;
127 |       return *this;
128 |     }
129 |     inline Builder &set_reg_0(Real arg) {
130 |       this->reg_0 = arg;
131 |       return *this;
132 |     }
133 | 
134 |     inline Builder &set_n_iter(int arg) {
135 |       this->n_iter = arg;
136 |       return *this;
137 |     }
138 | 
139 |     inline Builder &set_n_kept_samples(int arg) {
140 |       this->n_kept_samples = arg;
141 |       return *this;
142 |     }
143 | 
144 |     inline Builder &set_task_type(TASKTYPE arg) {
145 |       this->task_type = arg;
146 |       return *this;
147 |     }
148 | 
149 |     inline Builder &set_group_index(const vector<size_t> arg) {
150 |       this->group_index = arg;
151 |       return *this;
152 |     }
153 | 
154 |     inline Builder &set_identical_groups(size_t n_features) {
155 |       vector<size_t> default_group_index(n_features);
156 |       for (auto c = default_group_index.begin(); c != default_group_index.end();
157 |            c++) {
158 |         *c = 0;
159 |       }
160 |       return set_group_index(default_group_index);
161 |     }
162 | 
163 |     inline Builder &set_nu_oprobit(size_t nu_oprobit) {
164 |       this->nu_oprobit = nu_oprobit;
165 |       return *this;
166 |     }
167 | 
168 |     inline Builder &set_fit_w0(bool fit_w0) {
169 |       this->fit_w0 = fit_w0;
170 |       return *this;
171 |     }
172 | 
173 |     inline Builder &set_fit_linear(bool fit_linear) {
174 |       this->fit_linear = fit_linear;
175 |       return *this;
176 |     }
177 | 
178 |     inline Builder &set_cutpoint_scale(Real cutpoint_scale) {
179 |       this->cutpoint_scale = cutpoint_scale;
180 |       return *this;
181 |     }
182 | 
183 |     inline Builder &
184 |     set_cutpoint_groups(const CutpointGroupType &cutpoint_groups) {
185 |       this->cutpoint_groups = cutpoint_groups;
186 |       return *this;
187 |     }
188 | 
189 |     FMLearningConfig build() {
190 |       return FMLearningConfig(alpha_0, beta_0, gamma_0, mu_0, reg_0, task_type,
191 |                               nu_oprobit, fit_w0, fit_linear, group_index,
192 |                               n_iter, n_kept_samples, cutpoint_scale,
193 |                               this->cutpoint_groups);
194 |     }
195 | 
196 |     static FMLearningConfig get_default_config(size_t n_features) {
197 |       Builder builder;
198 |       return builder.set_identical_groups(n_features).build();
199 |     }
200 | 
201 |   }; // end Builder
202 | };
203 | 
204 | } // namespace myFM
205 | 


--------------------------------------------------------------------------------
/include/myfm/HyperParams.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "FM.hpp"
 4 | #include "definitions.hpp"
 5 | 
 6 | namespace myFM {
 7 | 
 8 | template <typename Real> struct FMHyperParameters {
 9 |   using FMType = FM<Real>;
10 |   using Vector = typename FMType::Vector;
11 |   using DenseMatrix = typename FMType::DenseMatrix;
12 | 
13 |   Real alpha;
14 | 
15 |   Vector mu_w;     // mean for w. will be (n_group) - vector
16 |   Vector lambda_w; // variances for w. will be (n_group) - vector
17 | 
18 |   DenseMatrix mu_V;     // mean for V. will be (n_group x n_factor) matrix
19 |   DenseMatrix lambda_V; // variances for V (n_group x n_factor) - matrix
20 | 
21 |   inline FMHyperParameters(size_t n_factors, size_t n_groups)
22 |       : mu_w(n_groups), lambda_w(n_groups), mu_V(n_groups, n_factors),
23 |         lambda_V(n_groups, n_factors) {}
24 | 
25 |   inline FMHyperParameters(size_t n_factors)
26 |       : FMHyperParameters(n_factors, 1) {}
27 | 
28 |   inline FMHyperParameters(Real alpha, const Vector &mu_w,
29 |                            const Vector &lambda_w, const DenseMatrix &mu_V,
30 |                            const DenseMatrix &lambda_V)
31 |       : alpha(alpha), mu_w(mu_w), lambda_w(lambda_w), mu_V(mu_V),
32 |         lambda_V(lambda_V) {}
33 | 
34 |   inline FMHyperParameters(const FMHyperParameters &other)
35 |       : alpha(other.alpha), mu_w(other.mu_w), lambda_w(other.lambda_w),
36 |         mu_V(other.mu_V), lambda_V(other.lambda_V) {}
37 | };
38 | 
39 | } // namespace myFM
40 | 


--------------------------------------------------------------------------------
/include/myfm/LearningHistory.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "HyperParams.hpp"
 4 | 
 5 | namespace myFM {
 6 | template <typename Real> struct GibbsLearningHistory {
 7 |   std::vector<FMHyperParameters<Real>> hypers;
 8 |   std::vector<size_t>
 9 |       n_mh_accept; // will be used for M-H step in ordered probit regression;
10 |   std::vector<Real> train_log_losses;
11 | };
12 | } // namespace myFM
13 | 


--------------------------------------------------------------------------------
/include/myfm/definitions.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <iostream>
 4 | #include <memory>
 5 | #include <random>
 6 | #include <unsupported/Eigen/SpecialFunctions>
 7 | #include <vector>
 8 | 
 9 | #include <Eigen/Core>
10 | #include <Eigen/Sparse>
11 | 
12 | namespace myFM {
13 | 
14 | using namespace std;
15 | namespace types {
16 | template <typename Real>
17 | using DenseMatrix = Eigen::Matrix<Real, -1, -1, Eigen::ColMajor>;
18 | 
19 | template <typename Real> using Vector = Eigen::Matrix<Real, -1, 1>;
20 | 
21 | template <typename Real>
22 | using SparseMatrix = Eigen::SparseMatrix<Real, Eigen::RowMajor>;
23 | 
24 | template <typename Real> using SparseVector = Eigen::SparseVector<Real>;
25 | 
26 | } // namespace types
27 | 
28 | namespace relational {
29 | 
30 | template <typename Real> struct RelationBlock {
31 |   typedef Eigen::SparseMatrix<Real, Eigen::RowMajor> SparseMatrix;
32 |   typedef Eigen::Matrix<Real, -1, 1> Vector;
33 | 
34 |   inline RelationBlock(vector<size_t> original_to_block, const SparseMatrix &X)
35 |       : original_to_block(original_to_block),
36 |         mapper_size(original_to_block.size()), X(X), block_size(X.rows()),
37 |         feature_size(X.cols()) {
38 |     for (auto c : original_to_block) {
39 |       if (c >= block_size)
40 |         throw runtime_error("index mapping points to non-existing row.");
41 |     }
42 |   }
43 | 
44 |   inline RelationBlock(const RelationBlock &other)
45 |       : RelationBlock(other.original_to_block, other.X) {}
46 | 
47 |   const vector<size_t> original_to_block;
48 |   const size_t mapper_size;
49 |   const SparseMatrix X;
50 |   const size_t block_size;
51 |   const size_t feature_size;
52 | };
53 | 
54 | template <typename Real> struct RelationWiseCache {
55 |   typedef typename RelationBlock<Real>::Vector Vector;
56 |   typedef typename RelationBlock<Real>::SparseMatrix SparseMatrix;
57 | 
58 |   inline RelationWiseCache(const RelationBlock<Real> &source)
59 |       : target(source), X_t(source.X.transpose()), cardinality(source.X.rows()),
60 |         y(source.X.rows()), q(source.X.rows()), q_S(source.X.rows()),
61 |         c(source.X.rows()), c_S(source.X.rows()), e(source.X.rows()),
62 |         e_q(source.X.rows()) {
63 |     X_t.makeCompressed();
64 |     cardinality.array() = static_cast<Real>(0);
65 |     for (auto v : source.original_to_block) {
66 |       cardinality(v)++;
67 |     }
68 |   }
69 | 
70 |   const RelationBlock<Real> &target;
71 |   SparseMatrix X_t;
72 |   Vector cardinality; // for each
73 | 
74 |   Vector y;
75 | 
76 |   Vector q;
77 |   Vector q_S;
78 | 
79 |   Vector c;
80 |   Vector c_S;
81 | 
82 |   Vector e;
83 |   Vector e_q;
84 | };
85 | } // namespace relational
86 | 
87 | } // namespace myFM
88 | 


--------------------------------------------------------------------------------
/include/myfm/predictor.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <atomic>
  4 | #include <mutex>
  5 | #include <thread>
  6 | 
  7 | #include "FM.hpp"
  8 | #include "FMLearningConfig.hpp"
  9 | #include "definitions.hpp"
 10 | #include "util.hpp"
 11 | 
 12 | namespace myFM {
 13 | 
 14 | template <typename Real, class FMType = FM<Real>> struct Predictor {
 15 |   typedef typename FMLearningConfig<Real>::TASKTYPE TASKTYPE;
 16 |   typedef typename FMType::SparseMatrix SparseMatrix;
 17 |   typedef typename FMType::Vector Vector;
 18 |   typedef typename FMType::DenseMatrix DenseMatrix;
 19 |   typedef typename FMType::RelationBlock RelationBlock;
 20 | 
 21 |   inline Predictor(size_t rank, size_t feature_size, TASKTYPE type)
 22 |       : rank(rank), feature_size(feature_size), type(type), samples() {}
 23 | 
 24 |   inline void check_input(const SparseMatrix &X,
 25 |                           const vector<RelationBlock> &relations) const {
 26 |     auto given_feature_size = check_row_consistency_return_column(X, relations);
 27 |     if (feature_size != given_feature_size) {
 28 |       throw std::invalid_argument(
 29 |           StringBuilder{}("Told to predict for ")(
 30 |               given_feature_size)(" but this->feature_size is ")(feature_size)
 31 |               .build());
 32 |     }
 33 |   }
 34 | 
 35 |   inline Vector predict_parallel(const SparseMatrix &X,
 36 |                                  const vector<RelationBlock> &relations,
 37 |                                  size_t n_workers) const {
 38 |     check_input(X, relations);
 39 |     if (samples.empty()) {
 40 |       throw std::runtime_error("Told to predict but no sample available.");
 41 |     }
 42 |     Vector result = Vector::Zero(X.rows());
 43 |     const size_t n_samples = this->samples.size();
 44 | 
 45 |     std::mutex mtx;
 46 |     std::atomic<size_t> currently_done(0);
 47 |     std::vector<std::thread> workers;
 48 | 
 49 |     for (size_t i = 0; i < n_workers; i++) {
 50 |       workers.emplace_back(
 51 |           [this, n_samples, &result, &X, &relations, &currently_done, &mtx] {
 52 |             Vector cache(X.rows());
 53 |             while (true) {
 54 |               size_t cd = currently_done++;
 55 |               if (cd >= n_samples)
 56 |                 break;
 57 |               this->samples[cd].predict_score_write_target(cache, X, relations);
 58 |               if (this->type == TASKTYPE::CLASSIFICATION) {
 59 |                 cache.array() =
 60 |                     ((cache.array() * static_cast<Real>(std::sqrt(0.5))).erf() +
 61 |                      static_cast<Real>(1)) /
 62 |                     static_cast<Real>(2);
 63 |               }
 64 |               {
 65 |                 std::lock_guard<std::mutex> lock{mtx};
 66 |                 result += cache;
 67 |               }
 68 |             }
 69 |           });
 70 |     }
 71 |     for (auto &worker : workers) {
 72 |       worker.join();
 73 |     }
 74 |     result.array() /= static_cast<Real>(n_samples);
 75 |     return result;
 76 |   }
 77 | 
 78 |   inline DenseMatrix
 79 |   predict_parallel_oprobit(const SparseMatrix &X,
 80 |                            const vector<RelationBlock> &relations,
 81 |                            size_t n_workers, size_t cutpoint_index) const {
 82 |     check_input(X, relations);
 83 |     if (samples.empty()) {
 84 |       throw std::runtime_error("Told to predict but no sample available.");
 85 |     }
 86 |     if (this->type != TASKTYPE::ORDERED) {
 87 |       throw std::runtime_error(
 88 |           "predict_parallel_oprobit must be called for oprobit model.");
 89 |     }
 90 |     int n_cpt = (this->samples.at(0)).cutpoints.at(cutpoint_index).size();
 91 |     DenseMatrix result = DenseMatrix::Zero(X.rows(), n_cpt + 1);
 92 |     const size_t n_samples = this->samples.size();
 93 | 
 94 |     std::mutex mtx;
 95 |     std::atomic<size_t> currently_done(0);
 96 |     std::vector<std::thread> workers;
 97 | 
 98 |     for (size_t i = 0; i < n_workers; i++) {
 99 |       workers.emplace_back([this, n_samples, &result, &X, &relations,
100 |                             &currently_done, &mtx, cutpoint_index, n_cpt] {
101 |         Vector score(X.rows());
102 | 
103 |         while (true) {
104 |           size_t cd = currently_done.fetch_add(1);
105 |           if (cd >= n_samples)
106 |             break;
107 | 
108 |           DenseMatrix sample_result =
109 |               this->samples.at(cd).oprobit_predict_proba(X, relations,
110 |                                                          cutpoint_index);
111 | 
112 |           {
113 |             std::lock_guard<std::mutex> lock{mtx};
114 |             result += sample_result;
115 |           }
116 |         }
117 |       });
118 |     }
119 |     for (auto &worker : workers) {
120 |       worker.join();
121 |     }
122 |     result.array() /= static_cast<Real>(n_samples);
123 |     return result;
124 |   }
125 | 
126 |   inline Vector predict(const SparseMatrix &X,
127 |                         const vector<RelationBlock> &relations) const {
128 |     check_input(X, relations);
129 |     if (samples.empty()) {
130 |       throw std::runtime_error("Empty samples!");
131 |     }
132 |     Vector result = Vector::Zero(X.rows());
133 |     Vector cache = Vector(X.rows());
134 |     for (auto iter = samples.cbegin(); iter != samples.cend(); iter++) {
135 |       iter->predict_score_write_target(cache, X, relations);
136 |       if (type == TASKTYPE::REGRESSION) {
137 |         result += cache;
138 |       } else if (type == TASKTYPE::CLASSIFICATION) {
139 |         result.array() +=
140 |             ((cache.array() * static_cast<Real>(std::sqrt(0.5))).erf() +
141 |              static_cast<Real>(1)) /
142 |             static_cast<Real>(2);
143 |       }
144 |     }
145 |     result.array() /= static_cast<Real>(samples.size());
146 |     return result;
147 |   }
148 | 
149 |   inline void set_samples(vector<FMType> &&samples_from) {
150 |     samples = std::forward<vector<FMType>>(samples_from);
151 |   }
152 | 
153 |   inline void add_sample(const FMType &fm) {
154 |     if (fm.w0.rows() != feature_size) {
155 |       throw std::invalid_argument("feature size mismatch!");
156 |     }
157 |     if (fm.V.cols() != rank) {
158 |       throw std::invalid_argument("rank mismatch!");
159 |     }
160 |     samples.emplace_back(fm);
161 |   }
162 | 
163 |   const size_t rank;
164 |   const size_t feature_size;
165 |   const TASKTYPE type;
166 |   vector<FMType> samples;
167 | };
168 | 
169 | } // namespace myFM
170 | 


--------------------------------------------------------------------------------
/include/myfm/util.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "Faddeeva/Faddeeva.hh"
  3 | #include "definitions.hpp"
  4 | #include <random>
  5 | #include <sstream>
  6 | 
  7 | namespace myFM {
  8 | using namespace std;
  9 | 
 10 | /*
 11 | Sample from truncated normal distribution.
 12 | https://arxiv.org/pdf/0907.4010.pdf
 13 | Proposition 2.3.
 14 | */
 15 | template <typename Real>
 16 | inline Real sample_truncated_normal_left(mt19937 &gen, Real mu_minus) {
 17 |   if (mu_minus < 0) {
 18 |     normal_distribution<Real> dist(0, 1);
 19 |     while (true) {
 20 |       Real z = dist(gen);
 21 |       if (z > mu_minus) {
 22 |         return z;
 23 |       }
 24 |     }
 25 |   } else {
 26 |     Real alpha_star = (mu_minus + std::sqrt(mu_minus * mu_minus + 4)) / 2;
 27 |     uniform_real_distribution<Real> dist(0, 1);
 28 |     while (true) {
 29 |       Real z = -std::log(dist(gen)) / alpha_star + mu_minus;
 30 |       Real rho = std::exp(-(z - alpha_star) * (z - alpha_star) / 2);
 31 |       Real u = dist(gen);
 32 |       if (u < rho) {
 33 |         return z;
 34 |       }
 35 |     }
 36 |   }
 37 | }
 38 | 
 39 | template <typename Real>
 40 | inline Real sample_truncated_normal_twoside(mt19937 &gen, Real mu_minus,
 41 |                                             Real mu_plus) {
 42 |   uniform_real_distribution<Real> proposal(mu_minus, mu_plus);
 43 |   uniform_real_distribution<Real> acceptance(0, 1);
 44 |   Real rho;
 45 |   while (true) {
 46 |     Real z = proposal(gen);
 47 |     if ((mu_minus <= static_cast<Real>(0)) &&
 48 |         (mu_plus >= static_cast<Real>(0))) {
 49 |       rho = std::exp(-z * z / 2);
 50 |     } else if (mu_plus < static_cast<Real>(0)) {
 51 |       rho = std::exp((mu_plus * mu_plus - z * z) / 2);
 52 |     } else {
 53 |       rho = std::exp((mu_minus * mu_minus - z * z) / 2);
 54 |     }
 55 |     Real u = acceptance(gen);
 56 |     if (u < rho) {
 57 |       return z;
 58 |     }
 59 |   }
 60 | }
 61 | template <typename Real>
 62 | inline Real sample_truncated_normal_left(mt19937 &gen, Real mean, Real std,
 63 |                                          Real mu_minus) {
 64 |   return mean +
 65 |          std * sample_truncated_normal_left(gen, (mu_minus - mean) / std);
 66 | }
 67 | 
 68 | template <typename Real>
 69 | inline Real sample_truncated_normal_right(mt19937 &gen, Real mu_plus) {
 70 |   return -sample_truncated_normal_left(gen, -mu_plus);
 71 | }
 72 | 
 73 | template <typename Real>
 74 | inline Real sample_truncated_normal_right(mt19937 &gen, Real mean, Real std,
 75 |                                           Real mu_plus) {
 76 |   return mean +
 77 |          std * sample_truncated_normal_right(gen, (mu_plus - mean) / std);
 78 | }
 79 | 
 80 | template <typename Real>
 81 | inline std::tuple<Real, Real, Real> mean_var_truncated_normal_left(Real mu) {
 82 |   static constexpr Real SQRT2 = 1.4142135623730951;
 83 |   static constexpr Real SQRTPI = 1.7724538509055159;
 84 |   static constexpr Real SQRT2PI = SQRT2 * SQRTPI;
 85 | 
 86 |   // mean, variance, log(Z)
 87 | 
 88 |   /*
 89 |   q(z)  = 1{z > 0} exp( - frac{1}{2}(z-mu)^2) / Z
 90 |   Z = 1 - \Phi(-mu)
 91 |   E_q[z] = \mu + 1/\sqrt{2\pi} exp(-\mu^2/2) / (1 - \Phi(-mu))
 92 |   */
 93 |   Real phi_Z;
 94 |   Real lnZ;
 95 |   Real mu_square = mu * mu / 2;
 96 |   if (mu > 0) {
 97 |     Real Z = (1 - Faddeeva::erf(-mu / SQRT2));
 98 |     phi_Z = 2 * std::exp(-mu_square) / SQRT2PI / Z;
 99 |     lnZ = std::log(Z);
100 |   } else {
101 |     Real Z = (Faddeeva::erfcx(-mu / SQRT2));
102 |     phi_Z = 2 / Z / SQRT2PI;
103 |     lnZ = std::log(Z) - mu_square;
104 |   }
105 |   std::tuple<Real, Real, Real> result(mu + phi_Z,
106 |                                       1 - mu * phi_Z - phi_Z * phi_Z, lnZ);
107 |   return result;
108 | }
109 | 
110 | template <typename Real>
111 | inline std::tuple<Real, Real, Real> mean_var_truncated_normal_right(Real mu) {
112 |   auto result = mean_var_truncated_normal_left(-mu);
113 |   std::get<0>(result) *= -1;
114 |   return result;
115 | }
116 | 
117 | struct StringBuilder {
118 |   inline StringBuilder() : oss_() {}
119 | 
120 |   template <typename T> inline StringBuilder &add(const T &arg) {
121 |     oss_ << arg;
122 |     return *this;
123 |   }
124 | 
125 |   template <typename T> inline StringBuilder &operator()(const T &arg) {
126 |     oss_ << arg;
127 |     return *this;
128 |   }
129 | 
130 |   template <typename T> inline StringBuilder &space_and_add(const T &arg) {
131 |     oss_ << " " << arg;
132 |     return *this;
133 |   }
134 | 
135 |   template <typename T, typename F>
136 |   inline StringBuilder &add(const T &arg, const T &fmt) {
137 |     oss_ << fmt << arg;
138 |     return *this;
139 |   }
140 | 
141 |   inline string build() { return oss_.str(); }
142 | 
143 | private:
144 |   ostringstream oss_;
145 | };
146 | 
147 | template <typename Real>
148 | inline size_t check_row_consistency_return_column(
149 |     const types::SparseMatrix<Real> &X,
150 |     const vector<relational::RelationBlock<Real>> &relations) {
151 |   size_t row = X.rows();
152 |   size_t col = X.cols();
153 |   int i = 0;
154 |   for (const auto &rel : relations) {
155 |     if (row != rel.original_to_block.size()) {
156 |       throw std::runtime_error(
157 |           (StringBuilder{})("main table has size ")(row)(" but the relation[")(
158 |               i)("] has size ")(rel.original_to_block.size())
159 |               .build());
160 |     }
161 |     col += rel.feature_size;
162 |     i++;
163 |   }
164 |   return col;
165 | }
166 | 
167 | template <typename... Cs> void print_to_stream(std::ostream &ss, Cs &&... args);
168 | 
169 | template <typename C, typename... Cs>
170 | inline void print_to_stream(std::ostream &ss, C &&c0, Cs &&... args) {
171 |   ss << c0;
172 |   print_to_stream(ss, std::forward<Cs>(args)...);
173 | }
174 | 
175 | template <> inline void print_to_stream(std::ostream &ss) {}
176 | 
177 | template <typename... Cs> std::string print_to_string(Cs &&... args) {
178 |   std::stringstream ss;
179 |   print_to_stream(ss, std::forward<Cs>(args)...);
180 |   return ss.str();
181 | }
182 | 
183 | } // namespace myFM
184 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | # Specify the target platform details in config, so your developers are
 3 | # free to run mypy on Windows, Linux, or macOS and get consistent
 4 | # results.
 5 | python_version=3.6
 6 | platform=linux
 7 | 
 8 | show_column_numbers=True
 9 | 
10 | follow_imports=normal
11 | 
12 | # suppress errors about unsatisfied imports
13 | ignore_missing_imports=True
14 | 
15 | # be strict
16 | disallow_untyped_calls=True
17 | warn_return_any=True
18 | strict_optional=True
19 | warn_no_return=True
20 | warn_redundant_casts=True
21 | warn_unused_ignores=True
22 | 
23 | # The following are off by default.  Flip them on if you feel
24 | # adventurous.
25 | disallow_untyped_defs=True
26 | check_untyped_defs=True
27 | 
28 | # No incremental mode
29 | cache_dir=/dev/null
30 | 
31 | plugins = numpy.typing.mypy_plugin
32 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |   "setuptools>=42",
 4 |   "wheel",
 5 |   "pybind11>=2.8.0",
 6 |   "httpx",
 7 |   "setuptools_scm[toml]>=6.2",
 8 | ]
 9 | 
10 | build-backend = "setuptools.build_meta"
11 | 
12 | [tool.black]
13 | ensure_newline_before_comments = true
14 | force_grid_wrap = 0
15 | include_trailing_comma = true
16 | line_length = 88
17 | multi_line_output = 3
18 | use_parentheses = true
19 | 
20 | [tool.isort]
21 | ensure_newline_before_comments = true
22 | force_grid_wrap = 0
23 | include_trailing_comma = true
24 | known_third_party = ["pybind11"]
25 | line_length = 88
26 | multi_line_output = 3
27 | use_parentheses = true
28 | 
29 | [tool.pycln]
30 | all = true
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | from typing import Any
  4 | 
  5 | from pybind11.setup_helpers import Pybind11Extension, build_ext
  6 | from setuptools import find_packages, setup
  7 | 
  8 | install_requires = [
  9 |     "numpy>=1.11",
 10 |     "scipy>=1.0",
 11 |     "tqdm>=4",
 12 |     "pandas>=1.0.0",
 13 |     "typing-extensions>=4.0.0",
 14 | ]
 15 | 
 16 | CURRENT_DIR = Path(__file__).resolve().parent
 17 | README_FILE = CURRENT_DIR / "README.md"
 18 | 
 19 | 
 20 | class get_eigen_include(object):
 21 |     EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip"
 22 |     EIGEN3_DIRNAME = "eigen-3.4.0"
 23 | 
 24 |     def __str__(self) -> str:
 25 |         eigen_include_dir = os.environ.get("EIGEN3_INCLUDE_DIR", None)
 26 |         if eigen_include_dir is not None:
 27 |             return eigen_include_dir
 28 | 
 29 |         basedir = Path(__file__).resolve().parent
 30 |         target_dir = basedir / self.EIGEN3_DIRNAME
 31 |         if target_dir.exists():
 32 |             return str(target_dir)
 33 | 
 34 |         download_target_dir = basedir / "eigen3.zip"
 35 |         import zipfile
 36 | 
 37 |         import httpx
 38 | 
 39 |         print("Start downloading Eigen library from {}.".format(self.EIGEN3_DIRNAME))
 40 |         with httpx.stream("GET", self.EIGEN3_URL, verify=False) as response:
 41 |             with download_target_dir.open("wb") as ofs:
 42 |                 for chunk in response.iter_bytes(chunk_size=1024):
 43 |                     ofs.write(chunk)
 44 |         print("Downloaded Eigen into {}.".format(download_target_dir))
 45 | 
 46 |         with zipfile.ZipFile(download_target_dir) as ifs:
 47 |             ifs.extractall()
 48 | 
 49 |         return str(target_dir)
 50 | 
 51 | 
 52 | headers = [
 53 |     "include/myfm/definitions.hpp",
 54 |     "include/myfm/util.hpp",
 55 |     "include/myfm/FM.hpp",
 56 |     "include/myfm/HyperParams.hpp",
 57 |     "include/myfm/predictor.hpp",
 58 |     "include/myfm/FMTrainer.hpp",
 59 |     "include/myfm/FMLearningConfig.hpp",
 60 |     "include/myfm/OProbitSampler.hpp",
 61 |     "include/Faddeeva/Faddeeva.hh",
 62 |     "cpp_source/declare_module.hpp",
 63 | ]
 64 | 
 65 | 
 66 | ext_modules = [
 67 |     Pybind11Extension(
 68 |         "myfm._myfm",
 69 |         ["cpp_source/bind.cpp", "cpp_source/Faddeeva.cc"],
 70 |         include_dirs=[
 71 |             # Path to pybind11 headers
 72 |             get_eigen_include(),
 73 |             "include",
 74 |         ],
 75 |     ),
 76 | ]
 77 | 
 78 | 
 79 | def local_scheme(version: Any) -> str:
 80 |     return ""
 81 | 
 82 | 
 83 | setup(
 84 |     name="myfm",
 85 |     use_scm_version={"local_scheme": local_scheme},
 86 |     author="Tomoki Ohtsuki",
 87 |     url="https://github.com/tohtsky/myfm",
 88 |     author_email="tomoki.ohtsuki.19937@outlook.jp",
 89 |     description="Yet another Bayesian factorization machines.",
 90 |     long_description=README_FILE.read_text(),
 91 |     long_description_content_type="text/markdown",
 92 |     ext_modules=ext_modules,
 93 |     install_requires=install_requires,
 94 |     cmdclass={"build_ext": build_ext},
 95 |     package_dir={"": "src"},
 96 |     zip_safe=False,
 97 |     headers=headers,
 98 |     python_requires=">=3.6",
 99 |     packages=find_packages("src"),
100 |     package_data={"myfm": ["*.pyi"]},
101 | )
102 | 


--------------------------------------------------------------------------------
/src/myfm/__init__.py:
--------------------------------------------------------------------------------
 1 | from pkg_resources import DistributionNotFound, get_distribution  # type: ignore
 2 | 
 3 | try:
 4 |     __version__ = get_distribution("myfm").version
 5 | except DistributionNotFound:  # pragma: no cover
 6 |     # package is not installed
 7 |     pass  # pragma: no cover
 8 | 
 9 | from ._myfm import RelationBlock
10 | from .gibbs import MyFMGibbsClassifier, MyFMGibbsRegressor, MyFMOrderedProbit
11 | from .variational import VariationalFMClassifier, VariationalFMRegressor
12 | 
13 | MyFMRegressor = MyFMGibbsRegressor
14 | MyFMClassifier = MyFMGibbsClassifier
15 | 
16 | __all__ = [
17 |     "RelationBlock",
18 |     "MyFMOrderedProbit",
19 |     "MyFMRegressor",
20 |     "MyFMClassifier",
21 |     "MyFMGibbsRegressor",
22 |     "MyFMGibbsClassifier",
23 |     "VariationalFMRegressor",
24 |     "VariationalFMClassifier",
25 | ]
26 | 


--------------------------------------------------------------------------------
/src/myfm/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/src/myfm/utils/__init__.py


--------------------------------------------------------------------------------
/src/myfm/utils/benchmark_data/__init__.py:
--------------------------------------------------------------------------------
 1 | from .movielens1M_data import MovieLens1MDataManager
 2 | from .movielens10M_data import MovieLens10MDataManager
 3 | from .movielens100k_data import MovieLens100kDataManager
 4 | 
 5 | __all__ = [
 6 |     "MovieLens100kDataManager",
 7 |     "MovieLens1MDataManager",
 8 |     "MovieLens10MDataManager",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/src/myfm/utils/benchmark_data/loader_base.py:
--------------------------------------------------------------------------------
  1 | import urllib.request
  2 | from abc import ABC, abstractmethod, abstractproperty
  3 | from pathlib import Path
  4 | from typing import Optional, Tuple
  5 | from zipfile import ZipFile
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from numpy.random import RandomState
 10 | 
 11 | 
 12 | def train_test_split_with_kfold(
 13 |     df: pd.DataFrame,
 14 |     K: int,
 15 |     fold: int,
 16 |     random_state: Optional[int] = None,
 17 | ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 18 |     rns = RandomState(random_state)
 19 |     if not ((0 <= fold) and (fold < K)):
 20 |         raise ValueError("0 <= fold < K")
 21 |     n_data = df.shape[0]
 22 |     n_test = n_data // K
 23 |     if fold < (n_data % K):
 24 |         n_test += 1
 25 |     index = np.arange(df.shape[0])
 26 |     rns.shuffle(index)
 27 |     df = df.iloc[index]
 28 |     test_start_position = (n_data // K) * fold + min((n_data % K), fold)
 29 |     test_end_position = test_start_position + n_test
 30 |     return (
 31 |         pd.concat(
 32 |             [df.iloc[:test_start_position], df.iloc[test_end_position:]]
 33 |         ).reset_index(drop=True),
 34 |         df.iloc[test_start_position:test_end_position].reset_index(drop=True),
 35 |     )
 36 | 
 37 | 
 38 | class DataLoaderBase(ABC):
 39 |     zf: ZipFile
 40 | 
 41 |     @abstractproperty
 42 |     def DOWNLOAD_URL(self) -> str:
 43 |         raise NotImplementedError("must be implemented")  # pragma: no cover
 44 | 
 45 |     @abstractproperty
 46 |     def DEFAULT_PATH(self) -> Path:
 47 |         raise NotImplementedError("must be implemented")  # pragma: no cover
 48 | 
 49 |     def __init__(self, zippath: Optional[Path] = None):
 50 |         zippath = Path(zippath or self.DEFAULT_PATH)
 51 |         if not zippath.exists():
 52 |             permission = input(
 53 |                 "Could not find {}.\nCan I download and save it there?[y/N]".format(
 54 |                     zippath
 55 |                 )
 56 |             ).lower()
 57 |             download = permission == "y"
 58 |             if download:
 59 |                 print("start download...")
 60 |                 urllib.request.urlretrieve(self.DOWNLOAD_URL, zippath)
 61 |                 print("complete")
 62 |             else:
 63 |                 raise RuntimeError("abort.")
 64 |         self.zf = ZipFile(zippath)
 65 | 
 66 | 
 67 | class MovieLensBase(DataLoaderBase, ABC):
 68 |     @abstractmethod
 69 |     def load_rating_all(self) -> pd.DataFrame:
 70 |         raise NotImplementedError("must be implemented")
 71 | 
 72 |     def load_rating_kfold_split(
 73 |         self, K: int, fold: int, random_state: Optional[int] = 0
 74 |     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 75 |         """Load the entire dataset and split it into train/test set.
 76 |         K-fold
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         K : int
 81 |             K in the K-fold splitting scheme.
 82 |         fold : int
 83 |             fold index.
 84 |         random_state : Union[np.RandomState, int, None], optional
 85 |             Controlls random state of the split.
 86 | 
 87 |         Returns
 88 |         -------
 89 |         Tuple[pd.DataFrame, pd.DataFrame]
 90 |             train and test dataframes.
 91 | 
 92 |         Raises
 93 |         ------
 94 |         ValueError
 95 |             When 0 <= fold < K  is not met.
 96 |         """
 97 |         if not ((0 <= fold) and (fold < K)):
 98 |             raise ValueError("0 <= fold < K")
 99 |         df_all = self.load_rating_all()
100 |         return train_test_split_with_kfold(df_all, K, fold, random_state)
101 | 


--------------------------------------------------------------------------------
/src/myfm/utils/benchmark_data/movielens100k_data.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | from pathlib import Path
  3 | from typing import List, Tuple
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from .loader_base import MovieLensBase
  8 | 
  9 | 
 10 | class MovieLens100kDataManager(MovieLensBase):
 11 |     """The Data manager for MovieLens 100k dataset."""
 12 | 
 13 |     @property
 14 |     def DOWNLOAD_URL(self) -> str:
 15 |         return "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
 16 | 
 17 |     @property
 18 |     def DEFAULT_PATH(self) -> Path:
 19 |         return Path("~/.ml-100k.zip").expanduser()
 20 | 
 21 |     def _read_interaction(self, byte_stream: bytes) -> pd.DataFrame:
 22 |         with BytesIO(byte_stream) as ifs:
 23 |             data = pd.read_csv(
 24 |                 ifs,
 25 |                 sep="\t",
 26 |                 header=None,
 27 |                 names=["user_id", "movie_id", "rating", "timestamp"],
 28 |             )
 29 |             data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
 30 |             return data
 31 | 
 32 |     def load_rating_all(self) -> pd.DataFrame:
 33 |         """Load the entire rating dataset.
 34 | 
 35 |         Returns
 36 |         -------
 37 |         pd.DataFrame
 38 |             all the available ratings.
 39 |         """
 40 |         return self._read_interaction(self.zf.read("ml-100k/u.data"))
 41 | 
 42 |     def load_rating_predefined_split(
 43 |         self,
 44 |         fold: int,
 45 |     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 46 |         """Read the pre-defined train/test split.
 47 |         Fold index ranges from 1 to 5.
 48 | 
 49 |         Parameters
 50 |         ----------
 51 |         fold : int
 52 |             specifies the fold index.
 53 | 
 54 |         Returns
 55 |         -------
 56 |         Tuple[pd.DataFrame, pd.DataFrame]
 57 |             train and test dataframes.
 58 | 
 59 |         """
 60 |         assert fold >= 1 and fold <= 5
 61 |         train_path = "ml-100k/u{}.base".format(fold)
 62 |         test_path = "ml-100k/u{}.test".format(fold)
 63 |         df_train = self._read_interaction(self.zf.read(train_path))
 64 |         df_test = self._read_interaction(self.zf.read(test_path))
 65 | 
 66 |         return df_train, df_test
 67 | 
 68 |     def load_user_info(self) -> pd.DataFrame:
 69 |         """load user meta information.
 70 | 
 71 |         Returns
 72 |         -------
 73 |         pd.DataFrame
 74 |             user infomation
 75 |         """
 76 |         user_info_bytes = self.zf.read("ml-100k/u.user")
 77 |         with BytesIO(user_info_bytes) as ifs:
 78 |             return pd.read_csv(
 79 |                 ifs,
 80 |                 sep="|",
 81 |                 header=None,
 82 |                 names=["user_id", "age", "gender", "occupation", "zipcode"],
 83 |             )
 84 | 
 85 |     def genres(self) -> List[str]:
 86 |         with BytesIO(self.zf.read("ml-100k/u.genre")) as ifs:
 87 |             genres: List[str] = list(pd.read_csv(ifs, sep="|", header=None)[0])
 88 |         return genres
 89 | 
 90 |     def load_movie_info(self) -> pd.DataFrame:
 91 |         r"""load movie meta information.
 92 | 
 93 |         Returns
 94 |         -------
 95 |         pd.DataFrame
 96 |             A dataframe containing meta-information (id, title, release_date, url, genres) about the movies.
 97 |             Multiple genres per movie will be concatenated by "|".
 98 |         """
 99 |         MOVIE_COLUMNS = ["movie_id", "title", "release_date", "unk", "url"]
100 |         genres = self.genres()
101 | 
102 |         with BytesIO(self.zf.read("ml-100k/u.item")) as ifs:
103 |             df_mov = pd.read_csv(
104 |                 ifs,
105 |                 sep="|",
106 |                 encoding="latin-1",
107 |                 header=None,
108 |             )
109 |             df_mov.columns = MOVIE_COLUMNS + genres
110 |         df_mov["release_date"] = pd.to_datetime(df_mov.release_date)
111 |         movie_index, genre_index = df_mov[genres].values.nonzero()
112 |         genre_df = (
113 |             (
114 |                 pd.DataFrame(
115 |                     dict(
116 |                         movie_id=df_mov.movie_id.values[movie_index],
117 |                         genre=[genres[i] for i in genre_index],
118 |                     )
119 |                 )
120 |                 .groupby("movie_id")
121 |                 .genre.agg(lambda x: "|".join(x))
122 |             )
123 |             .reindex(df_mov.movie_id)
124 |             .fillna("")
125 |         )
126 |         df_mov["genres"] = genre_df.values
127 |         return df_mov
128 | 


--------------------------------------------------------------------------------
/src/myfm/utils/benchmark_data/movielens10M_data.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from .loader_base import MovieLensBase
 7 | from .movielens1M_data import read_ml1m10m_df
 8 | 
 9 | 
10 | class MovieLens10MDataManager(MovieLensBase):
11 |     DOWNLOAD_URL = "http://files.grouplens.org/datasets/movielens/ml-10m.zip"
12 |     DEFAULT_PATH = Path("~/.ml-10m.zip").expanduser()
13 | 
14 |     def load_rating_all(self) -> pd.DataFrame:
15 |         with BytesIO(self.zf.read("ml-10M100K/ratings.dat")) as ifs:
16 |             return read_ml1m10m_df(ifs)
17 | 


--------------------------------------------------------------------------------
/src/myfm/utils/benchmark_data/movielens1M_data.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from .movielens100k_data import MovieLensBase
 7 | 
 8 | 
 9 | def read_ml1m10m_df(ifs: BytesIO) -> pd.DataFrame:
10 |     r"""A hacky function to read from Movielens 1M/10M dataset using native parser.
11 |     This hack is taken from irspack: https://github.com/tohtsky/irspack/blob/a1893be54200b0dc765957220deeccc1764fe39c/irspack/dataset/movielens/ML1M.py
12 |     """
13 |     df = pd.read_csv(
14 |         ifs,
15 |         sep=":",
16 |         header=None,
17 |     )[[0, 2, 4, 6]].copy()
18 | 
19 |     df.columns = ["user_id", "movie_id", "rating", "timestamp"]
20 |     df["timestamp"] = pd.to_datetime(df.timestamp, unit="s")
21 |     return df
22 | 
23 | 
24 | class MovieLens1MDataManager(MovieLensBase):
25 |     DOWNLOAD_URL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
26 |     DEFAULT_PATH = Path("~/.ml-1m.zip").expanduser()
27 | 
28 |     def load_rating_all(self) -> pd.DataFrame:
29 |         """Read all (1M) interactions.
30 | 
31 |         Returns
32 |         -------
33 |         pd.DataFrame
34 |             Movielens 1M rating dataframe.
35 |         """
36 |         with BytesIO(self.zf.read("ml-1m/ratings.dat")) as ifs:
37 |             return read_ml1m10m_df(ifs)
38 | 


--------------------------------------------------------------------------------
/src/myfm/utils/callbacks/__init__.py:
--------------------------------------------------------------------------------
 1 | from .libfm import (
 2 |     ClassificationCallback,
 3 |     LibFMLikeCallbackBase,
 4 |     OrderedProbitCallback,
 5 |     RegressionCallback,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "LibFMLikeCallbackBase",
10 |     "OrderedProbitCallback",
11 |     "ClassificationCallback",
12 |     "RegressionCallback",
13 | ]
14 | 


--------------------------------------------------------------------------------
/src/myfm/utils/callbacks/libfm.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from collections import OrderedDict
  3 | from typing import Dict, List, Optional, Tuple
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import sparse as sps
  8 | 
  9 | from myfm._myfm import FM, FMHyperParameters, LearningHistory, RelationBlock
 10 | from myfm.base import REAL, ArrayLike, check_data_consistency, std_cdf
 11 | 
 12 | 
 13 | class LibFMLikeCallbackBase(ABC):
 14 |     def __init__(
 15 |         self,
 16 |         n_iter: int,
 17 |         X_test: Optional[ArrayLike],
 18 |         X_rel_test: List[RelationBlock],
 19 |         y_test: np.ndarray,
 20 |         trace_path: Optional[str] = None,
 21 |     ):
 22 |         """Provides a LibFM-like callback after each iteration.
 23 |         This will be helpful when we cannot afford enough memory to store
 24 |         all posterior samples."""
 25 |         self.n_test_data = check_data_consistency(X_test, X_rel_test)
 26 | 
 27 |         self.n_iter = n_iter
 28 |         if X_test is not None:
 29 |             self.X_test = sps.csr_matrix(X_test, dtype=REAL)
 30 |         else:
 31 |             self.X_test = sps.csr_matrix((self.n_test_data, 0), dtype=REAL)
 32 |         self.X_rel_test = X_rel_test
 33 |         self.y_test: np.ndarray = y_test
 34 |         self.result_trace: List[Dict[str, float]] = []
 35 |         self.trace_path = trace_path
 36 |         self.n_samples = 0
 37 | 
 38 |     @abstractmethod
 39 |     def _measure_score(
 40 |         self, i: int, fm: FM, hyper: FMHyperParameters
 41 |     ) -> Tuple[str, Dict[str, float]]:
 42 |         raise NotImplementedError("must be implemented")
 43 | 
 44 |     def __call__(
 45 |         self, i: int, fm: FM, hyper: FMHyperParameters, history: LearningHistory
 46 |     ) -> Tuple[bool, Optional[str]]:
 47 |         description, trace_result = self._measure_score(i, fm, hyper)
 48 |         self.result_trace.append(trace_result)
 49 | 
 50 |         if self.trace_path is not None:
 51 |             df = pd.DataFrame(self.result_trace)
 52 |             df.to_csv(self.trace_path, index=False)
 53 | 
 54 |         return False, description
 55 | 
 56 | 
 57 | class RegressionCallback(LibFMLikeCallbackBase):
 58 |     def __init__(
 59 |         self,
 60 |         n_iter: int,
 61 |         X_test: Optional[ArrayLike],
 62 |         y_test: np.ndarray,
 63 |         X_rel_test: List[RelationBlock] = [],
 64 |         clip_min: Optional[float] = None,
 65 |         clip_max: Optional[float] = None,
 66 |         trace_path: Optional[str] = None,
 67 |     ):
 68 |         super(RegressionCallback, self).__init__(
 69 |             n_iter, X_test, X_rel_test, y_test, trace_path=trace_path
 70 |         )
 71 |         self.predictions = np.zeros((self.n_test_data,), dtype=np.float64)
 72 |         self.prediction_all_but_5 = np.zeros((self.n_test_data,), dtype=np.float64)
 73 |         self.clip_min = clip_min
 74 |         self.clip_max = clip_max
 75 | 
 76 |     def clip_value(self, arr: np.ndarray) -> None:
 77 |         if self.clip_min is not None:
 78 |             arr[arr <= self.clip_min] = self.clip_min
 79 |         if self.clip_max is not None:
 80 |             arr[arr >= self.clip_max] = self.clip_max
 81 | 
 82 |     def _measure_score(
 83 |         self, i: int, fm: FM, hyper: FMHyperParameters
 84 |     ) -> Tuple[str, Dict[str, float]]:
 85 |         score = fm.predict_score(self.X_test, self.X_rel_test)
 86 |         self.predictions += score
 87 |         self.n_samples += 1
 88 |         prediction_mean = self.predictions / self.n_samples
 89 |         self.clip_value(prediction_mean)
 90 |         if i >= 5:
 91 |             self.prediction_all_but_5 += score
 92 |             prediction_mean_all_but_5 = self.prediction_all_but_5 / (i + 1 - 5)
 93 |             self.clip_value(prediction_mean_all_but_5)
 94 |             rmse_all_but_5 = float(
 95 |                 ((self.y_test - prediction_mean_all_but_5) ** 2).mean() ** 0.5
 96 |             )
 97 |         else:
 98 |             rmse_all_but_5 = float("nan")
 99 | 
100 |         rmse = float(((self.y_test - prediction_mean) ** 2).mean() ** 0.5)
101 |         rmse_this = float(((self.y_test - score) ** 2).mean() ** 0.5)
102 |         description = "alpha={0:.4f}, rmse_mean={1:.4f}, rmse_this={2:.4f}, rmse_all_but_5={3:.4f}".format(
103 |             hyper.alpha, rmse, rmse_this, rmse_all_but_5
104 |         )
105 |         result = OrderedDict(
106 |             [
107 |                 ("alpha", hyper.alpha),
108 |                 ("rmse", rmse),
109 |                 ("rmse_this", rmse_this),
110 |                 ("rmse_all_but_5", rmse_all_but_5),
111 |             ]
112 |         )
113 |         return description, result
114 | 
115 | 
116 | class ClassificationCallback(LibFMLikeCallbackBase):
117 |     def __init__(
118 |         self,
119 |         n_iter: int,
120 |         X_test: Optional[ArrayLike],
121 |         y_test: np.ndarray,
122 |         X_rel_test: List[RelationBlock] = [],
123 |         eps: Optional[float] = 1e-15,
124 |         trace_path: Optional[str] = None,
125 |     ):
126 |         super(ClassificationCallback, self).__init__(
127 |             n_iter, X_test, X_rel_test, y_test, trace_path=trace_path
128 |         )
129 |         self.predictions = np.zeros((self.n_test_data,), dtype=np.float64)
130 |         self.prediction_all_but_5 = np.zeros((self.n_test_data,), dtype=np.float64)
131 |         self.eps = eps
132 | 
133 |     def clip_value(self, arr: np.ndarray) -> None:
134 |         if self.eps is not None:
135 |             arr[arr <= self.eps] = self.eps
136 |             arr[arr >= (1 - self.eps)] = 1 - self.eps
137 | 
138 |     def __log_loss(self, arr: np.ndarray) -> float:
139 |         result = 0
140 |         result += np.log(arr[self.y_test == 1]).sum()
141 |         result += np.log(1 - arr[self.y_test == 0]).sum()
142 |         return -result
143 | 
144 |     def __accuracy(self, arr: np.ndarray) -> float:
145 |         return float((self.y_test == (arr >= 0.5)).mean())
146 | 
147 |     def _measure_score(
148 |         self, i: int, fm: FM, hyper: FMHyperParameters
149 |     ) -> Tuple[str, Dict[str, float]]:
150 |         prob_this = std_cdf(fm.predict_score(self.X_test, self.X_rel_test))
151 |         self.predictions += prob_this
152 |         self.n_samples += 1
153 |         prediction_mean = self.predictions / self.n_samples
154 |         self.clip_value(prediction_mean)
155 |         if i >= 5:
156 |             self.prediction_all_but_5 += prob_this
157 |             prediction_mean_all_but_5 = self.prediction_all_but_5 / (i + 1 - 5)
158 |             self.clip_value(prediction_mean_all_but_5)
159 |             ll_all_but_5 = self.__log_loss(prediction_mean_all_but_5)
160 |             accuracy_all_but_5 = self.__accuracy(prediction_mean_all_but_5)
161 |         else:
162 |             ll_all_but_5 = float("nan")
163 |             accuracy_all_but_5 = float("nan")
164 | 
165 |         ll = self.__log_loss(prediction_mean)
166 |         accuracy = self.__accuracy(prediction_mean)
167 |         ll_this = self.__log_loss(prob_this)
168 |         accuracy_this = self.__accuracy(prob_this)
169 |         description = "ll_mean={0:.4f}, ll_this={1:.4f}, ll_all_but_5={2:.4f}".format(
170 |             ll, ll_this, ll_all_but_5
171 |         )
172 |         result = OrderedDict(
173 |             [
174 |                 ("log_loss", ll),
175 |                 ("log_loss_this", ll_this),
176 |                 ("log_loss_all_but_5", ll_all_but_5),
177 |                 ("accuracy", accuracy),
178 |                 ("accuracy_this", accuracy_this),
179 |                 ("accuracy_all_but_5", accuracy_all_but_5),
180 |             ]
181 |         )
182 |         return description, result
183 | 
184 | 
185 | class OrderedProbitCallback(LibFMLikeCallbackBase):
186 |     def __init__(
187 |         self,
188 |         n_iter: int,
189 |         X_test: Optional[ArrayLike],
190 |         y_test: np.ndarray,
191 |         n_class: int,
192 |         X_rel_test: List[RelationBlock] = [],
193 |         eps: Optional[float] = 1e-15,
194 |         trace_path: Optional[str] = None,
195 |     ):
196 |         super(OrderedProbitCallback, self).__init__(
197 |             n_iter, X_test, X_rel_test, y_test, trace_path=trace_path
198 |         )
199 |         self.predictions = np.zeros((self.n_test_data, n_class), dtype=np.float64)
200 |         self.prediction_all_but_5 = np.zeros(
201 |             (self.n_test_data, n_class), dtype=np.float64
202 |         )
203 |         self.n_class = n_class
204 |         self.eps = eps
205 |         self.y_test = self.y_test.astype(np.int32)
206 |         assert (self.y_test.min() >= 0) and (self.y_test.max() <= (self.n_class - 1))
207 | 
208 |     def __log_loss(self, arr: np.ndarray) -> float:
209 |         ps = arr[np.arange(self.y_test.shape[0]), self.y_test].copy()
210 |         ps[ps <= self.eps] = self.eps
211 |         return -float(np.log(ps).sum())
212 | 
213 |     def __accuracy(self, arr: np.ndarray) -> float:
214 |         return float((self.y_test == (arr.argmax(axis=1))).mean())
215 | 
216 |     def __rmse(self, arr: np.ndarray) -> float:
217 |         result: float = (
218 |             float(((self.y_test - arr.dot(np.arange(self.n_class))) ** 2).mean()) ** 0.5
219 |         )
220 |         return result
221 | 
222 |     def _measure_score(
223 |         self, i: int, fm: FM, hyper: FMHyperParameters
224 |     ) -> Tuple[str, Dict[str, float]]:
225 |         prob_this = fm.oprobit_predict_proba(self.X_test, self.X_rel_test, 0)
226 |         self.predictions += prob_this
227 |         self.n_samples += 1
228 |         prediction_mean = self.predictions / self.n_samples
229 |         if i >= 5:
230 |             self.prediction_all_but_5 += prob_this
231 |             prediction_mean_all_but_5 = self.prediction_all_but_5 / (i + 1 - 5)
232 |             ll_all_but_5 = self.__log_loss(prediction_mean_all_but_5)
233 |             accuracy_all_but_5 = self.__accuracy(prediction_mean_all_but_5)
234 |             rmse_all_but_5 = self.__rmse(prediction_mean_all_but_5)
235 |         else:
236 |             ll_all_but_5 = float("nan")
237 |             accuracy_all_but_5 = float("nan")
238 |             rmse_all_but_5 = float("nan")
239 | 
240 |         ll = self.__log_loss(prediction_mean)
241 |         accuracy = self.__accuracy(prediction_mean)
242 |         rmse = self.__rmse(prediction_mean)
243 |         ll_this = self.__log_loss(prob_this)
244 |         accuracy_this = self.__accuracy(prob_this)
245 |         rmse_this = self.__rmse(prob_this)
246 |         description = "ll_mean={0:.4f}, ll_this={1:.4f}, ll_all_but_5={2:.4f}".format(
247 |             ll, ll_this, ll_all_but_5
248 |         )
249 |         result = OrderedDict(
250 |             [
251 |                 ("log_loss", ll),
252 |                 ("log_loss_this", ll_this),
253 |                 ("log_loss_all_but_5", ll_all_but_5),
254 |                 ("accuracy", accuracy),
255 |                 ("accuracy_this", accuracy_this),
256 |                 ("accuracy_all_but_5", accuracy_all_but_5),
257 |                 ("rmse", rmse),
258 |                 ("rmse_this", rmse_this),
259 |                 ("rmse_all_but_5", rmse_all_but_5),
260 |             ]
261 |         )
262 |         return description, result
263 | 


--------------------------------------------------------------------------------
/src/myfm/utils/dummy_data.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from scipy import sparse as sps
 6 | 
 7 | from myfm.base import DenseArray, RelationBlock
 8 | 
 9 | 
10 | def gen_dummy_rating_df(
11 |     random_seed: int = 0,
12 |     factor_rank: int = 3,
13 |     size: int = 100,
14 |     user_colname: str = "userId",
15 |     item_colname: str = "itemId",
16 |     timestamp_colname: str = "timestamp",
17 |     rating_colname: str = "rating",
18 | ) -> pd.DataFrame:
19 |     rns = np.random.RandomState(random_seed)
20 |     user_indices_all = np.arange(max(int(size / 3), 10))
21 |     item_indices_all = np.arange(max(int(size / 2), 10))
22 |     user_factor = rns.normal(
23 |         0, 1 / factor_rank**0.5, size=(user_indices_all.shape[0], factor_rank)
24 |     )
25 |     item_factor = rns.normal(0, 1, size=(item_indices_all.shape[0], factor_rank))
26 | 
27 |     time = pd.Timestamp("2000-01-01") + pd.to_timedelta(
28 |         rns.randint(-365, 365, size=size), unit="day"
29 |     )
30 | 
31 |     result_df = pd.DataFrame(
32 |         {
33 |             user_colname: rns.choice(user_indices_all, size=size, replace=True) + 1,
34 |             item_colname: rns.choice(item_indices_all, size=size, replace=True) + 1,
35 |             timestamp_colname: time,
36 |         }
37 |     )
38 |     score = (
39 |         user_factor[result_df[user_colname].values - 1, :]
40 |         * item_factor[result_df[item_colname].values - 1, :]
41 |     ).sum(axis=1)
42 |     cutpoints: List[float] = list(np.percentile(score, [20, 40, 60, 80]))  # type: ignore
43 |     rating = np.ones((size,), dtype=np.int64)
44 |     for cp in cutpoints:
45 |         rating += score >= cp
46 |     result_df[rating_colname] = rating
47 |     return result_df
48 | 
49 | 
50 | def gen_dummy_X(
51 |     random_seed: int = 0,
52 |     factor_rank: int = 3,
53 |     size: int = 100,
54 | ) -> Tuple[List[RelationBlock], DenseArray, List[int]]:
55 |     user_column = "userId"
56 |     item_column = "itemId"
57 |     rating_column = "rating"
58 |     df_ = gen_dummy_rating_df(
59 |         random_seed,
60 |         factor_rank=factor_rank,
61 |         size=size,
62 |         user_colname=user_column,
63 |         item_colname=item_column,
64 |         rating_colname=rating_column,
65 |     )
66 |     blocks = []
67 |     shapes = []
68 |     for colname in [user_column, item_column]:
69 |         categorical_expression = pd.Categorical(df_[colname])
70 |         X = sps.identity(
71 |             len(categorical_expression.categories), dtype=np.float64
72 |         ).tocsr()
73 |         ind = categorical_expression.codes
74 |         blocks.append(RelationBlock(ind, X))
75 |         shapes.append(X.shape[1])
76 |     return (blocks, df_[rating_column].values, shapes)
77 | 
78 | 
79 | __all__ = ["gen_dummy_rating_df"]
80 | 


--------------------------------------------------------------------------------
/src/myfm/utils/encoders/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import DataFrameEncoder
 2 | from .binning import BinningEncoder
 3 | from .categorical import CategoryValueToSparseEncoder
 4 | from .multi_value import MultipleValuesToSparseEncoder
 5 | 
 6 | __all__ = [
 7 |     "DataFrameEncoder",
 8 |     "CategoryValueToSparseEncoder",
 9 |     "BinningEncoder",
10 |     "MultipleValuesToSparseEncoder",
11 | ]
12 | 


--------------------------------------------------------------------------------
/src/myfm/utils/encoders/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections import OrderedDict
 3 | from typing import Any, Dict, List
 4 | 
 5 | import pandas as pd
 6 | import scipy.sparse as sps
 7 | 
 8 | 
 9 | class SparseEncoderBase(ABC):
10 |     r"""The base class for encoders into sparse matrices."""
11 | 
12 |     @abstractmethod
13 |     def to_sparse(self, x: List[Any]) -> sps.csr_matrix:
14 |         raise NotImplementedError("must be implemented")  # pragma: no cover
15 | 
16 |     @abstractmethod
17 |     def __len__(self) -> int:
18 |         raise NotImplementedError("must be implemented")  # pragma: no cover
19 | 
20 |     @abstractmethod
21 |     def names(self) -> List[str]:
22 |         r"""Description of each non-zero entry."""
23 |         raise NotImplementedError("must be implemented")  # pragma: no cover
24 | 
25 | 
26 | class DataFrameEncoder:
27 |     """Encode pandas.DataFrame into concatenated sparse matrices."""
28 | 
29 |     def __init__(self) -> None:
30 |         r"""Construct the encoders starting from empty one."""
31 |         self.col_encoders: Dict[str, SparseEncoderBase] = OrderedDict()
32 | 
33 |     def all_names(self) -> List[str]:
34 |         return [
35 |             f"{col_name}__{description}"
36 |             for col_name, encoder in self.col_encoders.items()
37 |             for description in encoder.names()
38 |         ]
39 | 
40 |     @property
41 |     def encoder_shapes(self) -> List[int]:
42 |         r"""Show how the columns for an encoded CSR matrix are organized.
43 | 
44 |         Returns
45 |         -------
46 |         List[int]
47 |             list of length of internal encoders.
48 |         """
49 |         return [len(enc) for enc in self.col_encoders.values()]
50 | 
51 |     def add_column(
52 |         self, colname: str, encoder: SparseEncoderBase
53 |     ) -> "DataFrameEncoder":
54 |         r"""Add a column name to be encoded / encoder pair.
55 | 
56 |         Parameters
57 |         ----------
58 |         colname : str
59 |             The column name to be encoded.
60 |         encoder : SparseEncoderBase
61 |             The corresponding encoder.
62 |         """
63 |         self.col_encoders[colname] = encoder
64 |         return self
65 | 
66 |     def encode_df(self, df: pd.DataFrame) -> sps.csr_matrix:
67 |         r"""Encode the dataframe into a concatenated CSR matrix.
68 | 
69 |         Parameters
70 |         ----------
71 |         df : pd.DataFrame
72 |             The source.
73 | 
74 |         Returns
75 |         -------
76 |         sps.csr_matrix
77 |             The result.
78 |         """
79 |         matrices: List[sps.csr_matrix] = []
80 |         for colname, encoder in self.col_encoders.items():
81 |             matrices.append(encoder.to_sparse(df[colname]))
82 | 
83 |         return sps.hstack(matrices, format="csr")
84 | 


--------------------------------------------------------------------------------
/src/myfm/utils/encoders/binning.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, List, TypeVar
 2 | 
 3 | import numpy as np
 4 | from scipy import sparse as sps
 5 | 
 6 | from myfm.base import DenseArray
 7 | 
 8 | from .base import SparseEncoderBase
 9 | 
10 | if TYPE_CHECKING:
11 |     from numpy.typing import ArrayLike
12 | else:
13 |     ArrayLike = object
14 | 
15 | Numeric = TypeVar("Numeric", int, float)
16 | 
17 | 
18 | class BinningEncoder(SparseEncoderBase):
19 |     """The class to one-hot encode a List of numerical values into a sparse matrix representation by binning."""
20 | 
21 |     def __init__(self, x: ArrayLike, n_percentiles: int = 10) -> None:
22 |         """Initializes the encoder by compting the percentile values of input.
23 | 
24 |         Parameters
25 |         ----------
26 |         x:
27 |             list of numerical values.
28 |         n_percentiles:
29 |             number of percentiles computed against x, by default 10.
30 | 
31 |         """
32 |         if n_percentiles <= 0:
33 |             raise ValueError("n_percentiles must be greater than 0.")
34 |         self.percentages = np.linspace(0, 100, n_percentiles + 2)[1:-1]
35 |         x_arr = np.asfarray(x)
36 |         temp_percentiles: DenseArray = np.percentile(
37 |             x_arr[~np.isnan(x_arr)], self.percentages
38 |         )
39 |         self.percentiles = np.unique(temp_percentiles)
40 | 
41 |     def names(self) -> List[str]:
42 |         return (
43 |             ["NaN"]
44 |             + [f"<={val}" for val in self.percentiles]
45 |             + [f">{self.percentiles[-1]}"]
46 |         )
47 | 
48 |     def to_sparse(self, x: ArrayLike) -> sps.csr_matrix:
49 |         x_array = np.asarray(x, dtype=np.float64)
50 |         N = x_array.shape[0]
51 |         non_na_index = ~np.isnan(x_array)
52 |         x_not_na = x_array[non_na_index]
53 |         cols = np.zeros(N, dtype=np.int64)
54 |         cols[non_na_index] += 1
55 |         for p in self.percentiles:
56 |             cols[non_na_index] += x_not_na > p
57 |         return sps.csr_matrix(
58 |             (np.ones(N, dtype=np.float64), (np.arange(N), cols)),
59 |             shape=(N, len(self)),
60 |         )
61 | 
62 |     def __len__(self) -> int:
63 |         return len(self.percentiles) + 2
64 | 


--------------------------------------------------------------------------------
/src/myfm/utils/encoders/categorical.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from typing import Dict, Generic, Iterable, List, Optional, TypeVar, Union
 3 | 
 4 | import numpy as np
 5 | import scipy.sparse as sps
 6 | from typing_extensions import Literal
 7 | 
 8 | from .base import SparseEncoderBase
 9 | 
10 | T = TypeVar("T", int, float, str)
11 | 
12 | 
13 | class CategoryValueToSparseEncoder(Generic[T], SparseEncoderBase):
14 |     """The class to one-hot encode a List of items into a sparse matrix representation."""
15 | 
16 |     def __init__(
17 |         self,
18 |         items: Iterable[T],
19 |         min_freq: int = 1,
20 |         handle_unknown: Literal["create", "ignore", "raise"] = "create",
21 |     ):
22 |         r"""Construct the encoder by providing a list of items.
23 | 
24 |         Parameters
25 |         ----------
26 |         items : Iterable[T]
27 |             The items list.
28 |         min_freq : int, optional
29 |             The minimal frequency for an item to be retained in the known items list, by default 1
30 |         handle_unknown: Literal["create", "ignore", "raise"], optional
31 |             How to handle previously unseen values during encoding.
32 |             If "create", then there is a single category named "__UNK__" for unknown values,
33 |             ant it is treated as 0th category.
34 |             If "ignore", such an item will be ignored.
35 |             If "raise", a `KeyError` is raised.
36 |             Defaults to "create".
37 |         """
38 |         counter_ = Counter(items)
39 |         unique_items = sorted([x for x, freq in counter_.items() if freq >= min_freq])
40 |         self._item_index_offset = 1 if handle_unknown == "create" else 0
41 |         self.handle_unknown = handle_unknown
42 |         self._dict: Dict[T, int] = {
43 |             item: i + self._item_index_offset for i, item in enumerate(unique_items)
44 |         }
45 |         self.values: List[Union[str, T]] = []
46 |         if self.handle_unknown == "create":
47 |             self.values.append("__UNK__")
48 |         self.values.extend(unique_items)
49 | 
50 |     def _get_index(self, x: T) -> Optional[int]:
51 |         try:
52 |             return self._dict[x]
53 |         except KeyError:
54 |             if self.handle_unknown == "create":
55 |                 return 0
56 |             elif self.handle_unknown == "ignore":
57 |                 return None
58 |             raise
59 | 
60 |     def __getitem__(self, x: T) -> int:
61 |         result = self._get_index(x)
62 |         if result is None:
63 |             raise KeyError(f"{x} not found.")
64 |         return result
65 | 
66 |     def names(self) -> List[str]:
67 |         return [str(y) for y in self.values]
68 | 
69 |     def to_sparse(self, items: Iterable[T]) -> sps.csr_matrix:
70 |         rows = []
71 |         cols = []
72 |         n_row = 0
73 |         for i, x in enumerate(items):
74 |             n_row += 1
75 |             index = self._get_index(x)
76 |             if index is None:
77 |                 continue
78 |             rows.append(i)
79 |             cols.append(index)
80 |         return sps.csr_matrix(
81 |             (
82 |                 np.ones(len(rows), dtype=np.float64),
83 |                 (rows, cols),
84 |             ),
85 |             shape=(n_row, len(self)),
86 |         )
87 | 
88 |     def __len__(self) -> int:
89 |         return len(self._dict) + self._item_index_offset
90 | 


--------------------------------------------------------------------------------
/src/myfm/utils/encoders/multi_value.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable
 2 | 
 3 | import scipy.sparse as sps
 4 | from typing_extensions import Literal
 5 | 
 6 | from .categorical import CategoryValueToSparseEncoder
 7 | 
 8 | 
 9 | class MultipleValuesToSparseEncoder(CategoryValueToSparseEncoder[str]):
10 |     """The class to N-hot encode a List of items into a sparse matrix representation."""
11 | 
12 |     def __init__(
13 |         self,
14 |         items: Iterable[str],
15 |         min_freq: int = 1,
16 |         sep: str = ",",
17 |         normalize: bool = True,
18 |         handle_unknown: Literal["create", "ignore", "raise"] = "create",
19 |     ):
20 |         """Construct the encoder by providing a list of strings,
21 |         each of which is a list of strings concatenated by `sep`.
22 | 
23 |         Parameters
24 |         ----------
25 |         items : Iterable[str]
26 |             Iterable of strings, each of which is a concatenated list of possibly multiple items.
27 |         min_freq : int, optional
28 |             The minimal frequency for an item to be retained in the known items list, by default 1.
29 |         sep: str, optional
30 |             Tells how to separate string back into a list. Defaults to `','`.
31 |         normalize: bool, optional
32 |             If `True`, non-zero entry in the encoded matrix will have `1 / N ** 0.5`,
33 |             where `N` is the number of non-zero entries in that row. Defaults to `True`.
34 |         handle_unknown: Literal["create", "ignore", "raise"], optional
35 |             How to handle previously unseen values during encoding.
36 |             If "create", then there is a single category named "__UNK__" for unknown values,
37 |             ant it is treated as 0th category.
38 |             If "ignore", such an item will be ignored.
39 |             If "raise", a `KeyError` is raised.
40 |             Defaults to "create".
41 |         """
42 |         items_flatten = [
43 |             y for x in items for y in set(x.split(sep)) if y
44 |         ]  # ignore empty string.
45 |         self.sep = sep
46 |         self.normalize = normalize
47 |         super().__init__(
48 |             items_flatten, min_freq=min_freq, handle_unknown=handle_unknown
49 |         )
50 | 
51 |     def to_sparse(self, items: Iterable[str]) -> sps.csr_matrix:
52 |         indptr = [0]
53 |         indices = []
54 |         data = []
55 |         n_row = 0
56 |         cursor = 0
57 |         for row in items:
58 |             n_row += 1
59 |             items = row.split(self.sep)
60 |             indices_local = sorted(
61 |                 list(
62 |                     {
63 |                         index
64 |                         for index in [self._get_index(v) for v in items if v]
65 |                         if index is not None
66 |                     }
67 |                 )
68 |             )
69 | 
70 |             if not indices_local:
71 |                 indptr.append(cursor)
72 |                 continue
73 |             n = len(indices_local)
74 |             value = 1.0 / (float(n) ** 0.5) if self.normalize else 1.0
75 |             indices.extend(indices_local)
76 |             data.extend([value] * n)
77 |             cursor += n
78 |             indptr.append(cursor)
79 |         return sps.csr_matrix(
80 |             (data, indices, indptr),
81 |             shape=(n_row, len(self)),
82 |         )
83 | 


--------------------------------------------------------------------------------
/src/myfm/variational.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, List, Optional, Tuple, TypeVar
  2 | 
  3 | import numpy as np
  4 | import scipy.sparse as sps
  5 | 
  6 | from ._myfm import (
  7 |     ConfigBuilder,
  8 |     FMLearningConfig,
  9 |     RelationBlock,
 10 |     VariationalFM,
 11 |     VariationalFMHyperParameters,
 12 |     VariationalLearningHistory,
 13 |     VariationalPredictor,
 14 |     create_train_vfm,
 15 | )
 16 | from .base import (
 17 |     REAL,
 18 |     ArrayLike,
 19 |     ClassifierMixin,
 20 |     MyFMBase,
 21 |     RegressorMixin,
 22 |     check_data_consistency,
 23 | )
 24 | 
 25 | ArrayOrDenseArray = TypeVar("ArrayOrDenseArray", np.ndarray, float)
 26 | 
 27 | 
 28 | def runtime_error_to_optional(
 29 |     fm: "MyFMVariationalBase",
 30 |     retrieve_method: Callable[[VariationalFM], ArrayOrDenseArray],
 31 | ) -> Optional[ArrayOrDenseArray]:
 32 |     try:
 33 |         predictor = fm._fetch_predictor()
 34 |     except:
 35 |         return None
 36 |     weights = predictor.weights()
 37 |     return retrieve_method(weights)
 38 | 
 39 | 
 40 | class MyFMVariationalBase(
 41 |     MyFMBase[
 42 |         VariationalFM,
 43 |         VariationalFMHyperParameters,
 44 |         VariationalPredictor,
 45 |         VariationalLearningHistory,
 46 |     ]
 47 | ):
 48 |     @property
 49 |     def w0_mean(self) -> Optional[float]:
 50 |         r"""Mean of variational posterior distribution of global bias `w0`.
 51 |         If the model is not fit yet, returns `None`.
 52 | 
 53 |         Returns:
 54 |             Mean of variational posterior distribution of global bias `w0`.
 55 |         """
 56 | 
 57 |         def _retrieve(fm: VariationalFM) -> float:
 58 |             return fm.w0
 59 | 
 60 |         return runtime_error_to_optional(self, _retrieve)
 61 | 
 62 |     @property
 63 |     def w0_var(self) -> Optional[float]:
 64 |         r"""Variance of variational posterior distribution of global bias `w0`.
 65 |         If the model is not fit yet, returns `None`.
 66 | 
 67 |         Returns:
 68 |             Variance of variational posterior distribution of global bias `w0`.
 69 |         """
 70 | 
 71 |         def _retrieve(fm: VariationalFM) -> float:
 72 |             return fm.w0_var
 73 | 
 74 |         return runtime_error_to_optional(self, _retrieve)
 75 | 
 76 |     @property
 77 |     def w_mean(self) -> Optional[np.ndarray]:
 78 |         r"""Mean of variational posterior distribution of linear coefficnent `w`.
 79 |         If the model is not fit yet, returns `None`.
 80 | 
 81 |         Returns:
 82 |             Mean of variational posterior distribution of linear coefficnent `w`.
 83 |         """
 84 | 
 85 |         def _retrieve(fm: VariationalFM) -> np.ndarray:
 86 |             return fm.w
 87 | 
 88 |         return runtime_error_to_optional(self, _retrieve)
 89 | 
 90 |     @property
 91 |     def w_var(self) -> Optional[np.ndarray]:
 92 |         r"""Variance of variational posterior distribution of linear coefficnent `w`.
 93 |         If the model is not fit yet, returns `None`.
 94 | 
 95 |         Returns:
 96 |             Variance of variational posterior distribution of linear coefficnent `w`.
 97 |         """
 98 | 
 99 |         def _retrieve(fm: VariationalFM) -> np.ndarray:
100 |             return fm.w_var
101 | 
102 |         return runtime_error_to_optional(self, _retrieve)
103 | 
104 |     @property
105 |     def V_mean(self) -> Optional[np.ndarray]:
106 |         r"""Mean of variational posterior distribution of factorized quadratic coefficnent `V`.
107 |         If the model is not fit yet, returns `None`.
108 | 
109 |         Returns:
110 |             Mean of variational posterior distribution of factorized quadratic coefficient `V`.
111 |         """
112 | 
113 |         def _retrieve(fm: VariationalFM) -> np.ndarray:
114 |             return fm.V
115 | 
116 |         return runtime_error_to_optional(self, _retrieve)
117 | 
118 |     @property
119 |     def V_var(self) -> Optional[np.ndarray]:
120 |         r"""Variance of variational posterior distribution of factorized quadratic coefficnent `V`.
121 |         If the model is not fit yet, returns `None`.
122 | 
123 |         Returns:
124 |             Variance of variational posterior distribution of factorized quadratic coefficient `V`.
125 |         """
126 | 
127 |         def _retrieve(fm: VariationalFM) -> np.ndarray:
128 |             return fm.V_var
129 | 
130 |         return runtime_error_to_optional(self, _retrieve)
131 | 
132 |     @classmethod
133 |     def _train_core(
134 |         cls,
135 |         rank: int,
136 |         init_stdev: float,
137 |         X: sps.csr_matrix,
138 |         X_rel: List[RelationBlock],
139 |         y: np.ndarray,
140 |         random_seed: int,
141 |         config: FMLearningConfig,
142 |         callback: Callable[
143 |             [
144 |                 int,
145 |                 VariationalFM,
146 |                 VariationalFMHyperParameters,
147 |                 VariationalLearningHistory,
148 |             ],
149 |             bool,
150 |         ],
151 |     ) -> Tuple[VariationalPredictor, VariationalLearningHistory]:
152 |         return create_train_vfm(
153 |             rank, init_stdev, X, X_rel, y, random_seed, config, callback
154 |         )
155 | 
156 |     def _predict_core(
157 |         self,
158 |         X: Optional[ArrayLike],
159 |         X_rel: List[RelationBlock] = [],
160 |     ) -> np.ndarray:
161 |         predictor = self._fetch_predictor()
162 |         shape = check_data_consistency(X, X_rel)
163 |         if X is None:
164 |             X = sps.csr_matrix((shape, 0), dtype=REAL)
165 |         else:
166 |             X = sps.csr_matrix(X)
167 |         return predictor.predict(X, X_rel)
168 | 
169 | 
170 | class VariationalFMRegressor(
171 |     RegressorMixin[VariationalFM, VariationalFMHyperParameters],
172 |     MyFMVariationalBase,
173 | ):
174 |     """Variational Inference for Regression Task."""
175 | 
176 |     def fit(
177 |         self,
178 |         X: ArrayLike,
179 |         y: np.ndarray,
180 |         X_rel: List[RelationBlock] = [],
181 |         X_test: Optional[ArrayLike] = None,
182 |         y_test: Optional[np.ndarray] = None,
183 |         X_rel_test: List[RelationBlock] = [],
184 |         n_iter: int = 100,
185 |         grouping: Optional[List[int]] = None,
186 |         group_shapes: Optional[List[int]] = None,
187 |         callback: Optional[
188 |             Callable[
189 |                 [
190 |                     int,
191 |                     VariationalFM,
192 |                     VariationalFMHyperParameters,
193 |                     VariationalLearningHistory,
194 |                 ],
195 |                 Tuple[bool, Optional[str]],
196 |             ]
197 |         ] = None,
198 |         config_builder: Optional[ConfigBuilder] = None,
199 |     ) -> "VariationalFMRegressor":
200 |         r"""Performs batch variational inference fit the data.
201 | 
202 |         Parameters
203 |         ----------
204 |         X : 2D array-like.
205 |             Input variable.
206 | 
207 |         y : 1D array-like.
208 |             Target variable.
209 | 
210 |         X_rel: list of RelationBlock, optional (default=[])
211 |                Relation blocks which supplements X.
212 | 
213 |         n_iter : int, optional (default = 100)
214 |             Iterations to perform.
215 | 
216 |         grouping: Integer List, optional (default = None)
217 |             If not `None`, this specifies which column of X belongs to which group.
218 |             That is, if grouping[i] is g, then, :math:`w_i` and :math:`V_{i, r}`
219 |             will be distributed according to
220 |             :math:`\mathcal{N}(\mu_w[g], \lambda_w[g])` and :math:`\mathcal{N}(\mu_V[g, r], \lambda_V[g,r])`,
221 |             respectively.
222 |             If `None`, all the columns of X are assumed to belong to a single group, 0.
223 | 
224 |         group_shapes: Integer array, optional (default = None)
225 |             If not `None`, this specifies each variable group's size.
226 |             Ignored if grouping is not None.
227 |             For example, if ``group_shapes = [n_1, n_2]``,
228 |             this is equivalent to ``grouping = [0] * n_1 + [1] * n_2``
229 | 
230 |         callback: function(int, fm, hyper, history) -> bool, optional(default = None)
231 |             Called at the every end of each Gibbs iteration.
232 |         """
233 |         self._fit(
234 |             X,
235 |             y,
236 |             X_rel=X_rel,
237 |             X_test=X_test,
238 |             X_rel_test=X_rel_test,
239 |             y_test=y_test,
240 |             n_iter=n_iter,
241 |             grouping=grouping,
242 |             callback=callback,
243 |             group_shapes=group_shapes,
244 |             config_builder=config_builder,
245 |         )
246 |         return self
247 | 
248 |     def predict(
249 |         self, X: Optional[ArrayLike], X_rel: List[RelationBlock] = []
250 |     ) -> np.ndarray:
251 |         r"""Make a prediction based on variational mean.
252 | 
253 |         Parameters
254 |         ----------
255 |         X : Optional[ArrayLike]
256 |             Main Table. When None, treated as a matrix without columns.
257 |         X_rel : List[RelationBlock], optional
258 |             Relations, by default []
259 | 
260 |         Returns
261 |         -------
262 |         np.ndarray
263 |             [description]
264 |         """
265 |         return self._predict_core(X, X_rel)
266 | 
267 | 
268 | class VariationalFMClassifier(
269 |     ClassifierMixin[VariationalFM, VariationalFMHyperParameters],
270 |     MyFMVariationalBase,
271 | ):
272 |     """Variational Inference for Classification Task."""
273 | 
274 |     def fit(
275 |         self,
276 |         X: ArrayLike,
277 |         y: np.ndarray,
278 |         X_rel: List[RelationBlock] = [],
279 |         X_test: Optional[ArrayLike] = None,
280 |         y_test: Optional[np.ndarray] = None,
281 |         X_rel_test: List[RelationBlock] = [],
282 |         n_iter: int = 100,
283 |         grouping: Optional[List[int]] = None,
284 |         group_shapes: Optional[List[int]] = None,
285 |         callback: Optional[
286 |             Callable[
287 |                 [
288 |                     int,
289 |                     VariationalFM,
290 |                     VariationalFMHyperParameters,
291 |                     VariationalLearningHistory,
292 |                 ],
293 |                 Tuple[bool, Optional[str]],
294 |             ]
295 |         ] = None,
296 |         config_builder: Optional[ConfigBuilder] = None,
297 |     ) -> "VariationalFMClassifier":
298 |         r"""Performs batch variational inference fit the data.
299 | 
300 |         Parameters
301 |         ----------
302 |         X : Optional[ArrayLike].
303 |             Main table. When None, treated as a matrix without columns.
304 | 
305 |         y : 1D array-like.
306 |             Target variable.
307 | 
308 |         X_rel: list of RelationBlock, optional (default=[])
309 |                Relation blocks which supplements X.
310 | 
311 |         n_iter : int, optional (default = 100)
312 |             Iterations to perform.
313 | 
314 |         grouping: Integer List, optional (default = None)
315 |             If not `None`, this specifies which column of X belongs to which group.
316 |             That is, if grouping[i] is g, then, :math:`w_i` and :math:`V_{i, r}`
317 |             will be distributed according to
318 |             :math:`\mathcal{N}(\mu_w[g], \lambda_w[g])` and :math:`\mathcal{N}(\mu_V[g, r], \lambda_V[g,r])`,
319 |             respectively.
320 |             If `None`, all the columns of X are assumed to belong to a single group, 0.
321 | 
322 |         group_shapes: Integer array, optional (default = None)
323 |             If not `None`, this specifies each variable group's size.
324 |             Ignored if grouping is not None.
325 |             For example, if ``group_shapes = [n_1, n_2]``,
326 |             this is equivalent to ``grouping = [0] * n_1 + [1] * n_2``
327 | 
328 |         callback: function(int, fm, hyper) -> bool, optional(default = None)
329 |             Called at the every end of each Gibbs iteration.
330 |         """
331 |         self._fit(
332 |             X,
333 |             y,
334 |             X_rel=X_rel,
335 |             X_test=X_test,
336 |             X_rel_test=X_rel_test,
337 |             y_test=y_test,
338 |             n_iter=n_iter,
339 |             grouping=grouping,
340 |             callback=callback,
341 |             group_shapes=group_shapes,
342 |             config_builder=config_builder,
343 |         )
344 |         return self
345 | 
346 |     def predict(
347 |         self, X: Optional[ArrayLike], X_rel: List[RelationBlock] = []
348 |     ) -> np.ndarray:
349 |         r"""Based on the class probability, return binary classified outcome based on threshold = 0.5.
350 |         If you want class probability instead, use `predict_proba` method.
351 | 
352 |         Parameters
353 |         ----------
354 |         X : Optional[ArrayLike]
355 |             Main Table. When None, treated as a matrix without columns.
356 |         X_rel : List[RelationBlock], optional
357 |             Relations, by default []
358 | 
359 |         Returns
360 |         -------
361 |         np.ndarray
362 |             0/1 predictions based on the probability.
363 |         """
364 |         return self.predict_proba(X, X_rel) > 0.5
365 | 
366 |     def predict_proba(
367 |         self, X: Optional[ArrayLike], X_rel: List[RelationBlock] = []
368 |     ) -> np.ndarray:
369 |         r"""Compute the probability that the outcome will be 1 based on variational mean.
370 | 
371 |         Parameters
372 |         ----------
373 |         X : Optional[ArrayLike]
374 |             Main Table. When None, treated as a matrix without columns.
375 |         X_rel : List[RelationBlock], optional
376 |             Relations, by default []
377 | 
378 |         Returns
379 |         -------
380 |         np.ndarray
381 |             the probability.
382 |         """
383 |         return self._predict_core(X, X_rel)
384 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/__init__.py


--------------------------------------------------------------------------------
/tests/classification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/classification/__init__.py


--------------------------------------------------------------------------------
/tests/classification/test_classification.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | from scipy import sparse as sps
 6 | 
 7 | from myfm import MyFMGibbsClassifier, VariationalFMClassifier
 8 | from myfm.utils.callbacks import ClassificationCallback
 9 | 
10 | from ..test_utils import FMWeights
11 | 
12 | 
13 | @pytest.mark.parametrize("use_libfm_callback", [True, False])
14 | def test_middle_clf(
15 |     middle_data: Tuple[sps.csr_matrix, np.ndarray],
16 |     stub_weight: FMWeights,
17 |     use_libfm_callback: bool,
18 | ) -> None:
19 |     rns = np.random.RandomState(0)
20 |     X, score = middle_data
21 |     score_noised = score + rns.normal(0, 1, size=score.shape)
22 |     score_noised -= score_noised.mean()
23 |     y = score_noised > 0
24 |     if use_libfm_callback:
25 |         callback = ClassificationCallback(200, X, y)
26 |     else:
27 |         callback = None
28 | 
29 |     fm = MyFMGibbsClassifier(3).fit(
30 |         X, y, X_test=X, y_test=y, n_iter=200, n_kept_samples=200, callback=callback
31 |     )
32 |     if use_libfm_callback:
33 |         np.testing.assert_allclose(fm.predict_proba(X), callback.predictions / 200)
34 | 
35 |     vfm_before_fit = VariationalFMClassifier(3)
36 |     assert vfm_before_fit.w0_mean is None
37 |     assert vfm_before_fit.w0_var is None
38 |     assert vfm_before_fit.w_mean is None
39 |     assert vfm_before_fit.w_var is None
40 |     assert vfm_before_fit.V_mean is None
41 |     assert vfm_before_fit.V_var is None
42 | 
43 |     vfm = vfm_before_fit.fit(
44 |         X, y, X_test=X, y_test=y, n_iter=200  # , n_kept_samples=50
45 |     )
46 | 
47 |     assert vfm.w0_mean is not None
48 |     assert vfm.w0_var is not None
49 |     assert vfm.w_mean is not None
50 |     assert vfm.w_var is not None
51 |     assert vfm.V_mean is not None
52 |     assert vfm.V_var is not None
53 | 
54 |     assert fm.predictor_ is not None
55 | 
56 |     last_samples = fm.predictor_.samples[-20:]
57 | 
58 |     for i in range(3):
59 |         for j in range(i + 1, 3):
60 |             cross_term = stub_weight.factors[:, i].dot(stub_weight.factors[:, j])
61 |             if abs(cross_term) < 0.5:
62 |                 continue
63 |             sign = cross_term / abs(cross_term)
64 |             assert vfm.V_mean[i].dot(vfm.V_mean[j]) > sign * cross_term * 0.8
65 |             assert vfm.V_mean[i].dot(vfm.V_mean[j]) < sign * cross_term * 1.2
66 | 
67 |             for s in last_samples:
68 |                 sample_cross_term = s.V[i].dot(s.V[j])
69 |                 assert sample_cross_term > sign * cross_term * 0.5
70 |                 assert sample_cross_term < sign * cross_term * 2
71 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import scipy.sparse as sps
 6 | 
 7 | from .test_utils import FMWeights, prediction
 8 | 
 9 | N_FEATURES = 3
10 | N_LATENT = 4
11 | 
12 | 
13 | @pytest.fixture
14 | def stub_weight() -> FMWeights:
15 |     weights = FMWeights(
16 |         -3.0,
17 |         np.asfarray([1.0, 2.0, -1.0]),
18 |         np.asfarray(
19 |             [[1.0, -1.0, 0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [-1.0, 0, -1.0]]
20 |         ),
21 |     )
22 |     return weights
23 | 
24 | 
25 | def create_data(
26 |     n_train: int, stub_weight: FMWeights
27 | ) -> Tuple[sps.csr_matrix, np.ndarray]:
28 |     rns = np.random.RandomState(0)
29 |     rows: List[int] = []
30 |     cols: List[int] = []
31 |     data: List[float] = []
32 |     for row in range(n_train):
33 |         indices = np.where(rns.random(N_FEATURES) > 0.5)[0]
34 |         for ind in indices:
35 |             rows.append(row)
36 |             cols.append(ind)
37 |             data.append(float(rns.choice([-2, -1, 1, 2])))
38 |     X = sps.csr_matrix((data, (rows, cols)))
39 |     p = prediction(X, weight=stub_weight)
40 |     return X, p
41 | 
42 | 
43 | @pytest.fixture
44 | def middle_data(stub_weight: FMWeights) -> Tuple[sps.csr_matrix, np.ndarray]:
45 |     return create_data(1000, stub_weight)
46 | 


--------------------------------------------------------------------------------
/tests/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/dataset/__init__.py


--------------------------------------------------------------------------------
/tests/dataset/test_ml100k.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from pytest_mock import MockerFixture
 4 | 
 5 | from myfm.utils.benchmark_data import MovieLens100kDataManager
 6 | 
 7 | 
 8 | def test_ml100k(mocker: MockerFixture) -> None:
 9 |     mocker.patch("builtins.input", return_value="y")
10 |     dm = MovieLens100kDataManager()
11 |     unique_key_pair = ["user_id", "movie_id"]
12 | 
13 |     df_all_recovered = dm.load_rating_all().sort_values(unique_key_pair)
14 | 
15 |     user_infos = dm.load_user_info()
16 |     assert np.all(df_all_recovered["user_id"].isin(user_infos["user_id"]))
17 |     assert np.all(user_infos["age"] >= 0)
18 |     assert np.all(user_infos["gender"].isin(["M", "F"]).values)
19 | 
20 |     movie_infos = dm.load_movie_info()
21 |     genres = dm.genres()
22 |     for genre_concat in movie_infos["genres"]:
23 |         for genre in genre_concat.split("|"):
24 |             assert genre in genres
25 | 
26 |     for k in [2, 3]:
27 |         df_train, df_test = dm.load_rating_predefined_split(k)
28 |         df_reconcat = pd.concat([df_train, df_test]).sort_values(unique_key_pair)
29 |         for key in ["user_id", "movie_id", "timestamp"]:
30 |             assert np.all(df_all_recovered[key].values == df_reconcat[key].values)
31 | 
32 |     N_manual_fold = 7
33 |     df_tests = []
34 |     for i in range(N_manual_fold):
35 |         df_tr, df_te = dm.load_rating_kfold_split(N_manual_fold, i)
36 |         assert (
37 |             pd.concat([df_tr, df_te]).drop_duplicates(unique_key_pair).shape[0]
38 |             == df_all_recovered.shape[0]
39 |         )
40 |         assert df_tr.shape[0] + df_te.shape[0] == df_all_recovered.shape[0]
41 |         test_size = df_all_recovered.shape[0] // N_manual_fold
42 |         assert df_te.shape[0] in {test_size, test_size + 1}
43 |         df_tests.append(df_te)
44 |     df_tests_concat = pd.concat(df_tests)
45 |     assert df_tests_concat.shape[0] == df_all_recovered.shape[0]
46 |     assert (
47 |         df_tests_concat.drop_duplicates(unique_key_pair).shape[0]
48 |         == df_all_recovered.shape[0]
49 |     )
50 | 


--------------------------------------------------------------------------------
/tests/dataset/test_ml1m.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | from tempfile import TemporaryDirectory
 4 | from zipfile import ZipFile
 5 | 
 6 | import numpy as np
 7 | import pytest
 8 | from pytest_mock import MockerFixture
 9 | 
10 | from myfm.utils.benchmark_data import MovieLens1MDataManager
11 | from myfm.utils.dummy_data import gen_dummy_rating_df
12 | 
13 | 
14 | def test_ml1m(mocker: MockerFixture) -> None:
15 |     if sys.platform == "win32":
16 |         pytest.skip("Skip on Windows.")
17 |     dummy_df = gen_dummy_rating_df(user_colname="user_id", item_colname="movie_id")
18 |     dummy_df["timestamp"] = (dummy_df["timestamp"].view(np.int64) / 1e9).astype(
19 |         np.int64
20 |     )
21 |     with TemporaryDirectory() as temp_dir:
22 |         target = Path(temp_dir) / "ml1m.zip"
23 |         mocker.patch("builtins.input", return_value="NOO")
24 |         with pytest.raises(RuntimeError):
25 |             dm = MovieLens1MDataManager(target)
26 |         df_stringified = "\n".join(
27 |             [
28 |                 "::".join([str(v) for v in row])
29 |                 for row in dummy_df[
30 |                     ["user_id", "movie_id", "rating", "timestamp"]
31 |                 ].values
32 |             ]
33 |         )
34 |         with ZipFile(target, "w") as zf:
35 |             zf.writestr("ml-1m/ratings.dat", df_stringified)
36 |         dm = MovieLens1MDataManager(target)
37 |         unique_key_pair = ["user_id", "movie_id", "rating"]
38 |         df_all_recovered = dm.load_rating_all()
39 |         for key in unique_key_pair:
40 |             assert np.all(df_all_recovered[key] == dummy_df[key])
41 | 


--------------------------------------------------------------------------------
/tests/oprobit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/oprobit/__init__.py


--------------------------------------------------------------------------------
/tests/oprobit/test_oprobit_1dim.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from myfm import MyFMOrderedProbit
 5 | from myfm.base import std_cdf
 6 | from myfm.utils.callbacks import OrderedProbitCallback
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("use_libfm_callback", [True, False])
10 | def test_oprobit(use_libfm_callback: bool) -> None:
11 |     N_train = 1000
12 |     cps = np.asfarray([0.0, 0.5, 1.5])
13 |     rns = np.random.RandomState(0)
14 |     X = rns.normal(0, 2, size=N_train)
15 |     coeff = 0.5
16 |     y = np.zeros(N_train, dtype=np.float64)
17 |     score = X * coeff + rns.randn(N_train)
18 |     for cp_value in cps:
19 |         y += (score > cp_value).astype(np.int64)
20 |     if use_libfm_callback:
21 |         callback = OrderedProbitCallback(100, X_test=X[:, None], y_test=y, n_class=4)
22 |     else:
23 |         callback = None
24 |     fm = MyFMOrderedProbit(0, fit_w0=False)
25 |     fm.fit(
26 |         X[:, None],
27 |         y,
28 |         callback=callback,
29 |         n_iter=100,
30 |         n_kept_samples=100,
31 |     )
32 | 
33 |     assert fm.predictor_ is not None
34 |     for cutpoint_sample in fm.cutpoint_samples[-10:]:
35 |         cp_1, cp_2, cp_3 = cutpoint_sample
36 |         assert abs(cp_1) < 0.25
37 |         assert abs(cp_2 - cp_1 - 0.5) < 0.25
38 |         assert abs(cp_3 - cp_1 - 1.5) < 0.25
39 | 
40 |     p_using_core = fm.predict_proba(X[:, None])
41 | 
42 |     if use_libfm_callback:
43 |         assert callback is not None
44 |         np.testing.assert_allclose(callback.predictions / 100, p_using_core)
45 |     result_manual = np.zeros((X.shape[0], 4))
46 | 
47 |     n_ = 0
48 |     for sample in fm.predictor_.samples:
49 |         n_ += 1
50 |         score = sample.predict_score(X[:, None], [])
51 |         cdf = std_cdf((sample.cutpoints[0][np.newaxis, :] - score[:, np.newaxis]))
52 |         diff = np.hstack(
53 |             [
54 |                 np.zeros((score.shape[0], 1)),
55 |                 cdf,
56 |                 np.ones((score.shape[0], 1)),
57 |             ]
58 |         )
59 |         result_manual += diff[:, 1:] - diff[:, :-1]
60 |     result_manual /= n_
61 |     np.testing.assert_allclose(result_manual, p_using_core)
62 | 


--------------------------------------------------------------------------------
/tests/regression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/regression/__init__.py


--------------------------------------------------------------------------------
/tests/regression/test_block.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import tempfile
  3 | 
  4 | import numpy as np
  5 | from scipy import sparse as sps
  6 | 
  7 | from myfm import MyFMRegressor, RelationBlock, VariationalFMRegressor
  8 | 
  9 | 
 10 | def test_block_vfm() -> None:
 11 |     N_train = 1000
 12 |     rns = np.random.RandomState(1)
 13 |     user_block = sps.csr_matrix(
 14 |         [[1, 0, 1], [0, 1, 1], [1, 1, 0]],
 15 |         dtype=np.float64,
 16 |     )
 17 |     user_indices = rns.randint(0, user_block.shape[0], size=N_train)
 18 |     item_block = sps.csr_matrix(
 19 |         [
 20 |             [1, 0, 0, 1],
 21 |             [0, 1, 1, 0],
 22 |         ],
 23 |         dtype=np.float64,
 24 |     )
 25 |     item_indices = rns.randint(0, item_block.shape[0], size=N_train)
 26 |     tm_column = rns.randn(N_train, 1)
 27 | 
 28 |     X_flatten = sps.hstack(
 29 |         [tm_column, user_block[user_indices], item_block[item_indices]]
 30 |     )
 31 |     X_flatten_squread = X_flatten.copy()
 32 |     X_flatten_squread.data = X_flatten_squread.data**2
 33 |     factor = rns.randn(X_flatten.shape[1], 3)
 34 |     f2 = (factor**2).sum(axis=1)
 35 |     Xf = X_flatten.dot(factor)
 36 | 
 37 |     gb = 3.0
 38 |     linear_weights = rns.randn(X_flatten.shape[1])
 39 |     y = (
 40 |         gb
 41 |         + X_flatten.dot(linear_weights)
 42 |         + 0.5 * ((Xf**2).sum(axis=1) - X_flatten_squread.dot(f2))
 43 |         + rns.normal(1.0, size=X_flatten.shape[0])
 44 |     )
 45 | 
 46 |     blocks = [
 47 |         RelationBlock(user_indices, user_block),
 48 |         RelationBlock(item_indices, item_block),
 49 |     ]
 50 |     with tempfile.TemporaryFile() as temp_fs:
 51 |         pickle.dump(blocks, temp_fs)
 52 |         del blocks
 53 |         temp_fs.seek(0)
 54 |         blocks = pickle.load(temp_fs)
 55 |     fm_flatten = VariationalFMRegressor(3).fit(
 56 |         X_flatten,
 57 |         y,
 58 |         n_iter=100,
 59 |     )
 60 |     fm_blocked_serialized = VariationalFMRegressor(3).fit(
 61 |         tm_column,
 62 |         y,
 63 |         blocks,
 64 |         n_iter=100,
 65 |     )
 66 | 
 67 |     with tempfile.TemporaryFile() as temp_fs:
 68 |         pickle.dump(fm_blocked_serialized, temp_fs)
 69 |         del fm_blocked_serialized
 70 |         temp_fs.seek(0)
 71 |         fm_blocked: VariationalFMRegressor = pickle.load(temp_fs)
 72 | 
 73 |     np.testing.assert_allclose(fm_flatten.w_mean, fm_blocked.w_mean)
 74 |     np.testing.assert_allclose(fm_flatten.V_mean, fm_blocked.V_mean)
 75 |     predicton_flatten = fm_flatten.predict(tm_column, blocks)
 76 |     predicton_blocked = fm_blocked.predict(X_flatten)
 77 |     np.testing.assert_allclose(predicton_flatten, predicton_blocked)
 78 | 
 79 | 
 80 | def test_block() -> None:
 81 |     rns = np.random.RandomState(0)
 82 |     N_train = 100
 83 |     user_block = sps.csr_matrix(
 84 |         [
 85 |             [1, 0, 0],
 86 |             [0, 1, 0],
 87 |             [0, 0, 1],
 88 |         ],
 89 |         dtype=np.float64,
 90 |     )
 91 |     user_indices = rns.randint(0, user_block.shape[0], size=N_train)
 92 |     item_block = sps.csr_matrix(
 93 |         [
 94 |             [1, 0],
 95 |             [0, 1],
 96 |         ],
 97 |         dtype=np.float64,
 98 |     )
 99 | 
100 |     group_shapes = [1, user_block.shape[1], item_block.shape[1]]
101 |     item_indices = rns.randint(0, item_block.shape[0], size=N_train)
102 |     tm_column = rns.randn(N_train, 1)
103 | 
104 |     X_flatten = sps.hstack(
105 |         [tm_column, user_block[user_indices], item_block[item_indices]]
106 |     )
107 |     X_flatten_squread = X_flatten.copy()
108 |     X_flatten_squread.data = X_flatten_squread.data**2
109 | 
110 |     weights = rns.randn(3, X_flatten.shape[1])
111 |     Xw = X_flatten.dot(weights.T)
112 |     X2w2 = X_flatten_squread.dot((weights**2).sum(axis=0))
113 |     y = 0.5 * ((Xw**2).sum(axis=1) - X2w2) + rns.randn(N_train)
114 | 
115 |     blocks = [
116 |         RelationBlock(user_indices, user_block),
117 |         RelationBlock(item_indices, item_block),
118 |     ]
119 |     fm_flatten = MyFMRegressor(2, fit_w0=False).fit(
120 |         X_flatten,
121 |         y,
122 |         group_shapes=group_shapes,
123 |         n_iter=30,
124 |         n_kept_samples=30,
125 |     )
126 |     fm_blocked = MyFMRegressor(2, fit_w0=False).fit(
127 |         tm_column,
128 |         y,
129 |         blocks,
130 |         group_shapes=group_shapes,
131 |         n_iter=30,
132 |         n_kept_samples=30,
133 |     )
134 |     assert fm_flatten.predictor_ is not None
135 |     assert fm_blocked.predictor_ is not None
136 |     for s_flatten, s_blocked in zip(
137 |         fm_flatten.predictor_.samples, fm_blocked.predictor_.samples
138 |     ):
139 |         np.testing.assert_allclose(s_flatten.V, s_blocked.V)
140 | 
141 |     with tempfile.TemporaryFile() as temp_fs:
142 |         pickle.dump(fm_blocked, temp_fs)
143 |         del fm_blocked
144 |         temp_fs.seek(0)
145 |         fm_blocked = pickle.load(temp_fs)
146 | 
147 |     predicton_flatten = fm_flatten.predict(tm_column, blocks, n_workers=2)
148 |     predicton_blocked = fm_blocked.predict(X_flatten, n_workers=None)
149 |     np.testing.assert_allclose(predicton_flatten, predicton_blocked)
150 | 


--------------------------------------------------------------------------------
/tests/regression/test_fit.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | from scipy import sparse as sps
 6 | 
 7 | from myfm import MyFMGibbsRegressor, VariationalFMRegressor
 8 | from myfm.base import DenseArray
 9 | from myfm.utils.callbacks import RegressionCallback
10 | 
11 | from ..test_utils import FMWeights
12 | 
13 | 
14 | def assert_unwrap(x: Optional[DenseArray]) -> DenseArray:
15 |     assert x is not None
16 |     return x
17 | 
18 | 
19 | @pytest.mark.parametrize("alpha_inv", [0.3, 1.0, 3])
20 | def test_middle_reg(
21 |     alpha_inv: float,
22 |     middle_data: Tuple[sps.csr_matrix, np.ndarray],
23 |     stub_weight: FMWeights,
24 | ) -> None:
25 |     rns = np.random.RandomState(0)
26 |     X, score = middle_data
27 |     y = score + alpha_inv * rns.normal(0, 1, size=score.shape)
28 | 
29 |     callback = RegressionCallback(100, X_test=X, y_test=y)
30 | 
31 |     fm_init = MyFMGibbsRegressor(3)
32 |     assert fm_init.w0_samples is None
33 |     assert fm_init.w_samples is None
34 |     assert fm_init.V_samples is None
35 |     fm = fm_init.fit(
36 |         X, y, X_test=X, y_test=y, n_iter=100, n_kept_samples=100, callback=callback
37 |     )
38 | 
39 |     np.testing.assert_allclose(fm.predict(X), callback.predictions / 100)
40 |     vfm = VariationalFMRegressor(3).fit(X, y, X_test=X, y_test=y, n_iter=50)
41 |     vfm_weights = vfm.predictor_.weights()
42 |     hp_trace = fm.get_hyper_trace()
43 |     last_alphs = hp_trace["alpha"].iloc[-20:].values
44 |     assert np.all(last_alphs > ((1 / alpha_inv**2) / 2))
45 |     assert np.all(last_alphs < ((1 / alpha_inv**2) * 2))
46 | 
47 |     last_w0_samples = assert_unwrap(fm.w0_samples)[-20:]
48 |     assert np.all(last_w0_samples < (stub_weight.global_bias + 0.5))
49 |     assert np.all(last_w0_samples > (stub_weight.global_bias - 0.5))
50 | 
51 |     last_w_samples = assert_unwrap(fm.w_samples)[-20:]
52 | 
53 |     for w_ in last_w_samples:
54 |         assert np.all(w_ < (stub_weight.weight + 1.0))
55 |         assert np.all(w_ > (stub_weight.weight - 1.0))
56 | 
57 |     last_V_samples = assert_unwrap(fm.V_samples)[-20:]
58 | 
59 |     for i in range(3):
60 |         for j in range(i + 1, 3):
61 |             cross_term = stub_weight.factors[:, i].dot(stub_weight.factors[:, j])
62 |             if abs(cross_term) < 0.1:
63 |                 continue
64 |             sign = cross_term / abs(cross_term)
65 |             vfm_cross_term = vfm_weights.V[i].dot(vfm_weights.V[j])
66 |             assert vfm_cross_term > sign * cross_term * 0.8
67 |             assert vfm_cross_term < sign * cross_term * 1.25
68 | 
69 |             for V_ in last_V_samples:
70 |                 sample_cross_term = V_[i].dot(V_[j])
71 |                 assert sample_cross_term > sign * cross_term * 0.5
72 |                 assert sample_cross_term < sign * cross_term * 2
73 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import NamedTuple
 2 | 
 3 | import numpy as np
 4 | import scipy.sparse as sps
 5 | 
 6 | N_FEATURES = 3
 7 | N_LATENT = 4
 8 | 
 9 | 
10 | class FMWeights(NamedTuple):
11 |     global_bias: float
12 |     weight: np.ndarray
13 |     factors: np.ndarray
14 | 
15 | 
16 | def prediction(X: sps.csr_matrix, weight: FMWeights) -> np.ndarray:
17 |     X2 = X.copy()
18 |     X2.data[:] = X2.data**2
19 |     result = np.zeros(X.shape[0], dtype=np.float64)
20 |     result[:] = weight.global_bias
21 |     result += X.dot(weight.weight)
22 |     w2 = (weight.factors**2).sum(axis=0)
23 |     Xw = X.dot(weight.factors.T)
24 |     result += ((Xw**2).sum(axis=1) - (X2.dot(w2))) * 0.5
25 |     return result
26 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tohtsky/myFM/b9ba70ea38d9370d3ad50a9d25b2ff825eaa30ef/tests/utils/__init__.py


--------------------------------------------------------------------------------
/tests/utils/test_binning.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from myfm.utils.encoders import BinningEncoder
 4 | 
 5 | 
 6 | def test_binning_dense() -> None:
 7 |     rns = np.random.RandomState(0)
 8 |     v = rns.randn(1000)
 9 |     v[0] = np.nan
10 |     enc = BinningEncoder(v)
11 |     assert enc.percentiles.shape[0] == 10
12 |     X = enc.to_sparse(v)
13 |     assert np.all(X.sum(axis=1).A1 == 1.0)
14 |     assert X.shape[1] == 12
15 |     assert X[0, 0] == 1.0
16 |     for j in np.where(v[1:] <= enc.percentiles[0])[0]:
17 |         assert X[j + 1, 1] == 1.0
18 |     for k in range(1, 10):
19 |         for j in np.where(
20 |             (v[1:] <= enc.percentiles[k]) & (v[1:] > enc.percentiles[k - 1])
21 |         )[0]:
22 |             assert X[j + 1, k + 1] == 1.0
23 |     for j in np.where(v[1:] > enc.percentiles[-1])[0]:
24 |         assert X[j + 1, 11] == 1.0
25 | 
26 | 
27 | def test_binning_sparse() -> None:
28 |     rns = np.random.RandomState(0)
29 |     v = rns.poisson(2, size=1000)
30 |     enc = BinningEncoder(v)
31 |     X = enc.to_sparse(v)
32 |     assert np.all(X.sum(axis=1).A1 == 1.0)
33 |     assert X.shape[1] == len(enc)
34 |     for j in np.where(v == 0)[0]:
35 |         assert X[j, 1] == 1.0
36 | 
37 |     for j in np.where(v == 1.0)[0]:
38 |         assert X[j, 2] == 1.0
39 | 
40 |     for k in range(2, len(enc.percentiles)):
41 |         for j in np.where((v <= enc.percentiles[k]) & (v > enc.percentiles[k - 1]))[0]:
42 |             assert X[j, k + 1] == 1.0
43 | 
44 |     for j in np.where(v > enc.percentiles[-1])[0]:
45 |         assert X[j, len(enc.percentiles) + 1] == 1.0
46 | 


--------------------------------------------------------------------------------
/tests/utils/test_categorical.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from myfm.utils.encoders import CategoryValueToSparseEncoder
 5 | 
 6 | TEST_ITEMS = [
 7 |     "item1",
 8 |     "item2",
 9 |     "item3",
10 |     "item1",
11 |     "item2",
12 |     "item3",
13 |     "item1",
14 |     "item2",
15 | ]
16 | 
17 | 
18 | def test_categorical_encs_create() -> None:
19 |     enc = CategoryValueToSparseEncoder(TEST_ITEMS, handle_unknown="create")
20 |     X = enc.to_sparse(["item4", "item1", "item2", "item3"])
21 |     for i in range(4):
22 |         for j in range(len(enc)):
23 |             if i == j:
24 |                 assert X[i, j] == 1
25 |             else:
26 |                 assert X[i, j] == 0
27 | 
28 |     enc_cutoff = CategoryValueToSparseEncoder(
29 |         TEST_ITEMS, handle_unknown="create", min_freq=3
30 |     )
31 |     assert len(enc_cutoff) == 3
32 |     X_cutoffed = enc_cutoff.to_sparse(["item4", "item1", "item2", "item3"])
33 |     np.testing.assert_allclose(
34 |         X_cutoffed.toarray(), np.asfarray([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]])
35 |     )
36 | 
37 | 
38 | def test_categorical_encs_ignore() -> None:
39 |     enc = CategoryValueToSparseEncoder(TEST_ITEMS, handle_unknown="ignore")
40 |     X = enc.to_sparse(["item4", "item1", "item2", "item3"])
41 |     np.testing.assert_allclose(
42 |         X.toarray(), np.asfarray([[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
43 |     )
44 |     enc_cutoff = CategoryValueToSparseEncoder(
45 |         TEST_ITEMS, handle_unknown="ignore", min_freq=3
46 |     )
47 |     X = enc_cutoff.to_sparse(["item4", "item1", "item2", "item3"])
48 |     np.testing.assert_allclose(
49 |         X.toarray(), np.asfarray([[0, 0], [1, 0], [0, 1], [0, 0]])
50 |     )
51 | 
52 | 
53 | def test_categorical_encs_raise() -> None:
54 |     enc = CategoryValueToSparseEncoder(TEST_ITEMS, handle_unknown="raise")
55 |     with pytest.raises(KeyError):
56 |         X = enc.to_sparse(["item4", "item1", "item2", "item3"])
57 |     X = enc.to_sparse(["item1", "item2", "item3"])
58 | 
59 |     np.testing.assert_allclose(
60 |         X.toarray(), np.asfarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
61 |     )
62 | 


--------------------------------------------------------------------------------
/tests/utils/test_dataframe_encoder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from myfm.utils.encoders import (
 5 |     BinningEncoder,
 6 |     DataFrameEncoder,
 7 |     MultipleValuesToSparseEncoder,
 8 | )
 9 | from myfm.utils.encoders.categorical import CategoryValueToSparseEncoder
10 | 
11 | 
12 | def test_dfe() -> None:
13 |     rns = np.random.RandomState(0)
14 |     N = 1000
15 |     categories = [1, 2, 3]
16 | 
17 |     multi_categories = ["i1", "i2", "i3", "i4"]
18 |     multi_values = []
19 |     cnts = []
20 |     for _ in range(N):
21 |         n = rns.randint(0, len(multi_categories) + 1)
22 |         cnts.append(n)
23 |         v = ",".join(rns.choice(multi_categories, size=n, replace=False))
24 |         multi_values.append(v)
25 |     df = pd.DataFrame(
26 |         dict(
27 |             numerical_value=rns.randn(N),
28 |             categorical_value=rns.choice(categories, size=N, replace=True),
29 |             multi_values=multi_values,
30 |         )
31 |     )
32 |     dfe = DataFrameEncoder().add_column(
33 |         "numerical_value", BinningEncoder(df.numerical_value)
34 |     )
35 |     assert np.all(dfe.encode_df(df).sum(axis=1).A1 == 1.0)
36 |     dfe.add_column(
37 |         "categorical_value", CategoryValueToSparseEncoder(df.categorical_value)
38 |     )
39 |     assert np.all(dfe.encode_df(df).sum(axis=1).A1 == 2.0)
40 |     dfe.add_column(
41 |         "multi_values", MultipleValuesToSparseEncoder(df.multi_values, normalize=False)
42 |     )
43 |     for nnz, cnt in zip(dfe.encode_df(df).sum(axis=1).A1, cnts):
44 |         assert nnz == cnt + 2
45 |     cursor = 0
46 |     names = dfe.all_names()
47 |     for s, name_prefix in zip(
48 |         dfe.encoder_shapes, ["numerical_value", "categorical_value", "multi_values"]
49 |     ):
50 |         for X_col_name in names[cursor : cursor + s]:
51 |             assert X_col_name.startswith(name_prefix)
52 |         cursor += s
53 | 


--------------------------------------------------------------------------------
/tests/utils/test_multivalue.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from myfm.utils.encoders import MultipleValuesToSparseEncoder
 4 | 
 5 | TEST_ITEMS = [
 6 |     "item1",
 7 |     "item2, item1",
 8 |     "item3, item2",
 9 |     "item2",
10 |     "item3, item1",
11 | ]
12 | 
13 | 
14 | def test_categorical_encs_create() -> None:
15 |     enc = MultipleValuesToSparseEncoder(TEST_ITEMS, handle_unknown="create")
16 |     X = enc.to_sparse(["item4,item1", "item1,item2,item3", "item2", "item3"])
17 |     nnz_rows = (X.toarray() > 0).astype(np.int32).sum(axis=1)
18 |     np.testing.assert_allclose(nnz_rows, np.asarray([2, 3, 1, 1]))
19 | 


--------------------------------------------------------------------------------