├── .codecov.yml
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── build.yml
    │   ├── docs.yml
    │   ├── documentation-links.yml
    │   ├── lint.yml
    │   ├── release.yml
    │   └── tests.yml
├── .gitignore
├── .readthedocs.yaml
├── CHANGELOG
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
    ├── requirements.txt
    └── src
    │   ├── bibliography.rst
    │   ├── changelog.rst
    │   ├── conf.py
    │   ├── contributing.rst
    │   ├── getting-started.rst
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── references
    │       ├── VoronoiFPS-Schematic.pdf
    │       ├── clustering.rst
    │       ├── datasets.rst
    │       ├── decomposition.rst
    │       ├── index.rst
    │       ├── linear_models.rst
    │       ├── metrics.rst
    │       ├── neighbors.rst
    │       ├── preprocessing.rst
    │       ├── selection.rst
    │       └── utils.rst
    │   └── tutorials.rst
├── examples
    ├── README.rst
    ├── neighbors
    │   ├── README.rst
    │   ├── pamm.py
    │   └── sparse-kde.py
    ├── pcovc
    │   ├── PCovC_Comparison.py
    │   ├── PCovC_Hyperparameters.py
    │   └── README.rst
    ├── pcovr
    │   ├── PCovR-WHODataset.py
    │   ├── PCovR.py
    │   ├── PCovR_Regressors.py
    │   ├── PCovR_Scaling.py
    │   └── README.rst
    ├── reconstruction
    │   ├── PlotGFRE.py
    │   ├── PlotLFRE.py
    │   ├── PlotPointwiseGFRE.py
    │   └── README.rst
    ├── regression
    │   ├── OrthogonalRegressionNonAnalytic.py
    │   ├── README.rst
    │   └── Ridge2FoldCVRegularization.py
    └── selection
    │   ├── FeatureSelection-WHODataset.py
    │   ├── FeatureSelection.py
    │   ├── GCH-ROY.py
    │   ├── README.rst
    │   └── Selectors-Pipelines.py
├── pyproject.toml
├── src
    └── skmatter
    │   ├── __init__.py
    │   ├── _selection.py
    │   ├── clustering
    │       ├── __init__.py
    │       └── _quick_shift.py
    │   ├── datasets
    │       ├── __init__.py
    │       ├── _base.py
    │       ├── data
    │       │   ├── beran_roy_properties.npz
    │       │   ├── csd-1000r.npz
    │       │   ├── degenerate_CH4_manifold.npz
    │       │   ├── h2o-blyp-piglet.npz
    │       │   ├── nice_dataset.npz
    │       │   └── who_dataset.csv
    │       └── descr
    │       │   ├── csd-1000r.rst
    │       │   ├── degenerate_CH4_manifold.rst
    │       │   ├── h2o-blyp-piglet.rst
    │       │   ├── nice_dataset.rst
    │       │   └── who_dataset.rst
    │   ├── decomposition
    │       ├── __init__.py
    │       ├── _kernel_pcovr.py
    │       ├── _pcov.py
    │       ├── _pcovc.py
    │       └── _pcovr.py
    │   ├── feature_selection
    │       ├── __init__.py
    │       └── _base.py
    │   ├── linear_model
    │       ├── __init__.py
    │       ├── _base.py
    │       └── _ridge.py
    │   ├── metrics
    │       ├── __init__.py
    │       ├── _pairwise.py
    │       ├── _prediction_rigidities.py
    │       └── _reconstruction_measures.py
    │   ├── model_selection
    │       ├── __init__.py
    │       └── _split.py
    │   ├── neighbors
    │       ├── __init__.py
    │       └── _sparsekde.py
    │   ├── preprocessing
    │       ├── __init__.py
    │       └── _data.py
    │   ├── sample_selection
    │       ├── __init__.py
    │       ├── _base.py
    │       └── _voronoi_fps.py
    │   └── utils
    │       ├── __init__.py
    │       ├── _orthogonalizers.py
    │       ├── _pcovc_utils.py
    │       ├── _pcovr_utils.py
    │       ├── _progress_bar.py
    │       └── _sparsekde.py
├── tests
    ├── .gitignore
    ├── test_check_estimators.py
    ├── test_clustering.py
    ├── test_datasets.py
    ├── test_dch.py
    ├── test_feature_pcov_cur.py
    ├── test_feature_pcov_fps.py
    ├── test_feature_simple_cur.py
    ├── test_feature_simple_fps.py
    ├── test_greedy_selector.py
    ├── test_kernel_normalizer.py
    ├── test_kernel_pcovr.py
    ├── test_linear_model.py
    ├── test_metrics.py
    ├── test_model_selection.py
    ├── test_neighbors.py
    ├── test_orthogonalizers.py
    ├── test_pcovc.py
    ├── test_pcovr.py
    ├── test_pcovr_distances.py
    ├── test_progress_bar.py
    ├── test_sample_pcov_cur.py
    ├── test_sample_pcov_fps.py
    ├── test_sample_simple_cur.py
    ├── test_sample_simple_fps.py
    ├── test_sparse_kernel_centerer.py
    ├── test_standard_flexible_scaler.py
    └── test_voronoi_fps.py
└── tox.ini


/.codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   ignore:
 3 |     - tests/.*
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: 95%
 8 |     patch:
 9 |       default:
10 |         target: 95%
11 | 
12 | comment: false
13 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!-- What does this implement/fix? Explain your changes here. -->
 2 | 
 3 | 
 4 | Contributor (creator of PR) checklist
 5 | -------------------------------------
 6 |  - [ ] Tests updated (for new features and bugfixes)?
 7 |  - [ ] Documentation updated (for new features)?
 8 |  - [ ] Issue referenced (for PRs that solve an issue)?
 9 | 
10 | For Reviewer
11 | ------------
12 |  - [ ] CHANGELOG updated if important change?
13 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: monthly
 7 |     open-pull-requests-limit: 1
 8 |     groups:
 9 |       action-dependencies:
10 |         patterns:
11 |           - "*" # A wildcard to create one PR for all dependencies in the ecosystem
12 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | # This workflow builds and checks the package for release
 2 | name: Build
 3 | 
 4 | on:
 5 |   pull_request:
 6 |     branches: [main]
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v4
14 | 
15 |     - name: Set up Python
16 |       uses: actions/setup-python@v5
17 |       with:
18 |         python-version: "3.13"
19 | 
20 |     - name: install tests dependencies
21 |       run: python -m pip install tox
22 | 
23 |     - name: Test build integrity
24 |       run: tox -e build
25 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |     tags: ["*"]
 7 |   pull_request:
 8 |     # Check all PR
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 | 
16 |       - name: setup Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: "3.13"
20 | 
21 |       - name: install tests dependencies
22 |         run: python -m pip install tox
23 | 
24 |       - name: build documentation
25 |         run: tox -e docs
26 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation-links.yml:
--------------------------------------------------------------------------------
 1 | name: readthedocs/actions
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types:
 6 |       - opened
 7 | 
 8 | permissions:
 9 |   pull-requests: write
10 | 
11 | jobs:
12 |   documentation-links:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: readthedocs/actions/preview@v1
16 |         with:
17 |           project-slug: scikit-matter
18 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 | 
 7 | jobs:
 8 |   lint:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |     - uses: actions/checkout@v4
13 | 
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v5
16 |       with:
17 |         python-version: "3.13"
18 | 
19 |     - name: install tests dependencies
20 |       run: python -m pip install tox
21 | 
22 |     - name: Lint the code
23 |       run: tox -e lint
24 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags: ["*"]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     name: Build distribution
10 |     runs-on: ubuntu-latest
11 |     environment:
12 |       name: pypi
13 |       url: https://pypi.org/project/skmatter
14 |     permissions:
15 |       id-token: write
16 |       contents: write
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |       with:
21 |         fetch-depth: 0
22 |     - name: setup Python
23 |       uses: actions/setup-python@v5
24 |       with:
25 |         python-version: "3.13"
26 |     - run: python -m pip install tox
27 |     - name: Build package
28 |       run: tox -e build
29 |     - name: Publish distribution to PyPI
30 |       if: startsWith(github.ref, 'refs/tags/v')
31 |       uses: pypa/gh-action-pypi-publish@release/v1
32 |     - name: Publish to GitHub release
33 |       if: startsWith(github.ref, 'refs/tags/v')
34 |       uses: softprops/action-gh-release@v2
35 |       with:
36 |         files: |
37 |           dist/*.tar.gz
38 |           dist/*.whl
39 |         prerelease: ${{ contains(github.ref, '-rc') }}
40 |       env:
41 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
42 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     # Check all PR
 8 | 
 9 | jobs:
10 |   tests:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: [ubuntu-latest, macos-latest, windows-latest]
15 |         python-version: ["3.10", "3.13"]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v4
19 | 
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v5
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 | 
25 |     - name: install tests dependencies
26 |       run: python -m pip install tox coverage[toml]
27 | 
28 |     - name: run Python tests
29 |       run: |
30 |         tox -e tests
31 |         coverage xml
32 | 
33 |     - name: upload to codecov.io
34 |       uses: codecov/codecov-action@v5
35 |       with:
36 |         fail_ci_if_error: true
37 |         files: ./tests/coverage.xml
38 |         token: ${{ secrets.CODECOV_TOKEN }}
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.coverage*
 2 | *.pyc
 3 | *.ipynb_checkpoints*
 4 | __pycache__
 5 | *.egg-info
 6 | *.swp
 7 | *.swo
 8 | *DS_Store
 9 | 
10 | .tox/
11 | build/
12 | dist/
13 | docs/src/examples
14 | sg_execution_times.rst
15 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools we need
 9 | build:
10 |   os: ubuntu-lts-latest
11 |   tools:
12 |     python: "3.13"
13 |   
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/src/conf.py
17 | 
18 | # Optionally build your docs in additional formats such as PDF
19 | formats:
20 |   - pdf
21 | 
22 | python:
23 |   install:
24 |     - requirements: docs/requirements.txt
25 |     - method: pip
26 |       path: .
27 |       extra_requirements:
28 |         # The documentation runs "examples" to produce outputs via sphinx-gallery.
29 |         - examples
30 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | CHANGELOG file
 2 | --------------
 3 | 
 4 | The rules for CHANGELOG file:
 5 | 
 6 | - entries are sorted newest-first
 7 | - summarize sets of changes (do not reproduce every git log comment here).
 8 | - do not ever delete anything
 9 | - keep the format consistent (88 char width, Y/M/D date format) and do not use tabs but
10 |   use spaces for formatting
11 | 
12 | .. inclusion-marker-changelog-start
13 | 
14 | 0.3.0 (XXXX/XX/XX)
15 | ------------------
16 | - Add ``_BasePCov`` class (#248)
17 | - Add ``PCovC`` class that inherits shared functionality from ``_BasePCov`` (#248)
18 | - Add ``PCovC`` testing suite and examples (#248)
19 | - Modify ``PCovR`` to inherit shared functionality from ``_BasePCov_`` (#248)
20 | - Update to sklearn >= 1.6.0 and scipy >= 1.15.0 (#239)
21 | - Fixed moved function import from scipy and bump scipy dependency to 1.15.0 (#236)
22 | - Fix rendering issues for `SparseKDE` and `QuickShift` (#236)
23 | - Updating ``FPS`` to allow a numpy array of ints as an initialize parameter (#145)
24 | - Supported Python versions are now ranging from 3.9 - 3.12.
25 | - Updating ``skmatter.datasets`` submodule to support sklearn 1.5.0 (#229)
26 | - Add `SparseKDE` class (#222)
27 | - Add `QuickShift` class (#222)
28 | - Add an example on how to conduct PAMM algorithm with `SparseKDE` and `QuickShift`
29 |   (#222)
30 | - Add H2O-BLYP-Piglet dataset (#222)
31 | - Add two distance metrics that support the periodic boundry condition, 
32 |   `periodic_pairwise_euclidean_distances` and `pairwise_mahalanobis_distances` (#222)
33 | 
34 | 0.2.0 (2023/08/24)
35 | ------------------
36 | - Add this ``CHANGELOG`` file (#198)
37 | - Update example of WHO feature selection (#212)
38 | - Rename ``RidgeRegression2FoldCV`` -> ``Ridge2FoldCV`` (#211)
39 | - Adding metrics for prediction rigidity (#209)
40 | - Overhaul of documentation page (#200 to #204)
41 | - Rename and add member variables (#197)
42 | - Fix/check estimator (#196)
43 | - fixed small typo in ``PCovR`` class documentation (#194)
44 | - Resolved Issue WHO dataset missing function call section in doc (#181, #192)
45 | - Speeding up tests (#190)
46 | - Removing kernel optimization from who example (#189)
47 | - Ignore rendered examples for linting (#188)
48 | - Add more infos on documentation landing pages & ``CODE_OF_CONDUCT`` (#186)
49 | - Add contributors pictures to ``README``, show pip install instructions in docs (#185)
50 | - Add linting and tests for docstring and documentation code (#184)
51 | - Rerestructure requirements after (#171, #183)
52 | - Update ``README.md`` to show banners (#176)
53 | - Modernize package infrastructure (#172)
54 | - Add an example of GCH for molecular materials (#171)
55 | - Port examples to ``sphinx_gallery`` (#170)
56 | 
57 | 0.1.4 (2023/03/14)
58 | ------------------
59 | - documentation formatting fixes for math and datasets (#161, #163)
60 | - changing the way the distance to the convex hull is computed in the
61 |   ``DirectionalConvexHull`` due to numerical issues with the old method (#165)
62 | 
63 | 0.1.3 (2023/03/02)
64 | ------------------
65 | - Refactor ``scikit-cosmo`` to ``scikit-matter`` (#157, #151)
66 | - Deprecation warning was added to link to renamed package (#154)
67 | - dropped Python `<3.8` support, because we are now using ``scikit-learn`` version
68 |   `>=1.1.0` (#139, #146, #152)
69 | - WHO dataset and examples were added (#149)
70 | - nice dataset was added (#143)
71 | - overhaul of documentation (#142, #150)
72 | - added ``DirectionalConvexHull`` class (#140)
73 | - added test_precomputed_regression function to ``KPCovR`` (#136)
74 | - other bugfixes (#141, #148)
75 | 
76 | 
77 | 0.1.2 (2022/07/04)
78 | ------------------
79 | - fixed a bug in the orthonormalization step of ``PCov-CUR`` (#118)
80 | - users can now initialize ``FPS`` selecting using a list of selected points, allowing
81 |   to restart the selection in the middle (#116)
82 | - KPCovR is now able to use pre-fitted regressor in the same way that ``PCovR`` can
83 |   (#113)
84 | 
85 | 0.1.1 (2021/11/30)
86 | ------------------
87 | - fixed a bug in the ``orthonormalization`` step of ``PCov-CUR`` (#118)
88 | - users can now initialize ``FPS`` selecting using a list of selected points, allowing to
89 |   restart the selection in the middle (#116)
90 | - KPCovR is now able to use pre-fitted regressor in the same way that ``PCovR`` can (#113)
91 | 
92 | 0.1.0 (2021/05/12)
93 | ------------------
94 | - first release out of the lab
95 | 
96 | .. inclusion-marker-changelog-end
97 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | philip.loche@epfl.ch.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020 the sklearn-matter contributors
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft src
 2 | 
 3 | include LICENSE
 4 | include README.rst
 5 | 
 6 | prune docs
 7 | prune examples
 8 | prune tests
 9 | prune .github
10 | prune .tox
11 | 
12 | exclude CHANGELOG
13 | exclude CODE_OF_CONDUCT.md
14 | exclude .gitignore
15 | exclude .codecov.yml
16 | exclude .readthedocs.yaml
17 | exclude tox.ini
18 | 
19 | global-exclude *.py[cod] __pycache__/* *.so *.dylib
20 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | scikit-matter
  2 | =============
  3 | |tests| |codecov| |pypi| |conda| |docs| |doi|
  4 | 
  5 | A collection of ``scikit-learn`` compatible utilities that implement methods born out of
  6 | the materials science and chemistry communities.
  7 | 
  8 | For details, tutorials, and examples, please have a look at our `documentation`_.
  9 | 
 10 | .. _`documentation`: https://scikit-matter.readthedocs.io
 11 | 
 12 | .. marker-installation
 13 | 
 14 | Installation
 15 | ------------
 16 | You can install *scikit-matter* either via pip using
 17 | 
 18 | .. code-block:: bash
 19 | 
 20 |     pip install skmatter
 21 | 
 22 | or conda
 23 | 
 24 | .. code-block:: bash
 25 | 
 26 |     conda install -c conda-forge skmatter
 27 | 
 28 | You can then ``import skmatter`` and use scikit-matter in your projects!
 29 | 
 30 | .. marker-ci-tests
 31 | 
 32 | Tests
 33 | -----
 34 | We are testing our code for Python 3.10 and 3.13 on the latest versions of Ubuntu,
 35 | macOS and Windows.
 36 | 
 37 | .. marker-issues
 38 | 
 39 | Having problems or ideas?
 40 | -------------------------
 41 | Having a problem with scikit-matter? Please let us know by `submitting an issue
 42 | <https://github.com/scikit-learn-contrib/scikit-matter/issues>`_.
 43 | 
 44 | Submit new features or bug fixes through a `pull request
 45 | <https://github.com/scikit-learn-contrib/scikit-matter/pulls>`_.
 46 | 
 47 | .. marker-contributing
 48 | 
 49 | Call for Contributions
 50 | ----------------------
 51 | We always welcome new contributors. If you want to help us take a look at our
 52 | `contribution guidelines`_ and afterwards you may start with an open issue marked as
 53 | `good first issue`_.
 54 | 
 55 | Writing code is not the only way to contribute to the project. You can also:
 56 | 
 57 | * review `pull requests`_
 58 | * help us stay on top of new and old `issues`_
 59 | * develop `examples and tutorials`_
 60 | * maintain and `improve our documentation`_
 61 | * contribute `new datasets`_
 62 | 
 63 | .. _`contribution guidelines`: https://scikit-matter.readthedocs.io/en/latest/contributing.html
 64 | .. _`good first issue`: https://github.com/scikit-learn-contrib/scikit-matter/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22
 65 | .. _`pull requests`: https://github.com/scikit-learn-contrib/scikit-matter/pulls
 66 | .. _`issues`: https://github.com/scikit-learn-contrib/scikit-matter/issues
 67 | .. _`improve our documentation`: https://scikit-matter.readthedocs.io/en/latest/contributing.html#contributing-to-the-documentation
 68 | .. _`examples and tutorials`: https://scikit-matter.readthedocs.io/en/latest/contributing.html#contributing-new-examples
 69 | .. _`new datasets`: https://scikit-matter.readthedocs.io/en/latest/contributing.html#contributing-datasets
 70 | 
 71 | .. marker-citing
 72 | 
 73 | Citing scikit-matter
 74 | --------------------
 75 | If you use *scikit-matter* for your work, please cite:
 76 | 
 77 | Goscinski A, Principe VP, Fraux G et al. scikit-matter :
 78 | A Suite of Generalisable Machine Learning Methods Born out of Chemistry
 79 | and Materials Science. Open Res Europe 2023, 3:81.
 80 | `10.12688/openreseurope.15789.2`_
 81 | 
 82 | .. _`10.12688/openreseurope.15789.2`: https://doi.org/10.12688/openreseurope.15789.2
 83 | 
 84 | .. marker-contributors
 85 | 
 86 | Contributors
 87 | ------------
 88 | Thanks goes to all people that make scikit-matter possible:
 89 | 
 90 | .. image:: https://contrib.rocks/image?repo=scikit-learn-contrib/scikit-matter
 91 |    :target: https://github.com/scikit-learn-contrib/scikit-matter/graphs/contributors
 92 | 
 93 | .. |tests| image:: https://github.com/scikit-learn-contrib/scikit-matter/workflows/Tests/badge.svg
 94 |    :alt: Github Actions Tests Job Status
 95 |    :target: action_
 96 | 
 97 | .. |codecov| image:: https://codecov.io/gh/scikit-learn-contrib/scikit-matter/branch/main/graph/badge.svg?token=UZJPJG34SM
 98 |    :alt: Code coverage
 99 |    :target: https://codecov.io/gh/scikit-learn-contrib/scikit-matter/
100 | 
101 | .. |docs| image:: https://img.shields.io/badge/documentation-latest-sucess
102 |    :alt: Python
103 |    :target: documentation_
104 | 
105 | .. |pypi| image:: https://img.shields.io/pypi/v/skmatter.svg
106 |    :alt: Latest PYPI version
107 |    :target: https://pypi.org/project/skmatter
108 | 
109 | .. |conda| image:: https://anaconda.org/conda-forge/skmatter/badges/version.svg
110 |    :alt: Latest conda version
111 |    :target: https://anaconda.org/conda-forge/skmatter
112 | 
113 | .. |doi| image:: https://img.shields.io/badge/DOI-10.12688-blue
114 |    :alt: ORE Paper
115 |    :target: `10.12688/openreseurope.15789.2`_
116 | 
117 | .. _`action`: https://github.com/scikit-learn-contrib/scikit-matter/actions?query=branch%3Amain
118 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx-gallery
3 | sphinx-toggleprompt
4 | pydata-sphinx-theme
5 | tomli
6 | 


--------------------------------------------------------------------------------
/docs/src/bibliography.rst:
--------------------------------------------------------------------------------
 1 | References
 2 | ############
 3 | 
 4 | .. [deJong1992]
 5 |     S. de Jong, H.A.L. Kiers,
 6 |     "Principal covariates regression: Part I. Theory", Chemom. intell. lab. syst. 14
 7 |     (1992) 155-164 https://doi.org/10.1016/0169-7439(92)80100-I
 8 | 
 9 | .. [Gasparotto2014]
10 |     Piero Gasparotto, Michele Ceriotti,
11 |     "Recognizing molecular patterns by machine learning: An agnostic structural
12 |     definition of the hydrogen bond", J. Chem. Phys., 141 (17): 174110.
13 |     https://doi.org/10.1063/1.4900655.
14 | 
15 | .. [Imbalzano2018]
16 |     Giulio Imbalzano, Andrea Anelli, Daniele Giofré,Sinja Klees, Jörg Behler, and
17 |     Michele Ceriotti, “Automatic selection of atomic fingerprints and reference
18 |     configurations for machine-learning potentials.” The Journal of chemical physics 148
19 |     24 (2018): 241730. https://aip.scitation.org/doi/10.1063/1.5024611.
20 | 
21 | .. [Ceriotti2019]
22 |     Michele Ceriotti, Lyndon Emsley, Federico Paruzzo, Albert Hofstetter, Félix Musil,
23 |     Sandip De, Edgar A. Engel, and Andrea Anelli. "Chemical Shifts in Molecular Solids
24 |     by Machine Learning Datasets", Materials Cloud Archive 2019.0023/v2 (2019),
25 |     https://doi.org/10.24435/materialscloud:2019.0023/v2.
26 | 
27 | .. [Helfrecht2020]
28 |     Benjamin A Helfrecht, Rose K Cersonsky, Guillaume Fraux, and Michele Ceriotti,
29 |     "Structure-property maps with Kernel principal covariates regression." 2020 Mach.
30 |     Learn.: Sci. Technol. 1 045021.
31 |     https://iopscience.iop.org/article/10.1088/2632-2153/aba9ef.
32 | 
33 | .. [Pozdnyakov2020]
34 |     Pozdnyakov, S. N., Willatt, M. J., Bartók, A. P., Ortner, C., Csányi, G., &
35 |     Ceriotti, M. (2020). "Incompleteness of Atomic Structure Representations." Physical
36 |     Review Letters, 125(16). https://doi.org/10.1103/physrevlett.125.166001
37 | 
38 | .. [Goscinski2021]
39 |     Alexander Goscinski, Guillaume Fraux, Giulio Imbalzano, and Michele Ceriotti, "The
40 |     role of feature space in atomistic learning." 2021 Mach. Learn.: Sci. Technol. 2
41 |     025028. https://iopscience.iop.org/article/10.1088/2632-2153/abdaf7.
42 | 
43 | .. [Cersonsky2021]
44 |     Rose K Cersonsky, Benjamin A Helfrecht, Edgar A. Engel, Sergei Kliavinek, and
45 |     Michele Ceriotti, "Improving Sample and Feature Selection with Principal Covariates
46 |     Regression" 2021 Mach. Learn.: Sci. Technol. 2 035038.
47 |     https://iopscience.iop.org/article/10.1088/2632-2153/abfe7c.
48 | 
49 | .. [Jorgensen2025]
50 |     Christian Jorgensen, Arthur Y. Lin, Rhushil Vasavada, and Rose K. Cersonsky,
51 |     "Interpretable Visualizations of Data Spaces for Classification Problems"
52 |     2025 arXiv. 2503.05861.
53 |     https://doi.org/10.48550/arXiv.2503.05861.
54 | 


--------------------------------------------------------------------------------
/docs/src/changelog.rst:
--------------------------------------------------------------------------------
1 | Changelog
2 | =========
3 | 
4 | .. include:: ../../CHANGELOG
5 |    :start-after: inclusion-marker-changelog-start
6 |    :end-before: inclusion-marker-changelog-end
7 | 


--------------------------------------------------------------------------------
/docs/src/getting-started.rst:
--------------------------------------------------------------------------------
 1 | Getting started
 2 | ===============
 3 | 
 4 | This guide illustrates the main functionalities that ``scikit-matter`` provides. It
 5 | assumes a very basic working knowledge of how ``scikit-learn`` works. Please refer to
 6 | our :ref:`installation` instructions for installing ``scikit-matter``.
 7 | 
 8 | For a detailed explaination of the functionalities, please look at the
 9 | :ref:`selection-api`
10 | 
11 | .. _getting_started-selection:
12 | 
13 | Features and Samples Selection
14 | ------------------------------
15 | 
16 | .. automodule:: skmatter._selection
17 |    :noindex:
18 | 
19 | Notebook Examples
20 | ^^^^^^^^^^^^^^^^^
21 | 
22 | .. include:: examples/selection/index.rst
23 |    :start-line: 4
24 | 
25 | 
26 | .. _getting_started-reconstruction:
27 | 
28 | Metrics
29 | -------
30 | 
31 | .. automodule:: skmatter.metrics
32 |    :noindex:
33 | 
34 | Notebook Examples
35 | ^^^^^^^^^^^^^^^^^
36 | 
37 | .. include:: examples/reconstruction/index.rst
38 |    :start-line: 4
39 | 
40 | .. _getting_started-hybrid:
41 | 
42 | Hybrid Mapping Techniques
43 | -------------------------
44 | 
45 | .. automodule:: skmatter.decomposition
46 |    :noindex:
47 | 
48 | Notebook Examples
49 | ^^^^^^^^^^^^^^^^^
50 | 
51 | .. include:: examples/pcovr/index.rst
52 |    :start-line: 4
53 | .. include:: examples/pcovc/index.rst
54 |    :start-line: 4
55 | 


--------------------------------------------------------------------------------
/docs/src/index.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: skmatter
 2 | 
 3 | .. raw:: html
 4 | 
 5 |     <div class="container">
 6 |         <div class="row">
 7 |             <div class="col-sm d-flex">
 8 |                 <div class="card text-center mb-4" style="background-color: transparent">
 9 |                 <div class="card-body">
10 |                     <h5 class="card-title" style="margin-top: 0px">
11 | 
12 | 
13 | .. only:: html
14 | 
15 |   :ref:`getting_started-selection`
16 | 
17 |   .. image:: /examples/selection/images/thumb/sphx_glr_FeatureSelection-WHODataset_thumb.png
18 |     :alt:
19 | 
20 | .. raw:: html
21 | 
22 |                     </h5>
23 |                     <p class="card-text">Supervised and unsupervised selection
24 |                     methods based on
25 |                     CUR matrix decomposition and Farthest Point Sampling.</p>
26 |                 </div>
27 |                 </div>
28 |             </div>
29 |             <div class="col-sm d-flex">
30 |                 <div class="card text-center mb-4" style="background-color: transparent">
31 |                 <div class="card-body">
32 |                     <h5 class="card-title" style="margin-top: 0px">
33 | 
34 | .. only:: html
35 | 
36 |   :ref:`getting_started-hybrid`
37 | 
38 |   .. image:: /examples/pcovr/images/thumb/sphx_glr_PCovR_thumb.png
39 |     :alt:
40 | 
41 | .. raw:: html
42 | 
43 |                     </h5>
44 |                     <p class="card-text">PCovR and PCovC utilize a combination between a PCA-like and a LR-like loss
45 |                     to determine the decomposition matrix to project feature into latent space</p>
46 |                 </div>
47 |                 </div>
48 |             </div>
49 |             <div class="col-sm d-flex">
50 |                 <div class="card text-center mb-4" style="background-color: transparent">
51 |                 <div class="card-body">
52 |                     <h5 class="card-title" style="margin-top: 0px">
53 | 
54 | .. only:: html
55 | 
56 |   :ref:`getting_started-reconstruction`
57 | 
58 |   .. image:: /examples/reconstruction/images/thumb/sphx_glr_PlotLFRE_thumb.png
59 |     :alt:
60 | 
61 | .. raw:: html
62 | 
63 |                     </h5>
64 |                     <p class="card-text">Error measures for quantifying the
65 |                     linear decodable information capacity between features</p>
66 |                 </div>
67 |                 </div>
68 |             </div>
69 |         </div>
70 |     </div>
71 |     <br>
72 | 
73 | .. include:: ../../README.rst
74 |    :start-after: marker-issues
75 |    :end-before: marker-contributing
76 | 
77 | .. include:: ../../README.rst
78 |    :start-after: marker-citing
79 |    :end-before: marker-contributors
80 | 
81 | If you would like to contribute to scikit-matter, check out our :ref:`contributing`
82 | page!
83 | 
84 | .. toctree::
85 |   :hidden:
86 | 
87 |   getting-started
88 |   installation
89 |   references/index
90 |   tutorials
91 |   contributing
92 |   changelog
93 |   bibliography
94 | 


--------------------------------------------------------------------------------
/docs/src/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | .. include:: ../../README.rst
 4 |    :start-after: marker-installation
 5 |    :end-before: marker-ci-tests
 6 | 
 7 | Install from source
 8 | -------------------
 9 | 
10 | For development purposes you should clone the repository and install the current
11 | development version from the source code
12 | 
13 | .. code-block:: bash
14 | 
15 |   git clone https://github.com/lab-cosmo/scikit-matter
16 |   cd scikit-matter
17 |   pip install .
18 | 
19 | Alternatively, if you don't have special privileges, install
20 | the package using the ``--user`` flag:
21 | 
22 | .. code-block:: bash
23 | 
24 |   pip install . --user
25 | 
26 | You're ready to import skmatter from your code! Have a look at the :ref:`api-reference`
27 | for how to use the code.
28 | 


--------------------------------------------------------------------------------
/docs/src/references/VoronoiFPS-Schematic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/docs/src/references/VoronoiFPS-Schematic.pdf


--------------------------------------------------------------------------------
/docs/src/references/clustering.rst:
--------------------------------------------------------------------------------
 1 | Clustering
 2 | ==========
 3 | 
 4 | .. automodule:: skmatter.clustering
 5 | 
 6 | .. _quick-shift-api:
 7 | 
 8 | Quick Shift
 9 | ------------
10 | 
11 | .. autoclass:: skmatter.clustering.QuickShift
12 | 


--------------------------------------------------------------------------------
/docs/src/references/datasets.rst:
--------------------------------------------------------------------------------
 1 | Datasets
 2 | ========
 3 | 
 4 | .. include:: ../../../src/skmatter/datasets/descr/csd-1000r.rst
 5 | 
 6 | .. include:: ../../../src/skmatter/datasets/descr/degenerate_CH4_manifold.rst
 7 | 
 8 | .. include:: ../../../src/skmatter/datasets/descr/h2o-blyp-piglet.rst
 9 | 
10 | .. include:: ../../../src/skmatter/datasets/descr/nice_dataset.rst
11 | 
12 | .. include:: ../../../src/skmatter/datasets/descr/who_dataset.rst
13 | 


--------------------------------------------------------------------------------
/docs/src/references/decomposition.rst:
--------------------------------------------------------------------------------
 1 | Hybrid Mapping Techniques
 2 | =========================
 3 | 
 4 | .. _PCovR-api:
 5 | 
 6 | PCovR
 7 | -----
 8 | 
 9 | .. autoclass:: skmatter.decomposition.PCovR
10 |     :show-inheritance:
11 |     :special-members:
12 | 
13 |     .. automethod:: fit
14 | 
15 |         .. automethod:: _fit_feature_space
16 |         .. automethod:: _fit_sample_space
17 | 
18 |     .. automethod:: transform
19 |     .. automethod:: predict
20 |     .. automethod:: inverse_transform
21 |     .. automethod:: score
22 | 
23 | .. _PCovC-api:
24 | 
25 | PCovC
26 | -----
27 | 
28 | .. autoclass:: skmatter.decomposition.PCovC
29 |     :show-inheritance:
30 |     :special-members:
31 | 
32 |     .. automethod:: fit
33 | 
34 |         .. automethod:: _fit_feature_space
35 |         .. automethod:: _fit_sample_space
36 | 
37 |     .. automethod:: transform
38 |     .. automethod:: predict
39 |     .. automethod:: inverse_transform
40 |     .. automethod:: decision_function
41 |     .. automethod:: score
42 | 
43 | .. _KPCovR-api:
44 | 
45 | Kernel PCovR
46 | ------------
47 | 
48 | .. autoclass:: skmatter.decomposition.KernelPCovR
49 |     :show-inheritance:
50 |     :special-members:
51 | 
52 |     .. automethod:: fit
53 |     .. automethod:: transform
54 |     .. automethod:: predict
55 |     .. automethod:: inverse_transform
56 |     .. automethod:: score
57 | 


--------------------------------------------------------------------------------
/docs/src/references/index.rst:
--------------------------------------------------------------------------------
 1 | .. _api-reference:
 2 | 
 3 | API Reference
 4 | =============
 5 | 
 6 | .. toctree::
 7 |   :maxdepth: 1
 8 |   :caption: Contents:
 9 | 
10 |   preprocessing
11 |   selection
12 |   linear_models
13 |   clustering
14 |   decomposition
15 |   metrics
16 |   neighbors
17 |   datasets
18 |   utils
19 | 


--------------------------------------------------------------------------------
/docs/src/references/linear_models.rst:
--------------------------------------------------------------------------------
 1 | Linear Models
 2 | =============
 3 | 
 4 | Orthogonal Regression
 5 | ---------------------
 6 | 
 7 | .. autoclass:: skmatter.linear_model.OrthogonalRegression
 8 | 
 9 | Ridge Regression with Two-fold Cross Validation
10 | -----------------------------------------------
11 | 
12 | .. autoclass:: skmatter.linear_model.Ridge2FoldCV
13 | 
14 | PCovR
15 | -----
16 | 
17 | Principal Covariates Regression is a linear model, see :ref:`PCovR-api`.
18 | 


--------------------------------------------------------------------------------
/docs/src/references/metrics.rst:
--------------------------------------------------------------------------------
 1 | Metrics
 2 | =======
 3 | 
 4 | .. automodule:: skmatter.metrics
 5 | 
 6 | .. _GRE-api:
 7 | 
 8 | Global Reconstruction Error
 9 | ---------------------------
10 | 
11 | .. autofunction:: skmatter.metrics.pointwise_global_reconstruction_error
12 | .. autofunction:: skmatter.metrics.global_reconstruction_error
13 | 
14 | .. _GRD-api:
15 | 
16 | Global Reconstruction Distortion
17 | --------------------------------
18 | 
19 | .. autofunction:: skmatter.metrics.pointwise_global_reconstruction_distortion
20 | .. autofunction:: skmatter.metrics.global_reconstruction_distortion
21 | 
22 | .. _LRE-api:
23 | 
24 | Local Reconstruction Error
25 | --------------------------
26 | 
27 | .. autofunction:: skmatter.metrics.pointwise_local_reconstruction_error
28 | .. autofunction:: skmatter.metrics.local_reconstruction_error
29 | 
30 | .. _LPR-api:
31 | 
32 | Local Prediction Rigidity
33 | -------------------------
34 | 
35 | .. autofunction:: skmatter.metrics.local_prediction_rigidity
36 | 
37 | .. _CPR-api:
38 | 
39 | Component-wise Prediction Rigidity
40 | ----------------------------------
41 | 
42 | .. autofunction:: skmatter.metrics.componentwise_prediction_rigidity
43 | 
44 | 
45 | .. _pairwise-euclidian-api:
46 | 
47 | Pairwise Euclidean Distances
48 | ----------------------------
49 | 
50 | .. autofunction:: skmatter.metrics.periodic_pairwise_euclidean_distances
51 | 
52 | .. _pairwise-mahalanobis-api:
53 | 
54 | Pairwise Mahalanobis Distance
55 | -----------------------------
56 | 
57 | .. autofunction:: skmatter.metrics.pairwise_mahalanobis_distances
58 | 


--------------------------------------------------------------------------------
/docs/src/references/neighbors.rst:
--------------------------------------------------------------------------------
 1 | Neighbors
 2 | =========
 3 | 
 4 | .. automodule:: skmatter.neighbors
 5 | 
 6 | .. _sparse-kde-api:
 7 | 
 8 | Sparse Kernel Density Estimation
 9 | --------------------------------
10 | 
11 | .. autoclass:: skmatter.neighbors.SparseKDE
12 |     :show-inheritance:
13 | 
14 |     .. automethod:: fit
15 |     .. automethod:: score_samples
16 |     .. automethod:: score
17 | 


--------------------------------------------------------------------------------
/docs/src/references/preprocessing.rst:
--------------------------------------------------------------------------------
 1 | Preprocessing
 2 | =============
 3 | 
 4 | .. automodule:: skmatter.preprocessing
 5 | 
 6 | KernelNormalizer
 7 | ----------------
 8 | 
 9 | .. autoclass:: skmatter.preprocessing.KernelNormalizer
10 |    :members:
11 |    :undoc-members:
12 |    :inherited-members:
13 | 
14 | 
15 | SparseKernelCenterer
16 | --------------------
17 | 
18 | .. autoclass:: skmatter.preprocessing.SparseKernelCenterer
19 |    :members:
20 |    :undoc-members:
21 |    :inherited-members:
22 | 
23 | StandardFlexibleScaler
24 | ----------------------
25 | 
26 | .. autoclass:: skmatter.preprocessing.StandardFlexibleScaler
27 |    :members:
28 |    :undoc-members:
29 |    :inherited-members:
30 | 


--------------------------------------------------------------------------------
/docs/src/references/selection.rst:
--------------------------------------------------------------------------------
  1 | .. _selection-api:
  2 | 
  3 | Feature and Sample Selection
  4 | ============================
  5 | 
  6 | .. automodule:: skmatter._selection
  7 | 
  8 | .. _CUR-api:
  9 | 
 10 | CUR
 11 | ---
 12 | 
 13 | CUR decomposition begins by approximating a matrix :math:`{\mathbf{X}}` using a subset
 14 | of columns and rows
 15 | 
 16 | .. math::
 17 |     \mathbf{\hat{X}} \approx \mathbf{X}_\mathbf{c} \left(\mathbf{X}_\mathbf{c}^-
 18 |     \mathbf{X} \mathbf{X}_\mathbf{r}^-\right) \mathbf{X}_\mathbf{r}.
 19 | 
 20 | These subsets of rows and columns, denoted :math:`\mathbf{X}_\mathbf{r}` and
 21 | :math:`\mathbf{X}_\mathbf{c}`, respectively, can be determined by iterative maximization
 22 | of a leverage score :math:`\pi`, representative of the relative importance of each
 23 | column or row. From hereon, we will call selection methods which are derived off of the
 24 | CUR decomposition "CUR" as a shorthand for "CUR-derived selection". In each iteration of
 25 | CUR, we select the column or row that maximizes :math:`\pi` and orthogonalize the
 26 | remaining columns or rows. These steps are iterated until a sufficient number of
 27 | features has been selected. This iterative approach, albeit comparatively time
 28 | consuming, is the most deterministic and efficient route in reducing the number of
 29 | features needed to approximate :math:`\mathbf{X}` when compared to selecting all
 30 | features in a single iteration based upon the relative :math:`\pi` importance.
 31 | 
 32 | The feature and sample selection versions of CUR differ only in the computation of
 33 | :math:`\pi`. In sample selection :math:`\pi` is computed using the left singular
 34 | vectors, versus in feature selection, :math:`\pi` is computed using the right singular
 35 | vectors.
 36 | 
 37 | .. autoclass:: skmatter.feature_selection.CUR
 38 |    :members:
 39 |    :private-members: _compute_pi
 40 |    :undoc-members:
 41 |    :inherited-members:
 42 | 
 43 | .. autoclass:: skmatter.sample_selection.CUR
 44 |    :members:
 45 |    :private-members: _compute_pi
 46 |    :undoc-members:
 47 |    :inherited-members:
 48 | 
 49 | .. _PCov-CUR-api:
 50 | 
 51 | PCov-CUR
 52 | --------
 53 | 
 54 | PCov-CUR extends upon CUR by using augmented right or left singular vectors inspired by
 55 | Principal Covariates Regression, as demonstrated in [Cersonsky2021]_. These methods
 56 | employ the modified kernel and covariance matrices introduced in :ref:`PCovR-api` and
 57 | available via the Utility Classes.
 58 | 
 59 | Again, the feature and sample selection versions of PCov-CUR differ only in the
 60 | computation of :math:`\pi`. S
 61 | 
 62 | .. autoclass:: skmatter.feature_selection.PCovCUR
 63 |    :members:
 64 |    :private-members: _compute_pi
 65 |    :undoc-members:
 66 |    :inherited-members:
 67 | 
 68 | .. autoclass:: skmatter.sample_selection.PCovCUR
 69 |    :members:
 70 |    :private-members: _compute_pi
 71 |    :undoc-members:
 72 |    :inherited-members:
 73 | 
 74 | .. _FPS-api:
 75 | 
 76 | Farthest Point-Sampling (FPS)
 77 | -----------------------------
 78 | 
 79 | Farthest Point Sampling is a common selection technique intended to exploit the
 80 | diversity of the input space.
 81 | 
 82 | In FPS, the selection of the first point is made at random or by a separate metric. Each
 83 | subsequent selection is made to maximize the Hausdorf distance, i.e. the minimum
 84 | distance between a point and all previous selections. It is common to use the Euclidean
 85 | distance, however other distance metrics may be employed.
 86 | 
 87 | Similar to CUR, the feature and selection versions of FPS differ only in the way
 88 | distance is computed (feature selection does so column-wise, sample selection does so
 89 | row-wise), and are built off of the same base class,
 90 | 
 91 | These selectors can be instantiated using :py:class:`skmatter.feature_selection.FPS` and
 92 | :py:class:`skmatter.sample_selection.FPS`.
 93 | 
 94 | .. autoclass:: skmatter.feature_selection.FPS
 95 |    :members:
 96 |    :undoc-members:
 97 |    :inherited-members:
 98 | 
 99 | .. autoclass:: skmatter.sample_selection.FPS
100 |    :members:
101 |    :undoc-members:
102 |    :inherited-members:
103 | 
104 | .. _PCov-FPS-api:
105 | 
106 | PCov-FPS
107 | --------
108 | 
109 | PCov-FPS extends upon FPS much like PCov-CUR does to CUR. Instead of using the Euclidean
110 | distance solely in the space of :math:`\mathbf{X}`, we use a combined distance in terms
111 | of :math:`\mathbf{X}` and :math:`\mathbf{y}`.
112 | 
113 | .. autoclass:: skmatter.feature_selection.PCovFPS
114 |    :members:
115 |    :undoc-members:
116 |    :inherited-members:
117 | 
118 | .. autoclass:: skmatter.sample_selection.PCovFPS
119 |    :members:
120 |    :undoc-members:
121 |    :inherited-members:
122 | 
123 | .. _Voronoi-FPS-api:
124 | 
125 | Voronoi FPS
126 | -----------
127 | 
128 | .. autoclass:: skmatter.sample_selection.VoronoiFPS
129 |    :members:
130 |    :undoc-members:
131 |    :inherited-members:
132 | 
133 | 
134 | When *Not* to Use Voronoi FPS
135 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
136 | 
137 | In many cases, this algorithm may not increase upon the efficiency. For example, for
138 | simple metrics (such as Euclidean distance), Voronoi FPS will likely not accelerate, and
139 | may decelerate, computations when compared to FPS. The sweet spot for Voronoi FPS is
140 | when the number of selectable samples is already enough to divide the space with Voronoi
141 | polyhedrons, but not yet comparable to the total number of samples, when the cost of
142 | bookkeeping significantly degrades the speed of work compared to FPS.
143 | 
144 | .. _DCH-api:
145 | 
146 | Directional Convex Hull (DCH)
147 | -----------------------------
148 | 
149 | .. autoclass:: skmatter.sample_selection.DirectionalConvexHull
150 |    :members:
151 |    :undoc-members:
152 |    :inherited-members:
153 | 


--------------------------------------------------------------------------------
/docs/src/references/utils.rst:
--------------------------------------------------------------------------------
 1 | Utility Classes
 2 | ===============
 3 | 
 4 | .. _PCovR_dist-api:
 5 | 
 6 | Modified Gram Matrix :math:`\mathbf{\tilde{K}}`
 7 | -----------------------------------------------
 8 | 
 9 | .. autofunction:: skmatter.utils.pcovr_kernel
10 | 
11 | 
12 | Modified Covariance Matrix :math:`\mathbf{\tilde{C}}`
13 | -----------------------------------------------------
14 | 
15 | .. autofunction:: skmatter.utils.pcovr_covariance
16 | 
17 | Orthogonalizers for CUR
18 | -----------------------
19 | 
20 | When computing non-iterative CUR, it is necessary to orthogonalize the input matrices
21 | after each selection. For this, we have supplied a feature and a sample orthogonalizer
22 | for feature and sample selection.
23 | 
24 | .. autofunction:: skmatter.utils.X_orthogonalizer
25 | .. autofunction:: skmatter.utils.Y_feature_orthogonalizer
26 | .. autofunction:: skmatter.utils.Y_sample_orthogonalizer
27 | 
28 | 
29 | Random Partitioning with Overlaps
30 | ---------------------------------
31 | 
32 | .. autofunction:: skmatter.model_selection.train_test_split
33 | 
34 | 
35 | Effective Dimension of Covariance Matrix
36 | ----------------------------------------
37 | 
38 | .. autofunction:: skmatter.utils.effdim
39 | 
40 | Oracle Approximating Shrinkage
41 | ------------------------------
42 | 
43 | .. autofunction:: skmatter.utils.oas
44 | 


--------------------------------------------------------------------------------
/docs/src/tutorials.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../../examples/README.rst
 2 | 
 3 | .. toctree::
 4 | 
 5 |   examples/pcovr/index
 6 |   examples/pcovc/index
 7 |   examples/selection/index
 8 |   examples/regression/index
 9 |   examples/reconstruction/index
10 |   examples/neighbors/index
11 | 


--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ========
 3 | 
 4 | For a thorough tutorial of the methods introduced in ``scikit-matter``, we
 5 | suggest you check out the pedagogic notebooks in our companion project
 6 | `kernel-tutorials <https://github.com/lab-cosmo/kernel-tutorials/>`_.
 7 | 
 8 | For running the examples locally install ``scikit-matter`` with the ``examples``
 9 | optional dependencies.
10 | 
11 | .. code-block:: bash
12 | 
13 |     pip install skmatter[examples]
14 | 


--------------------------------------------------------------------------------
/examples/neighbors/README.rst:
--------------------------------------------------------------------------------
1 | Neighbors
2 | =========
3 | 


--------------------------------------------------------------------------------
/examples/neighbors/sparse-kde.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | """
  5 | Sparse KDE examples
  6 | ===================
  7 | 
  8 | Example for the usage of the :class:`skmatter.neighbors.SparseKDE` class. This class is
  9 | specifically designed for conducting pobabilistic analysis of molecular motifs
 10 | (`PAMM <https://doi.org/10.1063/1.4900655>`_),
 11 | which is quite useful for analyzing motifs like H-bonds, coordination polyhedra, and
 12 | protein secondary structure.
 13 | 
 14 | Here we show how to use the sparse KDE model to fit the probability distribution based
 15 | on sampled data and how to use PAMM to analyze the H-bond.
 16 | 
 17 | We start from a simple system, which is consist of three 2D Gaussians. Our task is to
 18 | estimate the parameters of these Gaussians from our sampled data.
 19 | 
 20 | Here we first sample from these three Gaussians.
 21 | """
 22 | 
 23 | 
 24 | # %%
 25 | import time
 26 | 
 27 | import matplotlib.pyplot as plt
 28 | import numpy as np
 29 | from scipy.stats import gaussian_kde
 30 | 
 31 | from skmatter.feature_selection import FPS
 32 | from skmatter.neighbors import SparseKDE
 33 | 
 34 | 
 35 | # %%
 36 | means = np.array([[0, 0], [4, 4], [6, -2]])
 37 | covariances = np.array(
 38 |     [[[1, 0.5], [0.5, 1]], [[1, 0.5], [0.5, 0.5]], [[1, -0.5], [-0.5, 1]]]
 39 | )
 40 | N_SAMPLES = 100_000
 41 | samples = np.concatenate(
 42 |     [
 43 |         np.random.multivariate_normal(means[0], covariances[0], N_SAMPLES),
 44 |         np.random.multivariate_normal(means[1], covariances[1], N_SAMPLES),
 45 |         np.random.multivariate_normal(means[2], covariances[2], N_SAMPLES),
 46 |     ]
 47 | )
 48 | 
 49 | # %%
 50 | # We can visualize our sample result:
 51 | #
 52 | #
 53 | 
 54 | # %%
 55 | fig, ax = plt.subplots()
 56 | ax.scatter(samples[:, 0], samples[:, 1], alpha=0.05, s=1)
 57 | ax.scatter(means[:, 0], means[:, 1], marker="+", color="red", s=100)
 58 | ax.set_xlabel("x")
 59 | ax.set_ylabel("y")
 60 | plt.show()
 61 | 
 62 | # %%
 63 | # Sparse KDE requires a discretization of the sample space. Here, we use
 64 | # the FPS method to generate grid points in the sample space:
 65 | #
 66 | #
 67 | 
 68 | # %%
 69 | start1 = time.time()
 70 | selector = FPS(n_to_select=int(np.sqrt(3 * N_SAMPLES)))
 71 | grids = selector.fit_transform(samples.T).T
 72 | end1 = time.time()
 73 | fig, ax = plt.subplots()
 74 | ax.scatter(samples[:, 0], samples[:, 1], alpha=0.05, s=1)
 75 | ax.scatter(means[:, 0], means[:, 1], marker="+", color="red", s=100)
 76 | ax.scatter(grids[:, 0], grids[:, 1], color="orange", s=1)
 77 | ax.set_xlabel("x")
 78 | ax.set_ylabel("y")
 79 | plt.show()
 80 | 
 81 | # %%
 82 | # Now we can do sparse KDE (usually takes tens of seconds):
 83 | #
 84 | #
 85 | 
 86 | # %%
 87 | start2 = time.time()
 88 | estimator = SparseKDE(samples, None, fpoints=0.5)
 89 | estimator.fit(grids)
 90 | end2 = time.time()
 91 | 
 92 | # %%
 93 | # We can have a comparison with the original sampling result by plotting them.
 94 | #
 95 | # For the convenience, we create a class for the Gaussian mixture model to help us plot
 96 | # the result.
 97 | 
 98 | 
 99 | # %%
100 | class GaussianMixtureModel:
101 | 
102 |     def __init__(
103 |         self,
104 |         weights: np.ndarray,
105 |         means: np.ndarray,
106 |         covariances: np.ndarray,
107 |         period: np.ndarray = None,
108 |     ):
109 |         self.weights = weights
110 |         self.means = means
111 |         self.covariances = covariances
112 |         self.period = period
113 |         self.dimension = self.means.shape[1]
114 |         self.cov_inv = np.linalg.inv(self.covariances)
115 |         self.cov_det = np.linalg.det(self.covariances)
116 |         self.norm = 1 / np.sqrt((2 * np.pi) ** self.dimension * self.cov_det)
117 | 
118 |     def __call__(self, x: np.ndarray, i: int = None):
119 | 
120 |         if len(x.shape) == 1:
121 |             x = x[np.newaxis, :]
122 |         if self.period is not None:
123 |             xij = np.zeros(self.means.shape)
124 |             xij = rij(self.period, xij, x)
125 |         else:
126 |             xij = x - self.means
127 |         p = (
128 |             self.weights
129 |             * self.norm
130 |             * np.exp(
131 |                 -0.5 * (xij[:, np.newaxis, :] @ self.cov_inv @ xij[:, :, np.newaxis])
132 |             ).reshape(-1)
133 |         )
134 |         sum_p = np.sum(p)
135 |         if i is None:
136 |             return sum_p
137 | 
138 |         return np.sum(p[i]) / sum_p
139 | 
140 | 
141 | # %%
142 | def rij(period: np.ndarray, xi: np.ndarray, xj: np.ndarray) -> np.ndarray:
143 |     """Get the position vectors between two points. PBC are taken into account."""
144 |     xij = xi - xj
145 |     if period is not None:
146 |         xij -= np.round(xij / period) * period
147 | 
148 |     return xij
149 | 
150 | 
151 | # %%
152 | # The original model that we want to fit:
153 | original_model = GaussianMixtureModel(np.full(3, 1 / 3), means, covariances)
154 | # The fitted model:
155 | fitted_model = GaussianMixtureModel(
156 |     estimator._sample_weights, estimator._grids, estimator.bandwidth_
157 | )
158 | 
159 | # To plot the probability density contour, we need to create a grid of points:
160 | x, y = np.meshgrid(np.linspace(-6, 12, 100), np.linspace(-8, 8))
161 | points = np.concatenate(np.stack([x, y], axis=-1))
162 | probs = np.array([original_model(point) for point in points])
163 | fitted_probs = np.array([fitted_model(point) for point in points])
164 | 
165 | fig, ax = plt.subplots()
166 | ct1 = ax.contour(x, y, probs.reshape(x.shape), colors="blue")
167 | ct2 = ax.contour(x, y, fitted_probs.reshape(x.shape), colors="orange")
168 | h1, _ = ct1.legend_elements()
169 | h2, _ = ct2.legend_elements()
170 | ax.legend(
171 |     [h1[0], h2[0]],
172 |     ["original", "fitted"],
173 | )
174 | ax.set_xlabel("x")
175 | ax.set_ylabel("y")
176 | plt.show()
177 | 
178 | # %%
179 | # The performance of the probability density estimation can be characterized by the
180 | # Mean Integrated Squared Error (MISE), which is defined as:
181 | # :math:`\text{MISE}=\text{E}[\int (\hat{P}(\textbf{x})-P(\textbf{x}))^2 d\textbf{x}]`
182 | 
183 | # %%
184 | RMSE = np.sum((probs - fitted_probs) ** 2 * (x[0][1] - x[0][0]) * (y[1][0] - y[0][0]))
185 | print(f"Time sparse-kde: {end2 - start2} s")
186 | print(f"RMSE = {RMSE:.2e}")
187 | 
188 | # %%
189 | # We can compare the result with the KDE class from scipy. (Usually takes
190 | # several minutes to run)
191 | 
192 | # %%
193 | data = np.vstack([x.ravel(), y.ravel()])
194 | start = time.time()
195 | kde = gaussian_kde(samples.T)
196 | sklearn_probs = kde(data).T
197 | end = time.time()
198 | print(f"Time scipy: {end - start} s")
199 | RMSE_kde = np.sum(
200 |     (probs - sklearn_probs) ** 2 * (x[0][1] - x[0][0]) * (y[1][0] - y[0][0])
201 | )
202 | print(f"RMSE_kde = {RMSE_kde:.2e}")
203 | 
204 | # %%
205 | # We can see that the fitted model can perfectly capture the original one. Even though
206 | # we have not specified the number of the Gaussians, it can still perform well. This
207 | # allows us to fit distributions of the data automatically at a comparable quality
208 | # within a much shorter time than scipy.
209 | 


--------------------------------------------------------------------------------
/examples/pcovc/PCovC_Comparison.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | """
  5 | Comparing PCovC with PCA and LDA
  6 | ================================
  7 | """
  8 | # %%
  9 | #
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | import numpy as np
 13 | from sklearn.datasets import load_breast_cancer
 14 | from sklearn.decomposition import PCA
 15 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 16 | from sklearn.linear_model import LogisticRegressionCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | 
 19 | from skmatter.decomposition import PCovC
 20 | 
 21 | 
 22 | plt.rcParams["image.cmap"] = "tab10"
 23 | plt.rcParams["scatter.edgecolors"] = "k"
 24 | 
 25 | random_state = 0
 26 | 
 27 | # %%
 28 | #
 29 | # For this, we will use the :func:`sklearn.datasets.load_breast_cancer` dataset from
 30 | # ``sklearn``.
 31 | 
 32 | X, y = load_breast_cancer(return_X_y=True)
 33 | 
 34 | scaler = StandardScaler()
 35 | X_scaled = scaler.fit_transform(X)
 36 | 
 37 | # %%
 38 | #
 39 | # PCA
 40 | # ---
 41 | #
 42 | 
 43 | pca = PCA(n_components=2)
 44 | 
 45 | pca.fit(X_scaled, y)
 46 | T_pca = pca.transform(X_scaled)
 47 | 
 48 | fig, ax = plt.subplots()
 49 | scatter = ax.scatter(T_pca[:, 0], T_pca[:, 1], c=y)
 50 | ax.set(xlabel="PC$_1$", ylabel="PC$_2$")
 51 | ax.legend(
 52 |     scatter.legend_elements()[0][::-1],
 53 |     load_breast_cancer().target_names[::-1],
 54 |     loc="upper right",
 55 |     title="Classes",
 56 | )
 57 | 
 58 | # %%
 59 | #
 60 | # LDA
 61 | # ---
 62 | #
 63 | 
 64 | lda = LinearDiscriminantAnalysis(n_components=1)
 65 | lda.fit(X_scaled, y)
 66 | 
 67 | T_lda = lda.transform(X_scaled)
 68 | 
 69 | fig, ax = plt.subplots()
 70 | ax.scatter(T_lda[:], np.zeros(len(T_lda[:])), c=y)
 71 | ax.set(xlabel="LDA$_1$", ylabel="LDA$_2$")
 72 | 
 73 | # %%
 74 | #
 75 | # PCovC
 76 | # -------------------
 77 | #
 78 | # Below, we see the map produced
 79 | # by a PCovC model with :math:`\alpha` = 0.5 and a logistic
 80 | # regression classifier.
 81 | 
 82 | mixing = 0.5
 83 | 
 84 | pcovc = PCovC(
 85 |     mixing=mixing,
 86 |     n_components=2,
 87 |     random_state=random_state,
 88 |     classifier=LogisticRegressionCV(),
 89 | )
 90 | pcovc.fit(X_scaled, y)
 91 | 
 92 | T_pcovc = pcovc.transform(X_scaled)
 93 | 
 94 | fig, ax = plt.subplots()
 95 | ax.scatter(T_pcovc[:, 0], T_pcovc[:, 1], c=y)
 96 | ax.set(xlabel="PCov$_1$", ylabel="PCov$_2$")
 97 | 
 98 | # %%
 99 | #
100 | # A side-by-side comparison of the
101 | # three maps (PCA, LDA, and PCovC):
102 | 
103 | fig, axs = plt.subplots(1, 3, figsize=(18, 5))
104 | axs[0].scatter(T_pca[:, 0], T_pca[:, 1], c=y)
105 | axs[0].set_title("PCA")
106 | axs[1].scatter(T_lda, np.zeros(len(T_lda)), c=y)
107 | axs[1].set_title("LDA")
108 | axs[2].scatter(T_pcovc[:, 0], T_pcovc[:, 1], c=y)
109 | axs[2].set_title("PCovC")
110 | plt.show()
111 | 


--------------------------------------------------------------------------------
/examples/pcovc/PCovC_Hyperparameters.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | """
  5 | PCovC Hyperparameter Tuning
  6 | ===========================
  7 | """
  8 | # %%
  9 | #
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | from matplotlib.colors import LinearSegmentedColormap
 13 | from sklearn.datasets import load_iris
 14 | from sklearn.decomposition import PCA
 15 | from sklearn.inspection import DecisionBoundaryDisplay
 16 | from sklearn.linear_model import LogisticRegressionCV, Perceptron, RidgeClassifierCV
 17 | from sklearn.preprocessing import StandardScaler
 18 | from sklearn.svm import LinearSVC
 19 | 
 20 | from skmatter.decomposition import PCovC
 21 | 
 22 | 
 23 | plt.rcParams["image.cmap"] = "tab10"
 24 | plt.rcParams["scatter.edgecolors"] = "k"
 25 | 
 26 | random_state = 10
 27 | n_components = 2
 28 | 
 29 | # %%
 30 | #
 31 | # For this, we will use the :func:`sklearn.datasets.load_iris` dataset from
 32 | # ``sklearn``.
 33 | 
 34 | X, y = load_iris(return_X_y=True)
 35 | 
 36 | scaler = StandardScaler()
 37 | X_scaled = scaler.fit_transform(X)
 38 | 
 39 | # %%
 40 | #
 41 | # PCA
 42 | # ---
 43 | #
 44 | 
 45 | pca = PCA(n_components=n_components)
 46 | 
 47 | pca.fit(X_scaled, y)
 48 | T_pca = pca.transform(X_scaled)
 49 | 
 50 | fig, axis = plt.subplots()
 51 | scatter = axis.scatter(T_pca[:, 0], T_pca[:, 1], c=y)
 52 | axis.set(xlabel="PC$_1$", ylabel="PC$_2$")
 53 | axis.legend(
 54 |     scatter.legend_elements()[0],
 55 |     load_iris().target_names,
 56 |     loc="lower right",
 57 |     title="Classes",
 58 | )
 59 | 
 60 | # %%
 61 | #
 62 | # Effect of Mixing Parameter :math:`\alpha` on PCovC Map
 63 | # ------------------------------------------------------
 64 | #
 65 | # Below, we see how different :math:`\alpha` values for our PCovC model
 66 | # result in varying class distinctions between setosa, versicolor,
 67 | # and virginica on the PCovC map.
 68 | 
 69 | n_mixing = 5
 70 | mixing_params = [0, 0.25, 0.50, 0.75, 1]
 71 | 
 72 | fig, axs = plt.subplots(1, n_mixing, figsize=(4 * n_mixing, 4), sharey="row")
 73 | 
 74 | for id in range(0, n_mixing):
 75 |     mixing = mixing_params[id]
 76 | 
 77 |     pcovc = PCovC(
 78 |         mixing=mixing,
 79 |         n_components=n_components,
 80 |         random_state=random_state,
 81 |         classifier=LogisticRegressionCV(),
 82 |     )
 83 | 
 84 |     pcovc.fit(X_scaled, y)
 85 |     T = pcovc.transform(X_scaled)
 86 | 
 87 |     axs[id].set_xticks([])
 88 |     axs[id].set_yticks([])
 89 | 
 90 |     axs[id].set_title(r"$\alpha=$" + str(mixing))
 91 |     axs[id].set_xlabel("PCov$_1$")
 92 |     axs[id].scatter(T[:, 0], T[:, 1], c=y)
 93 | 
 94 | axs[0].set_ylabel("PCov$_2$")
 95 | 
 96 | fig.subplots_adjust(wspace=0)
 97 | 
 98 | # %%
 99 | #
100 | # Effect of PCovC Classifier on PCovC Map and Decision Boundaries
101 | # ---------------------------------------------------------------
102 | #
103 | # Here, we see how a PCovC model (:math:`\alpha` = 0.5) fitted with
104 | # different classifiers produces varying PCovC maps. In addition,
105 | # we see the varying decision boundaries produced by the
106 | # respective PCovC classifiers.
107 | 
108 | mixing = 0.5
109 | fig, axs = plt.subplots(1, 4, figsize=(16, 4))
110 | 
111 | models = {
112 |     RidgeClassifierCV(): "Ridge Classification",
113 |     LogisticRegressionCV(random_state=random_state): "Logistic Regression",
114 |     LinearSVC(random_state=random_state): "Support Vector Classification",
115 |     Perceptron(random_state=random_state): "Single-Layer Perceptron",
116 | }
117 | 
118 | for id in range(0, len(models)):
119 |     model = list(models)[id]
120 | 
121 |     pcovc = PCovC(
122 |         mixing=mixing,
123 |         n_components=n_components,
124 |         random_state=random_state,
125 |         classifier=model,
126 |     )
127 | 
128 |     pcovc.fit(X_scaled, y)
129 |     T = pcovc.transform(X_scaled)
130 | 
131 |     graph = axs[id]
132 |     graph.set_title(models[model])
133 | 
134 |     DecisionBoundaryDisplay.from_estimator(
135 |         estimator=pcovc.classifier_,
136 |         X=T,
137 |         ax=graph,
138 |         response_method="predict",
139 |         grid_resolution=1000,
140 |     )
141 | 
142 |     scatter = graph.scatter(T[:, 0], T[:, 1], c=y)
143 | 
144 |     graph.set_xlabel("PCov$_1$")
145 |     graph.set_xticks([])
146 |     graph.set_yticks([])
147 | 
148 | axs[0].set_ylabel("PCov$_2$")
149 | axs[0].legend(
150 |     scatter.legend_elements()[0],
151 |     load_iris().target_names,
152 |     loc="lower right",
153 |     title="Classes",
154 |     fontsize=8,
155 | )
156 | 
157 | fig.subplots_adjust(wspace=0.04)
158 | plt.show()
159 | 


--------------------------------------------------------------------------------
/examples/pcovc/README.rst:
--------------------------------------------------------------------------------
1 | PCovC
2 | =====
3 | 


--------------------------------------------------------------------------------
/examples/pcovr/PCovR.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | """
  5 | Construct a PCovR Map
  6 | =====================
  7 | """
  8 | # %%
  9 | #
 10 | 
 11 | 
 12 | import numpy as np
 13 | from matplotlib import cm
 14 | from matplotlib import pyplot as plt
 15 | from sklearn.datasets import load_diabetes
 16 | from sklearn.kernel_ridge import KernelRidge
 17 | from sklearn.linear_model import Ridge
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | from skmatter.decomposition import KernelPCovR, PCovR
 21 | 
 22 | 
 23 | cmapX = cm.plasma
 24 | cmapy = cm.Greys
 25 | 
 26 | # %%
 27 | #
 28 | # For this, we will use the :func:`sklearn.datasets.load_diabetes` dataset from
 29 | # ``sklearn``.
 30 | 
 31 | X, y = load_diabetes(return_X_y=True)
 32 | y = y.reshape(X.shape[0], -1)
 33 | 
 34 | X_scaler = StandardScaler()
 35 | X_scaled = X_scaler.fit_transform(X)
 36 | 
 37 | y_scaler = StandardScaler()
 38 | y_scaled = y_scaler.fit_transform(y)
 39 | 
 40 | # %%
 41 | #
 42 | # Computing a simple PCovR and making a fancy plot of the results
 43 | # ---------------------------------------------------------------
 44 | 
 45 | mixing = 0.5
 46 | pcovr = PCovR(
 47 |     mixing=mixing,
 48 |     regressor=Ridge(alpha=1e-8, fit_intercept=False, tol=1e-12),
 49 |     n_components=2,
 50 | )
 51 | pcovr.fit(X_scaled, y_scaled)
 52 | T = pcovr.transform(X_scaled)
 53 | yp = y_scaler.inverse_transform(pcovr.predict(X_scaled).reshape(-1, 1))
 54 | 
 55 | fig, ((axT, axy), (caxT, caxy)) = plt.subplots(
 56 |     2, 2, figsize=(8, 5), gridspec_kw=dict(height_ratios=(1, 0.1))
 57 | )
 58 | 
 59 | scatT = axT.scatter(T[:, 0], T[:, 1], s=50, alpha=0.8, c=y, cmap=cmapX, edgecolor="k")
 60 | axT.set_xlabel(r"$PC_1$")
 61 | axT.set_ylabel(r"$PC_2$")
 62 | fig.colorbar(scatT, cax=caxT, label="True NMR Shift [ppm]", orientation="horizontal")
 63 | 
 64 | scaty = axy.scatter(y, yp, s=50, alpha=0.8, c=np.abs(y - yp), cmap=cmapy, edgecolor="k")
 65 | axy.plot(axy.get_xlim(), axy.get_xlim(), "r--")
 66 | fig.suptitle(r"$\alpha=$" + str(mixing))
 67 | 
 68 | axy.set_xlabel(r"True NMR Shift [ppm]")
 69 | axy.set_ylabel(r"Predicted NMR Shift [ppm]")
 70 | fig.colorbar(
 71 |     scaty, cax=caxy, label="Error in NMR Shift [ppm]", orientation="horizontal"
 72 | )
 73 | 
 74 | fig.tight_layout()
 75 | 
 76 | # %%
 77 | #
 78 | # Surveying many Mixing Parameters
 79 | # --------------------------------
 80 | 
 81 | n_alpha = 5
 82 | 
 83 | fig, axes = plt.subplots(2, n_alpha, figsize=(4 * n_alpha, 10), sharey="row")
 84 | 
 85 | for i, mixing in enumerate(np.linspace(0, 1, n_alpha)):
 86 |     pcovr = PCovR(
 87 |         mixing=mixing,
 88 |         regressor=Ridge(alpha=1e-8, fit_intercept=False, tol=1e-12),
 89 |         n_components=2,
 90 |     )
 91 |     pcovr.fit(X_scaled, y_scaled)
 92 |     T = pcovr.transform(X_scaled)
 93 |     yp = y_scaler.inverse_transform(pcovr.predict(X_scaled).reshape(-1, 1))
 94 | 
 95 |     axes[0, i].scatter(
 96 |         T[:, 0], T[:, 1], s=50, alpha=0.8, c=y, cmap=cmapX, edgecolor="k"
 97 |     )
 98 |     axes[0, i].set_title(r"$\alpha=$" + str(mixing))
 99 |     axes[0, i].set_xlabel(r"$PC_1$")
100 |     axes[0, i].set_xticks([])
101 |     axes[0, i].set_yticks([])
102 | 
103 |     axes[1, i].scatter(
104 |         y, yp, s=50, alpha=0.8, c=np.abs(y - yp), cmap=cmapy, edgecolor="k"
105 |     )
106 |     axes[1, i].set_title(r"$\alpha=$" + str(mixing))
107 |     axes[1, i].set_xlabel("y")
108 | 
109 | axes[0, 0].set_ylabel(r"$PC_2$")
110 | axes[1, 0].set_ylabel("Predicted y")
111 | 
112 | fig.subplots_adjust(wspace=0, hspace=0.25)
113 | plt.show()
114 | 
115 | # %%
116 | #
117 | # Construct a Kernel PCovR Map
118 | # ============================
119 | #
120 | # Moving from PCovR to KernelPCovR is much like moving from PCA to KernelPCA in
121 | # ``sklearn``. Like KernelPCA, KernelPCovR can compute any pairwise kernel supported by
122 | # ``sklearn`` or operate on a precomputed kernel.
123 | 
124 | 
125 | mixing = 0.5
126 | kpcovr = KernelPCovR(
127 |     mixing=mixing,
128 |     regressor=KernelRidge(
129 |         alpha=1e-8,
130 |         kernel="rbf",
131 |         gamma=0.1,
132 |     ),
133 |     kernel="rbf",
134 |     gamma=0.1,
135 |     n_components=2,
136 | )
137 | kpcovr.fit(X_scaled, y_scaled)
138 | T = kpcovr.transform(X_scaled)
139 | yp = y_scaler.inverse_transform(kpcovr.predict(X_scaled).reshape(-1, 1))
140 | 
141 | fig, ((axT, axy), (caxT, caxy)) = plt.subplots(
142 |     2, 2, figsize=(8, 5), gridspec_kw=dict(height_ratios=(1, 0.1))
143 | )
144 | 
145 | scatT = axT.scatter(T[:, 0], T[:, 1], s=50, alpha=0.8, c=y, cmap=cmapX, edgecolor="k")
146 | axT.set_xlabel(r"$PC_1$")
147 | axT.set_ylabel(r"$PC_2$")
148 | fig.colorbar(scatT, cax=caxT, label="y", orientation="horizontal")
149 | 
150 | scaty = axy.scatter(y, yp, s=50, alpha=0.8, c=np.abs(y - yp), cmap=cmapy, edgecolor="k")
151 | axy.plot(axy.get_xlim(), axy.get_xlim(), "r--")
152 | fig.suptitle(r"$\alpha=$" + str(mixing))
153 | 
154 | axy.set_xlabel(r"$y$")
155 | axy.set_ylabel(r"Predicted $y$")
156 | fig.colorbar(scaty, cax=caxy, label="Error in y", orientation="horizontal")
157 | 
158 | fig.tight_layout()
159 | 
160 | # %%
161 | #
162 | # As you can see, the regression error has decreased considerably from the linear case,
163 | # meaning that the map on the left can, and will, better correlate with the target
164 | # values.
165 | #
166 | # Note on KernelPCovR for Atoms, Molecules, and Structures
167 | # --------------------------------------------------------
168 | #
169 | # Applying this to datasets involving collections of atoms and their atomic descriptors,
170 | # it's important to consider the nature of the property you are learning and the samples
171 | # you are comparing before constructing a kernel, for example, whether the analysis is
172 | # to be based on whole structures or individual atomic environments. For more detail,
173 | # see Appendix C of
174 | # `Helfrecht 2020 <https://iopscience.iop.org/article/10.1088/2632-2153/aba9ef>`_ or,
175 | # regarding kernels involving gradients,
176 | # `Musil 2021 <https://arxiv.org/pdf/2101.08814.pdf>`_.
177 | 


--------------------------------------------------------------------------------
/examples/pcovr/PCovR_Regressors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | """
  4 | Choosing Different Regressors for PCovR
  5 | =======================================
  6 | """
  7 | # %%
  8 | #
  9 | import time
 10 | 
 11 | from matplotlib import pyplot as plt
 12 | from sklearn.datasets import load_diabetes
 13 | from sklearn.linear_model import Ridge
 14 | from sklearn.preprocessing import StandardScaler
 15 | 
 16 | from skmatter.decomposition import PCovR
 17 | 
 18 | 
 19 | # %%
 20 | #
 21 | # For this, we will use the :func:`sklearn.datasets.load_diabetes` dataset from
 22 | # ``sklearn``.
 23 | 
 24 | mixing = 0.5
 25 | 
 26 | X, y = load_diabetes(return_X_y=True)
 27 | 
 28 | X_scaler = StandardScaler()
 29 | X_scaled = X_scaler.fit_transform(X)
 30 | 
 31 | y_scaler = StandardScaler()
 32 | y_scaled = y_scaler.fit_transform(y.reshape(-1, 1))
 33 | 
 34 | 
 35 | # %%
 36 | #
 37 | # Use the default regressor in PCovR
 38 | # ----------------------------------
 39 | #
 40 | # When there is no regressor supplied, PCovR uses
 41 | # ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)``.
 42 | 
 43 | pcovr1 = PCovR(mixing=mixing, n_components=2)
 44 | 
 45 | t0 = time.perf_counter()
 46 | pcovr1.fit(X_scaled, y_scaled)
 47 | t1 = time.perf_counter()
 48 | 
 49 | print(f"Regressor is {pcovr1.regressor_} and fit took {1e3 * (t1 - t0):0.2} ms.")
 50 | 
 51 | 
 52 | # %%
 53 | #
 54 | # Use a fitted regressor
 55 | # ----------------------
 56 | #
 57 | # You can pass a fitted regressor to ``PCovR`` to rely on the predetermined regression
 58 | # parameters. Currently, scikit-matter supports ``scikit-learn`` classes
 59 | # class:`LinearModel <sklearn.linear_model.LinearModel>`, :class:`Ridge
 60 | # <sklearn.linear_model.Ridge>`, and class:`RidgeCV <sklearn.linear_model.RidgeCV>`,
 61 | # with plans to support any regressor with similar architecture in the future.
 62 | 
 63 | regressor = Ridge(alpha=1e-6, fit_intercept=False, tol=1e-12)
 64 | 
 65 | t0 = time.perf_counter()
 66 | regressor.fit(X_scaled, y_scaled)
 67 | t1 = time.perf_counter()
 68 | 
 69 | print(f"Fit took {1e3 * (t1 - t0):0.2} ms.")
 70 | 
 71 | 
 72 | # %%
 73 | #
 74 | 
 75 | pcovr2 = PCovR(mixing=mixing, n_components=2, regressor=regressor)
 76 | 
 77 | t0 = time.perf_counter()
 78 | pcovr2.fit(X_scaled, y_scaled)
 79 | t1 = time.perf_counter()
 80 | 
 81 | print(f"Regressor is {pcovr2.regressor_} and fit took {1e3 * (t1 - t0):0.2} ms.")
 82 | 
 83 | # %%
 84 | #
 85 | # Use a pre-predicted y
 86 | # ---------------------
 87 | #
 88 | # With ``regressor='precomputed'``, you can pass a regression output :math:`\hat{Y}` and
 89 | # optional regression weights :math:`W` to PCovR. If ``W=None``, then PCovR will
 90 | # determine :math:`W` as the least-squares solution between :math:`X` and
 91 | # :math:`\hat{Y}`.
 92 | 
 93 | regressor = Ridge(alpha=1e-6, fit_intercept=False, tol=1e-12)
 94 | 
 95 | t0 = time.perf_counter()
 96 | regressor.fit(X_scaled, y_scaled)
 97 | t1 = time.perf_counter()
 98 | 
 99 | print(f"Fit took {1e3 * (t1 - t0):0.2} ms.")
100 | 
101 | W = regressor.coef_
102 | 
103 | # %%
104 | #
105 | 
106 | pcovr3 = PCovR(mixing=mixing, n_components=2, regressor="precomputed")
107 | 
108 | t0 = time.perf_counter()
109 | pcovr3.fit(X_scaled, y_scaled, W=W)
110 | t1 = time.perf_counter()
111 | 
112 | print(f"Fit took {1e3 * (t1 - t0):0.2} ms.")
113 | 
114 | # %%
115 | #
116 | # Comparing Results
117 | # -----------------
118 | #
119 | # Because we used the same regressor in all three models, they will yield the same
120 | # result.
121 | 
122 | fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), sharex=True, sharey=True)
123 | 
124 | ax1.scatter(*pcovr1.transform(X_scaled).T, c=y)
125 | ax2.scatter(*pcovr2.transform(X_scaled).T, c=y)
126 | ax3.scatter(*pcovr3.transform(X_scaled).T, c=y)
127 | 
128 | ax1.set_ylabel("PCov$_2$")
129 | ax1.set_xlabel("PCov$_1$")
130 | ax2.set_xlabel("PCov$_1$")
131 | ax3.set_xlabel("PCov$_1$")
132 | 
133 | ax1.set_title("Default Regressor")
134 | ax2.set_title("Pre-fit Regressor")
135 | ax3.set_title("Precomputed Regression Result")
136 | 
137 | fig.show()
138 | 
139 | # %%
140 | #
141 | # As you can imagine, these three options have different use cases -- if you
142 | # are working with a large dataset, you should always pre-fit to save on time!
143 | 


--------------------------------------------------------------------------------
/examples/pcovr/PCovR_Scaling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | """
  4 | The Importance of Data Scaling in PCovR / KernelPCovR
  5 | =====================================================
  6 | """
  7 | # %%
  8 | #
  9 | 
 10 | import numpy as np
 11 | from matplotlib import pyplot as plt
 12 | from sklearn.datasets import load_diabetes
 13 | from sklearn.preprocessing import StandardScaler
 14 | 
 15 | from skmatter.decomposition import PCovR
 16 | 
 17 | 
 18 | # %%
 19 | #
 20 | # In PCovR, and KernelPCovR, we are combining multiple aspects of the dataset, primarily
 21 | # the features and targets. As such, the results largely depend on the relative
 22 | # contributions of each aspect to the
 23 | # mixed model.
 24 | 
 25 | X, y = load_diabetes(return_X_y=True)
 26 | 
 27 | # %%
 28 | #
 29 | # Take the diabetes dataset from sklearn. In their raw form, the magnitudes of the
 30 | # features and targets are
 31 | 
 32 | print(
 33 |     "Norm of the features: %0.2f \nNorm of the targets: %0.2f"
 34 |     % (np.linalg.norm(X), np.linalg.norm(y))
 35 | )
 36 | 
 37 | # %%
 38 | #
 39 | # For the California dataset, we can use the `StandardScaler` class from sklearn,
 40 | # as the features and targets are independent.
 41 | 
 42 | x_scaler = StandardScaler()
 43 | y_scaler = StandardScaler()
 44 | 
 45 | X_scaled = x_scaler.fit_transform(X)
 46 | y_scaled = y_scaler.fit_transform(y.reshape(-1, 1))
 47 | 
 48 | # %%
 49 | #
 50 | # Looking at the results at ``mixing=0.5``, we see an especially large difference in the
 51 | # latent-space projections
 52 | 
 53 | 
 54 | pcovr_unscaled = PCovR(mixing=0.5, n_components=4).fit(X, y)
 55 | T_unscaled = pcovr_unscaled.transform(X)
 56 | Yp_unscaled = pcovr_unscaled.predict(X)
 57 | 
 58 | pcovr_scaled = PCovR(mixing=0.5, n_components=4).fit(X_scaled, y_scaled)
 59 | T_scaled = pcovr_scaled.transform(X_scaled)
 60 | Yp_scaled = y_scaler.inverse_transform(pcovr_scaled.predict(X_scaled))
 61 | 
 62 | fig, ((ax1_T, ax2_T), (ax1_Y, ax2_Y)) = plt.subplots(2, 2, figsize=(8, 10))
 63 | 
 64 | ax1_T.scatter(T_unscaled[:, 0], T_unscaled[:, 1], c=y, cmap="plasma", ec="k")
 65 | ax1_T.set_xlabel("PCov1")
 66 | ax1_T.set_ylabel("PCov2")
 67 | ax1_T.set_title("Latent Projection\nWithout Scaling")
 68 | 
 69 | ax2_T.scatter(T_scaled[:, 0], T_scaled[:, 1], c=y, cmap="plasma", ec="k")
 70 | ax2_T.set_xlabel("PCov1")
 71 | ax2_T.set_ylabel("PCov2")
 72 | ax2_T.set_title("Latent Projection\nWith Scaling")
 73 | 
 74 | ax1_Y.scatter(Yp_unscaled, y, c=np.abs(y - Yp_unscaled), cmap="bone_r", ec="k")
 75 | ax1_Y.plot(ax1_Y.get_xlim(), ax1_Y.get_xlim(), "r--")
 76 | ax1_Y.set_xlabel("True Y, unscaled")
 77 | ax1_Y.set_ylabel("Predicted Y, unscaled")
 78 | ax1_Y.set_title("Regression\nWithout Scaling")
 79 | 
 80 | ax2_Y.scatter(
 81 |     Yp_scaled, y, c=np.abs(y.ravel() - Yp_scaled.ravel()), cmap="bone_r", ec="k"
 82 | )
 83 | ax2_Y.plot(ax2_Y.get_xlim(), ax2_Y.get_xlim(), "r--")
 84 | ax2_Y.set_xlabel("True Y, unscaled")
 85 | ax2_Y.set_ylabel("Predicted Y, unscaled")
 86 | ax2_Y.set_title("Regression\nWith Scaling")
 87 | 
 88 | fig.subplots_adjust(hspace=0.5, wspace=0.3)
 89 | 
 90 | # %%
 91 | #
 92 | # Also, we see that when the datasets are unscaled, the total loss (loss in recreating
 93 | # the original dataset and regression loss) does not vary with ``mixing``, as expected.
 94 | # Typically, the regression loss should _gradually_ increase with ``mixing``
 95 | # (and vice-versa for the loss in reconstructing the original features). When the
 96 | # inputs are not scaled, however, only in the case of ``mixing`` = 0 or 1 will the
 97 | # losses drastically change, depending on which component is dominating the model.
 98 | # Here, because the features dominate the model, this jump occurs as ``mixing`` goes to
 99 | # 0. With the scaled inputs, there is still a jump when ``mixing>0`` due to the change
100 | # in matrix rank.
101 | 
102 | mixings = np.linspace(0, 1, 21)
103 | losses_unscaled = np.zeros((2, len(mixings)))
104 | losses_scaled = np.zeros((2, len(mixings)))
105 | 
106 | nc = 4
107 | 
108 | for mi, mixing in enumerate(mixings):
109 |     pcovr_unscaled = PCovR(mixing=mixing, n_components=nc).fit(X, y)
110 |     t_unscaled = pcovr_unscaled.transform(X)
111 |     yp_unscaled = pcovr_unscaled.predict(T=t_unscaled)
112 |     xr_unscaled = pcovr_unscaled.inverse_transform(t_unscaled)
113 |     losses_unscaled[:, mi] = (
114 |         np.linalg.norm(xr_unscaled - X) ** 2.0 / np.linalg.norm(X) ** 2,
115 |         np.linalg.norm(yp_unscaled - y) ** 2.0 / np.linalg.norm(y) ** 2,
116 |     )
117 | 
118 |     pcovr_scaled = PCovR(mixing=mixing, n_components=nc).fit(X_scaled, y_scaled)
119 |     t_scaled = pcovr_scaled.transform(X_scaled)
120 |     yp_scaled = pcovr_scaled.predict(T=t_scaled)
121 |     xr_scaled = pcovr_scaled.inverse_transform(t_scaled)
122 |     losses_scaled[:, mi] = (
123 |         np.linalg.norm(xr_scaled - X_scaled) ** 2.0 / np.linalg.norm(X_scaled) ** 2,
124 |         np.linalg.norm(yp_scaled - y_scaled) ** 2.0 / np.linalg.norm(y_scaled) ** 2,
125 |     )
126 | 
127 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharey=True, sharex=True)
128 | ax1.plot(mixings, losses_unscaled[0], marker="o", label=r"$\ell_{X}$")
129 | ax1.plot(mixings, losses_unscaled[1], marker="o", label=r"$\ell_{Y}$")
130 | ax1.plot(mixings, np.sum(losses_unscaled, axis=0), marker="o", label=r"$\ell$")
131 | ax1.legend(fontsize=12)
132 | ax1.set_title("With Inputs Unscaled")
133 | ax1.set_xlabel(r"Mixing parameter $\alpha$")
134 | ax1.set_ylabel(r"Loss $\ell$")
135 | 
136 | ax2.plot(mixings, losses_scaled[0], marker="o", label=r"$\ell_{X}$")
137 | ax2.plot(mixings, losses_scaled[1], marker="o", label=r"$\ell_{Y}$")
138 | ax2.plot(mixings, np.sum(losses_scaled, axis=0), marker="o", label=r"$\ell$")
139 | ax2.legend(fontsize=12)
140 | ax2.set_title("With Inputs Scaled")
141 | ax2.set_xlabel(r"Mixing parameter $\alpha$")
142 | ax2.set_ylabel(r"Loss $\ell$")
143 | 
144 | fig.show()
145 | 
146 | # %%
147 | #
148 | # **Note**: When the relative magnitude of the features or targets is important, such
149 | # as in :func:`skmatter.datasets.load_csd_1000r`, one should use the
150 | # :class:`skmatter.preprocessing.StandardFlexibleScaler`.
151 | 


--------------------------------------------------------------------------------
/examples/pcovr/README.rst:
--------------------------------------------------------------------------------
1 | PCovR and KernelPCovR
2 | =====================
3 | 


--------------------------------------------------------------------------------
/examples/reconstruction/PlotGFRE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | """
  5 | Global Feature Reconstruction Error (GFRE) and Distortion (GFRD)
  6 | ================================================================
  7 | Example for the usage of the :class:`skmatter.metrics.global_reconstruction_error` as
  8 | global feature reconstruction error (GFRE) and
  9 | :class:`skmatter.metrics.global_reconstruction_distortion` global feature reconstruction
 10 | distortion (GFRD). We apply the global reconstruction measures on the degenerate CH4
 11 | manifold dataset. This dataset was specifically constructed to be representable by a
 12 | 4-body features (bispectrum) but not by a 3-body features (power spectrum). In other
 13 | words the dataset contains environments which are different, but have the same 3-body
 14 | features. For more details about the dataset please refer to `Pozdnyakov 2020
 15 | <https://doi.org/10.1103/PhysRevLett.125.166001>`_.
 16 | 
 17 | The ``skmatter`` dataset already contains the 3 and 4-body features computed with
 18 | `librascal <https://github.com/lab-cosmo/librascal>`_ so we can load it and compare it
 19 | with the GFRE/GFRD.
 20 | """
 21 | # %%
 22 | #
 23 | 
 24 | import matplotlib as mpl
 25 | import matplotlib.pyplot as plt
 26 | import numpy as np
 27 | 
 28 | from skmatter.datasets import load_degenerate_CH4_manifold
 29 | from skmatter.metrics import (
 30 |     global_reconstruction_distortion,
 31 |     global_reconstruction_error,
 32 | )
 33 | 
 34 | 
 35 | mpl.rc("font", size=20)
 36 | 
 37 | # load features
 38 | degenerate_manifold = load_degenerate_CH4_manifold()
 39 | power_spectrum_features = degenerate_manifold.data.SOAP_power_spectrum
 40 | bispectrum_features = degenerate_manifold.data.SOAP_bispectrum
 41 | 
 42 | # %%
 43 | #
 44 | 
 45 | gfre_matrix = np.zeros((2, 2))
 46 | print("Computing GFRE...")
 47 | 
 48 | 
 49 | # reconstruction error of power spectrum features using power spectrum features
 50 | gfre_matrix[0, 0] = global_reconstruction_error(
 51 |     power_spectrum_features, power_spectrum_features
 52 | )
 53 | 
 54 | # reconstruction error of bispectrum features using power spectrum features
 55 | gfre_matrix[0, 1] = global_reconstruction_error(
 56 |     power_spectrum_features, bispectrum_features
 57 | )
 58 | 
 59 | 
 60 | # reconstruction error of power spectrum features using bispectrum features
 61 | gfre_matrix[1, 0] = global_reconstruction_error(
 62 |     bispectrum_features, power_spectrum_features
 63 | )
 64 | 
 65 | # reconstruction error of bispectrum features using bispectrum features
 66 | gfre_matrix[1, 1] = global_reconstruction_error(
 67 |     bispectrum_features, bispectrum_features
 68 | )
 69 | 
 70 | print("Computing GFRE finished.")
 71 | 
 72 | 
 73 | # %%
 74 | #
 75 | 
 76 | 
 77 | gfrd_matrix = np.zeros((2, 2))
 78 | print("Computing GFRD...")
 79 | 
 80 | 
 81 | # reconstruction distortion of power spectrum features using power spectrum features
 82 | gfrd_matrix[0, 0] = global_reconstruction_distortion(
 83 |     power_spectrum_features, power_spectrum_features
 84 | )
 85 | 
 86 | # reconstruction distortion of power spectrum features using bispectrum features
 87 | gfrd_matrix[0, 1] = global_reconstruction_distortion(
 88 |     power_spectrum_features, bispectrum_features
 89 | )
 90 | 
 91 | # reconstruction distortion of bispectrum features using power spectrum features
 92 | gfrd_matrix[1, 0] = global_reconstruction_distortion(
 93 |     bispectrum_features, power_spectrum_features
 94 | )
 95 | 
 96 | 
 97 | # reconstruction distortion of bipsectrum features using bispectrum features
 98 | gfrd_matrix[1, 1] = global_reconstruction_distortion(
 99 |     bispectrum_features, bispectrum_features
100 | )
101 | 
102 | print("Computing GFRD finished.")
103 | 
104 | 
105 | # %%
106 | #
107 | 
108 | 
109 | fig, (axGFRE, axGFRD, cbar_ax) = plt.subplots(
110 |     1,
111 |     3,
112 |     figsize=(10, 4),
113 |     gridspec_kw=dict(width_ratios=(1, 1, 0.2)),
114 | )
115 | 
116 | 
117 | pcm1 = axGFRE.imshow(gfre_matrix, vmin=0, vmax=0.25)
118 | axGFRE.set_ylabel("F")
119 | axGFRE.set_xlabel("F'")
120 | axGFRE.set_title("GFRE(F, F')")
121 | 
122 | axGFRE.set_xticks([0, 1])
123 | axGFRE.set_xticklabels(["3-body", "4-body"])
124 | axGFRE.set_yticks([0, 1])
125 | axGFRE.set_yticklabels(["3-body", "4-body"])
126 | 
127 | pcm2 = axGFRD.imshow(gfrd_matrix, vmin=0, vmax=0.25)
128 | axGFRD.set_xlabel("F'")
129 | axGFRD.set_title("GFRD(F, F')")
130 | 
131 | axGFRD.set_xticks([0, 1])
132 | axGFRD.set_xticklabels(["3-body", "4-body"])
133 | axGFRD.set_yticks([0, 1])
134 | axGFRD.set_yticklabels(["", ""])
135 | 
136 | cbar = fig.colorbar(pcm2, cax=cbar_ax, label="GFRE or GFRD")
137 | plt.show()
138 | 
139 | # %%
140 | #
141 | # It can be seen that the reconstruction error of 4-body features with 3-body
142 | # features in the left plot in the upper right corner is large, expressing that the
143 | # dataset contains 4-body information that cannot be well linearly reconstructed using
144 | # 3-body information. This is expected, since the dataset was specifically designed for
145 | # this purpose (for more information please read
146 | # `Pozdnyakov 2020 <https://doi.org/10.1103/PhysRevLett.125.166001>`_). On the other
147 | # hand the 3-body features can be perfectly reconstructed from the 4-body features
148 | # as seen in the left plot in the lower left corner. However, this reconstruction
149 | # distorts the 4-body features significantly as seen in the right plot in the lower left
150 | # corner which is a typical behaviour of higher order features and can be also observe
151 | # for polynomial kernel features.
152 | 


--------------------------------------------------------------------------------
/examples/reconstruction/PlotLFRE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | """
  4 | Pointwise Local Reconstruction Error
  5 | ====================================
  6 | Example for the usage of the
  7 | :class:`skmatter.metrics.pointwise_local_reconstruction_error` as pointwise local
  8 | reconstruction error (LFRE) on the degenerate CH4 manifold. We apply the local
  9 | reconstruction measure on the degenerate CH4 manifold dataset. This dataset was
 10 | specifically constructed to be representable by a 4-body features (bispectrum) but not
 11 | by a 3-body features (power spectrum). In other words the dataset contains environments
 12 | which are different, but have the same 3-body features. For more details about the
 13 | dataset please refer to `Pozdnyakov 2020
 14 | <https://doi.org/10.1103/PhysRevLett.125.166001>`_.
 15 | 
 16 | The skmatter dataset already contains the 3 and 4-body features computed with `librascal
 17 | <https://github.com/lab-cosmo/librascal>`_ so we can load it and compare it with the
 18 | LFRE.
 19 | """
 20 | # %%
 21 | #
 22 | 
 23 | 
 24 | import matplotlib as mpl
 25 | import matplotlib.pyplot as plt
 26 | import numpy as np
 27 | 
 28 | from skmatter.datasets import load_degenerate_CH4_manifold
 29 | from skmatter.metrics import pointwise_local_reconstruction_error
 30 | 
 31 | 
 32 | mpl.rc("font", size=20)
 33 | 
 34 | 
 35 | # load features
 36 | degenerate_manifold = load_degenerate_CH4_manifold()
 37 | power_spectrum_features = degenerate_manifold.data.SOAP_power_spectrum
 38 | bispectrum_features = degenerate_manifold.data.SOAP_bispectrum
 39 | 
 40 | print(degenerate_manifold.DESCR)
 41 | 
 42 | 
 43 | # %%
 44 | #
 45 | 
 46 | 
 47 | n_local_points = 20
 48 | 
 49 | print("Computing pointwise LFRE...")
 50 | 
 51 | # %%
 52 | 
 53 | # local reconstruction error of power spectrum features using bispectrum features
 54 | power_spectrum_to_bispectrum_pointwise_lfre = pointwise_local_reconstruction_error(
 55 |     power_spectrum_features,
 56 |     bispectrum_features,
 57 |     n_local_points,
 58 |     train_idx=np.arange(0, len(power_spectrum_features), 2),
 59 |     test_idx=np.arange(0, len(power_spectrum_features)),
 60 |     estimator=None,
 61 |     n_jobs=4,
 62 | )
 63 | 
 64 | # local reconstruction error of bispectrum features using power spectrum features
 65 | bispectrum_to_power_spectrum_pointwise_lfre = pointwise_local_reconstruction_error(
 66 |     bispectrum_features,
 67 |     power_spectrum_features,
 68 |     n_local_points,
 69 |     train_idx=np.arange(0, len(power_spectrum_features), 2),
 70 |     test_idx=np.arange(0, len(power_spectrum_features)),
 71 |     estimator=None,
 72 |     n_jobs=4,
 73 | )
 74 | 
 75 | print("Computing pointwise LFRE finished.")
 76 | 
 77 | print(
 78 |     "LFRE(3-body, 4-body) = ",
 79 |     np.linalg.norm(power_spectrum_to_bispectrum_pointwise_lfre)
 80 |     / np.sqrt(len(power_spectrum_to_bispectrum_pointwise_lfre)),
 81 | )
 82 | 
 83 | print(
 84 |     "LFRE(4-body, 3-body) = ",
 85 |     np.linalg.norm(bispectrum_to_power_spectrum_pointwise_lfre)
 86 |     / np.sqrt(len(power_spectrum_to_bispectrum_pointwise_lfre)),
 87 | )
 88 | 
 89 | 
 90 | # %%
 91 | #
 92 | 
 93 | 
 94 | fig, (ax34, ax43) = plt.subplots(
 95 |     1, 2, constrained_layout=True, figsize=(16, 7.5), sharey="row", sharex=True
 96 | )
 97 | 
 98 | vmax = 0.5
 99 | 
100 | X, Y = np.meshgrid(np.linspace(0.7, 0.9, 9), np.linspace(-0.1, 0.1, 9))
101 | pcm = ax34.contourf(
102 |     X,
103 |     Y,
104 |     power_spectrum_to_bispectrum_pointwise_lfre[81:].reshape(9, 9).T,
105 |     vmin=0,
106 |     vmax=vmax,
107 | )
108 | 
109 | ax43.contourf(
110 |     X,
111 |     Y,
112 |     bispectrum_to_power_spectrum_pointwise_lfre[81:].reshape(9, 9).T,
113 |     vmin=0,
114 |     vmax=vmax,
115 | )
116 | 
117 | ax34.axhline(y=0, color="red", linewidth=5)
118 | ax43.axhline(y=0, color="red", linewidth=5)
119 | ax34.set_ylabel(r"v/$\pi$")
120 | ax34.set_xlabel(r"u/$\pi$")
121 | ax43.set_xlabel(r"u/$\pi$")
122 | 
123 | ax34.set_title(r"$X^-$ LFRE(3-body, 4-body)")
124 | ax43.set_title(r"$X^-$ LFRE(4-body, 3-body)")
125 | 
126 | cbar = fig.colorbar(pcm, ax=[ax34, ax43], label="LFRE", location="bottom")
127 | 
128 | plt.show()
129 | 
130 | # %%
131 | #
132 | # The environments span a manifold which is described by the coordinates :math:`v/\pi`
133 | # and :math:`u/\pi` (please refer to
134 | # `Pozdnyakov 2020 <https://doi.org/10.1103/PhysRevLett.125.166001>`_ for a concrete
135 | # understanding of the manifold). The LFRE is presented for each environment in the
136 | # manifold in the two contour plots. It can be seen that the reconstruction error
137 | # of 4-body features using 3-body features (the left plot) is most significant along the
138 | # degenerate line (the horizontal red line). This agrees with the fact that the 3-body
139 | # features remain the same on the degenerate line and can therefore not reconstruct the
140 | # 4-body features. On the other hand the 4-body features can perfectly reconstruct the
141 | # 3-body features as seen in the right plot.
142 | 


--------------------------------------------------------------------------------
/examples/reconstruction/PlotPointwiseGFRE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | """
  5 | Pointwise GFRE applied on RKHS features
  6 | =======================================
  7 | Example for the usage of the
  8 | :class:`skmatter.metrics.pointwise_global_reconstruction_error` as the pointwise global
  9 | feature reconstruction error (pointwise GFRE). We apply the pointwise global feature
 10 | reconstruction error on the degenerate CH4 manifold dataset containing 3 and 4-body
 11 | features computed with `librascal <https://github.com/lab-cosmo/librascal>`_. We will
 12 | show that using reproducing kernel Hilbert space (RKHS) features can improve the quality
 13 | of the reconstruction with the downside of being less general.
 14 | """
 15 | 
 16 | # %%
 17 | #
 18 | 
 19 | 
 20 | import matplotlib as mpl
 21 | import matplotlib.pyplot as plt
 22 | import numpy as np
 23 | from sklearn.model_selection import train_test_split
 24 | from sklearn.preprocessing._data import KernelCenterer
 25 | 
 26 | from skmatter.datasets import load_degenerate_CH4_manifold
 27 | from skmatter.metrics import (
 28 |     global_reconstruction_error,
 29 |     pointwise_global_reconstruction_error,
 30 | )
 31 | from skmatter.preprocessing import StandardFlexibleScaler
 32 | 
 33 | 
 34 | mpl.rc("font", size=20)
 35 | 
 36 | # load features
 37 | degenerate_manifold = load_degenerate_CH4_manifold()
 38 | power_spectrum_features = degenerate_manifold.data.SOAP_power_spectrum
 39 | bispectrum_features = degenerate_manifold.data.SOAP_bispectrum
 40 | 
 41 | # %%
 42 | #
 43 | # We compare 3-body features with their mapping to the reproducing kernel Hilbert space
 44 | # (RKHS) projected to the sample space using the nonlinear radial basis function (RBF)
 45 | # kernel
 46 | #
 47 | # .. math::
 48 | #   k^{\textrm{RBF}}(\mathbf{x},\mathbf{x}') =
 49 | #       \exp(-\gamma \|\mathbf{x}-\mathbf{x}'\|^2),\quad \gamma\in\mathbb{R}_+
 50 | #
 51 | # The projected RKHS features are computed using the eigendecomposition of the
 52 | # positive-definite kernel matrix :math:`K`
 53 | #
 54 | # .. math::
 55 | #   K = ADA^T = AD^{\frac12}(AD^{\frac12})^T = \Phi\Phi^T
 56 | 
 57 | 
 58 | def compute_standardized_rbf_rkhs_features(features, gamma):
 59 |     """Compute the  standardized RDF RKHS features."""
 60 |     # standardize features
 61 |     features = StandardFlexibleScaler().fit_transform(features)
 62 | 
 63 |     # compute \|x - x\|^2
 64 |     squared_distance = (
 65 |         np.sum(features**2, axis=1)[:, np.newaxis]
 66 |         + np.sum(features**2, axis=1)[np.newaxis, :]
 67 |         - 2 * features.dot(features.T)
 68 |     )
 69 |     # computer rbf kernel
 70 |     kernel = np.exp(-gamma * squared_distance)
 71 | 
 72 |     # center kernel
 73 |     kernel = KernelCenterer().fit_transform(kernel)
 74 | 
 75 |     # compute D and A
 76 |     D, A = np.linalg.eigh(kernel)
 77 | 
 78 |     # retain features associated with an eigenvalue above 1e-9 for denoising
 79 |     select_idx = np.where(D > 1e-9)[0]
 80 | 
 81 |     # compute rkhs features
 82 |     rbf_rkhs_features = A[:, select_idx] @ np.diag(np.sqrt(D[select_idx]))
 83 | 
 84 |     # standardize rkhs features,
 85 |     # this step could be omitted since it is done by the reconstruction measure by
 86 |     # default
 87 |     standardized_rbf_rkhs_features = StandardFlexibleScaler().fit_transform(
 88 |         rbf_rkhs_features
 89 |     )
 90 |     return standardized_rbf_rkhs_features
 91 | 
 92 | 
 93 | gamma = 1
 94 | rbf_power_spectrum_features = compute_standardized_rbf_rkhs_features(
 95 |     power_spectrum_features, gamma=gamma
 96 | )
 97 | 
 98 | # %%
 99 | #
100 | 
101 | # some split into train and test idx
102 | idx = np.arange(len(power_spectrum_features))
103 | 
104 | train_idx, test_idx = train_test_split(idx, random_state=42)
105 | 
106 | print("Computing pointwise GFRE...")
107 | 
108 | # pointwise global reconstruction error of bispectrum features using power spectrum
109 | # features
110 | power_spectrum_to_bispectrum_pointwise_gfre = pointwise_global_reconstruction_error(
111 |     power_spectrum_features, bispectrum_features, train_idx=train_idx, test_idx=test_idx
112 | )
113 | 
114 | # pointwise global reconstruction error of bispectrum features using power spectrum
115 | # features mapped to the RKHS
116 | power_spectrum_rbf_to_bispectrum_pointwise_gfre = pointwise_global_reconstruction_error(
117 |     rbf_power_spectrum_features,
118 |     bispectrum_features,
119 |     train_idx=train_idx,
120 |     test_idx=test_idx,
121 | )
122 | 
123 | print("Computing pointwise GFRE finished.")
124 | 
125 | print("Computing GFRE...")
126 | 
127 | # global reconstruction error of bispectrum features using power spectrum features
128 | power_spectrum_to_bispectrum_gfre = global_reconstruction_error(
129 |     power_spectrum_features, bispectrum_features, train_idx=train_idx, test_idx=test_idx
130 | )
131 | 
132 | # global reconstruction error of bispectrum features using power spectrum features
133 | # mapped to the RKHS
134 | power_spectrum_rbf_to_bispectrum_gfre = global_reconstruction_error(
135 |     rbf_power_spectrum_features,
136 |     bispectrum_features,
137 |     train_idx=train_idx,
138 |     test_idx=test_idx,
139 | )
140 | 
141 | print("Computing GFRE finished.")
142 | 
143 | 
144 | # %%
145 | #
146 | 
147 | fig, axes = plt.subplots(1, 1, figsize=(12, 7))
148 | 
149 | bins = np.linspace(0, 0.5, 10)
150 | axes.hist(
151 |     power_spectrum_to_bispectrum_pointwise_gfre,
152 |     bins,
153 |     alpha=0.5,
154 |     label="pointwise GFRE(3-body, 4-body)",
155 | )
156 | axes.hist(
157 |     power_spectrum_rbf_to_bispectrum_pointwise_gfre,
158 |     bins,
159 |     color="r",
160 |     alpha=0.5,
161 |     label="pointwise GFRE(3-body RBF, 4-body)",
162 | )
163 | axes.axvline(
164 |     power_spectrum_to_bispectrum_gfre,
165 |     color="darkblue",
166 |     label="GFRE(3-body, 4-body)",
167 |     linewidth=4,
168 | )
169 | axes.axvline(
170 |     power_spectrum_rbf_to_bispectrum_gfre,
171 |     color="darkred",
172 |     label="GFRE(3-body RBF RKHS, 4-body)",
173 |     linewidth=4,
174 | )
175 | axes.set_title(f"3-body vs 4-body RBF gamma={gamma} comparison")
176 | axes.set_xlabel("pointwise GFRE")
177 | axes.set_ylabel("number of samples")
178 | axes.legend(fontsize=13)
179 | plt.show()
180 | 
181 | 
182 | # %%
183 | #
184 | 
185 | 
186 | print("GFRE(3-body, 4-body) =", power_spectrum_to_bispectrum_gfre)
187 | print("GFRE(3-body RBF RKHS, 4-body) = ", power_spectrum_rbf_to_bispectrum_gfre)
188 | 
189 | # %%
190 | #
191 | # It can be seen that RBF RKHS features improve the linear reconstruction of the
192 | # 4-body features (~0.22 in contrast to ~0.19) while also spreading the error for
193 | # individual samples across a wider span of [0, 0.45] in contrast to [0.17, 0.32].
194 | # This indicates that the reconstruction using the RBF RKHS is less generally
195 | # applicable but instead specific to this dataset
196 | 


--------------------------------------------------------------------------------
/examples/reconstruction/README.rst:
--------------------------------------------------------------------------------
1 | Feature Reconstruction Measures
2 | ===============================
3 | 


--------------------------------------------------------------------------------
/examples/regression/README.rst:
--------------------------------------------------------------------------------
1 | Regression
2 | ==========
3 | 


--------------------------------------------------------------------------------
/examples/selection/FeatureSelection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | """
  4 | PCovR-Inspired Feature Selection
  5 | ================================
  6 | """
  7 | # %%
  8 | #
  9 | import numpy as np
 10 | from matplotlib import cm
 11 | from matplotlib import pyplot as plt
 12 | from sklearn.linear_model import RidgeCV
 13 | from sklearn.preprocessing import StandardScaler
 14 | 
 15 | from skmatter.datasets import load_csd_1000r
 16 | from skmatter.feature_selection import CUR, FPS, PCovCUR, PCovFPS
 17 | from skmatter.preprocessing import StandardFlexibleScaler
 18 | 
 19 | 
 20 | cmap = cm.brg
 21 | 
 22 | # %%
 23 | #
 24 | # For this, we will use the provided CSD dataset, which has 100 features to select from.
 25 | 
 26 | X, y = load_csd_1000r(return_X_y=True)
 27 | X = StandardFlexibleScaler(column_wise=False).fit_transform(X)
 28 | y = StandardScaler().fit_transform(y.reshape(X.shape[0], -1))
 29 | 
 30 | 
 31 | # %%
 32 | #
 33 | 
 34 | n = X.shape[-1] // 2
 35 | lr = RidgeCV(cv=2, alphas=np.logspace(-10, 1), fit_intercept=False)
 36 | 
 37 | # %%
 38 | #
 39 | # Feature Selection with CUR + PCovR
 40 | # ----------------------------------
 41 | #
 42 | # First, let's demonstrate CUR feature selection, and show the ten features chosen with
 43 | # a mixing parameter of 0.0, 0.5, and 1.0 perform.
 44 | 
 45 | for m in np.arange(0, 1.01, 0.5, dtype=np.float32):
 46 |     if m < 1.0:
 47 |         idx = PCovCUR(mixing=m, n_to_select=n).fit(X, y).selected_idx_
 48 |     else:
 49 |         idx = CUR(n_to_select=n).fit(X, y).selected_idx_
 50 | 
 51 |     plt.loglog(
 52 |         range(1, n + 1),
 53 |         np.array(
 54 |             [
 55 |                 lr.fit(X[:, idx[: ni + 1]], y).score(X[:, idx[: ni + 1]], y)
 56 |                 for ni in range(n)
 57 |             ]
 58 |         ),
 59 |         label=m,
 60 |         c=cmap(m),
 61 |         marker="o",
 62 |     )
 63 | 
 64 | plt.xlabel("Number of Features Selected")
 65 | plt.ylabel(r"$R^2$")
 66 | plt.legend(title="Mixing \nParameter")
 67 | plt.show()
 68 | 
 69 | # %%
 70 | #
 71 | # Non-iterative feature selection with CUR + PCovR
 72 | # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 73 | #
 74 | # Computing a non-iterative CUR is more efficient, although can result in poorer
 75 | # performance for larger datasets. you can also use a greater number of
 76 | # eigenvectors to compute the feature importance by varying ``k``, but ``k`` should
 77 | # not exceed the number of targets, for optimal results.
 78 | 
 79 | m = 0.0
 80 | 
 81 | idx = PCovCUR(mixing=m, n_to_select=n).fit(X, y).selected_idx_
 82 | idx_non_it = PCovCUR(mixing=m, recompute_every=0, n_to_select=n).fit(X, y).selected_idx_
 83 | 
 84 | plt.loglog(
 85 |     range(1, n + 1),
 86 |     np.array(
 87 |         [
 88 |             lr.fit(X[:, idx[: ni + 1]], y).score(X[:, idx[: ni + 1]], y)
 89 |             for ni in range(n)
 90 |         ]
 91 |     ),
 92 |     label="Iterative",
 93 |     marker="o",
 94 | )
 95 | plt.loglog(
 96 |     range(1, n + 1),
 97 |     np.array(
 98 |         [
 99 |             lr.fit(X[:, idx_non_it[: ni + 1]], y).score(X[:, idx_non_it[: ni + 1]], y)
100 |             for ni in range(n)
101 |         ]
102 |     ),
103 |     label="Non-Iterative",
104 |     marker="s",
105 | )
106 | 
107 | plt.xlabel("Number of Features Selected")
108 | plt.ylabel(r"$R^2$")
109 | plt.legend()
110 | plt.show()
111 | 
112 | # %%
113 | #
114 | # Feature Selection with FPS + PCovR
115 | # ----------------------------------
116 | #
117 | # Next, let's look at FPS. We'll choose the first index from CUR at m = 0, which is 46.
118 | 
119 | 
120 | for m in np.arange(0, 1.01, 0.5, dtype=np.float32):
121 |     if m < 1.0:
122 |         idx = PCovFPS(mixing=m, n_to_select=n, initialize=46).fit(X, y).selected_idx_
123 |     else:
124 |         idx = FPS(n_to_select=n, initialize=46).fit(X, y).selected_idx_
125 | 
126 |     plt.loglog(
127 |         range(1, n + 1),
128 |         np.array(
129 |             [
130 |                 lr.fit(X[:, idx[: ni + 1]], y).score(X[:, idx[: ni + 1]], y)
131 |                 for ni in range(n)
132 |             ]
133 |         ),
134 |         label=m,
135 |         c=cmap(m),
136 |         marker="o",
137 |     )
138 | 
139 | plt.xlabel("Number of Features Selected")
140 | plt.ylabel(r"$R^2$")
141 | plt.legend(title="Mixing \nParameter")
142 | plt.show()
143 | 


--------------------------------------------------------------------------------
/examples/selection/README.rst:
--------------------------------------------------------------------------------
1 | Feature and Sample Selection
2 | ============================
3 | 


--------------------------------------------------------------------------------
/examples/selection/Selectors-Pipelines.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | """
 5 | Using scikit-matter selectors with scikit-learn pipelines
 6 | =========================================================
 7 | """
 8 | 
 9 | # %%
10 | #
11 | 
12 | 
13 | import numpy as np
14 | from matplotlib import pyplot as plt
15 | from sklearn.datasets import load_diabetes
16 | from sklearn.linear_model import RidgeCV
17 | from sklearn.model_selection import train_test_split
18 | from sklearn.pipeline import Pipeline
19 | from sklearn.preprocessing import StandardScaler
20 | 
21 | from skmatter.feature_selection import CUR, FPS
22 | 
23 | 
24 | # %%
25 | #
26 | # Simple integration of scikit-matter selectors
27 | # ---------------------------------------------
28 | #
29 | # This example shows how to use FPS to subselect features before training a RidgeCV.
30 | 
31 | 
32 | scaler = StandardScaler()
33 | selector = FPS(n_to_select=4)
34 | ridge = RidgeCV(cv=2, alphas=np.logspace(-8, 2, 10))
35 | 
36 | X, y = load_diabetes(return_X_y=True)
37 | 
38 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
39 | 
40 | pipe = Pipeline([("scaler", scaler), ("selector", selector), ("ridge", ridge)])
41 | pipe.fit(X_train.copy(), y_train.copy())
42 | 
43 | plt.scatter(y_test, pipe.predict(X_test))
44 | plt.gca().set_aspect("equal")
45 | plt.plot(plt.xlim(), plt.xlim(), "r--")
46 | plt.xlabel("True Values")
47 | plt.ylabel("Predicted Values")
48 | plt.show()
49 | 
50 | 
51 | # %%
52 | #
53 | # Stacking selectors one after another
54 | # ------------------------------------
55 | #
56 | # This example shows how to use an FPS, then CUR selector
57 | # to subselect features before training a RidgeCV.
58 | 
59 | 
60 | scaler = StandardScaler()
61 | fps = FPS(n_to_select=8)
62 | cur = CUR(n_to_select=4)
63 | ridge = RidgeCV(cv=2, alphas=np.logspace(-8, 2, 10))
64 | 
65 | X, y = load_diabetes(return_X_y=True)
66 | 
67 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
68 | 
69 | pipe = Pipeline(
70 |     [("scaler", scaler), ("selector1", fps), ("selector2", cur), ("ridge", ridge)]
71 | )
72 | pipe.fit(X_train.copy(), y_train.copy())
73 | 
74 | plt.scatter(y_test, pipe.predict(X_test))
75 | plt.gca().set_aspect("equal")
76 | plt.plot(plt.xlim(), plt.xlim(), "r--")
77 | plt.xlabel("True Values")
78 | plt.ylabel("Predicted Values")
79 | plt.show()
80 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = [
  3 |     "setuptools",
  4 |     "wheel",
  5 | ]
  6 | build-backend = "setuptools.build_meta"
  7 | 
  8 | [project]
  9 | name = "skmatter"
 10 | description = "A collection of scikit-learn compatible utilities that implement methods born out of the materials science and chemistry communities."
 11 | authors = [
 12 |     {name = "Rose K. Cersonsky", email="rose.cersonsky@wisc.edu"},
 13 |     {name = "Guillaume Fraux"},
 14 |     {name = "Sergei Kliavinek"},
 15 |     {name = "Alexander Goscinski"},
 16 |     {name = "Benjamin A. Helfrecht"},
 17 |     {name = "Victor P. Principe"},
 18 |     {name = "Philip Loche"},
 19 |     {name = "Michele Ceriotti"}
 20 | ]
 21 | readme = "README.rst"
 22 | requires-python = ">=3.10"
 23 | license = {text = "BSD-3-Clause"}
 24 | classifiers = [
 25 |     "Development Status :: 4 - Beta",
 26 |     "Environment :: Console",
 27 |     "Intended Audience :: Science/Research",
 28 |     "License :: OSI Approved :: BSD License",
 29 |     "Natural Language :: English",
 30 |     "Operating System :: POSIX",
 31 |     "Operating System :: MacOS :: MacOS X",
 32 |     "Operating System :: Microsoft :: Windows",
 33 |     "Programming Language :: Python :: 3",
 34 |     "Programming Language :: Python :: 3.9",
 35 |     "Programming Language :: Python :: 3.10",
 36 |     "Programming Language :: Python :: 3.11",
 37 |     "Programming Language :: Python :: 3.12",
 38 |     "Topic :: Scientific/Engineering",
 39 | ]
 40 | dependencies = [
 41 |     "scikit-learn >= 1.6.0",
 42 |     "scipy >= 1.15.0",  # explicit to adhere to scikit-learn dependencies
 43 | ]
 44 | dynamic = ["version"]
 45 | 
 46 | [project.optional-dependencies]
 47 | examples = [
 48 |     "matplotlib",
 49 |     "pandas",
 50 |     "tqdm",
 51 | ]
 52 | 
 53 | [project.urls]
 54 | homepage = "http://scikit-matter.readthedocs.io"
 55 | documentation = "http://scikit-matter.readthedocs.io"
 56 | repository = "https://github.com/scikit-learn-contrib/scikit-matter"
 57 | issues = "https://github.com/scikit-learn-contrib/scikit-matterissues"
 58 | changelog = "http://scikit-matter.readthedocs.io/en/latest/changelog.html"
 59 | 
 60 | [tool.setuptools.packages.find]
 61 | where = ["src"]
 62 | 
 63 | [tool.setuptools.dynamic]
 64 | version = {attr = "skmatter.__version__"}
 65 | 
 66 | [tool.coverage.run]
 67 | branch = true
 68 | data_file = 'tests/.coverage'
 69 | 
 70 | [tool.coverage.report]
 71 | include = [
 72 |     "src/skmatter/*"
 73 | ]
 74 | 
 75 | [tool.coverage.xml]
 76 | output = 'tests/coverage.xml'
 77 | 
 78 | [tool.isort]
 79 | skip = "__init__.py"
 80 | profile = "black"
 81 | line_length = 88
 82 | indent = 4
 83 | include_trailing_comma = true
 84 | lines_after_imports = 2
 85 | known_first_party = "skmatter"
 86 | 
 87 | [tool.pytest.ini_options]
 88 | testpaths = ["tests"]
 89 | addopts = [
 90 |     "--cov",
 91 |     "--cov-append",
 92 |     "--cov-report=",
 93 |     "--import-mode=append",
 94 | ]
 95 | 
 96 | [tool.ruff]
 97 | exclude = ["docs/src/examples/"]
 98 | lint.ignore = [
 99 |     "F401",
100 |     "E203",
101 |     "D100",
102 |     "D101",
103 |     "D102",
104 |     "D205",
105 |     "D400",
106 |     "D401",
107 | ]
108 | line-length = 88
109 | lint.select = [
110 |     "D",
111 |     "E",
112 |     "F",
113 |     "W",
114 | ]
115 | 
116 | [tool.ruff.lint.pydocstyle]
117 | convention = "numpy"
118 | 
119 | [tool.ruff.lint.per-file-ignores]
120 | "examples/**" = [
121 |     "D205",
122 |     "D400",
123 | ]
124 | 


--------------------------------------------------------------------------------
/src/skmatter/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | scikit-matter
 3 | =============
 4 | 
 5 | scikit-matter is a toolbox of methods developed in the computational chemical and
 6 | materials science community, following the `scikit-learn <https://scikit.org/>`_ API and
 7 | coding guidelines to promote usability and interoperability with existing workflows.
 8 | """
 9 | 
10 | __version__ = "0.3.0-dev"
11 | 


--------------------------------------------------------------------------------
/src/skmatter/clustering/__init__.py:
--------------------------------------------------------------------------------
 1 | r"""
 2 | The module implements the quick shift clustering algorithm, which is used in
 3 | probabilistic analysis of molecular motifs (PAMM). See `Gasparotto and Ceriotti
 4 | <https://doi.org/10.1063/1.4900655>`_ for more details.
 5 | """
 6 | 
 7 | from ._quick_shift import QuickShift
 8 | 
 9 | __all__ = [
10 |     "QuickShift",
11 | ]
12 | 


--------------------------------------------------------------------------------
/src/skmatter/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | """Datasets used for example and testing."""
 2 | 
 3 | from ._base import (
 4 |     load_csd_1000r,
 5 |     load_degenerate_CH4_manifold,
 6 |     load_hbond_dataset,
 7 |     load_nice_dataset,
 8 |     load_roy_dataset,
 9 |     load_who_dataset,
10 | )
11 | 
12 | 
13 | __all__ = [
14 |     "load_degenerate_CH4_manifold",
15 |     "load_csd_1000r",
16 |     "load_hbond_dataset",
17 |     "load_nice_dataset",
18 |     "load_roy_dataset",
19 |     "load_who_dataset",
20 | ]
21 | 


--------------------------------------------------------------------------------
/src/skmatter/datasets/_base.py:
--------------------------------------------------------------------------------
  1 | from os.path import dirname, join
  2 | 
  3 | import numpy as np
  4 | import sklearn
  5 | 
  6 | 
  7 | if sklearn.__version__ >= "1.5.0":
  8 |     from sklearn.utils._optional_dependencies import check_pandas_support
  9 | else:
 10 |     from sklearn.utils import check_pandas_support
 11 | 
 12 | from sklearn.utils import Bunch
 13 | 
 14 | 
 15 | def load_nice_dataset():
 16 |     """Load and returns NICE dataset.
 17 | 
 18 |     Returns
 19 |     -------
 20 |     nice_data : sklearn.utils.Bunch
 21 |       Dictionary-like object, with the following attributes:
 22 |       data : `sklearn.utils.Bunch` --
 23 |       contains the keys ``X`` and ``y``.
 24 |       Structural NICE features and energies, respectively.
 25 |       DESCR: `str` --
 26 |         The full description of the dataset.
 27 |     """
 28 |     module_path = dirname(__file__)
 29 |     target_filename = join(module_path, "data", "nice_dataset.npz")
 30 |     raw_data = np.load(target_filename)
 31 |     data = Bunch(
 32 |         X=raw_data["structural_features"],
 33 |         y=raw_data["energies"],
 34 |     )
 35 |     with open(join(module_path, "descr", "nice_dataset.rst")) as rst_file:
 36 |         fdescr = rst_file.read()
 37 |     return Bunch(data=data, DESCR=fdescr)
 38 | 
 39 | 
 40 | def load_degenerate_CH4_manifold():
 41 |     """Load and return the degenerate manifold dataset.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     degenerate_CH4_manifold_data : sklearn.utils.Bunch
 46 |         Dictionary-like object, with the following attributes:
 47 | 
 48 |         data : `sklearn.utils.Bunch` --
 49 |         contains the keys ``SOAP_power_spectrum`` and ``SOAP_bispectrum``.
 50 |         Two representations of the carbon environments of the
 51 |         degenerate manifold dataset.
 52 | 
 53 |         DESCR: `str` --
 54 |         The full description of the dataset.
 55 |     """
 56 |     module_path = dirname(__file__)
 57 |     target_filename = join(module_path, "data", "degenerate_CH4_manifold.npz")
 58 |     raw_data = np.load(target_filename)
 59 |     data = Bunch(
 60 |         SOAP_power_spectrum=raw_data["SOAP_power_spectrum"],
 61 |         SOAP_bispectrum=raw_data["SOAP_bispectrum"],
 62 |     )
 63 |     with open(join(module_path, "descr", "degenerate_CH4_manifold.rst")) as rst_file:
 64 |         fdescr = rst_file.read()
 65 | 
 66 |     return Bunch(data=data, DESCR=fdescr)
 67 | 
 68 | 
 69 | def load_csd_1000r(return_X_y=False):
 70 |     """Load and return the minimal CSD dataset.
 71 | 
 72 |     Returns
 73 |     -------
 74 |     csd1000r : sklearn.utils.Bunch
 75 |         Dictionary-like object, with the following attributes:
 76 | 
 77 |         data : `sklearn.utils.Bunch` --
 78 |         contains the keys ``X`` and ``Y``, corresponding to the
 79 |         FPS-reduced SOAP vectors and local NMR chemical shielding, respectively,
 80 |         for 100 selected environments of the CSD-1000r dataset.
 81 | 
 82 |         DESCR: `str` --
 83 |         The full description of the dataset.
 84 |     """
 85 |     module_path = dirname(__file__)
 86 |     target_filename = join(module_path, "data", "csd-1000r.npz")
 87 |     raw_data = np.load(target_filename)
 88 |     if not return_X_y:
 89 |         data = Bunch(
 90 |             X=raw_data["X"],
 91 |             y=raw_data["Y"],
 92 |         )
 93 |         with open(join(module_path, "descr", "csd-1000r.rst")) as rst_file:
 94 |             fdescr = rst_file.read()
 95 | 
 96 |         return Bunch(data=data, DESCR=fdescr)
 97 |     else:
 98 |         return raw_data["X"], raw_data["Y"]
 99 | 
100 | 
101 | def load_who_dataset():
102 |     """Load and returns WHO dataset.
103 | 
104 |     Returns
105 |     -------
106 |     who_dataset : sklearn.utils.Bunch
107 |       Dictionary-like object, with the following attributes:
108 |           data : `pandas.core.frame.DataFrame` -- the WHO dataset
109 |                   as a Pandas dataframe.
110 |           DESCR: `str` -- The full description of the dataset.
111 |     """
112 |     module_path = dirname(__file__)
113 |     target_filename = join(module_path, "data", "who_dataset.csv")
114 |     pd = check_pandas_support("load_who_dataset")
115 |     raw_data = pd.read_csv(target_filename)
116 |     with open(join(module_path, "descr", "who_dataset.rst")) as rst_file:
117 |         fdescr = rst_file.read()
118 |     return Bunch(data=raw_data, DESCR=fdescr)
119 | 
120 | 
121 | def load_roy_dataset():
122 |     """Load and returns the ROY dataset, which contains densities,
123 |     energies and SOAP-derived descriptors for 264 structures of polymorphs of ROY,
124 |     from [Beran et Al, Chemical Science (2022)](https://doi.org/10.1039/D1SC06074K)
125 |     Each structure is labeled as "Known" or "Unknown".
126 | 
127 |     Returns
128 |     -------
129 |     roy_dataset : sklearn.utils.Bunch
130 |       Dictionary-like object, with the following attributes:
131 |           densities : `np.array` -- the densities of the structures
132 |           structure_types : `np.array` -- the type of the structures
133 |           features : `np.array` -- SOAP-derived descriptors for the structures
134 |           energies : `np.array` -- energies of the structures
135 |     """
136 |     module_path = dirname(__file__)
137 |     target_properties = join(module_path, "data", "beran_roy_properties.npz")
138 |     properties = np.load(target_properties)
139 | 
140 |     return Bunch(
141 |         densities=properties["densities"],
142 |         energies=properties["energies"],
143 |         structure_types=properties["structure_types"],
144 |         features=properties["feats"],
145 |     )
146 | 
147 | 
148 | def load_hbond_dataset():
149 |     """Load and returns the hydrogen bond dataset, which contains
150 |     a set of 3D descriptors for 27233 hydrogen bonds and corresponding
151 |     weights, from [Gasparotto et Al, The Journal of Chemical Physics]
152 |     (https://doi.org/10.1063/1.4900655)
153 | 
154 |     Returns
155 |     -------
156 |     hbond_dataset : sklearn.utils.Bunch
157 |       Dictionary-like object, with the following attributes:
158 |           descriptors : `numpy.ndarray` -- the descriptors of hydrogen bond dataset
159 |           weights : `numpy.ndarray` -- the weights of each sample in the dataset
160 |     """
161 |     module_path = dirname(__file__)
162 |     target_filename = join(module_path, "data", "h2o-blyp-piglet.npz")
163 |     raw_data = np.load(target_filename)
164 | 
165 |     with open(join(module_path, "descr", "h2o-blyp-piglet.rst")) as rst_file:
166 |         fdescr = rst_file.read()
167 | 
168 |     return Bunch(
169 |         descriptors=raw_data["descriptors"],
170 |         weights=raw_data["weights"],
171 |         DESCR=fdescr,
172 |     )
173 | 


--------------------------------------------------------------------------------
/src/skmatter/datasets/data/beran_roy_properties.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/beran_roy_properties.npz


--------------------------------------------------------------------------------
/src/skmatter/datasets/data/csd-1000r.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/csd-1000r.npz


--------------------------------------------------------------------------------
/src/skmatter/datasets/data/degenerate_CH4_manifold.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/degenerate_CH4_manifold.npz


--------------------------------------------------------------------------------
/src/skmatter/datasets/data/h2o-blyp-piglet.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/h2o-blyp-piglet.npz


--------------------------------------------------------------------------------
/src/skmatter/datasets/data/nice_dataset.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/nice_dataset.npz


--------------------------------------------------------------------------------
/src/skmatter/datasets/descr/csd-1000r.rst:
--------------------------------------------------------------------------------
  1 | .. _csd:
  2 | 
  3 | CSD-1000R
  4 | #########
  5 | 
  6 | This dataset, intended for model testing, contains the SOAP power spectrum features and
  7 | local NMR chemical shieldings for 100 environments selected from CSD-1000r, originally
  8 | published in [Ceriotti2019]_.
  9 | 
 10 | Function Call
 11 | -------------
 12 | 
 13 | .. function:: skmatter.datasets.load_csd_1000r
 14 | 
 15 | Data Set Characteristics
 16 | ------------------------
 17 | 
 18 | :Number of Instances: Each representation 100
 19 | 
 20 | :Number of Features: Each representation 100
 21 | 
 22 | The representations were computed with [C1]_ using the hyperparameters:
 23 | 
 24 | :rascal hyperparameters:
 25 | 
 26 | +---------------------------+------------+
 27 | | key                       |   value    |
 28 | +===========================+============+
 29 | | interaction_cutoff:       |    3.5     |
 30 | +---------------------------+------------+
 31 | | max_radial:               |      6     |
 32 | +---------------------------+------------+
 33 | | max_angular:              |      6     |
 34 | +---------------------------+------------+
 35 | | gaussian_sigma_constant:  |     0.4    |
 36 | +---------------------------+------------+
 37 | | gaussian_sigma_type:      |  "Constant"|
 38 | +---------------------------+------------+
 39 | | cutoff_smooth_width:      |     0.5    |
 40 | +---------------------------+------------+
 41 | | normalize:                |    True    |
 42 | +---------------------------+------------+
 43 | 
 44 | Of the 2'520 resulting features, 100 were selected via FPS using [C2]_.
 45 | 
 46 | Chemical Properties
 47 | -------------------
 48 | 
 49 | The CSD-1000R dataset consists of 100 atomic environments selected from crystal
 50 | structures in the Cambridge Structural Database (CSD) [C3]_. These environments
 51 | represent a diverse set of chemical compositions and bonding types, including:
 52 | 
 53 | - Metals, metalloids, and non-metals
 54 | - Covalent, ionic, and metallic bonding environments
 55 | - Various coordination numbers and geometries
 56 | 
 57 | The dataset captures local chemical environments relevant for modeling properties
 58 | such as nuclear magnetic resonance (NMR) chemical shieldings, aiding in the
 59 | understanding of structure-property relationships in materials chemistry.
 60 | 
 61 | For more detailed chemical information, users can refer to the original Cambridge
 62 | Structural Database [C3]_ or the publication by Ceriotti et al. (2019) [C4]_.
 63 | 
 64 | References
 65 | ----------
 66 | 
 67 | .. [C1] https://github.com/lab-cosmo/librascal commit ade202a6
 68 | .. [C2] https://github.com/lab-cosmo/scikit-matter commit 4ed1d92
 69 | .. [C3] https://www.ccdc.cam.ac.uk/structures/
 70 | .. [C4] https://www.nature.com/articles/s41597-019-0224-1
 71 | 
 72 | Reference Code
 73 | --------------
 74 | 
 75 | .. code-block:: python
 76 | 
 77 |     from skmatter.feature_selection import CUR
 78 |     from skmatter.preprocessing import StandardFlexibleScaler
 79 |     from skmatter.sample_selection import FPS
 80 | 
 81 |     # read all of the frames and book-keep the centers and species
 82 |     filename = "/path/to/CSD-1000R.xyz"
 83 |     frames = np.asarray(
 84 |         read(filename, ":"),
 85 |         dtype=object,
 86 |     )
 87 | 
 88 |     n_centers = np.array([len(frame) for frame in frames])
 89 |     center_idx = np.array([i for i, f in enumerate(frames) for p in f])
 90 |     n_env_accum = np.zeros(len(frames) + 1, dtype=int)
 91 |     n_env_accum[1:] = np.cumsum(n_centers)
 92 | 
 93 |     numbers = np.concatenate([frame.numbers for frame in frames])
 94 | 
 95 |     # compute radial soap vectors as first pass
 96 |     hypers = dict(
 97 |         soap_type="PowerSpectrum",
 98 |         interaction_cutoff=2.5,
 99 |         max_radial=6,
100 |         max_angular=0,
101 |         gaussian_sigma_type="Constant",
102 |         gaussian_sigma_constant=0.4,
103 |         cutoff_smooth_width=0.5,
104 |         normalize=False,
105 |         global_species=[1, 6, 7, 8],
106 |         expansion_by_species_method="user defined",
107 |     )
108 |     soap = SOAP(**hypers)
109 | 
110 |     X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(
111 |         soap.transform(frames).get_features(soap)
112 |     )
113 | 
114 |     # rank the environments in terms of diversity
115 |     n_samples = 500
116 |     i_selected = FPS(n_to_select=n_samples, initialize=0).fit(X_raw).selected_idx_
117 | 
118 |     # book-keep which frames these samples belong in
119 |     f_selected = center_idx[i_selected]
120 |     reduced_f_selected = list(sorted(set(f_selected)))
121 |     frames_selected = frames[f_selected].copy()
122 |     ci_selected = i_selected - n_env_accum[f_selected]
123 | 
124 |     properties_select = [
125 |         frames[fi].arrays["CS_local"][ci] for fi, ci in zip(f_selected, ci_selected)
126 |     ]
127 | 


--------------------------------------------------------------------------------
/src/skmatter/datasets/descr/degenerate_CH4_manifold.rst:
--------------------------------------------------------------------------------
 1 | .. _degenerate_manifold:
 2 | 
 3 | Degenerate CH4 manifold
 4 | #######################
 5 | 
 6 | The dataset contains two representations (SOAP power spectrum and bispectrum) of the two
 7 | manifolds spanned by the carbon atoms of two times 81 methane structures. The SOAP power
 8 | spectrum representation the two manifolds intersect creating a degenerate manifold/line
 9 | for which the representation remains the same. In contrast for higher body order
10 | representations as the (SOAP) bispectrum the carbon atoms can be uniquely represented
11 | and do not create a degenerate manifold. Following the naming convention of
12 | [Pozdnyakov2020]_ for each representation the first 81 samples correspond to the X minus
13 | manifold and the second 81 samples contain the X plus manifold
14 | 
15 | Function Call
16 | -------------
17 | 
18 | .. function:: skmatter.datasets.load_degenerate_CH4_manifold
19 | 
20 | Data Set Characteristics
21 | ------------------------
22 | 
23 | :Number of Instances: Each representation 162
24 | 
25 | :Number of Features: Each  representation 12
26 | 
27 | The representations were computed with [D1]_ using the hyperparameters:
28 | 
29 | :rascal hyperparameters:
30 | 
31 | +---------------------------+------------+
32 | | key                       |   value    |
33 | +===========================+============+
34 | | radial_basis:             |    "GTO"   |
35 | +---------------------------+------------+
36 | | interaction_cutoff:       |      4     |
37 | +---------------------------+------------+
38 | | max_radial:               |      2     |
39 | +---------------------------+------------+
40 | | max_angular:              |      2     |
41 | +---------------------------+------------+
42 | | gaussian_sigma_constant": |     0.5    |
43 | +---------------------------+------------+
44 | | gaussian_sigma_type:      |  "Constant"|
45 | +---------------------------+------------+
46 | | cutoff_smooth_width:      |     0.5    |
47 | +---------------------------+------------+
48 | | normalize:                |    False   |
49 | +---------------------------+------------+
50 | 
51 | The SOAP bispectrum features were in addition reduced to 12 features with principal
52 | component analysis (PCA) [D2]_.
53 | 
54 | References
55 | ----------
56 | 
57 | .. [D1] https://github.com/lab-cosmo/librascal commit 8d9ad7a
58 | .. [D2] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
59 | 


--------------------------------------------------------------------------------
/src/skmatter/datasets/descr/h2o-blyp-piglet.rst:
--------------------------------------------------------------------------------
 1 | .. _water:
 2 | 
 3 | H2O-BLYP-Piglet
 4 | ###############
 5 | 
 6 | This dataset contains 27233 hydrogen bond descriptors and corresponding weights from a
 7 | trajectory of a classical simulation performed with a BLYP exchange-correlation
 8 | functional and a DZVP basis set. The simulation box contined 64 water molecules. This
 9 | dataset was originally published in
10 | [Gasparotto2014]_.
11 | 
12 | Function Call
13 | -------------
14 | 
15 | .. function:: skmatter.datasets.load_hbond_dataset
16 | 
17 | Data Set Characteristics
18 | ------------------------
19 | 
20 | :Number of Instances: 27233
21 | 
22 | :Number of Features: 3
23 | 
24 | Reference
25 | ---------
26 | 
27 | [1] https://github.com/lab-cosmo/pamm/tree/master/examples/water
28 | 
29 | Reference Code
30 | --------------
31 | 
32 | [2] https://github.com/GardevoirX/pypamm/blob/master/tutorials/water/tutorial.ipynb
33 | 
34 | [3] https://github.com/lab-cosmo/pamm/blob/master/examples/water/README
35 | 
36 | 


--------------------------------------------------------------------------------
/src/skmatter/datasets/descr/nice_dataset.rst:
--------------------------------------------------------------------------------
 1 | .. _nice-dataset:
 2 | 
 3 | NICE dataset
 4 | ############
 5 | 
 6 | This is a toy dataset containing NICE[1, 4](N-body Iterative Contraction of
 7 | Equivariants) features for first 500 configurations of the dataset[2, 3] with randomly
 8 | displaced methane configurations.
 9 | 
10 | Function Call
11 | -------------
12 | 
13 | .. function:: skmatter.datasets.load_nice_dataset
14 | 
15 | Data Set Characteristics
16 | ------------------------
17 | 
18 | :Number of Instances: 500
19 | 
20 | :Number of Features: 160
21 | 
22 | The representations were computed using the NICE package[4] using the following
23 | definition of the NICE calculator:
24 | 
25 | .. code-block:: python
26 | 
27 |     StandardSequence(
28 |         [
29 |             StandardBlock(
30 |                 ThresholdExpansioner(num_expand=150),
31 |                 CovariantsPurifierBoth(max_take=10),
32 |                 IndividualLambdaPCAsBoth(n_components=50),
33 |                 ThresholdExpansioner(num_expand=300, mode="invariants"),
34 |                 InvariantsPurifier(max_take=50),
35 |                 InvariantsPCA(n_components=30),
36 |             ),
37 |             StandardBlock(
38 |                 ThresholdExpansioner(num_expand=150),
39 |                 CovariantsPurifierBoth(max_take=10),
40 |                 IndividualLambdaPCAsBoth(n_components=50),
41 |                 ThresholdExpansioner(num_expand=300, mode="invariants"),
42 |                 InvariantsPurifier(max_take=50),
43 |                 InvariantsPCA(n_components=20),
44 |             ),
45 |             StandardBlock(
46 |                 None,
47 |                 None,
48 |                 None,
49 |                 ThresholdExpansioner(num_expand=300, mode="invariants"),
50 |                 InvariantsPurifier(max_take=50),
51 |                 InvariantsPCA(n_components=20),
52 |             ),
53 |         ],
54 |         initial_scaler=InitialScaler(mode="signal integral", individually=True),
55 |     )
56 | 
57 | 
58 | References
59 | ----------
60 | 
61 | [1] Jigyasa Nigam, Sergey Pozdnyakov, and Michele Ceriotti. "Recursive evaluation and
62 |     iterative contraction of N-body equivariant features." The Journal of Chemical
63 |     Physics 153.12 (2020): 121101.
64 | 
65 | [2] Incompleteness of Atomic Structure Representations
66 |     Sergey N. Pozdnyakov, Michael J. Willatt, Albert P. Bartók, Christoph Ortner,
67 |     Gábor Csányi, and Michele Ceriotti
68 | 
69 | [3] https://archive.materialscloud.org/record/2020.110
70 | 
71 | Reference Code
72 | --------------
73 | 
74 | [4] https://github.com/lab-cosmo/nice
75 | 


--------------------------------------------------------------------------------
/src/skmatter/datasets/descr/who_dataset.rst:
--------------------------------------------------------------------------------
  1 | .. _who:
  2 | 
  3 | WHO dataset
  4 | ###########
  5 | 
  6 | ``who_dataset.csv`` is a compilation of multiple publically-available datasets
  7 | through data.worldbank.org. Specifically, the following versioned datasets are used:
  8 | 
  9 | - NY.GDP.PCAP.CD (v2_4770383) [1]_
 10 | - SE.XPD.TOTL.GD.ZS (v2_4773094) [2]_
 11 | - SH.DYN.AIDS.ZS (v2_4770518) [3]_
 12 | - SH.IMM.IDPT (v2_4770682) [4]_
 13 | - SH.IMM.MEAS (v2_4774112) [5]_
 14 | - SH.TBS.INCD (v2_4770775) [6]_
 15 | - SH.XPD.CHEX.GD.ZS (v2_4771258) [7]_
 16 | - SN.ITK.DEFC.ZS (v2_4771336) [8]_
 17 | - SP.DYN.LE00.IN (v2_4770556) [9]_
 18 | - SP.POP.TOTL (v2_4770385) [10]_
 19 | 
 20 | where the corresponding file names are ``API_{dataset}_DS2_excel_en_{version}.xls``.
 21 | 
 22 | This dataset, intended only for demonstration, contains 2020 country-year pairings and
 23 | the corresponding values above.
 24 | Function Call
 25 | -------------
 26 | 
 27 | .. function:: skmatter.datasets.load_who_dataset
 28 | 
 29 | Data Set Characteristics
 30 | ------------------------
 31 | 
 32 |     :Number of Instances: 2020
 33 | 
 34 |     :Number of Features: 10
 35 | 
 36 | References
 37 | ----------
 38 | 
 39 |    .. [1] https://data.worldbank.org/indicator/NY.GDP.PCAP.CD
 40 |    .. [2] https://data.worldbank.org/indicator/SE.XPD.TOTL.GD.ZS
 41 |    .. [3] https://data.worldbank.org/indicator/SH.DYN.AIDS.ZS
 42 |    .. [4] https://data.worldbank.org/indicator/SH.IMM.IDPT
 43 |    .. [5] https://data.worldbank.org/indicator/SH.IMM.MEAS
 44 |    .. [6] https://data.worldbank.org/indicator/SH.TBS.INCD
 45 |    .. [7] https://data.worldbank.org/indicator/SH.XPD.CHEX.GD.ZS
 46 |    .. [8] https://data.worldbank.org/indicator/SN.ITK.DEFC.ZS
 47 |    .. [9] https://data.worldbank.org/indicator/SP.DYN.LE00.IN
 48 |    .. [10] https://data.worldbank.org/indicator/SP.POP.TOTL
 49 | 
 50 | 
 51 | Reference Code
 52 | --------------
 53 | 
 54 | The following script is compiled, where the datasets have been placed in a
 55 | folder named ``who_data``:
 56 | 
 57 | .. code-block:: python
 58 | 
 59 |     import os
 60 |     import pandas as pd
 61 |     import numpy as np
 62 | 
 63 |     files = os.listdir("who_data/")
 64 |     indicators = [f[4 : f[4:].index("_") + 4] for f in files]
 65 |     indicator_codes = {}
 66 |     data_dict = {}
 67 |     entries = []
 68 | 
 69 |     for file in files:
 70 |         data = pd.read_excel(
 71 |             "who_data/" + file,
 72 |             header=3,
 73 |             sheet_name="Data",
 74 |             index_col=0,
 75 |         )
 76 | 
 77 |         indicator = data["Indicator Code"].values[0]
 78 |         indicator_codes[indicator] = data["Indicator Name"].values[0]
 79 | 
 80 |         for index in data.index:
 81 |             for year in range(1900, 2022):
 82 |                 if str(year) in data.loc[index] and not np.isnan(
 83 |                     data.loc[index].loc[str(year)]
 84 |                 ):
 85 |                     if (index, year) not in data_dict:
 86 |                         data_dict[(index, year)] = np.nan * np.ones(len(indicators))
 87 |                     data_dict[(index, year)][indicators.index(indicator)] = data.loc[
 88 |                         index
 89 |                     ].loc[str(year)]
 90 | 
 91 |     with open("who_data.csv", "w") as outf:
 92 |         outf.write("Country,Year," + ",".join(indicators) + "\n")
 93 |         for key, data in data_dict.items():
 94 |             if np.count_nonzero(~np.isnan(np.array(data, dtype=float))) == len(
 95 |                 indicators
 96 |             ):
 97 |                 outf.write(
 98 |                     "{},{},{}\n".format(
 99 |                         key[0].replace(",", " "),
100 |                         key[1],
101 |                         ",".join([str(d) for d in data]),
102 |                     )
103 |                 )
104 | 


--------------------------------------------------------------------------------
/src/skmatter/decomposition/__init__.py:
--------------------------------------------------------------------------------
 1 | r"""
 2 | Often, one wants to construct new ML features from their current representation
 3 | in order to compress data or visualise trends in the dataset. In the archetypal
 4 | method for this dimensionality reduction, principal components analysis (PCA),
 5 | features are transformed into the latent space which best preserves the
 6 | variance of the original data.
 7 | 
 8 | This module provides the Principal Covariates
 9 | Regression (PCovR), as introduced by [deJong1992]_, which is a modification to PCA
10 | that incorporates target information, such that the resulting embedding could
11 | be tuned using a mixing parameter α to improve performance in regression tasks
12 | (:math:`\alpha = 0` corresponding to linear regression and :math:`\alpha = 1`
13 | corresponding to PCA). Also provided is Principal Covariates Classification (PCovC),
14 | proposed in [Jorgensen2025]_, which can similarly be used for classification problems.
15 | 
16 | [Helfrecht2020]_ introduced the non-linear version of PCovR,
17 | Kernel Principal Covariates Regression (KPCovR), where the mixing parameter α
18 | now interpolates between kernel ridge regression (:math:`\alpha = 0`) and
19 | kernel principal components analysis (KPCA, :math:`\alpha = 1`).
20 | 
21 | The module includes:
22 | 
23 | * :ref:`PCovR-api` the standard Principal Covariates Regression. Utilises a
24 |   combination between a PCA-like and an LR-like loss, and therefore attempts to find
25 |   a low-dimensional projection of the feature vectors that simultaneously minimises
26 |   information loss and error in predicting the target properties using only the
27 |   latent space vectors :math:`\mathbf{T}`.
28 | * :ref:`PCovC-api` the standard Principal Covariates Classification, proposed in
29 |   [Jorgensen2025]_.
30 | * :ref:`KPCovR-api` the Kernel Principal Covariates Regression.
31 |   A kernel-based variation on the
32 |   original PCovR method, proposed in [Helfrecht2020]_.
33 | """
34 | 
35 | from ._pcov import _BasePCov
36 | 
37 | from ._pcovr import PCovR
38 | from ._pcovc import PCovC
39 | 
40 | from ._kernel_pcovr import KernelPCovR
41 | 
42 | __all__ = [
43 |     "_BasePCov",
44 |     "PCovR",
45 |     "PCovC",
46 |     "KernelPCovR",
47 | ]
48 | 


--------------------------------------------------------------------------------
/src/skmatter/feature_selection/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn.feature_selection` module includes FPS and CUR selection, each
 3 | with the optional PCov-flavor
 4 | """
 5 | 
 6 | from ._base import (
 7 |     CUR,
 8 |     FPS,
 9 |     PCovCUR,
10 |     PCovFPS,
11 | )
12 | 
13 | __all__ = ["PCovFPS", "PCovCUR", "FPS", "CUR"]
14 | 


--------------------------------------------------------------------------------
/src/skmatter/linear_model/__init__.py:
--------------------------------------------------------------------------------
1 | """Classes for building linear models."""
2 | 
3 | from ._base import OrthogonalRegression
4 | from ._ridge import Ridge2FoldCV
5 | 
6 | __all__ = ["OrthogonalRegression", "Ridge2FoldCV"]
7 | 


--------------------------------------------------------------------------------
/src/skmatter/linear_model/_base.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.linalg import orthogonal_procrustes
  3 | from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
  4 | from sklearn.linear_model import LinearRegression
  5 | from sklearn.utils import check_array, check_X_y
  6 | from sklearn.utils.validation import check_is_fitted
  7 | 
  8 | 
  9 | class OrthogonalRegression(MultiOutputMixin, RegressorMixin, BaseEstimator):
 10 |     r"""Orthogonal regression by solving the Procrustes problem
 11 | 
 12 |     Linear regression with the additional constraint that the weight matrix
 13 |     must be an orthogonal matrix/projection. It minimizes the Procrustes
 14 |     problem:
 15 | 
 16 |     .. math::
 17 | 
 18 |         \min_\Omega ||y - X\Omega\||_F \quad\mathrm{subject\ to}\quad \Omega^T\Omega=I
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     use_orthogonal_projector : bool, default=True
 23 |         Controls if orthogonal projectors are used to predict y fitting on X.
 24 |         If this parameter is set to False X and y are padded with zeros to the larger
 25 |         number of features of X and y. The projection method is similar
 26 |         to the procedure in the computation GFRD in the first version of
 27 |         Ref. [Goscinski2021]_. The method has been adapted obtain a full weight matrix.
 28 | 
 29 |         The projection can introduce nonanalytic behavior with respect to
 30 |         changes in dimensions of X for cases where X n_features > y n_targets.
 31 |         See ``examples/OrthogonalRegressionNonAnalytic_no-doc.ipynb``
 32 | 
 33 |     linear_estimator : object implementing fit/predict, default=None
 34 |         The linear estimator is used when `use_orthogonal_projector`
 35 |         is set to True, to compute the projection matrix
 36 | 
 37 |     Attributes
 38 |     ----------
 39 |     max_components_ : int
 40 |         The source X and target y are padded with zeros to match in feature/target
 41 |         dimension, when `use_orthogonal_projector` is set to False. This attribute
 42 |         is set to the maximum of the feature and target dimension.
 43 | 
 44 |     coef_ : numpy.ndarray of shape (n_features,) or (n_targets, n_features) or (max_components, max_components)
 45 |         Weight matrix. The shape (max_components, max_components) is used if
 46 |         `use_orthogonal_projector` is set to False.
 47 |     """  # NoQa: E501
 48 | 
 49 |     def __init__(self, use_orthogonal_projector=True, linear_estimator=None):
 50 |         self.use_orthogonal_projector = use_orthogonal_projector
 51 |         self.linear_estimator = linear_estimator
 52 | 
 53 |     def fit(self, X, y):
 54 |         """
 55 |         Parameters
 56 |         ----------
 57 |         X : numpy.ndarray of shape (n_samples, n_features)
 58 |             Training data, where ``n_samples`` is the number of samples and
 59 |             ``n_features`` is the number of features.
 60 |         y : numpy.ndarray of shape (n_samples, n_targets)
 61 |             Training data, where ``n_samples`` is the number of samples and
 62 |             ``n_targets`` is the number of target properties.
 63 |         """
 64 |         X, y = check_X_y(
 65 |             X,
 66 |             y,
 67 |             y_numeric=True,
 68 |             ensure_min_features=1,
 69 |             ensure_min_samples=1,
 70 |             multi_output=True,
 71 |         )
 72 | 
 73 |         self.n_samples_in_, self.n_features_in_ = X.shape
 74 |         if self.use_orthogonal_projector:
 75 |             # check estimator
 76 |             linear_estimator = (
 77 |                 LinearRegression()
 78 |                 if self.linear_estimator is None
 79 |                 else self.linear_estimator
 80 |             )
 81 |             # compute orthogonal projectors
 82 |             linear_estimator.fit(X, y)
 83 |             coef = np.reshape(linear_estimator.coef_.T, (X.shape[1], -1))
 84 |             U, _, Vt = np.linalg.svd(coef, full_matrices=False)
 85 | 
 86 |             # compute weights by solving the Procrustes problem
 87 |             self.coef_ = (
 88 |                 U
 89 |                 @ orthogonal_procrustes(X @ U, y.reshape(X.shape[0], -1) @ Vt.T)[0]
 90 |                 @ Vt
 91 |             ).T
 92 |         else:
 93 |             self.max_components_ = max(X.shape[1], y.shape[1])
 94 |             X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])])
 95 |             y = np.pad(y, [(0, 0), (0, self.max_components_ - y.shape[1])])
 96 |             self.coef_ = orthogonal_procrustes(X, y)[0].T
 97 | 
 98 |         return self
 99 | 
100 |     def predict(self, X):
101 |         """
102 |         Parameters
103 |         ----------
104 |         X : numpy.ndarray of shape (n_samples, n_features)
105 |             Training data, where n_samples is the number of samples and n_features is
106 |             the number of features.
107 |         """
108 |         X = check_array(X, ensure_min_features=1, ensure_min_samples=1)
109 |         check_is_fitted(self, ["coef_"])
110 | 
111 |         if not (self.use_orthogonal_projector):
112 |             X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])])
113 |         return X @ self.coef_.T
114 | 


--------------------------------------------------------------------------------
/src/skmatter/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | """Set of metrics that can be used for an enhanced understanding of your machine
 2 | learning model.
 3 | 
 4 | First are the easily-interpretable error measures of the relative information
 5 | capacity of feature space `F` with respect to feature space `F'`. The methods
 6 | returns a value between 0 and 1, where 0 means that `F` and `F'` are completey
 7 | distinct in terms of linearly-decodable information, and where 1 means that `F'`
 8 | is contained in `F`. All methods are implemented as the root mean-square error
 9 | for the regression of the feature matrix `X_F'` (or sometimes called `Y` in the
10 | doc) from `X_F` (or sometimes called `X` in the doc) for transformations with
11 | different constraints (linear, orthogonal, locally-linear). By default a custom
12 | 2-fold cross-validation :py:class:`skosmo.linear_model.Ridge2FoldCV`
13 | is used to ensure the generalization of the transformation and efficiency of the
14 | computation, since we deal with a multi-target regression problem. Methods were
15 | applied to compare different forms of featurizations through different
16 | hyperparameters and induced metrics and kernels [Goscinski2021]_ .
17 | 
18 | These reconstruction measures are available:
19 | 
20 | * :ref:`GRE-api` (GRE) computes the amount of linearly-decodable information
21 |   recovered through a global linear reconstruction.
22 | * :ref:`GRD-api` (GRD) computes the amount of distortion contained in a global
23 |   linear reconstruction.
24 | * :ref:`LRE-api` (LRE) computes the amount of decodable information recovered
25 |   through a local linear reconstruction for the k-nearest neighborhood of each
26 |   sample.
27 | 
28 | Next, we offer a set of prediction rigidity metrics, which can be used to
29 | quantify the robustness of the local or component-wise predictions that the
30 | machine learning model has been trained to make, based on the training dataset
31 | composition.
32 | 
33 | These prediction rigidities are available:
34 | 
35 | * :ref:`LPR-api` (LPR) computes the local prediction rigidity of a linear or
36 |   kernel model.
37 | * :ref:`CPR-api` (CPR) computes the component-wise prediction rigidity of a
38 |   linear or kernel model.
39 | 
40 | There are also two distance metrics compatible with the periodic boundary conditions
41 | available.
42 | 
43 |   .. note::
44 |     Currently only rectangular cells are supported.
45 |     Cell format: [side_length_1, ..., side_length_n]
46 | 
47 | * :ref:`pairwise-euclidian-api` computes the euclidean distance between two sets
48 |   of points. It is compatible with the periodic boundary conditions.
49 |   If the cell length is not provided, it will fall back to the ``scikit-learn`` version
50 |   of the euclidean distance :func:`sklearn.metrics.pairwise.euclidean_distances`.
51 | * :ref:`pairwise-mahalanobis-api` computes the Mahalanobis distance between two sets
52 |   of points. It is compatible with the periodic boundary conditions.
53 | """
54 | 
55 | from ._reconstruction_measures import (
56 |     check_global_reconstruction_measures_input,
57 |     check_local_reconstruction_measures_input,
58 |     global_reconstruction_distortion,
59 |     global_reconstruction_error,
60 |     local_reconstruction_error,
61 |     pointwise_global_reconstruction_distortion,
62 |     pointwise_global_reconstruction_error,
63 |     pointwise_local_reconstruction_error,
64 | )
65 | 
66 | from ._prediction_rigidities import (
67 |     local_prediction_rigidity,
68 |     componentwise_prediction_rigidity,
69 | )
70 | 
71 | from ._pairwise import (
72 |     periodic_pairwise_euclidean_distances,
73 |     pairwise_mahalanobis_distances,
74 | )
75 | 
76 | __all__ = [
77 |     "pointwise_global_reconstruction_error",
78 |     "global_reconstruction_error",
79 |     "pointwise_global_reconstruction_distortion",
80 |     "global_reconstruction_distortion",
81 |     "pointwise_local_reconstruction_error",
82 |     "local_reconstruction_error",
83 |     "check_global_reconstruction_measures_input",
84 |     "check_local_reconstruction_measures_input",
85 |     "local_prediction_rigidity",
86 |     "componentwise_prediction_rigidity",
87 |     "periodic_pairwise_euclidean_distances",
88 |     "pairwise_mahalanobis_distances",
89 | ]
90 | 
91 | DIST_METRICS = {
92 |     "periodic_euclidean": periodic_pairwise_euclidean_distances,
93 | }
94 | 


--------------------------------------------------------------------------------
/src/skmatter/metrics/_pairwise.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics.pairwise import _euclidean_distances, check_pairwise_arrays
  5 | 
  6 | 
  7 | def periodic_pairwise_euclidean_distances(
  8 |     X,
  9 |     Y=None,
 10 |     *,
 11 |     squared=False,
 12 |     cell_length=None,
 13 | ):
 14 |     r"""
 15 |     Compute the pairwise distance matrix between each pair from a vector array X and Y.
 16 | 
 17 |     .. math::
 18 |         d_{i, j} = \\sqrt{\\sum_{k=1}^n (x_{i, k} - y_{j, k})^2}
 19 | 
 20 |     For efficiency reasons, the euclidean distance between a pair of row
 21 |     vector x and y is computed as::
 22 | 
 23 |         dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
 24 | 
 25 |     This formulation has two advantages over other ways of computing distances. First,
 26 |     it is computationally efficient when dealing with sparse data. Second, if one
 27 |     argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)`
 28 |     can be pre-computed.
 29 | 
 30 |     However, this is not the most precise way of doing this computation, because this
 31 |     equation potentially suffers from "catastrophic cancellation". Also, the distance
 32 |     matrix returned by this function may not be exactly symmetric as required by, e.g.,
 33 |     ``scipy.spatial.distance`` functions.
 34 | 
 35 |     Read more in the :ref:`User Guide <metrics>`.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     X : {array-like, sparse matrix} of shape (n_samples_X, n_components)
 40 |         An array where each row is a sample and each column is a component.
 41 |     Y : {array-like, sparse matrix} of shape (n_samples_Y, n_components), \
 42 |             default=None
 43 |         An array where each row is a sample and each column is a component.
 44 |         If `None`, method uses `Y=X`.
 45 |     cell_length : array-like of shape (n_components,), default=None
 46 |         The side length of rectangular cell used for periodic boundary conditions.
 47 |         `None` for non-periodic boundary conditions.
 48 | 
 49 |         .. note::
 50 |             Only side lengths of rectangular cells are supported.
 51 |             Cell format: `[side_length_1, ..., side_length_n]`
 52 | 
 53 |     Returns
 54 |     -------
 55 |     distances : ndarray of shape (n_samples_X, n_samples_Y)
 56 |         Returns the distances between the row vectors of `X`
 57 |         and the row vectors of `Y`.
 58 | 
 59 |     Examples
 60 |     --------
 61 |     >>> import numpy as np
 62 |     >>> from skmatter.metrics import periodic_pairwise_euclidean_distances
 63 |     >>> X = np.array([[0, 1], [1, 1]])
 64 |     >>> origin = np.array([[0, 0]])
 65 |     >>> # distance between rows of X
 66 |     >>> periodic_pairwise_euclidean_distances(X, X)
 67 |     array([[0., 1.],
 68 |            [1., 0.]])
 69 |     >>> # get distance to origin
 70 |     >>> periodic_pairwise_euclidean_distances(X, origin, cell_length=[0.5, 0.7])
 71 |     array([[0.3],
 72 |            [0.3]])
 73 |     """
 74 |     _check_dimension(X, cell_length)
 75 |     X, Y = check_pairwise_arrays(X, Y)
 76 | 
 77 |     if cell_length is None:
 78 |         return _euclidean_distances(X, Y, squared=squared)
 79 |     else:
 80 |         return _periodic_euclidean_distances(X, Y, squared=squared, cell=cell_length)
 81 | 
 82 | 
 83 | def _periodic_euclidean_distances(X, Y=None, *, squared=False, cell=None):
 84 |     X, Y = np.array(X).astype(float), np.array(Y).astype(float)
 85 |     XY = np.concatenate([x - Y for x in X])
 86 |     XY -= np.round(XY / cell) * cell
 87 |     distance = np.linalg.norm(XY, axis=1).reshape(X.shape[0], Y.shape[0])
 88 |     if squared:
 89 |         distance **= 2
 90 |     return distance
 91 | 
 92 | 
 93 | def pairwise_mahalanobis_distances(
 94 |     X: np.ndarray,
 95 |     Y: np.ndarray,
 96 |     cov_inv: np.ndarray,
 97 |     cell_length: Union[np.ndarray, None] = None,
 98 |     squared: bool = False,
 99 | ):
100 |     r"""
101 |     Calculate the pairwise Mahalanobis distance between two arrays.
102 | 
103 |     This metric is used for calculating the distances between observations from Gaussian
104 |     distributions. It is defined as:
105 | 
106 |     .. math::
107 |         d_{\Sigma}(x, y)^2 = (x - y)^T \Sigma^{-1} (x - y)
108 | 
109 |     where :math:`\Sigma` is the covariance matrix, :math:`x` and :math:`y` are
110 |     observations from the same distribution.
111 | 
112 |     Parameters
113 |     ----------
114 |         X : numpy.ndarray of shape (n_samples_X, n_components)
115 |             An array where each row is a sample and each column is a component.
116 |         Y : np.ndarray of shape (n_samples_Y, n_components)
117 |             An array where each row is a sample and each column is a component.
118 |         cov_inv : np.ndarray
119 |             The inverse covariance matrix of shape (n_components, n_components).
120 |         cell_length : np.ndarray, optinal, default=None
121 |             The cell size for periodic boundary conditions.
122 |             None for non-periodic boundary conditions.
123 | 
124 |             .. note::
125 |                 Only cubic cells are supported.
126 |                 Cell format: `[side_length_1, ..., side_length_n]`
127 | 
128 |         squared : bool, default=False
129 |             Whether to return the squared distance.
130 | 
131 |     Returns
132 |     -------
133 |     np.ndarray
134 |         The pairwise Mahalanobis distance between the two input arrays,
135 |         of shape `(cov_inv.shape[0], x.shape[0], y.shape[0])`.
136 | 
137 |     Examples
138 |     --------
139 |     >>> import numpy as np
140 |     >>> from skmatter.metrics import pairwise_mahalanobis_distances
141 |     >>> iv = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]])
142 |     >>> X = np.array([[1, 0, 0], [0, 2, 0], [2, 0, 0]])
143 |     >>> Y = np.array([[0, 1, 0]])
144 |     >>> pairwise_mahalanobis_distances(X, Y, iv)
145 |     array([[[1.        ],
146 |             [1.        ],
147 |             [1.73205081]]])
148 |     """
149 | 
150 |     def _mahalanobis(
151 |         cell: np.ndarray, X: np.ndarray, Y: np.ndarray, cov_inv: np.ndarray
152 |     ):
153 | 
154 |         XY = np.concatenate([x - Y for x in X])
155 |         if cell is not None:
156 |             XY -= np.round(XY / cell) * cell
157 | 
158 |         return np.sum(XY * np.transpose(cov_inv @ XY.T, (0, 2, 1)), axis=-1).reshape(
159 |             (cov_inv.shape[0], X.shape[0], Y.shape[0])
160 |         )
161 | 
162 |     _check_dimension(X, cell_length)
163 |     X, Y = check_pairwise_arrays(X, Y)
164 |     if len(cov_inv.shape) == 2:
165 |         cov_inv = cov_inv[np.newaxis, :, :]
166 |     dists = _mahalanobis(cell_length, X, Y, cov_inv)
167 |     if not squared:
168 |         dists **= 0.5
169 |     return dists
170 | 
171 | 
172 | def _check_dimension(X, cell_length):
173 |     if (cell_length is not None) and (X.shape[1] != len(cell_length)):
174 |         raise ValueError("Cell dimension does not match the data dimension.")
175 | 


--------------------------------------------------------------------------------
/src/skmatter/model_selection/__init__.py:
--------------------------------------------------------------------------------
1 | """Functions for model selection."""
2 | 
3 | from ._split import train_test_split
4 | 
5 | __all__ = ["train_test_split"]
6 | 


--------------------------------------------------------------------------------
/src/skmatter/model_selection/_split.py:
--------------------------------------------------------------------------------
 1 | import sklearn.model_selection
 2 | from sklearn.utils import indexable
 3 | from sklearn.utils.validation import _num_samples
 4 | 
 5 | 
 6 | def train_test_split(*arrays, **options):
 7 |     """Extended version of the sklearn train test split supporting overlapping train and
 8 |     test sets.
 9 | 
10 |     See `sklearn.model_selection.train_test_split (external link)
11 |     <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html>`_ .
12 | 
13 |     Parameters
14 |     ----------
15 |     *arrays : sequence of indexables with same length / shape[0]
16 |         Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas
17 |         dataframes.
18 |     test_size : float or int, default=None
19 |         If float, should be between 0.0 and 1.0 and represent the proportion of the
20 |         dataset to include in the test split. If int, represents the absolute number of
21 |         test samples. If :obj:`None`, the value is set to the complement of the train
22 |         size. If ``train_size`` is also None, it will be set to 0.25.
23 |     train_size : float or int, default=None
24 |         If float, should be between 0.0 and 1.0 and represent the proportion of the
25 |         dataset to include in the train split. If int, represents the absolute number of
26 |         train samples. If :obj:`None`, the value is automatically set to the complement
27 |         of the test size.
28 |     random_state : int or :class`numpy.random.RandomState` instance, default=None
29 |         Controls the shuffling applied to the data before applying the split. Pass an
30 |         int for reproducible output across multiple function calls. See `random state
31 |         glossary from sklearn (external link)
32 |         <https://scikit-learn.org/stable/glossary.html#term-random-state>`_
33 |     shuffle : bool, default=True
34 |         Whether or not to shuffle the data before splitting. If shuffle=False then
35 |         stratify must be :obj:`None`.
36 |     stratify : array-like, default=None
37 |         If not :obj:`None`, data is split in a stratified fashion, using this as the
38 |         class labels.
39 |     train_test_overlap : bool, default=False
40 |         If :obj:`True`, and train and test set are both not :obj:`None`, the train and
41 |         test set may overlap.
42 | 
43 |     Returns
44 |     -------
45 |     splitting : list, length=2 * len(arrays)
46 |         List containing train-test split of inputs.
47 |     """  # NoQa: E501
48 |     train_test_overlap = options.pop("train_test_overlap", False)
49 |     test_size = options.get("test_size", None)
50 |     train_size = options.get("train_size", None)
51 | 
52 |     if train_test_overlap and train_size is not None and test_size is not None:
53 |         # checks from sklearn
54 |         arrays = indexable(*arrays)
55 |         n_samples = _num_samples(arrays[0])
56 | 
57 |         if test_size == 1.0 or test_size == n_samples:
58 |             test_sets = arrays
59 |         else:
60 |             options["train_size"] = None
61 |             test_sets = sklearn.model_selection.train_test_split(*arrays, **options)[
62 |                 1::2
63 |             ]
64 |             options["train_size"] = train_size
65 | 
66 |         if train_size == 1.0 or train_size == n_samples:
67 |             train_sets = arrays
68 |         else:
69 |             options["test_size"] = None
70 |             train_sets = sklearn.model_selection.train_test_split(*arrays, **options)[
71 |                 ::2
72 |             ]
73 |             options["test_size"] = test_size
74 | 
75 |         train_test_sets = []
76 |         for i in range(len(train_sets)):
77 |             train_test_sets += [train_sets[i], test_sets[i]]
78 |         return train_test_sets
79 |     else:
80 |         return sklearn.model_selection.train_test_split(*arrays, **options)
81 | 


--------------------------------------------------------------------------------
/src/skmatter/neighbors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The module implements the sparse kernel density estimator.
 3 | 
 4 | A large dataset can be generated during the molecular dynamics sampling. The
 5 | distribution of the sampled data reflects the (free) energetic stability of molecular
 6 | patterns. The KDE model can be used to characterize the probability distribution, and
 7 | thus to identify the stable patterns in the system. However, the computational
 8 | cost of KDE is `O(N^2)` where `N` is the number of sampled points, which is very
 9 | expensive. Here we offer a sparse implementation of the KDE model with a
10 | `O(MN)` computational cost, where `M` is the number of grid points generated from the
11 | sampled data.
12 | 
13 | The following class is available:
14 | 
15 | * :ref:`sparse-kde-api` computes the kernel density estimator based on a set of grid
16 |   points generated from the sampled data.
17 | 
18 | """
19 | 
20 | from ._sparsekde import SparseKDE
21 | 
22 | __all__ = ["SparseKDE"]
23 | 


--------------------------------------------------------------------------------
/src/skmatter/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | """Scaling, centering and normalization methods."""
 2 | 
 3 | from ._data import (
 4 |     KernelNormalizer,
 5 |     SparseKernelCenterer,
 6 |     StandardFlexibleScaler,
 7 | )
 8 | 
 9 | __all__ = ["StandardFlexibleScaler", "KernelNormalizer", "SparseKernelCenterer"]
10 | 


--------------------------------------------------------------------------------
/src/skmatter/sample_selection/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`sklearn.sample selection` module will include FPS and CUR selection, each
 3 | with the optional PCov-flavor
 4 | """
 5 | 
 6 | from ._base import (
 7 |     CUR,
 8 |     FPS,
 9 |     DirectionalConvexHull,
10 |     PCovCUR,
11 |     PCovFPS,
12 | )
13 | from ._voronoi_fps import VoronoiFPS
14 | 
15 | __all__ = ["PCovFPS", "PCovCUR", "FPS", "CUR", "DirectionalConvexHull", "VoronoiFPS"]
16 | 


--------------------------------------------------------------------------------
/src/skmatter/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`skmatter.utils` module includes functions which are
 3 | used by multiple packages
 4 | """
 5 | 
 6 | from ._orthogonalizers import (
 7 |     X_orthogonalizer,
 8 |     Y_feature_orthogonalizer,
 9 |     Y_sample_orthogonalizer,
10 | )
11 | 
12 | from ._pcovc_utils import check_cl_fit
13 | 
14 | from ._pcovr_utils import (
15 |     check_krr_fit,
16 |     check_lr_fit,
17 |     pcovr_covariance,
18 |     pcovr_kernel,
19 | )
20 | 
21 | from ._progress_bar import (
22 |     get_progress_bar,
23 |     no_progress_bar,
24 | )
25 | 
26 | from ._sparsekde import (
27 |     effdim,
28 |     oas,
29 | )
30 | 
31 | __all__ = [
32 |     "get_progress_bar",
33 |     "no_progress_bar",
34 |     "pcovr_covariance",
35 |     "pcovr_kernel",
36 |     "check_krr_fit",
37 |     "check_lr_fit",
38 |     "X_orthogonalizer",
39 |     "Y_sample_orthogonalizer",
40 |     "Y_feature_orthogonalizer",
41 |     "effdim",
42 |     "oas",
43 | ]
44 | 


--------------------------------------------------------------------------------
/src/skmatter/utils/_orthogonalizers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Necessary orthogonalizers for the CUR decomposition subselection method.
  3 | 
  4 | Authors: Rose K. Cersonsky
  5 |          Michele Ceriotti
  6 | """
  7 | 
  8 | import warnings
  9 | 
 10 | import numpy as np
 11 | 
 12 | 
 13 | def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False):
 14 |     """Orthogonalizes a feature matrix by the given columns.
 15 | 
 16 |     Can be used to orthogonalize by samples by calling `X = X_orthogonalizer(X.T,
 17 |     row_index).T`. After orthogonalization, each column of X will contain only what is
 18 |     orthogonal to X[:, c] or x2.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     x1: numpy.ndarray of shape (n x m)
 23 |         feature matrix to orthogonalize
 24 |     c: int, less than m, default=None
 25 |        index of the column to orthogonalize by
 26 |     x2: numpy.ndarray of shape (n x a), default=x1[:, c]
 27 |         a separate set of columns to orthogonalize with respect to
 28 |         Note: the orthogonalizer will work column-by-column in column-index order
 29 |     """
 30 |     if x2 is None and c is not None:
 31 |         cols = x1[:, [c]]
 32 |     elif x2.shape[0] == x1.shape[0]:
 33 |         cols = np.reshape(x2, (x1.shape[0], -1))
 34 |     else:
 35 |         raise ValueError(
 36 |             "You can only orthogonalize a matrix using a vector with the same number "
 37 |             f"of rows. Matrix X has {x1.shape[0]} rows, whereas the orthogonalizing "
 38 |             f"matrix has {x2.shape[0]} rows."
 39 |         )
 40 | 
 41 |     if copy:
 42 |         xnew = x1.copy()
 43 |     else:
 44 |         xnew = x1
 45 | 
 46 |     for i in range(cols.shape[-1]):
 47 |         col = cols[:, [i]]
 48 | 
 49 |         if np.linalg.norm(col) < tol:
 50 |             warnings.warn("Column vector contains only zeros.", stacklevel=1)
 51 |         else:
 52 |             col = np.divide(col, np.linalg.norm(col, axis=0))
 53 | 
 54 |         xnew -= (col @ (col.T @ xnew)).astype(xnew.dtype)
 55 | 
 56 |     return xnew
 57 | 
 58 | 
 59 | def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True):
 60 |     r"""Orthogonalizes a property matrix given the selected features in
 61 |     :math:`\mathbf{X}`.
 62 | 
 63 |     .. math::
 64 |         \mathbf{Y} \leftarrow \mathbf{Y} -
 65 |         \mathbf{X} \left(\mathbf{X}^T\mathbf{X}\right)^{-1}\mathbf{X}^T \mathbf{Y}
 66 | 
 67 |     Parameters
 68 |     ----------
 69 |     y : numpy.ndarray of shape (n_samples x n_properties)
 70 |        property matrix
 71 |     X : numpy.ndarray of shape (n_samples x n_features)
 72 |        feature matrix
 73 |     tol: float
 74 |         cutoff for small eigenvalues to send to np.linalg.pinv
 75 |     copy: bool
 76 |         whether to return a copy of y or edit in-place, default=True
 77 |     """
 78 |     v = np.linalg.pinv(np.matmul(X.T, X), rcond=tol)
 79 |     v = np.matmul(X, v)
 80 |     v = np.matmul(v, X.T)
 81 | 
 82 |     if copy:
 83 |         return y.copy() - np.matmul(v, y)
 84 |     else:
 85 |         y -= np.matmul(v, y)
 86 |         return y
 87 | 
 88 | 
 89 | def Y_sample_orthogonalizer(y, X, y_ref, X_ref, tol=1e-12, copy=True):
 90 |     r"""Orthogonalizes a matrix of targets :math:`{\mathbf{Y}}` given a reference
 91 |     feature matrix :math:`{\mathbf{X}_r}` and reference target matrix
 92 |     :math:`{\mathbf{Y}_r}`:
 93 | 
 94 |     .. math::
 95 |         \mathbf{Y} \leftarrow \mathbf{Y} -
 96 |         \mathbf{X} \left(\mathbf{X}_{\mathbf{r}}^T
 97 |         \mathbf{X}_{\mathbf{r}}\right)^{-1}\mathbf{X}_{\mathbf{r}}^T
 98 |         \mathbf{Y}_{\mathbf{r}}
 99 | 
100 |     Parameters
101 |     ----------
102 |     y : numpy.ndarray of shape (n_samples x n_properties)
103 |        property matrix
104 |     X : numpy.ndarray of shape (n_samples x n_features)
105 |        feature matrix
106 |     y_ref : numpy.ndarray of shape (n_ref x n_properties)
107 |         reference property matrix
108 |     X_ref : numpy.ndarray of shape (n_ref x n_features)
109 |         reference feature matrix
110 |     tol: float
111 |         cutoff for small eigenvalues to send to np.linalg.pinv
112 |     copy: bool
113 |         whether to return a copy of y or edit in-place, default=True
114 |     """
115 |     y_frag = (X @ (np.linalg.lstsq(X_ref, y_ref, rcond=tol)[0])).reshape(y.shape)
116 | 
117 |     if copy:
118 |         return y.copy() - y_frag
119 |     else:
120 |         y -= y_frag
121 |         return y
122 | 


--------------------------------------------------------------------------------
/src/skmatter/utils/_pcovc_utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | import numpy as np
 4 | from sklearn import clone
 5 | from sklearn.exceptions import NotFittedError
 6 | from sklearn.utils.validation import check_is_fitted, validate_data
 7 | 
 8 | 
 9 | def check_cl_fit(classifier, X, y):
10 |     """
11 |     Checks that a (linear) classifier is fitted, and if not,
12 |     fits it with the provided data.
13 | 
14 |     Parameters
15 |     ----------
16 |     classifier : object
17 |         sklearn-style classifier
18 |     X : array-like
19 |         Feature matrix with which to fit the classifier if it is not already fitted
20 |     y : array-like
21 |         Target values with which to fit the classifier if it is not already fitted
22 | 
23 |     Returns
24 |     -------
25 |     fitted_classifier : object
26 |         The fitted classifier. If input classifier was already fitted and compatible
27 |         with the data, returns a deep copy. Otherwise returns a newly fitted classifier.
28 | 
29 |     Raises
30 |     ------
31 |     ValueError
32 |         If the fitted classifiers's coefficients have a shape incompatible with the
33 |         number of features in X or the number of classes in y.
34 |     """
35 |     try:
36 |         check_is_fitted(classifier)
37 |         fitted_classifier = deepcopy(classifier)
38 | 
39 |         # Check compatibility with X
40 |         validate_data(fitted_classifier, X, y, reset=False, multi_output=True)
41 | 
42 |         # Check compatibility with the number of features in X and the number of
43 |         # classes in y
44 |         n_classes = len(np.unique(y))
45 | 
46 |         if n_classes == 2:
47 |             if fitted_classifier.coef_.shape[0] != 1:
48 |                 raise ValueError(
49 |                     "For binary classification, expected classifier coefficients "
50 |                     "to have shape (1, "
51 |                     f"{X.shape[1]}) but got shape "
52 |                     f"{fitted_classifier.coef_.shape}"
53 |                 )
54 |         else:
55 |             if fitted_classifier.coef_.shape[0] != n_classes:
56 |                 raise ValueError(
57 |                     "For multiclass classification, expected classifier coefficients "
58 |                     "to have shape "
59 |                     f"({n_classes}, {X.shape[1]}) but got shape "
60 |                     f"{fitted_classifier.coef_.shape}"
61 |                 )
62 | 
63 |     except NotFittedError:
64 |         fitted_classifier = clone(classifier)
65 |         fitted_classifier.fit(X, y)
66 | 
67 |     return fitted_classifier
68 | 


--------------------------------------------------------------------------------
/src/skmatter/utils/_progress_bar.py:
--------------------------------------------------------------------------------
 1 | def get_progress_bar():
 2 |     """Returns the appropriate version of ``tqdm``, as determined by ``tqdm.auto``.
 3 | 
 4 |     If ``tqdm`` is not installed, an :py:class`ImportError` is raised.
 5 |     """
 6 |     try:
 7 |         from tqdm.auto import tqdm
 8 | 
 9 |         return tqdm
10 |     except ImportError:
11 |         raise ImportError(
12 |             "tqdm must be installed to use a progress bar. Either install tqdm or "
13 |             "re-run with progress_bar = False"
14 |         )
15 | 
16 | 
17 | def no_progress_bar(x):
18 |     """Identity function, same as ``lambda x:x``. It returns ``x``."""
19 |     return x
20 | 


--------------------------------------------------------------------------------
/src/skmatter/utils/_sparsekde.py:
--------------------------------------------------------------------------------
 1 | """The file holds utility functions and classes for the sparse KDE."""
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def effdim(cov):
 7 |     """
 8 |     Calculate the effective dimension of a covariance matrix based on Shannon entropy.
 9 | 
10 |     Parameters
11 |     ----------
12 |     cov : numpy.ndarray
13 |         The covariance matrix.
14 | 
15 |     Returns
16 |     -------
17 |     float
18 |         The effective dimension of the covariance matrix.
19 | 
20 |     Examples
21 |     --------
22 |     >>> import numpy as np
23 |     >>> from skmatter.utils import effdim
24 |     >>> cov = np.array([[25, 15, -5], [15, 18, 0], [-5, 0, 11]], dtype=np.float64)
25 |     >>> print(round(effdim(cov), 3))
26 |     2.214
27 | 
28 |     References
29 |     ----------
30 |     https://ieeexplore.ieee.org/document/7098875
31 |     """
32 |     eigval = np.linalg.eigvals(cov)
33 |     if (lowest_eigval := np.min(eigval)) <= -np.max(cov.shape) * np.finfo(
34 |         cov.dtype
35 |     ).eps:
36 |         raise np.linalg.LinAlgError(
37 |             f"Matrix is not positive definite."
38 |             f"Lowest eigenvalue {lowest_eigval} is "
39 |             f"above numerical threshold."
40 |         )
41 |     eigval[eigval < 0.0] = 0.0
42 |     eigval /= sum(eigval)
43 |     eigval *= np.log(eigval)
44 | 
45 |     return np.exp(-sum(eigval))
46 | 
47 | 
48 | def oas(cov: np.ndarray, n: float, D: int) -> np.ndarray:
49 |     """
50 |     Oracle approximating shrinkage (OAS) estimator
51 | 
52 |     Parameters
53 |     ----------
54 |     cov : numpy.ndarray
55 |         A covariance matrix
56 |     n : float
57 |         The local population
58 |     D : int
59 |         Dimension
60 | 
61 |     Examples
62 |     --------
63 |     >>> import numpy as np
64 |     >>> from skmatter.utils import oas
65 |     >>> cov = np.array([[0.5, 1.0], [0.7, 0.4]])
66 |     >>> oas(cov, 10, 2)
67 |     array([[0.48903924, 0.78078484],
68 |            [0.54654939, 0.41096076]])
69 | 
70 |     Returns
71 |     -------
72 |     np.ndarray
73 |         Covariance matrix
74 |     """
75 |     tr = np.trace(cov)
76 |     tr2 = tr**2
77 |     tr_cov2 = np.trace(cov**2)
78 |     phi = ((1 - 2 / D) * tr_cov2 + tr2) / ((n + 1 - 2 / D) * tr_cov2 - tr2 / D)
79 | 
80 |     return (1 - phi) * cov + phi * np.eye(D) * tr / D
81 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | .coverage
2 | coverage.xml
3 | 


--------------------------------------------------------------------------------
/tests/test_check_estimators.py:
--------------------------------------------------------------------------------
 1 | from sklearn.utils.estimator_checks import parametrize_with_checks
 2 | 
 3 | from skmatter.decomposition import KernelPCovR, PCovC, PCovR
 4 | from skmatter.feature_selection import CUR as fCUR
 5 | from skmatter.feature_selection import FPS as fFPS
 6 | from skmatter.feature_selection import PCovCUR as fPCovCUR
 7 | from skmatter.feature_selection import PCovFPS as fPCovFPS
 8 | from skmatter.linear_model import Ridge2FoldCV  # OrthogonalRegression,
 9 | from skmatter.preprocessing import KernelNormalizer, StandardFlexibleScaler
10 | 
11 | 
12 | @parametrize_with_checks(
13 |     [
14 |         KernelPCovR(mixing=0.5),
15 |         PCovR(mixing=0.5),
16 |         PCovC(mixing=0.5),
17 |         fCUR(),
18 |         fFPS(),
19 |         fPCovCUR(),
20 |         fPCovFPS(),
21 |         Ridge2FoldCV(),
22 |         KernelNormalizer(),
23 |         StandardFlexibleScaler(),
24 |     ]
25 | )
26 | def test_sklearn_compatible_estimator(estimator, check):
27 |     """Test of the estimators are compatible with sklearn."""
28 |     check(estimator)
29 | 


--------------------------------------------------------------------------------
/tests/test_clustering.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | 
 5 | from skmatter.clustering import QuickShift
 6 | 
 7 | 
 8 | class QuickShiftTests(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls) -> None:
11 |         cls.points = np.array(
12 |             [
13 |                 [-1.72779275, -1.32763554],
14 |                 [-4.44991964, -2.13474901],
15 |                 [0.54817734, -2.43319467],
16 |                 [3.19881307, -0.49547222],
17 |                 [-1.1335991, 2.33478428],
18 |                 [0.55437388, 0.18745963],
19 |             ]
20 |         )
21 |         cls.cuts = np.array(
22 |             [6.99485011, 8.80292681, 7.68486852, 9.5115009, 8.07736919, 6.22057056]
23 |         )
24 |         cls.weights = np.array(
25 |             [
26 |                 -3.94008092,
27 |                 -12.68095664,
28 |                 -7.07512499,
29 |                 -9.03064023,
30 |                 -8.26529849,
31 |                 -2.61132267,
32 |             ]
33 |         )
34 |         cls.qs_labels_ = np.array([0, 0, 0, 5, 5, 5])
35 |         cls.qs_cluster_centers_idx_ = np.array([0, 5])
36 |         cls.gabriel_labels_ = np.array([5, 5, 5, 5, 5, 5])
37 |         cls.gabriel_cluster_centers_idx_ = np.array([5])
38 |         cls.cell = [3, 3]
39 |         cls.gabriel_shell = 2
40 | 
41 |     def test_fit_qs(self):
42 |         model = QuickShift(dist_cutoff_sq=self.cuts)
43 |         model.fit(self.points, samples_weight=self.weights)
44 |         self.assertTrue(np.all(model.labels_ == self.qs_labels_))
45 |         self.assertTrue(
46 |             np.all(model.cluster_centers_idx_ == self.qs_cluster_centers_idx_)
47 |         )
48 | 
49 |     def test_fit_garbriel(self):
50 |         model = QuickShift(gabriel_shell=self.gabriel_shell)
51 |         model.fit(self.points, samples_weight=self.weights)
52 |         self.assertTrue(np.all(model.labels_ == self.gabriel_labels_))
53 |         self.assertTrue(
54 |             np.all(model.cluster_centers_idx_ == self.gabriel_cluster_centers_idx_)
55 |         )
56 | 
57 |     def test_dimension_check(self):
58 |         model = QuickShift(self.cuts, metric_params={"cell_length": self.cell})
59 |         self.assertRaises(ValueError, model.fit, np.array([[2]]))
60 | 


--------------------------------------------------------------------------------
/tests/test_datasets.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | 
  5 | from skmatter.datasets import (
  6 |     load_csd_1000r,
  7 |     load_degenerate_CH4_manifold,
  8 |     load_hbond_dataset,
  9 |     load_nice_dataset,
 10 |     load_roy_dataset,
 11 |     load_who_dataset,
 12 | )
 13 | 
 14 | 
 15 | class NICEDatasetTests(unittest.TestCase):
 16 |     @classmethod
 17 |     def setUpClass(cls):
 18 |         cls.nice_data = load_nice_dataset()
 19 | 
 20 |     def test_load_nice_data(self):
 21 |         # test if representations and properties have commensurate shape
 22 |         self.assertTrue(
 23 |             self.nice_data.data.X.shape[0] == self.nice_data.data.y.shape[0]
 24 |         )
 25 |         self.assertTrue(self.nice_data.data.X.shape[0] == 500)
 26 |         self.assertTrue(self.nice_data.data.X.shape[1] == 160)
 27 |         self.assertTrue(len(self.nice_data.data.X.shape) == 2)
 28 | 
 29 |     def test_load_nice_data_descr(self):
 30 |         self.nice_data.DESCR
 31 | 
 32 | 
 33 | class DegenerateCH4Tests(unittest.TestCase):
 34 |     @classmethod
 35 |     def setUpClass(cls):
 36 |         cls.degenerate_CH4_manifold = load_degenerate_CH4_manifold()
 37 | 
 38 |     def test_load_degenerate_CH4_manifold_power_spectrum_shape(self):
 39 |         # test if representations have correct shape
 40 |         self.assertTrue(
 41 |             self.degenerate_CH4_manifold.data.SOAP_power_spectrum.shape == (162, 12)
 42 |         )
 43 | 
 44 |     def test_load_degenerate_CH4_manifold_bispectrum_shape(self):
 45 |         self.assertTrue(
 46 |             self.degenerate_CH4_manifold.data.SOAP_bispectrum.shape == (162, 12)
 47 |         )
 48 | 
 49 |     def test_load_degenerate_CH4_manifold_access_descr(self):
 50 |         self.degenerate_CH4_manifold.DESCR
 51 | 
 52 | 
 53 | class CSDTests(unittest.TestCase):
 54 |     @classmethod
 55 |     def setUpClass(cls):
 56 |         cls.csd = load_csd_1000r()
 57 | 
 58 |     def test_load_csd_1000r_shape(self):
 59 |         # test if representations and properties have commensurate shape
 60 |         self.assertTrue(self.csd.data.X.shape[0] == self.csd.data.y.shape[0])
 61 | 
 62 |     def test_load_csd_1000r_access_descr(self):
 63 |         self.csd.DESCR
 64 | 
 65 | 
 66 | class WHOTests(unittest.TestCase):
 67 |     @classmethod
 68 |     def setUpClass(cls):
 69 |         cls.size = 24240
 70 |         cls.shape = (2020, 12)
 71 |         cls.value = 5.00977993011475
 72 |         try:
 73 |             import pandas as pd  # NoQa: F401
 74 | 
 75 |             cls.has_pandas = True
 76 |             cls.who = load_who_dataset()
 77 |         except ImportError:
 78 |             cls.has_pandas = False
 79 | 
 80 |     def test_load_dataset_without_pandas(self):
 81 |         """Check if the correct exception occurs when pandas isn't present."""
 82 |         with unittest.mock.patch.dict("sys.modules", {"pandas": None}):
 83 |             with self.assertRaises(ImportError) as cm:
 84 |                 _ = load_who_dataset()
 85 |             self.assertEqual(str(cm.exception), "load_who_dataset requires pandas.")
 86 | 
 87 |     def test_dataset_size_and_shape(self):
 88 |         """
 89 |         Check if the correct number of datapoints are present in the dataset.
 90 |         Also check if the size of the dataset is correct.
 91 |         """
 92 |         if self.has_pandas is True:
 93 |             self.assertEqual(self.who["data"].size, self.size)
 94 |             self.assertEqual(self.who["data"].shape, self.shape)
 95 | 
 96 |     def test_datapoint_value(self):
 97 |         """Check if the value of a datapoint at a certain location is correct."""
 98 |         if self.has_pandas is True:
 99 |             self.assertTrue(
100 |                 np.allclose(
101 |                     self.who["data"]["SE.XPD.TOTL.GD.ZS"][1924], self.value, rtol=1e-6
102 |                 )
103 |             )
104 | 
105 | 
106 | class ROYTests(unittest.TestCase):
107 |     @classmethod
108 |     def setUpClass(cls):
109 |         cls.size = 264
110 |         cls.shape = (264, 32)
111 |         cls.roy = load_roy_dataset()
112 | 
113 |     def test_dataset_content(self):
114 |         """Check if the correct number of datapoints are present in the dataset.
115 | 
116 |         Also check if the size of the dataset is correct.
117 |         """
118 |         self.assertEqual(len(self.roy["structure_types"]), self.size)
119 |         self.assertEqual(self.roy["features"].shape, self.shape)
120 |         self.assertEqual(len(self.roy["energies"]), self.size)
121 | 
122 | 
123 | class HBondTests(unittest.TestCase):
124 |     @classmethod
125 |     def setUpClass(cls):
126 |         cls.size = 27233
127 |         cls.shape = (27233, 3)
128 |         cls.hbond = load_hbond_dataset()
129 | 
130 |     def test_dataset_size_and_shape(self):
131 |         """
132 |         Check if the correct number of datapoints are present in the dataset.
133 |         Also check if the size of the dataset is correct.
134 |         """
135 |         self.assertEqual(self.hbond["descriptors"].shape, self.shape)
136 |         self.assertEqual(self.hbond["weights"].size, self.size)
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     unittest.main()
141 | 


--------------------------------------------------------------------------------
/tests/test_feature_pcov_cur.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from sklearn.datasets import load_diabetes as get_dataset
 5 | 
 6 | from skmatter.feature_selection import PCovCUR
 7 | 
 8 | 
 9 | class TestPCovCUR(unittest.TestCase):
10 |     def setUp(self):
11 |         self.X, self.y = get_dataset(return_X_y=True)
12 |         self.idx = [2, 8, 3, 4, 1, 7, 5, 9, 6]
13 | 
14 |     def test_known(self):
15 |         """Check that the model returns a known set of indices."""
16 |         selector = PCovCUR(n_to_select=9)
17 |         selector.fit(self.X, self.y)
18 | 
19 |         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
20 | 
21 |     def test_restart(self):
22 |         """Check that the model can be restarted with a new instance."""
23 |         selector = PCovCUR(n_to_select=1)
24 |         selector.fit(self.X, self.y)
25 | 
26 |         for i in range(len(self.idx) - 2):
27 |             selector.n_to_select += 1
28 |             selector.fit(self.X, self.y, warm_start=True)
29 |             self.assertEqual(selector.selected_idx_[i], self.idx[i])
30 | 
31 |     def test_non_it(self):
32 |         """Check that the model can be run non-iteratively."""
33 |         self.idx = [2, 8, 3, 6, 7, 9, 1, 0, 5]
34 |         selector = PCovCUR(n_to_select=9, recompute_every=0)
35 |         selector.fit(self.X, self.y)
36 | 
37 |         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     unittest.main(verbosity=2)
42 | 


--------------------------------------------------------------------------------
/tests/test_feature_pcov_fps.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from sklearn.datasets import load_diabetes as get_dataset
 4 | 
 5 | from skmatter.feature_selection import PCovFPS
 6 | 
 7 | 
 8 | class TestPCovFPS(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.X, self.y = get_dataset(return_X_y=True)
11 |         self.idx = [0, 2, 6, 7, 1, 3, 4]
12 | 
13 |     def test_restart(self):
14 |         """Check that the model can be restarted with a new number of features and
15 |         `warm_start`.
16 |         """
17 |         selector = PCovFPS(n_to_select=1, initialize=self.idx[0])
18 |         selector.fit(self.X, y=self.y)
19 | 
20 |         for i in range(2, len(self.idx)):
21 |             selector.n_to_select = i
22 |             selector.fit(self.X, y=self.y, warm_start=True)
23 |             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
24 | 
25 |     def test_no_mixing_1(self):
26 |         """Check that the model throws an error when mixing = 1.0."""
27 |         selector = PCovFPS(n_to_select=1, mixing=1.0)
28 |         with self.assertRaises(ValueError) as cm:
29 |             selector.fit(self.X, y=self.y)
30 |         self.assertEqual(
31 |             str(cm.exception),
32 |             "Mixing = 1.0 corresponds to traditional FPS. Please use the FPS class.",
33 |         )
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     unittest.main(verbosity=2)
38 | 


--------------------------------------------------------------------------------
/tests/test_feature_simple_cur.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from sklearn import exceptions
 5 | 
 6 | from skmatter.datasets import load_csd_1000r as load
 7 | from skmatter.feature_selection import CUR, FPS
 8 | 
 9 | 
10 | class TestCUR(unittest.TestCase):
11 |     def setUp(self):
12 |         self.X, _ = load(return_X_y=True)
13 |         self.X = FPS(n_to_select=10).fit(self.X).transform(self.X)
14 | 
15 |     def test_bad_transform(self):
16 |         selector = CUR(n_to_select=2)
17 |         with self.assertRaises(exceptions.NotFittedError):
18 |             _ = selector.transform(self.X)
19 | 
20 |     def test_restart(self):
21 |         """Check that the model can be restarted with a new instance."""
22 |         ref_selector = CUR(n_to_select=self.X.shape[-1] - 3).fit(X=self.X)
23 |         ref_idx = ref_selector.selected_idx_
24 | 
25 |         selector = CUR(n_to_select=1)
26 |         selector.fit(self.X)
27 | 
28 |         for i in range(self.X.shape[-1] - 3):
29 |             selector.n_to_select += 1
30 |             selector.fit(self.X, warm_start=True)
31 |             self.assertEqual(selector.selected_idx_[i], ref_idx[i])
32 | 
33 |     def test_non_it(self):
34 |         """Check that the model can be run non-iteratively."""
35 |         C = self.X.T @ self.X
36 |         _, UC = np.linalg.eigh(C)
37 |         ref_idx = np.argsort(-(UC[:, -1] ** 2.0))[:-1]
38 | 
39 |         selector = CUR(n_to_select=self.X.shape[-1] - 1, recompute_every=0)
40 |         selector.fit(self.X)
41 | 
42 |         self.assertTrue(np.allclose(selector.selected_idx_, ref_idx))
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main(verbosity=2)
47 | 


--------------------------------------------------------------------------------
/tests/test_feature_simple_fps.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from sklearn.datasets import load_diabetes as get_dataset
 5 | from sklearn.utils.validation import NotFittedError
 6 | 
 7 | from skmatter.feature_selection import FPS
 8 | 
 9 | 
10 | class TestFPS(unittest.TestCase):
11 |     def setUp(self):
12 |         self.X, _ = get_dataset(return_X_y=True)
13 |         self.idx = [0, 6, 1, 2, 4, 9, 3]
14 | 
15 |     def test_restart(self):
16 |         """
17 |         Check that the model can be restarted with a new number of
18 |         features and `warm_start`
19 |         """
20 |         selector = FPS(n_to_select=1, initialize=self.idx[0])
21 |         selector.fit(self.X)
22 | 
23 |         for i in range(2, len(self.idx)):
24 |             selector.n_to_select = i
25 |             selector.fit(self.X, warm_start=True)
26 |             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
27 | 
28 |     def test_initialize(self):
29 |         """Check that the model can be initialized in all applicable manners and throws
30 |         an error otherwise.
31 |         """
32 |         for initialize in [self.idx[0], "random"]:
33 |             with self.subTest(initialize=initialize):
34 |                 selector = FPS(n_to_select=1, initialize=initialize)
35 |                 selector.fit(self.X)
36 | 
37 |         initialize = self.idx[:4]
38 |         with self.subTest(initialize=initialize):
39 |             selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
40 |             selector.fit(self.X)
41 |             for i in range(4):
42 |                 self.assertEqual(selector.selected_idx_[i], self.idx[i])
43 | 
44 |         initialize = np.array(self.idx[:4])
45 |         with self.subTest(initialize=initialize):
46 |             selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
47 |             selector.fit(self.X)
48 |             for i in range(4):
49 |                 self.assertEqual(selector.selected_idx_[i], self.idx[i])
50 | 
51 |         initialize = np.array([1, 5, 3, 0.25])
52 |         with self.subTest(initialize=initialize):
53 |             with self.assertRaises(ValueError) as cm:
54 |                 selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
55 |                 selector.fit(self.X)
56 |             self.assertEqual(
57 |                 str(cm.exception), "Invalid value of the initialize parameter"
58 |             )
59 | 
60 |         initialize = np.array([[1, 5, 3], [2, 4, 6]])
61 |         with self.subTest(initialize=initialize):
62 |             with self.assertRaises(ValueError) as cm:
63 |                 selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
64 |                 selector.fit(self.X)
65 |             self.assertEqual(
66 |                 str(cm.exception), "Invalid value of the initialize parameter"
67 |             )
68 | 
69 |         with self.assertRaises(ValueError) as cm:
70 |             selector = FPS(n_to_select=1, initialize="bad")
71 |             selector.fit(self.X)
72 |         self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter")
73 | 
74 |     def test_get_distances(self):
75 |         """Check that the hausdorff distances are returnable after fitting."""
76 |         selector = FPS(n_to_select=7)
77 |         selector.fit(self.X)
78 |         d = selector.get_select_distance()
79 | 
80 |         dist_grad = d[1:-1] - d[2:]
81 |         self.assertTrue(all(dist_grad > 0))
82 | 
83 |         with self.assertRaises(NotFittedError):
84 |             selector = FPS(n_to_select=7)
85 |             _ = selector.get_select_distance()
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     unittest.main(verbosity=2)
90 | 


--------------------------------------------------------------------------------
/tests/test_greedy_selector.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | from sklearn.datasets import load_diabetes as get_dataset
  5 | from sklearn.exceptions import NotFittedError
  6 | 
  7 | from skmatter._selection import GreedySelector
  8 | 
  9 | 
 10 | class GreedyTester(GreedySelector):
 11 |     def __init__(
 12 |         self, n_to_select=None, score_threshold=None, selection_type="feature", **kwargs
 13 |     ):
 14 |         super().__init__(
 15 |             selection_type=selection_type,
 16 |             n_to_select=n_to_select,
 17 |             score_threshold=score_threshold,
 18 |             **kwargs,
 19 |         )
 20 | 
 21 |     def score(self, X, y=None):
 22 |         scores = np.linalg.norm(X, axis=0)
 23 |         scores[self.selected_idx_] = 0.0
 24 |         return scores
 25 | 
 26 | 
 27 | class TestGreedy(unittest.TestCase):
 28 |     def setUp(self):
 29 |         self.X, _ = get_dataset(return_X_y=True)
 30 | 
 31 |     def test_bad_type(self):
 32 |         with self.assertRaises(
 33 |             ValueError, msg="Only feature and sample selection supported."
 34 |         ):
 35 |             _ = GreedyTester(selection_type="bad").fit(self.X)
 36 | 
 37 |     def test_score_threshold(self):
 38 |         selector = GreedyTester(score_threshold=200, n_to_select=7)
 39 |         with self.assertWarns(
 40 |             Warning, msg="Score threshold of 200 reached. Terminating search at 6 / 7."
 41 |         ):
 42 |             selector.fit(self.X)
 43 | 
 44 |     def test_score_threshold_and_full(self):
 45 |         with self.assertRaises(ValueError) as cm:
 46 |             _ = GreedyTester(score_threshold=20, full=True, n_to_select=12).fit(self.X)
 47 |         self.assertEqual(
 48 |             str(cm.exception),
 49 |             "You cannot specify both `score_threshold` and `full=True`.",
 50 |         )
 51 | 
 52 |     def test_bad_score_threshold_type(self):
 53 |         with self.assertRaises(ValueError) as cm:
 54 |             _ = GreedyTester(score_threshold_type="bad").fit(self.X)
 55 |         self.assertEqual(
 56 |             str(cm.exception),
 57 |             "invalid score_threshold_type, expected one of 'relative' or 'absolute'",
 58 |         )
 59 | 
 60 |     def test_bad_warm_start(self):
 61 |         selector = GreedyTester()
 62 |         with self.assertRaises(ValueError) as cm:
 63 |             selector.fit(self.X, warm_start=True)
 64 |         self.assertTrue(
 65 |             str(cm.exception),
 66 |             "Cannot fit with warm_start=True without having been previously "
 67 |             "initialized",
 68 |         )
 69 | 
 70 |     def test_bad_y(self):
 71 |         self.X, self.Y = get_dataset(return_X_y=True)
 72 |         Y = self.Y[:2]
 73 |         selector = GreedyTester(n_to_select=2)
 74 |         with self.assertRaises(ValueError):
 75 |             selector.fit(X=self.X, y=Y)
 76 | 
 77 |     def test_bad_transform(self):
 78 |         selector = GreedyTester(n_to_select=2)
 79 |         selector.fit(self.X)
 80 |         with self.assertRaises(ValueError) as cm:
 81 |             _ = selector.transform(self.X[:, :3])
 82 |         self.assertEqual(
 83 |             str(cm.exception),
 84 |             "X has 3 features, but GreedyTester is expecting 10 features as input.",
 85 |         )
 86 | 
 87 |     def test_no_nfeatures(self):
 88 |         selector = GreedyTester()
 89 |         selector.fit(self.X)
 90 |         self.assertEqual(len(selector.selected_idx_), self.X.shape[1] // 2)
 91 | 
 92 |     def test_decimal_nfeatures(self):
 93 |         selector = GreedyTester(n_to_select=0.2)
 94 |         selector.fit(self.X)
 95 |         self.assertEqual(len(selector.selected_idx_), int(self.X.shape[1] * 0.2))
 96 | 
 97 |     def test_bad_nfeatures(self):
 98 |         for nf in [1.2, "1", 20]:
 99 |             with self.subTest(n_features=nf):
100 |                 selector = GreedyTester(n_to_select=nf)
101 |                 with self.assertRaises(ValueError) as cm:
102 |                     selector.fit(self.X)
103 |                 self.assertEqual(
104 |                     str(cm.exception),
105 |                     (
106 |                         "n_to_select must be either None, an integer in "
107 |                         "[1, n_features] representing the absolute number "
108 |                         "of features, or a float in (0, 1] representing a "
109 |                         f"percentage of features to select. Got {nf} "
110 |                         f"features and an input with {self.X.shape[1]} feature."
111 |                     ),
112 |                 )
113 | 
114 |     def test_not_fitted(self):
115 |         with self.assertRaises(NotFittedError):
116 |             selector = GreedyTester()
117 |             _ = selector._get_support_mask()
118 | 
119 |     def test_fitted(self):
120 |         selector = GreedyTester()
121 |         selector.fit(self.X)
122 |         _ = selector._get_support_mask()
123 | 
124 |         Xr = selector.transform(self.X)
125 |         self.assertEqual(Xr.shape[1], self.X.shape[1] // 2)
126 | 
127 |     def test_size_input(self):
128 |         X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
129 |         selector_sample = GreedyTester(selection_type="sample")
130 |         selector_feature = GreedyTester(selection_type="feature")
131 |         with self.assertRaises(ValueError) as cm:
132 |             selector_feature.fit(X)
133 |         self.assertEqual(
134 |             str(cm.exception),
135 |             f"Found array with 1 feature(s) (shape={X.shape}) while a minimum of 2 is "
136 |             "required by GreedyTester.",
137 |         )
138 | 
139 |         X = X.reshape(1, -1)
140 | 
141 |         with self.assertRaises(ValueError) as cm:
142 |             selector_sample.fit(X)
143 |         self.assertEqual(
144 |             str(cm.exception),
145 |             f"Found array with 1 sample(s) (shape={X.shape}) while a minimum of 2 is "
146 |             "required by GreedyTester.",
147 |         )
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     unittest.main(verbosity=2)
152 | 


--------------------------------------------------------------------------------
/tests/test_kernel_normalizer.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | import sklearn
  5 | 
  6 | from skmatter.preprocessing import KernelNormalizer
  7 | 
  8 | 
  9 | class KernelTests(unittest.TestCase):
 10 |     def __init__(self, *args, **kwargs):
 11 |         super().__init__(*args, **kwargs)
 12 |         self.random_state = np.random.RandomState(0)
 13 | 
 14 |     def test_sample_weights(self):
 15 |         """Checks that sample weights of one are equal to the unweighted case and
 16 |         that nonuniform weights are different from the unweighted case.
 17 |         """
 18 |         K = self.random_state.uniform(0, 100, size=(3, 3))
 19 |         equal_wts = np.ones(len(K))
 20 |         nonequal_wts = self.random_state.uniform(0, 100, size=(len(K),))
 21 |         model = KernelNormalizer()
 22 |         weighted_model = KernelNormalizer()
 23 |         K_unweighted = model.fit_transform(K)
 24 |         K_equal_weighted = weighted_model.fit_transform(K, sample_weight=equal_wts)
 25 |         self.assertTrue((np.isclose(K_unweighted, K_equal_weighted, atol=1e-12)).all())
 26 |         K_nonequal_weighted = weighted_model.fit_transform(
 27 |             K, sample_weight=nonequal_wts
 28 |         )
 29 |         self.assertFalse(
 30 |             (np.isclose(K_unweighted, K_nonequal_weighted, atol=1e-12)).all()
 31 |         )
 32 | 
 33 |     def test_invalid_sample_weights(self):
 34 |         """Checks that weights must be 1D array with the same length as the number of
 35 |         samples.
 36 |         """
 37 |         K = self.random_state.uniform(0, 100, size=(3, 3))
 38 |         wts_len = np.ones(len(K) + 1)
 39 |         wts_dim = np.ones((len(K), 2))
 40 |         model = KernelNormalizer()
 41 |         with self.assertRaises(ValueError):
 42 |             model.fit_transform(K, sample_weight=wts_len)
 43 |         with self.assertRaises(ValueError):
 44 |             model.fit_transform(K, sample_weight=wts_dim)
 45 | 
 46 |     def test_ValueError(self):
 47 |         """Checks that a non-square matrix cannot be normalized."""
 48 |         K = self.random_state.uniform(0, 100, size=(3, 4))
 49 |         model = KernelNormalizer()
 50 |         with self.assertRaises(ValueError):
 51 |             model.fit(K)
 52 | 
 53 |     def test_reference_ValueError(self):
 54 |         """Checks that it is impossible to normalize a matrix with a non-coincident
 55 |         size with the reference.
 56 |         """
 57 |         K = self.random_state.uniform(0, 100, size=(3, 3))
 58 |         K_2 = self.random_state.uniform(0, 100, size=(2, 2))
 59 |         model = KernelNormalizer()
 60 |         model = model.fit(K)
 61 |         with self.assertRaises(ValueError):
 62 |             model.transform(K_2)
 63 | 
 64 |     def test_NotFittedError_transform(self):
 65 |         """Checks that an error is returned when trying to use the transform function
 66 |         before the fit function.
 67 |         """
 68 |         K = self.random_state.uniform(0, 100, size=(3, 3))
 69 |         model = KernelNormalizer()
 70 |         with self.assertRaises(sklearn.exceptions.NotFittedError):
 71 |             model.transform(K)
 72 | 
 73 |     def test_fit_transform(self):
 74 |         """Checks that the kernel is correctly normalized.
 75 | 
 76 |         Compare with the value calculated directly from the equation.
 77 |         """
 78 |         K = self.random_state.uniform(0, 100, size=(3, 3))
 79 |         model = KernelNormalizer()
 80 |         Ktr = model.fit_transform(K)
 81 |         Kc = K - K.mean(axis=0) - K.mean(axis=1)[:, np.newaxis] + K.mean()
 82 |         Kc /= np.trace(Kc) / Kc.shape[0]
 83 | 
 84 |         self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all())
 85 | 
 86 |     def test_center_only(self):
 87 |         """Checks that the kernel is correctly centered,
 88 |         but not normalized.
 89 |         Compare with the value calculated
 90 |         directly from the equation.
 91 |         """
 92 |         K = self.random_state.uniform(0, 100, size=(3, 3))
 93 |         model = KernelNormalizer(with_center=True, with_trace=False)
 94 |         Ktr = model.fit_transform(K)
 95 |         Kc = K - K.mean(axis=0) - K.mean(axis=1)[:, np.newaxis] + K.mean()
 96 | 
 97 |         self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all())
 98 | 
 99 |     def test_trace_only(self):
100 |         """Checks that the kernel is correctly normalized,
101 |         but not centered.
102 |         Compare with the value calculated
103 |         directly from the equation.
104 |         """
105 |         K = self.random_state.uniform(0, 100, size=(3, 3))
106 |         model = KernelNormalizer(with_center=False, with_trace=True)
107 |         Ktr = model.fit_transform(K)
108 |         Kc = K.copy()
109 |         Kc /= np.trace(Kc) / Kc.shape[0]
110 | 
111 |         self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all())
112 | 
113 |     def test_no_preprocessing(self):
114 |         """Checks that the kernel is unchanged
115 |         if no preprocessing is specified.
116 |         """
117 |         K = self.random_state.uniform(0, 100, size=(3, 3))
118 |         model = KernelNormalizer(with_center=False, with_trace=False)
119 |         Ktr = model.fit_transform(K)
120 |         Kc = K.copy()
121 |         self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all())
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     unittest.main()
126 | 


--------------------------------------------------------------------------------
/tests/test_model_selection.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import sklearn.model_selection
 4 | from sklearn.datasets import load_iris
 5 | 
 6 | import skmatter.model_selection
 7 | 
 8 | 
 9 | class SplitTests(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.X = load_iris().data[:10]
13 |         cls.seed = 0x5F3759DF
14 | 
15 |     def test_train_test_splits(self):
16 |         # see if train_test_split of skmatter agrees with the one of sklearn
17 |         sklearn_outputs = sklearn.model_selection.train_test_split(
18 |             self.X, random_state=self.seed
19 |         )
20 |         skmatter_outputs = skmatter.model_selection.train_test_split(
21 |             self.X, random_state=self.seed
22 |         )
23 |         for i in range(len(skmatter_outputs)):
24 |             self.assertTrue((sklearn_outputs[i] == skmatter_outputs[i]).all())
25 | 
26 |     def test_train_test_splits_train_test_overlap(self):
27 |         # tests that a test/train split which necessitates overlap returns the right
28 |         # number of points in each set
29 |         X_train, X_test = skmatter.model_selection.train_test_split(
30 |             self.X,
31 |             train_size=0.8,
32 |             test_size=0.8,
33 |             train_test_overlap=True,
34 |             random_state=self.seed,
35 |         )
36 |         self.assertTrue(len(X_train) == len(X_test) == int(0.8 * self.X.shape[0]))
37 | 
38 |     def test_train_test_splits_train_test_overlap_full_test_set(self):
39 |         # tests that the entire dataset can be used as the testing set
40 |         X_train, X_test = skmatter.model_selection.train_test_split(
41 |             self.X,
42 |             train_size=0.8,
43 |             test_size=1.0,
44 |             train_test_overlap=True,
45 |             random_state=self.seed,
46 |         )
47 |         self.assertTrue((self.X == X_test).all())
48 | 
49 |     def test_train_test_splits_train_test_overlap_full_train_test_set(self):
50 |         # tests that the full dataset can be "split" to both train and test set
51 |         X_train, X_test = skmatter.model_selection.train_test_split(
52 |             self.X,
53 |             train_size=1.0,
54 |             test_size=1.0,
55 |             train_test_overlap=True,
56 |             random_state=self.seed,
57 |         )
58 |         self.assertTrue((X_train == X_test).all())
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     unittest.main()
63 | 


--------------------------------------------------------------------------------
/tests/test_neighbors.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | 
  5 | from skmatter.feature_selection import FPS
  6 | from skmatter.neighbors import SparseKDE
  7 | from skmatter.neighbors._sparsekde import _covariance
  8 | from skmatter.utils import effdim, oas
  9 | 
 10 | 
 11 | class SparseKDETests(unittest.TestCase):
 12 |     @classmethod
 13 |     def setUpClass(cls) -> None:
 14 |         np.random.seed(0)
 15 |         cls.n_samples_per_cov = 10000
 16 |         cls.samples = np.concatenate(
 17 |             [
 18 |                 np.random.multivariate_normal(
 19 |                     [0, 0], [[1, 0.5], [0.5, 1]], cls.n_samples_per_cov
 20 |                 ),
 21 |                 np.random.multivariate_normal(
 22 |                     [4, 4], [[1, 0.5], [0.5, 0.5]], cls.n_samples_per_cov
 23 |                 ),
 24 |             ]
 25 |         )
 26 |         cls.sample_results = np.array(
 27 |             [[4.56393465, 4.20566218], [0.73562454, 1.11116178]]
 28 |         )
 29 |         cls.selector = FPS(n_to_select=int(np.sqrt(2 * cls.n_samples_per_cov)))
 30 |         cls.grids = cls.selector.fit_transform(cls.samples.T).T
 31 |         cls.expect_score_fp = -759.831
 32 |         cls.expect_score_fs = -781.567
 33 | 
 34 |         cls.cell = np.array([4, 4])
 35 |         cls.expect_score_periodic = -456.744
 36 | 
 37 |     def test_sparse_kde(self):
 38 |         estimator = SparseKDE(self.samples, None, fpoints=0.5)
 39 |         estimator.fit(self.grids)
 40 |         self.assertTrue(round(estimator.score(self.grids), 3) == self.expect_score_fp)
 41 |         self.assertTrue(np.allclose(estimator.sample(2), self.sample_results))
 42 | 
 43 |     def test_sparce_kde_fs(self):
 44 |         estimator = SparseKDE(self.samples, None, fspread=0.5)
 45 |         estimator.fit(self.grids)
 46 |         self.assertTrue(round(estimator.score(self.grids), 3) == self.expect_score_fs)
 47 | 
 48 |     def test_sparse_kde_periodic(self):
 49 |         estimator = SparseKDE(
 50 |             self.samples,
 51 |             None,
 52 |             metric_params={"cell_length": self.cell},
 53 |             fpoints=0.5,
 54 |         )
 55 |         estimator.fit(self.grids)
 56 |         self.assertTrue(
 57 |             round(estimator.score(self.grids), 3) == self.expect_score_periodic
 58 |         )
 59 | 
 60 |     def test_dimension_check(self):
 61 |         estimator = SparseKDE(
 62 |             self.samples, None, metric_params={"cell_length": self.cell}, fpoints=0.5
 63 |         )
 64 |         self.assertRaises(ValueError, estimator.fit, np.array([[4]]))
 65 | 
 66 |     def test_fs_fp_imcompatibility(self):
 67 |         estimator = SparseKDE(
 68 |             self.samples,
 69 |             None,
 70 |             metric_params={"cell_length": self.cell},
 71 |             fspread=2,
 72 |             fpoints=0.5,
 73 |         )
 74 |         self.assertTrue(estimator.fpoints == -1)
 75 | 
 76 | 
 77 | class CovarianceTests(unittest.TestCase):
 78 |     @classmethod
 79 |     def setUpClass(cls):
 80 |         cls.X = np.array([[1, 2], [3, 3], [4, 6]])
 81 |         cls.expected_cov = np.array(
 82 |             [[2.33333333, 2.83333333], [2.83333333, 4.33333333]]
 83 |         )
 84 |         cls.expected_cov_periodic = np.array(
 85 |             [[1.12597216, 0.45645371], [0.45645371, 0.82318948]]
 86 |         )
 87 |         cls.cell = np.array([3, 3])
 88 | 
 89 |     def test_covariance(self):
 90 |         cov = _covariance(self.X, np.full(len(self.X), 1 / len(self.X)), None)
 91 |         self.assertTrue(np.allclose(cov, self.expected_cov))
 92 | 
 93 |     def test_covariance_periodic(self):
 94 |         cov = _covariance(self.X, np.full(len(self.X), 1 / len(self.X)), self.cell)
 95 |         self.assertTrue(np.allclose(cov, self.expected_cov_periodic))
 96 | 
 97 | 
 98 | class EffdimTests(unittest.TestCase):
 99 |     @classmethod
100 |     def setUpClass(cls):
101 |         cls.cov = np.array([[1, 1, 0], [1, 1.5, 0], [0, 0, 1]], dtype=np.float64)
102 |         cls.expected_effdim = 2.24909102090124
103 | 
104 |     def test_effdim(self):
105 |         self.assertTrue(np.allclose(effdim(self.cov), self.expected_effdim))
106 | 
107 | 
108 | class OASTests(unittest.TestCase):
109 |     @classmethod
110 |     def setUpClass(cls):
111 |         cls.cov = np.array([[0.5, 1.0], [0.7, 0.4]])
112 |         cls.n = 10
113 |         cls.D = 2
114 |         cls.expected_oas = np.array(
115 |             [[0.48903924, 0.78078484], [0.54654939, 0.41096076]]
116 |         )
117 | 
118 |     def test_oas(self):
119 |         self.assertTrue(np.allclose(oas(self.cov, self.n, self.D), self.expected_oas))
120 | 


--------------------------------------------------------------------------------
/tests/test_pcovr_distances.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import scipy
 5 | from sklearn.datasets import load_diabetes as get_dataset
 6 | 
 7 | from skmatter.utils import pcovr_covariance, pcovr_kernel
 8 | 
 9 | 
10 | class CovarianceTest(unittest.TestCase):
11 |     def __init__(self, *args, **kwargs):
12 |         super().__init__(*args, **kwargs)
13 |         self.X, self.Y = get_dataset(return_X_y=True)
14 | 
15 |     def test_alphas(self):
16 |         C_X = self.X.T @ self.X
17 | 
18 |         C_inv = np.linalg.pinv(C_X, rcond=1e-12)
19 |         C_isqrt = np.real(scipy.linalg.sqrtm(C_inv))
20 | 
21 |         # parentheses speed up calculation greatly
22 |         C_Y = C_isqrt @ (self.X.T @ self.Y)
23 |         C_Y = C_Y.reshape((C_X.shape[0], -1))
24 |         C_Y = np.real(C_Y)
25 |         C_Y = C_Y @ C_Y.T
26 | 
27 |         for alpha in [0.0, 0.5, 1.0]:
28 |             with self.subTest(alpha=alpha):
29 |                 C = pcovr_covariance(alpha, X=self.X, Y=self.Y, rcond=1e-6)
30 |                 self.assertTrue(np.allclose(C, alpha * C_X + (1 - alpha) * C_Y))
31 | 
32 |     def test_no_return_isqrt(self):
33 |         with self.assertRaises(ValueError):
34 |             _, _ = pcovr_covariance(0.5, self.X, self.Y, return_isqrt=False)
35 | 
36 |     def test_inverse_covariance(self):
37 |         rcond = 1e-12
38 |         rng = np.random.default_rng(0)
39 | 
40 |         # Make some random data where the last feature
41 |         # is a linear comibination of the other features.
42 |         # This gives us a covariance with a zero eigenvalue
43 |         # that should be dropped (via rcond).
44 |         # Hence, the inverse square root covariance
45 |         # should be identical between the "full"
46 |         # computation (eigh) and the approximate
47 |         # computation that takes the top n_features-1
48 |         # singular values (randomized svd).
49 |         X = rng.random((10, 5))
50 |         Y = rng.random(10)
51 |         x = rng.random(5)
52 |         Xx = np.column_stack((X, np.sum(X * x, axis=1)))
53 |         Xx -= np.mean(Xx, axis=0)
54 | 
55 |         C_inv = np.linalg.pinv(Xx.T @ Xx, rcond=rcond)
56 |         C_isqrt = np.real(scipy.linalg.sqrtm(C_inv))
57 | 
58 |         _, C_isqrt_eigh = pcovr_covariance(0.5, Xx, Y, return_isqrt=True, rcond=rcond)
59 |         _, C_isqrt_svd = pcovr_covariance(
60 |             0.5, Xx, Y, return_isqrt=True, rank=min(Xx.shape) - 1, rcond=rcond
61 |         )
62 | 
63 |         for C, C_type in zip([C_isqrt_eigh, C_isqrt_svd], ["eigh", "svd"]):
64 |             with self.subTest(C_isqrt_type=C_type):
65 |                 self.assertTrue(np.allclose(C_isqrt, C))
66 | 
67 | 
68 | class KernelTest(unittest.TestCase):
69 |     def __init__(self, *args, **kwargs):
70 |         super().__init__(*args, **kwargs)
71 |         self.X, self.Y = get_dataset(return_X_y=True)
72 | 
73 |     def test_alphas(self):
74 |         K_X = self.X @ self.X.T
75 |         K_Y = self.Y @ self.Y.T
76 | 
77 |         for alpha in [0.0, 0.5, 1.0]:
78 |             with self.subTest(alpha=alpha):
79 |                 K = pcovr_kernel(alpha, self.X, self.Y)
80 |                 self.assertTrue(np.allclose(K, alpha * K_X + (1 - alpha) * K_Y))
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     unittest.main(verbosity=2)
85 | 


--------------------------------------------------------------------------------
/tests/test_progress_bar.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from skmatter.utils import get_progress_bar
 4 | 
 5 | 
 6 | class PBarTest(unittest.TestCase):
 7 |     def test_no_tqdm(self):
 8 |         """Check that the model cannot use a progress bar when tqdm is not installed."""
 9 |         import sys
10 | 
11 |         sys.modules["tqdm"] = None
12 | 
13 |         with self.assertRaises(ImportError) as cm:
14 |             _ = get_progress_bar()
15 |         self.assertEqual(
16 |             str(cm.exception),
17 |             "tqdm must be installed to use a progress bar. Either install tqdm or "
18 |             "re-run with progress_bar = False",
19 |         )
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     unittest.main(verbosity=2)
24 | 


--------------------------------------------------------------------------------
/tests/test_sample_pcov_cur.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from sklearn.datasets import load_diabetes as get_dataset
 5 | 
 6 | from skmatter.sample_selection import PCovCUR
 7 | 
 8 | 
 9 | EPSILON = 1e-6
10 | 
11 | 
12 | class TestPCovCUR(unittest.TestCase):
13 |     def setUp(self):
14 |         self.X, self.y = get_dataset(return_X_y=True)
15 |         self.X = self.X[:, :4]
16 |         self.idx = [256, 304, 41, 408, 311, 364, 152, 78, 359, 102]
17 | 
18 |     def test_known(self):
19 |         """Check that the model returns a known set of indices."""
20 |         selector = PCovCUR(n_to_select=10, mixing=0.5)
21 |         selector.fit(self.X, self.y)
22 | 
23 |         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
24 | 
25 |     def test_restart(self):
26 |         """Check that the model can be restarted with a new instance."""
27 |         selector = PCovCUR(n_to_select=1, mixing=0.5)
28 |         selector.fit(self.X, self.y)
29 | 
30 |         for i in range(len(self.idx) - 2):
31 |             selector.n_to_select += 1
32 |             selector.fit(self.X, self.y, warm_start=True)
33 |             self.assertEqual(selector.selected_idx_[i], self.idx[i])
34 | 
35 |             self.assertLessEqual(
36 |                 np.linalg.norm(selector.X_current_[self.idx[i]]), EPSILON
37 |             )
38 | 
39 |             for j in range(self.X.shape[0]):
40 |                 self.assertLessEqual(
41 |                     np.dot(selector.X_current_[self.idx[i]], selector.X_current_[j]),
42 |                     EPSILON,
43 |                 )
44 | 
45 |     def test_non_it(self):
46 |         """Check that the model can be run non-iteratively."""
47 |         self.idx = [256, 32, 138, 290, 362, 141, 359, 428, 254, 9]
48 |         selector = PCovCUR(n_to_select=10, recompute_every=0)
49 |         selector.fit(self.X, self.y)
50 | 
51 |         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
52 | 
53 |     def test_multiple_k(self):
54 |         """Check that the model can be run with multiple k's."""
55 |         for k in list(set(np.logspace(0, np.log10(min(self.X.shape)), 4, dtype=int))):
56 |             selector = PCovCUR(n_to_select=10, k=k)
57 |             selector.fit(self.X, self.y)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     unittest.main(verbosity=2)
62 | 


--------------------------------------------------------------------------------
/tests/test_sample_pcov_fps.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from sklearn.datasets import load_diabetes as get_dataset
 4 | 
 5 | from skmatter.sample_selection import PCovFPS
 6 | 
 7 | 
 8 | class TestPCovFPS(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.X, self.y = get_dataset(return_X_y=True)
11 |         self.idx = [0, 256, 156, 324, 349, 77, 113, 441, 426, 51]
12 | 
13 |     def test_restart(self):
14 |         """Check that the model can be restarted with a new number of samples and
15 |         `warm_start`.
16 |         """
17 |         selector = PCovFPS(n_to_select=1, initialize=self.idx[0])
18 |         selector.fit(self.X, y=self.y)
19 | 
20 |         for i in range(2, len(self.idx)):
21 |             selector.n_to_select = i
22 |             selector.fit(self.X, y=self.y, warm_start=True)
23 |             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
24 | 
25 |     def test_no_mixing_1(self):
26 |         """Check that the model throws an error when mixing = 1.0."""
27 |         selector = PCovFPS(n_to_select=1, mixing=1.0)
28 |         with self.assertRaises(ValueError) as cm:
29 |             selector.fit(self.X, y=self.y)
30 |         self.assertEqual(
31 |             str(cm.exception),
32 |             "Mixing = 1.0 corresponds to traditional FPS. Please use the FPS class.",
33 |         )
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     unittest.main(verbosity=2)
38 | 


--------------------------------------------------------------------------------
/tests/test_sample_simple_cur.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from sklearn.datasets import fetch_california_housing as load
 5 | 
 6 | from skmatter.sample_selection import CUR, FPS
 7 | 
 8 | 
 9 | class TestCUR(unittest.TestCase):
10 |     def setUp(self):
11 |         self.X, _ = load(return_X_y=True)
12 |         self.X = self.X[FPS(n_to_select=100).fit(self.X).selected_idx_]
13 |         self.n_select = min(20, min(self.X.shape) // 2)
14 | 
15 |     def test_sample_transform(self):
16 |         """
17 |         Check that an error is raised when the transform function is used,
18 |         because sklearn does not support well transformers that change the number
19 |         of samples with other classes like Pipeline
20 |         """
21 |         selector = CUR(n_to_select=1)
22 |         selector.fit(self.X)
23 |         with self.assertRaises(ValueError) as error:
24 |             _ = selector.transform(self.X)
25 | 
26 |         self.assertTrue(
27 |             "Transform is not currently supported for sample selection."
28 |             == str(error.exception)
29 |         )
30 | 
31 |     def test_restart(self):
32 |         """Check that the model can be restarted with a new instance"""
33 |         ref_selector = CUR(n_to_select=self.n_select)
34 |         ref_idx = ref_selector.fit(self.X).selected_idx_
35 | 
36 |         selector = CUR(n_to_select=1)
37 |         selector.fit(self.X)
38 | 
39 |         for i in range(len(ref_idx) - 2):
40 |             selector.n_to_select += 1
41 |             selector.fit(self.X, warm_start=True)
42 |             self.assertEqual(selector.selected_idx_[i], ref_idx[i])
43 | 
44 |     def test_non_it(self):
45 |         """Check that the model can be run non-iteratively."""
46 |         K = self.X @ self.X.T
47 |         _, UK = np.linalg.eigh(K)
48 |         ref_idx = np.argsort(-(UK[:, -1] ** 2.0))[: self.n_select]
49 | 
50 |         selector = CUR(n_to_select=len(ref_idx), recompute_every=0)
51 |         selector.fit(self.X)
52 | 
53 |         self.assertTrue(np.allclose(selector.selected_idx_, ref_idx))
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     unittest.main(verbosity=2)
58 | 


--------------------------------------------------------------------------------
/tests/test_sample_simple_fps.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | from sklearn.datasets import load_diabetes as get_dataset
  5 | from sklearn.utils.validation import NotFittedError
  6 | 
  7 | from skmatter.sample_selection import FPS
  8 | 
  9 | 
 10 | class TestFPS(unittest.TestCase):
 11 |     def setUp(self):
 12 |         self.X, _ = get_dataset(return_X_y=True)
 13 |         self.idx = [0, 123, 441, 187, 117, 276, 261, 281, 251, 193]
 14 | 
 15 |     def test_restart(self):
 16 |         """Checks that the model can be restarted with a new number of samples and
 17 |         `warm_start`.
 18 |         """
 19 |         selector = FPS(n_to_select=1, initialize=self.idx[0])
 20 |         selector.fit(self.X)
 21 | 
 22 |         for i in range(2, len(self.idx)):
 23 |             selector.n_to_select = i
 24 |             selector.fit(self.X, warm_start=True)
 25 |             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
 26 | 
 27 |     def test_initialize(self):
 28 |         """Checks that the model can be initialized in all applicable manners and throws
 29 |         an error otherwise.
 30 |         """
 31 |         for initialize in [self.idx[0], "random"]:
 32 |             with self.subTest(initialize=initialize):
 33 |                 selector = FPS(n_to_select=1, initialize=initialize)
 34 |                 selector.fit(self.X)
 35 | 
 36 |         initialize = self.idx[:4]
 37 |         with self.subTest(initialize=initialize):
 38 |             selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
 39 |             selector.fit(self.X)
 40 |             for i in range(4):
 41 |                 self.assertEqual(selector.selected_idx_[i], self.idx[i])
 42 | 
 43 |         initialize = np.array(self.idx[:4])
 44 |         with self.subTest(initialize=initialize):
 45 |             selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
 46 |             selector.fit(self.X)
 47 |             for i in range(4):
 48 |                 self.assertEqual(selector.selected_idx_[i], self.idx[i])
 49 | 
 50 |         initialize = np.array([1, 5, 3, 0.25])
 51 |         with self.subTest(initialize=initialize):
 52 |             with self.assertRaises(ValueError) as cm:
 53 |                 selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
 54 |                 selector.fit(self.X)
 55 |             self.assertEqual(
 56 |                 str(cm.exception), "Invalid value of the initialize parameter"
 57 |             )
 58 | 
 59 |         initialize = np.array([[1, 5, 3], [2, 4, 6]])
 60 |         with self.subTest(initialize=initialize):
 61 |             with self.assertRaises(ValueError) as cm:
 62 |                 selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize)
 63 |                 selector.fit(self.X)
 64 |             self.assertEqual(
 65 |                 str(cm.exception), "Invalid value of the initialize parameter"
 66 |             )
 67 | 
 68 |         with self.assertRaises(ValueError) as cm:
 69 |             selector = FPS(n_to_select=1, initialize="bad")
 70 |             selector.fit(self.X)
 71 |         self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter")
 72 | 
 73 |     def test_get_distances(self):
 74 |         """Checks that the hausdorff distances are returnable after fitting."""
 75 |         selector = FPS(n_to_select=1)
 76 |         selector.fit(self.X)
 77 |         _ = selector.get_select_distance()
 78 | 
 79 |         with self.assertRaises(NotFittedError):
 80 |             selector = FPS(n_to_select=1)
 81 |             _ = selector.get_select_distance()
 82 | 
 83 |     def test_threshold(self):
 84 |         selector = FPS(
 85 |             n_to_select=10,
 86 |             score_threshold=5e-2,
 87 |             score_threshold_type="absolute",
 88 |         )
 89 |         selector.fit(self.X)
 90 |         self.assertEqual(len(selector.selected_idx_), 6)
 91 |         self.assertEqual(selector.selected_idx_.tolist(), self.idx[:6])
 92 | 
 93 |         selector = FPS(
 94 |             n_to_select=10,
 95 |             score_threshold=0.4,
 96 |             score_threshold_type="relative",
 97 |         )
 98 |         selector.fit(self.X)
 99 |         self.assertEqual(len(selector.selected_idx_), 5)
100 |         self.assertEqual(selector.selected_idx_.tolist(), self.idx[:5])
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     unittest.main(verbosity=2)
105 | 


--------------------------------------------------------------------------------
/tests/test_voronoi_fps.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | from sklearn.exceptions import NotFittedError
  5 | from test_sample_simple_fps import TestFPS
  6 | 
  7 | from skmatter.sample_selection import FPS, VoronoiFPS
  8 | 
  9 | 
 10 | class TestVoronoiFPS(TestFPS):
 11 |     def setUp(self):
 12 |         super().setUp()
 13 | 
 14 |     def test_restart(self):
 15 |         """Checks that the model can be restarted with a new number of
 16 |         features and `warm_start`
 17 |         """
 18 |         selector = VoronoiFPS(n_to_select=1, initialize=self.idx[0])
 19 |         selector.fit(self.X)
 20 | 
 21 |         for i in range(2, len(self.idx)):
 22 |             selector.n_to_select = i
 23 |             selector.fit(self.X, warm_start=True)
 24 |             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
 25 | 
 26 |     def test_initialize(self):
 27 |         """Checks that the model can be initialized in all applicable manners
 28 |         and throws an error otherwise
 29 |         """
 30 |         for initialize in [self.idx[0], "random"]:
 31 |             with self.subTest(initialize=initialize):
 32 |                 selector = VoronoiFPS(n_to_select=1, initialize=initialize)
 33 |                 selector.fit(self.X)
 34 | 
 35 |         with self.assertRaises(ValueError) as cm:
 36 |             selector = VoronoiFPS(n_to_select=1, initialize="bad")
 37 |             selector.fit(self.X)
 38 |         self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter")
 39 | 
 40 |     def test_switching_point(self):
 41 |         """Check work of the switching point calculator into the
 42 |         _init_greedy_search function
 43 |         """
 44 |         selector = VoronoiFPS(n_to_select=1)
 45 |         selector.fit(self.X)
 46 |         self.assertTrue(1 > selector.full_fraction)
 47 | 
 48 |         selector = VoronoiFPS(n_to_select=1, full_fraction=0.5)
 49 |         selector.fit(self.X)
 50 |         self.assertEqual(selector.full_fraction, 0.5)
 51 | 
 52 |         with self.subTest(name="bad_ntrial"):
 53 |             with self.assertRaises(ValueError) as cm:
 54 |                 selector = VoronoiFPS(n_to_select=1, n_trial_calculation=0)
 55 |                 selector.fit(self.X)
 56 |             self.assertEqual(
 57 |                 str(cm.exception),
 58 |                 "Number of trial calculation should be more or equal to 1",
 59 |             )
 60 | 
 61 |         with self.subTest(name="float_ntrial"):
 62 |             with self.assertRaises(TypeError) as cm:
 63 |                 selector = VoronoiFPS(n_to_select=1, n_trial_calculation=0.3)
 64 |                 selector.fit(self.X)
 65 |             self.assertEqual(
 66 |                 str(cm.exception), "Number of trial calculation should be integer"
 67 |             )
 68 | 
 69 |         with self.subTest(name="large_ff"):
 70 |             with self.assertRaises(ValueError) as cm:
 71 |                 selector = VoronoiFPS(n_to_select=1, full_fraction=1.1)
 72 |                 selector.fit(self.X)
 73 |             self.assertEqual(
 74 |                 str(cm.exception),
 75 |                 "Switching point should be real and more than 0 and less than 1. "
 76 |                 f"Received {selector.full_fraction}",
 77 |             )
 78 | 
 79 |         with self.subTest(name="string_ff"):
 80 |             with self.assertRaises(ValueError) as cm:
 81 |                 selector = VoronoiFPS(n_to_select=1, full_fraction="STRING")
 82 |                 selector.fit(self.X)
 83 |             self.assertEqual(
 84 |                 str(cm.exception),
 85 |                 "Switching point should be real and more than 0 and less than 1. "
 86 |                 f"Received {selector.full_fraction}",
 87 |             )
 88 | 
 89 |     def test_get_distances(self):
 90 |         """Checks that the hausdorff distances are returnable after fitting"""
 91 |         selector = VoronoiFPS(n_to_select=1)
 92 |         selector.fit(self.X)
 93 |         _ = selector.get_select_distance()
 94 | 
 95 |         with self.assertRaises(NotFittedError):
 96 |             selector = VoronoiFPS(n_to_select=1)
 97 |             _ = selector.get_select_distance()
 98 | 
 99 |     def test_comparison(self):
100 |         """Checks that the voronoi FPS strictly computes less distances
101 |         than its normal FPS counterpart.
102 |         """
103 |         vselector = VoronoiFPS(n_to_select=self.X.shape[0] - 1)
104 |         vselector.fit(self.X)
105 | 
106 |         selector = FPS(n_to_select=self.X.shape[0] - 1)
107 |         selector.fit(self.X)
108 | 
109 |         self.assertTrue(np.allclose(vselector.selected_idx_, selector.selected_idx_))
110 | 
111 |     def test_nothing_updated_points(self):
112 |         """Checks that in the case where we have no points to update, the code
113 |         still works fine
114 |         """
115 |         X = np.array([[1, 1], [4, 4], [10, 10], [100, 100]])
116 |         selector = VoronoiFPS(n_to_select=3, initialize=0)
117 |         try:
118 |             selector.fit(X)
119 |             f = 1
120 |         except Exception:
121 |             f = 0
122 |         self.assertEqual(f, 1)
123 | 
124 |         self.assertEqual(
125 |             len(np.where(selector.vlocation_of_idx == (selector.n_selected_ - 2))[0]), 1
126 |         )
127 | 
128 |     def test_calculate_dSL(self):
129 |         selector = VoronoiFPS(n_to_select=3)
130 |         selector.fit(self.X)
131 | 
132 |         active_points = np.where(
133 |             selector.dSL_[selector.vlocation_of_idx] < selector.hausdorff_
134 |         )[0]
135 | 
136 |         ap = selector._get_active(self.X, selector.selected_idx_[-1])
137 | 
138 |         self.assertTrue(
139 |             np.allclose(
140 |                 active_points,
141 |                 ap,
142 |             )
143 |         )
144 | 
145 |         selector = VoronoiFPS(n_to_select=1)
146 | 
147 |         ap = selector._get_active(self.X, 0)
148 | 
149 |         self.assertTrue(
150 |             np.allclose(
151 |                 np.arange(self.X.shape[0]),
152 |                 ap,
153 |             )
154 |         )
155 | 
156 |     def test_score(self):
157 |         """Check that function score return hausdorff distance"""
158 |         selector = VoronoiFPS(n_to_select=3, initialize=0)
159 |         selector.fit(self.X)
160 | 
161 |         self.assertTrue(
162 |             np.allclose(
163 |                 selector.hausdorff_,
164 |                 selector.score(self.X, selector.selected_idx_[-1]),
165 |             )
166 |         )
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     unittest.main(verbosity=2)
171 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =
 3 |     lint
 4 |     build
 5 |     tests
 6 | 
 7 | lint_folders =
 8 |     "{toxinidir}/src" \
 9 |     "{toxinidir}/tests" \
10 |     "{toxinidir}/docs/src/" \
11 |     "{toxinidir}/examples"
12 | 
13 | 
14 | [testenv:build]
15 | description = Builds the package and checks integrity
16 | 
17 | usedevelop = true
18 | deps =
19 |     build
20 |     check-manifest
21 |     twine
22 | allowlist_externals = bash
23 | commands_pre =
24 |     bash -c "if [ -e {toxinidir}/dist/*tar.gz ]; then unlink {toxinidir}/dist/*.whl; fi"
25 |     bash -c "if [ -e {toxinidir}/dist/*tar.gz ]; then unlink {toxinidir}/dist/*.tar.gz; fi"
26 | commands =
27 |     python -m build
28 |     twine check dist/*.tar.gz dist/*.whl
29 |     check-manifest {toxinidir}
30 | 
31 | [testenv:tests]
32 | description = Runs the tests
33 | usedevelop = true
34 | changedir = tests
35 | deps =
36 |     ase
37 |     parameterized
38 |     pytest
39 |     pytest-cov
40 |     tqdm
41 | 
42 | commands =
43 |     # Run unit tests
44 |     pytest {posargs}
45 | 
46 |     # Run documentation tests
47 |     pytest --doctest-modules --pyargs skmatter {posargs}
48 | 
49 | [testenv:lint]
50 | description = Checks the code and doc for programmatic and stylistic errors
51 | skip_install = true
52 | deps =
53 |     black
54 |     blackdoc
55 |     ruff
56 |     isort
57 |     sphinx-lint
58 | commands =
59 |     ruff check {[tox]lint_folders}
60 |     black --check --diff {[tox]lint_folders}
61 |     blackdoc --check --diff {[tox]lint_folders}
62 |     isort --check-only --diff {[tox]lint_folders}
63 |     sphinx-lint --enable all --max-line-length 88 \
64 |         -i "{toxinidir}/docs/src/examples" \
65 |         {[tox]lint_folders} "{toxinidir}/README.rst"
66 | 
67 | [testenv:{format,format-unsafe}]
68 | description =
69 |   format: Formats files in working directory.
70 |   format-unsafe: Formats files in working directory. Fixes more linter errors
71 |                  but might alter code logic. Result of this formatting should
72 |                  be double checked.
73 | skip_install = true
74 | deps =
75 |     ruff
76 |     black
77 |     blackdoc
78 |     isort
79 | commands =
80 |     format: ruff check --fix {[tox]lint_folders}
81 |     format-unsafe: ruff check --fix --unsafe-fixes {[tox]lint_folders}
82 |     black {[tox]lint_folders}
83 |     blackdoc {[tox]lint_folders}
84 |     isort {[tox]lint_folders}
85 | 
86 | [testenv:docs]
87 | description = Builds the documentation
88 | usedevelop = true
89 | deps =
90 |     -r docs/requirements.txt
91 | # The documentation runs "examples" to produce outputs via sphinx-gallery.
92 | extras = examples
93 | commands =
94 |     sphinx-build {posargs:-E} -W -b html docs/src docs/build/html
95 | 


--------------------------------------------------------------------------------