├── .codecov.yml ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── build.yml │ ├── docs.yml │ ├── documentation-links.yml │ ├── lint.yml │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── requirements.txt └── src │ ├── bibliography.rst │ ├── changelog.rst │ ├── conf.py │ ├── contributing.rst │ ├── getting-started.rst │ ├── index.rst │ ├── installation.rst │ ├── references │ ├── VoronoiFPS-Schematic.pdf │ ├── clustering.rst │ ├── datasets.rst │ ├── decomposition.rst │ ├── index.rst │ ├── linear_models.rst │ ├── metrics.rst │ ├── neighbors.rst │ ├── preprocessing.rst │ ├── selection.rst │ └── utils.rst │ └── tutorials.rst ├── examples ├── README.rst ├── neighbors │ ├── README.rst │ ├── pamm.py │ └── sparse-kde.py ├── pcovc │ ├── PCovC_Comparison.py │ ├── PCovC_Hyperparameters.py │ └── README.rst ├── pcovr │ ├── PCovR-WHODataset.py │ ├── PCovR.py │ ├── PCovR_Regressors.py │ ├── PCovR_Scaling.py │ └── README.rst ├── reconstruction │ ├── PlotGFRE.py │ ├── PlotLFRE.py │ ├── PlotPointwiseGFRE.py │ └── README.rst ├── regression │ ├── OrthogonalRegressionNonAnalytic.py │ ├── README.rst │ └── Ridge2FoldCVRegularization.py └── selection │ ├── FeatureSelection-WHODataset.py │ ├── FeatureSelection.py │ ├── GCH-ROY.py │ ├── README.rst │ └── Selectors-Pipelines.py ├── pyproject.toml ├── src └── skmatter │ ├── __init__.py │ ├── _selection.py │ ├── clustering │ ├── __init__.py │ └── _quick_shift.py │ ├── datasets │ ├── __init__.py │ ├── _base.py │ ├── data │ │ ├── beran_roy_properties.npz │ │ ├── csd-1000r.npz │ │ ├── degenerate_CH4_manifold.npz │ │ ├── h2o-blyp-piglet.npz │ │ ├── nice_dataset.npz │ │ └── who_dataset.csv │ └── descr │ │ ├── csd-1000r.rst │ │ ├── degenerate_CH4_manifold.rst │ │ ├── h2o-blyp-piglet.rst │ │ ├── nice_dataset.rst │ │ └── who_dataset.rst │ ├── decomposition │ ├── __init__.py │ ├── _kernel_pcovr.py │ ├── _pcov.py │ ├── _pcovc.py │ └── _pcovr.py │ ├── feature_selection │ ├── __init__.py │ └── _base.py │ ├── linear_model │ ├── __init__.py │ ├── _base.py │ └── _ridge.py │ ├── metrics │ ├── __init__.py │ ├── _pairwise.py │ ├── _prediction_rigidities.py │ └── _reconstruction_measures.py │ ├── model_selection │ ├── __init__.py │ └── _split.py │ ├── neighbors │ ├── __init__.py │ └── _sparsekde.py │ ├── preprocessing │ ├── __init__.py │ └── _data.py │ ├── sample_selection │ ├── __init__.py │ ├── _base.py │ └── _voronoi_fps.py │ └── utils │ ├── __init__.py │ ├── _orthogonalizers.py │ ├── _pcovc_utils.py │ ├── _pcovr_utils.py │ ├── _progress_bar.py │ └── _sparsekde.py ├── tests ├── .gitignore ├── test_check_estimators.py ├── test_clustering.py ├── test_datasets.py ├── test_dch.py ├── test_feature_pcov_cur.py ├── test_feature_pcov_fps.py ├── test_feature_simple_cur.py ├── test_feature_simple_fps.py ├── test_greedy_selector.py ├── test_kernel_normalizer.py ├── test_kernel_pcovr.py ├── test_linear_model.py ├── test_metrics.py ├── test_model_selection.py ├── test_neighbors.py ├── test_orthogonalizers.py ├── test_pcovc.py ├── test_pcovr.py ├── test_pcovr_distances.py ├── test_progress_bar.py ├── test_sample_pcov_cur.py ├── test_sample_pcov_fps.py ├── test_sample_simple_cur.py ├── test_sample_simple_fps.py ├── test_sparse_kernel_centerer.py ├── test_standard_flexible_scaler.py └── test_voronoi_fps.py └── tox.ini /.codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | ignore: 3 | - tests/.* 4 | status: 5 | project: 6 | default: 7 | target: 95% 8 | patch: 9 | default: 10 | target: 95% 11 | 12 | comment: false 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Contributor (creator of PR) checklist 5 | ------------------------------------- 6 | - [ ] Tests updated (for new features and bugfixes)? 7 | - [ ] Documentation updated (for new features)? 8 | - [ ] Issue referenced (for PRs that solve an issue)? 9 | 10 | For Reviewer 11 | ------------ 12 | - [ ] CHANGELOG updated if important change? 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: monthly 7 | open-pull-requests-limit: 1 8 | groups: 9 | action-dependencies: 10 | patterns: 11 | - "*" # A wildcard to create one PR for all dependencies in the ecosystem 12 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # This workflow builds and checks the package for release 2 | name: Build 3 | 4 | on: 5 | pull_request: 6 | branches: [main] 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.13" 19 | 20 | - name: install tests dependencies 21 | run: python -m pip install tox 22 | 23 | - name: Test build integrity 24 | run: tox -e build 25 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | tags: ["*"] 7 | pull_request: 8 | # Check all PR 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: setup Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.13" 20 | 21 | - name: install tests dependencies 22 | run: python -m pip install tox 23 | 24 | - name: build documentation 25 | run: tox -e docs 26 | -------------------------------------------------------------------------------- /.github/workflows/documentation-links.yml: -------------------------------------------------------------------------------- 1 | name: readthedocs/actions 2 | 3 | on: 4 | pull_request_target: 5 | types: 6 | - opened 7 | 8 | permissions: 9 | pull-requests: write 10 | 11 | jobs: 12 | documentation-links: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: readthedocs/actions/preview@v1 16 | with: 17 | project-slug: scikit-matter 18 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | 7 | jobs: 8 | lint: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: "3.13" 18 | 19 | - name: install tests dependencies 20 | run: python -m pip install tox 21 | 22 | - name: Lint the code 23 | run: tox -e lint 24 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: ["*"] 6 | 7 | jobs: 8 | build: 9 | name: Build distribution 10 | runs-on: ubuntu-latest 11 | environment: 12 | name: pypi 13 | url: https://pypi.org/project/skmatter 14 | permissions: 15 | id-token: write 16 | contents: write 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | fetch-depth: 0 22 | - name: setup Python 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: "3.13" 26 | - run: python -m pip install tox 27 | - name: Build package 28 | run: tox -e build 29 | - name: Publish distribution to PyPI 30 | if: startsWith(github.ref, 'refs/tags/v') 31 | uses: pypa/gh-action-pypi-publish@release/v1 32 | - name: Publish to GitHub release 33 | if: startsWith(github.ref, 'refs/tags/v') 34 | uses: softprops/action-gh-release@v2 35 | with: 36 | files: | 37 | dist/*.tar.gz 38 | dist/*.whl 39 | prerelease: ${{ contains(github.ref, '-rc') }} 40 | env: 41 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 42 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | # Check all PR 8 | 9 | jobs: 10 | tests: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest, macos-latest, windows-latest] 15 | python-version: ["3.10", "3.13"] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: install tests dependencies 26 | run: python -m pip install tox coverage[toml] 27 | 28 | - name: run Python tests 29 | run: | 30 | tox -e tests 31 | coverage xml 32 | 33 | - name: upload to codecov.io 34 | uses: codecov/codecov-action@v5 35 | with: 36 | fail_ci_if_error: true 37 | files: ./tests/coverage.xml 38 | token: ${{ secrets.CODECOV_TOKEN }} 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.coverage* 2 | *.pyc 3 | *.ipynb_checkpoints* 4 | __pycache__ 5 | *.egg-info 6 | *.swp 7 | *.swo 8 | *DS_Store 9 | 10 | .tox/ 11 | build/ 12 | dist/ 13 | docs/src/examples 14 | sg_execution_times.rst 15 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools we need 9 | build: 10 | os: ubuntu-lts-latest 11 | tools: 12 | python: "3.13" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/src/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF 19 | formats: 20 | - pdf 21 | 22 | python: 23 | install: 24 | - requirements: docs/requirements.txt 25 | - method: pip 26 | path: . 27 | extra_requirements: 28 | # The documentation runs "examples" to produce outputs via sphinx-gallery. 29 | - examples 30 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | CHANGELOG file 2 | -------------- 3 | 4 | The rules for CHANGELOG file: 5 | 6 | - entries are sorted newest-first 7 | - summarize sets of changes (do not reproduce every git log comment here). 8 | - do not ever delete anything 9 | - keep the format consistent (88 char width, Y/M/D date format) and do not use tabs but 10 | use spaces for formatting 11 | 12 | .. inclusion-marker-changelog-start 13 | 14 | 0.3.0 (XXXX/XX/XX) 15 | ------------------ 16 | - Add ``_BasePCov`` class (#248) 17 | - Add ``PCovC`` class that inherits shared functionality from ``_BasePCov`` (#248) 18 | - Add ``PCovC`` testing suite and examples (#248) 19 | - Modify ``PCovR`` to inherit shared functionality from ``_BasePCov_`` (#248) 20 | - Update to sklearn >= 1.6.0 and scipy >= 1.15.0 (#239) 21 | - Fixed moved function import from scipy and bump scipy dependency to 1.15.0 (#236) 22 | - Fix rendering issues for `SparseKDE` and `QuickShift` (#236) 23 | - Updating ``FPS`` to allow a numpy array of ints as an initialize parameter (#145) 24 | - Supported Python versions are now ranging from 3.9 - 3.12. 25 | - Updating ``skmatter.datasets`` submodule to support sklearn 1.5.0 (#229) 26 | - Add `SparseKDE` class (#222) 27 | - Add `QuickShift` class (#222) 28 | - Add an example on how to conduct PAMM algorithm with `SparseKDE` and `QuickShift` 29 | (#222) 30 | - Add H2O-BLYP-Piglet dataset (#222) 31 | - Add two distance metrics that support the periodic boundry condition, 32 | `periodic_pairwise_euclidean_distances` and `pairwise_mahalanobis_distances` (#222) 33 | 34 | 0.2.0 (2023/08/24) 35 | ------------------ 36 | - Add this ``CHANGELOG`` file (#198) 37 | - Update example of WHO feature selection (#212) 38 | - Rename ``RidgeRegression2FoldCV`` -> ``Ridge2FoldCV`` (#211) 39 | - Adding metrics for prediction rigidity (#209) 40 | - Overhaul of documentation page (#200 to #204) 41 | - Rename and add member variables (#197) 42 | - Fix/check estimator (#196) 43 | - fixed small typo in ``PCovR`` class documentation (#194) 44 | - Resolved Issue WHO dataset missing function call section in doc (#181, #192) 45 | - Speeding up tests (#190) 46 | - Removing kernel optimization from who example (#189) 47 | - Ignore rendered examples for linting (#188) 48 | - Add more infos on documentation landing pages & ``CODE_OF_CONDUCT`` (#186) 49 | - Add contributors pictures to ``README``, show pip install instructions in docs (#185) 50 | - Add linting and tests for docstring and documentation code (#184) 51 | - Rerestructure requirements after (#171, #183) 52 | - Update ``README.md`` to show banners (#176) 53 | - Modernize package infrastructure (#172) 54 | - Add an example of GCH for molecular materials (#171) 55 | - Port examples to ``sphinx_gallery`` (#170) 56 | 57 | 0.1.4 (2023/03/14) 58 | ------------------ 59 | - documentation formatting fixes for math and datasets (#161, #163) 60 | - changing the way the distance to the convex hull is computed in the 61 | ``DirectionalConvexHull`` due to numerical issues with the old method (#165) 62 | 63 | 0.1.3 (2023/03/02) 64 | ------------------ 65 | - Refactor ``scikit-cosmo`` to ``scikit-matter`` (#157, #151) 66 | - Deprecation warning was added to link to renamed package (#154) 67 | - dropped Python `<3.8` support, because we are now using ``scikit-learn`` version 68 | `>=1.1.0` (#139, #146, #152) 69 | - WHO dataset and examples were added (#149) 70 | - nice dataset was added (#143) 71 | - overhaul of documentation (#142, #150) 72 | - added ``DirectionalConvexHull`` class (#140) 73 | - added test_precomputed_regression function to ``KPCovR`` (#136) 74 | - other bugfixes (#141, #148) 75 | 76 | 77 | 0.1.2 (2022/07/04) 78 | ------------------ 79 | - fixed a bug in the orthonormalization step of ``PCov-CUR`` (#118) 80 | - users can now initialize ``FPS`` selecting using a list of selected points, allowing 81 | to restart the selection in the middle (#116) 82 | - KPCovR is now able to use pre-fitted regressor in the same way that ``PCovR`` can 83 | (#113) 84 | 85 | 0.1.1 (2021/11/30) 86 | ------------------ 87 | - fixed a bug in the ``orthonormalization`` step of ``PCov-CUR`` (#118) 88 | - users can now initialize ``FPS`` selecting using a list of selected points, allowing to 89 | restart the selection in the middle (#116) 90 | - KPCovR is now able to use pre-fitted regressor in the same way that ``PCovR`` can (#113) 91 | 92 | 0.1.0 (2021/05/12) 93 | ------------------ 94 | - first release out of the lab 95 | 96 | .. inclusion-marker-changelog-end 97 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | philip.loche@epfl.ch. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020 the sklearn-matter contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft src 2 | 3 | include LICENSE 4 | include README.rst 5 | 6 | prune docs 7 | prune examples 8 | prune tests 9 | prune .github 10 | prune .tox 11 | 12 | exclude CHANGELOG 13 | exclude CODE_OF_CONDUCT.md 14 | exclude .gitignore 15 | exclude .codecov.yml 16 | exclude .readthedocs.yaml 17 | exclude tox.ini 18 | 19 | global-exclude *.py[cod] __pycache__/* *.so *.dylib 20 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | scikit-matter 2 | ============= 3 | |tests| |codecov| |pypi| |conda| |docs| |doi| 4 | 5 | A collection of ``scikit-learn`` compatible utilities that implement methods born out of 6 | the materials science and chemistry communities. 7 | 8 | For details, tutorials, and examples, please have a look at our `documentation`_. 9 | 10 | .. _`documentation`: https://scikit-matter.readthedocs.io 11 | 12 | .. marker-installation 13 | 14 | Installation 15 | ------------ 16 | You can install *scikit-matter* either via pip using 17 | 18 | .. code-block:: bash 19 | 20 | pip install skmatter 21 | 22 | or conda 23 | 24 | .. code-block:: bash 25 | 26 | conda install -c conda-forge skmatter 27 | 28 | You can then ``import skmatter`` and use scikit-matter in your projects! 29 | 30 | .. marker-ci-tests 31 | 32 | Tests 33 | ----- 34 | We are testing our code for Python 3.10 and 3.13 on the latest versions of Ubuntu, 35 | macOS and Windows. 36 | 37 | .. marker-issues 38 | 39 | Having problems or ideas? 40 | ------------------------- 41 | Having a problem with scikit-matter? Please let us know by `submitting an issue 42 | `_. 43 | 44 | Submit new features or bug fixes through a `pull request 45 | `_. 46 | 47 | .. marker-contributing 48 | 49 | Call for Contributions 50 | ---------------------- 51 | We always welcome new contributors. If you want to help us take a look at our 52 | `contribution guidelines`_ and afterwards you may start with an open issue marked as 53 | `good first issue`_. 54 | 55 | Writing code is not the only way to contribute to the project. You can also: 56 | 57 | * review `pull requests`_ 58 | * help us stay on top of new and old `issues`_ 59 | * develop `examples and tutorials`_ 60 | * maintain and `improve our documentation`_ 61 | * contribute `new datasets`_ 62 | 63 | .. _`contribution guidelines`: https://scikit-matter.readthedocs.io/en/latest/contributing.html 64 | .. _`good first issue`: https://github.com/scikit-learn-contrib/scikit-matter/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22 65 | .. _`pull requests`: https://github.com/scikit-learn-contrib/scikit-matter/pulls 66 | .. _`issues`: https://github.com/scikit-learn-contrib/scikit-matter/issues 67 | .. _`improve our documentation`: https://scikit-matter.readthedocs.io/en/latest/contributing.html#contributing-to-the-documentation 68 | .. _`examples and tutorials`: https://scikit-matter.readthedocs.io/en/latest/contributing.html#contributing-new-examples 69 | .. _`new datasets`: https://scikit-matter.readthedocs.io/en/latest/contributing.html#contributing-datasets 70 | 71 | .. marker-citing 72 | 73 | Citing scikit-matter 74 | -------------------- 75 | If you use *scikit-matter* for your work, please cite: 76 | 77 | Goscinski A, Principe VP, Fraux G et al. scikit-matter : 78 | A Suite of Generalisable Machine Learning Methods Born out of Chemistry 79 | and Materials Science. Open Res Europe 2023, 3:81. 80 | `10.12688/openreseurope.15789.2`_ 81 | 82 | .. _`10.12688/openreseurope.15789.2`: https://doi.org/10.12688/openreseurope.15789.2 83 | 84 | .. marker-contributors 85 | 86 | Contributors 87 | ------------ 88 | Thanks goes to all people that make scikit-matter possible: 89 | 90 | .. image:: https://contrib.rocks/image?repo=scikit-learn-contrib/scikit-matter 91 | :target: https://github.com/scikit-learn-contrib/scikit-matter/graphs/contributors 92 | 93 | .. |tests| image:: https://github.com/scikit-learn-contrib/scikit-matter/workflows/Tests/badge.svg 94 | :alt: Github Actions Tests Job Status 95 | :target: action_ 96 | 97 | .. |codecov| image:: https://codecov.io/gh/scikit-learn-contrib/scikit-matter/branch/main/graph/badge.svg?token=UZJPJG34SM 98 | :alt: Code coverage 99 | :target: https://codecov.io/gh/scikit-learn-contrib/scikit-matter/ 100 | 101 | .. |docs| image:: https://img.shields.io/badge/documentation-latest-sucess 102 | :alt: Python 103 | :target: documentation_ 104 | 105 | .. |pypi| image:: https://img.shields.io/pypi/v/skmatter.svg 106 | :alt: Latest PYPI version 107 | :target: https://pypi.org/project/skmatter 108 | 109 | .. |conda| image:: https://anaconda.org/conda-forge/skmatter/badges/version.svg 110 | :alt: Latest conda version 111 | :target: https://anaconda.org/conda-forge/skmatter 112 | 113 | .. |doi| image:: https://img.shields.io/badge/DOI-10.12688-blue 114 | :alt: ORE Paper 115 | :target: `10.12688/openreseurope.15789.2`_ 116 | 117 | .. _`action`: https://github.com/scikit-learn-contrib/scikit-matter/actions?query=branch%3Amain 118 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx-gallery 3 | sphinx-toggleprompt 4 | pydata-sphinx-theme 5 | tomli 6 | -------------------------------------------------------------------------------- /docs/src/bibliography.rst: -------------------------------------------------------------------------------- 1 | References 2 | ############ 3 | 4 | .. [deJong1992] 5 | S. de Jong, H.A.L. Kiers, 6 | "Principal covariates regression: Part I. Theory", Chemom. intell. lab. syst. 14 7 | (1992) 155-164 https://doi.org/10.1016/0169-7439(92)80100-I 8 | 9 | .. [Gasparotto2014] 10 | Piero Gasparotto, Michele Ceriotti, 11 | "Recognizing molecular patterns by machine learning: An agnostic structural 12 | definition of the hydrogen bond", J. Chem. Phys., 141 (17): 174110. 13 | https://doi.org/10.1063/1.4900655. 14 | 15 | .. [Imbalzano2018] 16 | Giulio Imbalzano, Andrea Anelli, Daniele Giofré,Sinja Klees, Jörg Behler, and 17 | Michele Ceriotti, “Automatic selection of atomic fingerprints and reference 18 | configurations for machine-learning potentials.” The Journal of chemical physics 148 19 | 24 (2018): 241730. https://aip.scitation.org/doi/10.1063/1.5024611. 20 | 21 | .. [Ceriotti2019] 22 | Michele Ceriotti, Lyndon Emsley, Federico Paruzzo, Albert Hofstetter, Félix Musil, 23 | Sandip De, Edgar A. Engel, and Andrea Anelli. "Chemical Shifts in Molecular Solids 24 | by Machine Learning Datasets", Materials Cloud Archive 2019.0023/v2 (2019), 25 | https://doi.org/10.24435/materialscloud:2019.0023/v2. 26 | 27 | .. [Helfrecht2020] 28 | Benjamin A Helfrecht, Rose K Cersonsky, Guillaume Fraux, and Michele Ceriotti, 29 | "Structure-property maps with Kernel principal covariates regression." 2020 Mach. 30 | Learn.: Sci. Technol. 1 045021. 31 | https://iopscience.iop.org/article/10.1088/2632-2153/aba9ef. 32 | 33 | .. [Pozdnyakov2020] 34 | Pozdnyakov, S. N., Willatt, M. J., Bartók, A. P., Ortner, C., Csányi, G., & 35 | Ceriotti, M. (2020). "Incompleteness of Atomic Structure Representations." Physical 36 | Review Letters, 125(16). https://doi.org/10.1103/physrevlett.125.166001 37 | 38 | .. [Goscinski2021] 39 | Alexander Goscinski, Guillaume Fraux, Giulio Imbalzano, and Michele Ceriotti, "The 40 | role of feature space in atomistic learning." 2021 Mach. Learn.: Sci. Technol. 2 41 | 025028. https://iopscience.iop.org/article/10.1088/2632-2153/abdaf7. 42 | 43 | .. [Cersonsky2021] 44 | Rose K Cersonsky, Benjamin A Helfrecht, Edgar A. Engel, Sergei Kliavinek, and 45 | Michele Ceriotti, "Improving Sample and Feature Selection with Principal Covariates 46 | Regression" 2021 Mach. Learn.: Sci. Technol. 2 035038. 47 | https://iopscience.iop.org/article/10.1088/2632-2153/abfe7c. 48 | 49 | .. [Jorgensen2025] 50 | Christian Jorgensen, Arthur Y. Lin, Rhushil Vasavada, and Rose K. Cersonsky, 51 | "Interpretable Visualizations of Data Spaces for Classification Problems" 52 | 2025 arXiv. 2503.05861. 53 | https://doi.org/10.48550/arXiv.2503.05861. 54 | -------------------------------------------------------------------------------- /docs/src/changelog.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | .. include:: ../../CHANGELOG 5 | :start-after: inclusion-marker-changelog-start 6 | :end-before: inclusion-marker-changelog-end 7 | -------------------------------------------------------------------------------- /docs/src/getting-started.rst: -------------------------------------------------------------------------------- 1 | Getting started 2 | =============== 3 | 4 | This guide illustrates the main functionalities that ``scikit-matter`` provides. It 5 | assumes a very basic working knowledge of how ``scikit-learn`` works. Please refer to 6 | our :ref:`installation` instructions for installing ``scikit-matter``. 7 | 8 | For a detailed explaination of the functionalities, please look at the 9 | :ref:`selection-api` 10 | 11 | .. _getting_started-selection: 12 | 13 | Features and Samples Selection 14 | ------------------------------ 15 | 16 | .. automodule:: skmatter._selection 17 | :noindex: 18 | 19 | Notebook Examples 20 | ^^^^^^^^^^^^^^^^^ 21 | 22 | .. include:: examples/selection/index.rst 23 | :start-line: 4 24 | 25 | 26 | .. _getting_started-reconstruction: 27 | 28 | Metrics 29 | ------- 30 | 31 | .. automodule:: skmatter.metrics 32 | :noindex: 33 | 34 | Notebook Examples 35 | ^^^^^^^^^^^^^^^^^ 36 | 37 | .. include:: examples/reconstruction/index.rst 38 | :start-line: 4 39 | 40 | .. _getting_started-hybrid: 41 | 42 | Hybrid Mapping Techniques 43 | ------------------------- 44 | 45 | .. automodule:: skmatter.decomposition 46 | :noindex: 47 | 48 | Notebook Examples 49 | ^^^^^^^^^^^^^^^^^ 50 | 51 | .. include:: examples/pcovr/index.rst 52 | :start-line: 4 53 | .. include:: examples/pcovc/index.rst 54 | :start-line: 4 55 | -------------------------------------------------------------------------------- /docs/src/index.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: skmatter 2 | 3 | .. raw:: html 4 | 5 |
6 |
7 |
8 |
9 |
10 |
11 | 12 | 13 | .. only:: html 14 | 15 | :ref:`getting_started-selection` 16 | 17 | .. image:: /examples/selection/images/thumb/sphx_glr_FeatureSelection-WHODataset_thumb.png 18 | :alt: 19 | 20 | .. raw:: html 21 | 22 |
23 |

Supervised and unsupervised selection 24 | methods based on 25 | CUR matrix decomposition and Farthest Point Sampling.

26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | 34 | .. only:: html 35 | 36 | :ref:`getting_started-hybrid` 37 | 38 | .. image:: /examples/pcovr/images/thumb/sphx_glr_PCovR_thumb.png 39 | :alt: 40 | 41 | .. raw:: html 42 | 43 |
44 |

PCovR and PCovC utilize a combination between a PCA-like and a LR-like loss 45 | to determine the decomposition matrix to project feature into latent space

46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | 54 | .. only:: html 55 | 56 | :ref:`getting_started-reconstruction` 57 | 58 | .. image:: /examples/reconstruction/images/thumb/sphx_glr_PlotLFRE_thumb.png 59 | :alt: 60 | 61 | .. raw:: html 62 | 63 |
64 |

Error measures for quantifying the 65 | linear decodable information capacity between features

66 |
67 |
68 |
69 |
70 |
71 |
72 | 73 | .. include:: ../../README.rst 74 | :start-after: marker-issues 75 | :end-before: marker-contributing 76 | 77 | .. include:: ../../README.rst 78 | :start-after: marker-citing 79 | :end-before: marker-contributors 80 | 81 | If you would like to contribute to scikit-matter, check out our :ref:`contributing` 82 | page! 83 | 84 | .. toctree:: 85 | :hidden: 86 | 87 | getting-started 88 | installation 89 | references/index 90 | tutorials 91 | contributing 92 | changelog 93 | bibliography 94 | -------------------------------------------------------------------------------- /docs/src/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | .. include:: ../../README.rst 4 | :start-after: marker-installation 5 | :end-before: marker-ci-tests 6 | 7 | Install from source 8 | ------------------- 9 | 10 | For development purposes you should clone the repository and install the current 11 | development version from the source code 12 | 13 | .. code-block:: bash 14 | 15 | git clone https://github.com/lab-cosmo/scikit-matter 16 | cd scikit-matter 17 | pip install . 18 | 19 | Alternatively, if you don't have special privileges, install 20 | the package using the ``--user`` flag: 21 | 22 | .. code-block:: bash 23 | 24 | pip install . --user 25 | 26 | You're ready to import skmatter from your code! Have a look at the :ref:`api-reference` 27 | for how to use the code. 28 | -------------------------------------------------------------------------------- /docs/src/references/VoronoiFPS-Schematic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/docs/src/references/VoronoiFPS-Schematic.pdf -------------------------------------------------------------------------------- /docs/src/references/clustering.rst: -------------------------------------------------------------------------------- 1 | Clustering 2 | ========== 3 | 4 | .. automodule:: skmatter.clustering 5 | 6 | .. _quick-shift-api: 7 | 8 | Quick Shift 9 | ------------ 10 | 11 | .. autoclass:: skmatter.clustering.QuickShift 12 | -------------------------------------------------------------------------------- /docs/src/references/datasets.rst: -------------------------------------------------------------------------------- 1 | Datasets 2 | ======== 3 | 4 | .. include:: ../../../src/skmatter/datasets/descr/csd-1000r.rst 5 | 6 | .. include:: ../../../src/skmatter/datasets/descr/degenerate_CH4_manifold.rst 7 | 8 | .. include:: ../../../src/skmatter/datasets/descr/h2o-blyp-piglet.rst 9 | 10 | .. include:: ../../../src/skmatter/datasets/descr/nice_dataset.rst 11 | 12 | .. include:: ../../../src/skmatter/datasets/descr/who_dataset.rst 13 | -------------------------------------------------------------------------------- /docs/src/references/decomposition.rst: -------------------------------------------------------------------------------- 1 | Hybrid Mapping Techniques 2 | ========================= 3 | 4 | .. _PCovR-api: 5 | 6 | PCovR 7 | ----- 8 | 9 | .. autoclass:: skmatter.decomposition.PCovR 10 | :show-inheritance: 11 | :special-members: 12 | 13 | .. automethod:: fit 14 | 15 | .. automethod:: _fit_feature_space 16 | .. automethod:: _fit_sample_space 17 | 18 | .. automethod:: transform 19 | .. automethod:: predict 20 | .. automethod:: inverse_transform 21 | .. automethod:: score 22 | 23 | .. _PCovC-api: 24 | 25 | PCovC 26 | ----- 27 | 28 | .. autoclass:: skmatter.decomposition.PCovC 29 | :show-inheritance: 30 | :special-members: 31 | 32 | .. automethod:: fit 33 | 34 | .. automethod:: _fit_feature_space 35 | .. automethod:: _fit_sample_space 36 | 37 | .. automethod:: transform 38 | .. automethod:: predict 39 | .. automethod:: inverse_transform 40 | .. automethod:: decision_function 41 | .. automethod:: score 42 | 43 | .. _KPCovR-api: 44 | 45 | Kernel PCovR 46 | ------------ 47 | 48 | .. autoclass:: skmatter.decomposition.KernelPCovR 49 | :show-inheritance: 50 | :special-members: 51 | 52 | .. automethod:: fit 53 | .. automethod:: transform 54 | .. automethod:: predict 55 | .. automethod:: inverse_transform 56 | .. automethod:: score 57 | -------------------------------------------------------------------------------- /docs/src/references/index.rst: -------------------------------------------------------------------------------- 1 | .. _api-reference: 2 | 3 | API Reference 4 | ============= 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :caption: Contents: 9 | 10 | preprocessing 11 | selection 12 | linear_models 13 | clustering 14 | decomposition 15 | metrics 16 | neighbors 17 | datasets 18 | utils 19 | -------------------------------------------------------------------------------- /docs/src/references/linear_models.rst: -------------------------------------------------------------------------------- 1 | Linear Models 2 | ============= 3 | 4 | Orthogonal Regression 5 | --------------------- 6 | 7 | .. autoclass:: skmatter.linear_model.OrthogonalRegression 8 | 9 | Ridge Regression with Two-fold Cross Validation 10 | ----------------------------------------------- 11 | 12 | .. autoclass:: skmatter.linear_model.Ridge2FoldCV 13 | 14 | PCovR 15 | ----- 16 | 17 | Principal Covariates Regression is a linear model, see :ref:`PCovR-api`. 18 | -------------------------------------------------------------------------------- /docs/src/references/metrics.rst: -------------------------------------------------------------------------------- 1 | Metrics 2 | ======= 3 | 4 | .. automodule:: skmatter.metrics 5 | 6 | .. _GRE-api: 7 | 8 | Global Reconstruction Error 9 | --------------------------- 10 | 11 | .. autofunction:: skmatter.metrics.pointwise_global_reconstruction_error 12 | .. autofunction:: skmatter.metrics.global_reconstruction_error 13 | 14 | .. _GRD-api: 15 | 16 | Global Reconstruction Distortion 17 | -------------------------------- 18 | 19 | .. autofunction:: skmatter.metrics.pointwise_global_reconstruction_distortion 20 | .. autofunction:: skmatter.metrics.global_reconstruction_distortion 21 | 22 | .. _LRE-api: 23 | 24 | Local Reconstruction Error 25 | -------------------------- 26 | 27 | .. autofunction:: skmatter.metrics.pointwise_local_reconstruction_error 28 | .. autofunction:: skmatter.metrics.local_reconstruction_error 29 | 30 | .. _LPR-api: 31 | 32 | Local Prediction Rigidity 33 | ------------------------- 34 | 35 | .. autofunction:: skmatter.metrics.local_prediction_rigidity 36 | 37 | .. _CPR-api: 38 | 39 | Component-wise Prediction Rigidity 40 | ---------------------------------- 41 | 42 | .. autofunction:: skmatter.metrics.componentwise_prediction_rigidity 43 | 44 | 45 | .. _pairwise-euclidian-api: 46 | 47 | Pairwise Euclidean Distances 48 | ---------------------------- 49 | 50 | .. autofunction:: skmatter.metrics.periodic_pairwise_euclidean_distances 51 | 52 | .. _pairwise-mahalanobis-api: 53 | 54 | Pairwise Mahalanobis Distance 55 | ----------------------------- 56 | 57 | .. autofunction:: skmatter.metrics.pairwise_mahalanobis_distances 58 | -------------------------------------------------------------------------------- /docs/src/references/neighbors.rst: -------------------------------------------------------------------------------- 1 | Neighbors 2 | ========= 3 | 4 | .. automodule:: skmatter.neighbors 5 | 6 | .. _sparse-kde-api: 7 | 8 | Sparse Kernel Density Estimation 9 | -------------------------------- 10 | 11 | .. autoclass:: skmatter.neighbors.SparseKDE 12 | :show-inheritance: 13 | 14 | .. automethod:: fit 15 | .. automethod:: score_samples 16 | .. automethod:: score 17 | -------------------------------------------------------------------------------- /docs/src/references/preprocessing.rst: -------------------------------------------------------------------------------- 1 | Preprocessing 2 | ============= 3 | 4 | .. automodule:: skmatter.preprocessing 5 | 6 | KernelNormalizer 7 | ---------------- 8 | 9 | .. autoclass:: skmatter.preprocessing.KernelNormalizer 10 | :members: 11 | :undoc-members: 12 | :inherited-members: 13 | 14 | 15 | SparseKernelCenterer 16 | -------------------- 17 | 18 | .. autoclass:: skmatter.preprocessing.SparseKernelCenterer 19 | :members: 20 | :undoc-members: 21 | :inherited-members: 22 | 23 | StandardFlexibleScaler 24 | ---------------------- 25 | 26 | .. autoclass:: skmatter.preprocessing.StandardFlexibleScaler 27 | :members: 28 | :undoc-members: 29 | :inherited-members: 30 | -------------------------------------------------------------------------------- /docs/src/references/selection.rst: -------------------------------------------------------------------------------- 1 | .. _selection-api: 2 | 3 | Feature and Sample Selection 4 | ============================ 5 | 6 | .. automodule:: skmatter._selection 7 | 8 | .. _CUR-api: 9 | 10 | CUR 11 | --- 12 | 13 | CUR decomposition begins by approximating a matrix :math:`{\mathbf{X}}` using a subset 14 | of columns and rows 15 | 16 | .. math:: 17 | \mathbf{\hat{X}} \approx \mathbf{X}_\mathbf{c} \left(\mathbf{X}_\mathbf{c}^- 18 | \mathbf{X} \mathbf{X}_\mathbf{r}^-\right) \mathbf{X}_\mathbf{r}. 19 | 20 | These subsets of rows and columns, denoted :math:`\mathbf{X}_\mathbf{r}` and 21 | :math:`\mathbf{X}_\mathbf{c}`, respectively, can be determined by iterative maximization 22 | of a leverage score :math:`\pi`, representative of the relative importance of each 23 | column or row. From hereon, we will call selection methods which are derived off of the 24 | CUR decomposition "CUR" as a shorthand for "CUR-derived selection". In each iteration of 25 | CUR, we select the column or row that maximizes :math:`\pi` and orthogonalize the 26 | remaining columns or rows. These steps are iterated until a sufficient number of 27 | features has been selected. This iterative approach, albeit comparatively time 28 | consuming, is the most deterministic and efficient route in reducing the number of 29 | features needed to approximate :math:`\mathbf{X}` when compared to selecting all 30 | features in a single iteration based upon the relative :math:`\pi` importance. 31 | 32 | The feature and sample selection versions of CUR differ only in the computation of 33 | :math:`\pi`. In sample selection :math:`\pi` is computed using the left singular 34 | vectors, versus in feature selection, :math:`\pi` is computed using the right singular 35 | vectors. 36 | 37 | .. autoclass:: skmatter.feature_selection.CUR 38 | :members: 39 | :private-members: _compute_pi 40 | :undoc-members: 41 | :inherited-members: 42 | 43 | .. autoclass:: skmatter.sample_selection.CUR 44 | :members: 45 | :private-members: _compute_pi 46 | :undoc-members: 47 | :inherited-members: 48 | 49 | .. _PCov-CUR-api: 50 | 51 | PCov-CUR 52 | -------- 53 | 54 | PCov-CUR extends upon CUR by using augmented right or left singular vectors inspired by 55 | Principal Covariates Regression, as demonstrated in [Cersonsky2021]_. These methods 56 | employ the modified kernel and covariance matrices introduced in :ref:`PCovR-api` and 57 | available via the Utility Classes. 58 | 59 | Again, the feature and sample selection versions of PCov-CUR differ only in the 60 | computation of :math:`\pi`. S 61 | 62 | .. autoclass:: skmatter.feature_selection.PCovCUR 63 | :members: 64 | :private-members: _compute_pi 65 | :undoc-members: 66 | :inherited-members: 67 | 68 | .. autoclass:: skmatter.sample_selection.PCovCUR 69 | :members: 70 | :private-members: _compute_pi 71 | :undoc-members: 72 | :inherited-members: 73 | 74 | .. _FPS-api: 75 | 76 | Farthest Point-Sampling (FPS) 77 | ----------------------------- 78 | 79 | Farthest Point Sampling is a common selection technique intended to exploit the 80 | diversity of the input space. 81 | 82 | In FPS, the selection of the first point is made at random or by a separate metric. Each 83 | subsequent selection is made to maximize the Hausdorf distance, i.e. the minimum 84 | distance between a point and all previous selections. It is common to use the Euclidean 85 | distance, however other distance metrics may be employed. 86 | 87 | Similar to CUR, the feature and selection versions of FPS differ only in the way 88 | distance is computed (feature selection does so column-wise, sample selection does so 89 | row-wise), and are built off of the same base class, 90 | 91 | These selectors can be instantiated using :py:class:`skmatter.feature_selection.FPS` and 92 | :py:class:`skmatter.sample_selection.FPS`. 93 | 94 | .. autoclass:: skmatter.feature_selection.FPS 95 | :members: 96 | :undoc-members: 97 | :inherited-members: 98 | 99 | .. autoclass:: skmatter.sample_selection.FPS 100 | :members: 101 | :undoc-members: 102 | :inherited-members: 103 | 104 | .. _PCov-FPS-api: 105 | 106 | PCov-FPS 107 | -------- 108 | 109 | PCov-FPS extends upon FPS much like PCov-CUR does to CUR. Instead of using the Euclidean 110 | distance solely in the space of :math:`\mathbf{X}`, we use a combined distance in terms 111 | of :math:`\mathbf{X}` and :math:`\mathbf{y}`. 112 | 113 | .. autoclass:: skmatter.feature_selection.PCovFPS 114 | :members: 115 | :undoc-members: 116 | :inherited-members: 117 | 118 | .. autoclass:: skmatter.sample_selection.PCovFPS 119 | :members: 120 | :undoc-members: 121 | :inherited-members: 122 | 123 | .. _Voronoi-FPS-api: 124 | 125 | Voronoi FPS 126 | ----------- 127 | 128 | .. autoclass:: skmatter.sample_selection.VoronoiFPS 129 | :members: 130 | :undoc-members: 131 | :inherited-members: 132 | 133 | 134 | When *Not* to Use Voronoi FPS 135 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 136 | 137 | In many cases, this algorithm may not increase upon the efficiency. For example, for 138 | simple metrics (such as Euclidean distance), Voronoi FPS will likely not accelerate, and 139 | may decelerate, computations when compared to FPS. The sweet spot for Voronoi FPS is 140 | when the number of selectable samples is already enough to divide the space with Voronoi 141 | polyhedrons, but not yet comparable to the total number of samples, when the cost of 142 | bookkeeping significantly degrades the speed of work compared to FPS. 143 | 144 | .. _DCH-api: 145 | 146 | Directional Convex Hull (DCH) 147 | ----------------------------- 148 | 149 | .. autoclass:: skmatter.sample_selection.DirectionalConvexHull 150 | :members: 151 | :undoc-members: 152 | :inherited-members: 153 | -------------------------------------------------------------------------------- /docs/src/references/utils.rst: -------------------------------------------------------------------------------- 1 | Utility Classes 2 | =============== 3 | 4 | .. _PCovR_dist-api: 5 | 6 | Modified Gram Matrix :math:`\mathbf{\tilde{K}}` 7 | ----------------------------------------------- 8 | 9 | .. autofunction:: skmatter.utils.pcovr_kernel 10 | 11 | 12 | Modified Covariance Matrix :math:`\mathbf{\tilde{C}}` 13 | ----------------------------------------------------- 14 | 15 | .. autofunction:: skmatter.utils.pcovr_covariance 16 | 17 | Orthogonalizers for CUR 18 | ----------------------- 19 | 20 | When computing non-iterative CUR, it is necessary to orthogonalize the input matrices 21 | after each selection. For this, we have supplied a feature and a sample orthogonalizer 22 | for feature and sample selection. 23 | 24 | .. autofunction:: skmatter.utils.X_orthogonalizer 25 | .. autofunction:: skmatter.utils.Y_feature_orthogonalizer 26 | .. autofunction:: skmatter.utils.Y_sample_orthogonalizer 27 | 28 | 29 | Random Partitioning with Overlaps 30 | --------------------------------- 31 | 32 | .. autofunction:: skmatter.model_selection.train_test_split 33 | 34 | 35 | Effective Dimension of Covariance Matrix 36 | ---------------------------------------- 37 | 38 | .. autofunction:: skmatter.utils.effdim 39 | 40 | Oracle Approximating Shrinkage 41 | ------------------------------ 42 | 43 | .. autofunction:: skmatter.utils.oas 44 | -------------------------------------------------------------------------------- /docs/src/tutorials.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../examples/README.rst 2 | 3 | .. toctree:: 4 | 5 | examples/pcovr/index 6 | examples/pcovc/index 7 | examples/selection/index 8 | examples/regression/index 9 | examples/reconstruction/index 10 | examples/neighbors/index 11 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | For a thorough tutorial of the methods introduced in ``scikit-matter``, we 5 | suggest you check out the pedagogic notebooks in our companion project 6 | `kernel-tutorials `_. 7 | 8 | For running the examples locally install ``scikit-matter`` with the ``examples`` 9 | optional dependencies. 10 | 11 | .. code-block:: bash 12 | 13 | pip install skmatter[examples] 14 | -------------------------------------------------------------------------------- /examples/neighbors/README.rst: -------------------------------------------------------------------------------- 1 | Neighbors 2 | ========= 3 | -------------------------------------------------------------------------------- /examples/neighbors/sparse-kde.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Sparse KDE examples 6 | =================== 7 | 8 | Example for the usage of the :class:`skmatter.neighbors.SparseKDE` class. This class is 9 | specifically designed for conducting pobabilistic analysis of molecular motifs 10 | (`PAMM `_), 11 | which is quite useful for analyzing motifs like H-bonds, coordination polyhedra, and 12 | protein secondary structure. 13 | 14 | Here we show how to use the sparse KDE model to fit the probability distribution based 15 | on sampled data and how to use PAMM to analyze the H-bond. 16 | 17 | We start from a simple system, which is consist of three 2D Gaussians. Our task is to 18 | estimate the parameters of these Gaussians from our sampled data. 19 | 20 | Here we first sample from these three Gaussians. 21 | """ 22 | 23 | 24 | # %% 25 | import time 26 | 27 | import matplotlib.pyplot as plt 28 | import numpy as np 29 | from scipy.stats import gaussian_kde 30 | 31 | from skmatter.feature_selection import FPS 32 | from skmatter.neighbors import SparseKDE 33 | 34 | 35 | # %% 36 | means = np.array([[0, 0], [4, 4], [6, -2]]) 37 | covariances = np.array( 38 | [[[1, 0.5], [0.5, 1]], [[1, 0.5], [0.5, 0.5]], [[1, -0.5], [-0.5, 1]]] 39 | ) 40 | N_SAMPLES = 100_000 41 | samples = np.concatenate( 42 | [ 43 | np.random.multivariate_normal(means[0], covariances[0], N_SAMPLES), 44 | np.random.multivariate_normal(means[1], covariances[1], N_SAMPLES), 45 | np.random.multivariate_normal(means[2], covariances[2], N_SAMPLES), 46 | ] 47 | ) 48 | 49 | # %% 50 | # We can visualize our sample result: 51 | # 52 | # 53 | 54 | # %% 55 | fig, ax = plt.subplots() 56 | ax.scatter(samples[:, 0], samples[:, 1], alpha=0.05, s=1) 57 | ax.scatter(means[:, 0], means[:, 1], marker="+", color="red", s=100) 58 | ax.set_xlabel("x") 59 | ax.set_ylabel("y") 60 | plt.show() 61 | 62 | # %% 63 | # Sparse KDE requires a discretization of the sample space. Here, we use 64 | # the FPS method to generate grid points in the sample space: 65 | # 66 | # 67 | 68 | # %% 69 | start1 = time.time() 70 | selector = FPS(n_to_select=int(np.sqrt(3 * N_SAMPLES))) 71 | grids = selector.fit_transform(samples.T).T 72 | end1 = time.time() 73 | fig, ax = plt.subplots() 74 | ax.scatter(samples[:, 0], samples[:, 1], alpha=0.05, s=1) 75 | ax.scatter(means[:, 0], means[:, 1], marker="+", color="red", s=100) 76 | ax.scatter(grids[:, 0], grids[:, 1], color="orange", s=1) 77 | ax.set_xlabel("x") 78 | ax.set_ylabel("y") 79 | plt.show() 80 | 81 | # %% 82 | # Now we can do sparse KDE (usually takes tens of seconds): 83 | # 84 | # 85 | 86 | # %% 87 | start2 = time.time() 88 | estimator = SparseKDE(samples, None, fpoints=0.5) 89 | estimator.fit(grids) 90 | end2 = time.time() 91 | 92 | # %% 93 | # We can have a comparison with the original sampling result by plotting them. 94 | # 95 | # For the convenience, we create a class for the Gaussian mixture model to help us plot 96 | # the result. 97 | 98 | 99 | # %% 100 | class GaussianMixtureModel: 101 | 102 | def __init__( 103 | self, 104 | weights: np.ndarray, 105 | means: np.ndarray, 106 | covariances: np.ndarray, 107 | period: np.ndarray = None, 108 | ): 109 | self.weights = weights 110 | self.means = means 111 | self.covariances = covariances 112 | self.period = period 113 | self.dimension = self.means.shape[1] 114 | self.cov_inv = np.linalg.inv(self.covariances) 115 | self.cov_det = np.linalg.det(self.covariances) 116 | self.norm = 1 / np.sqrt((2 * np.pi) ** self.dimension * self.cov_det) 117 | 118 | def __call__(self, x: np.ndarray, i: int = None): 119 | 120 | if len(x.shape) == 1: 121 | x = x[np.newaxis, :] 122 | if self.period is not None: 123 | xij = np.zeros(self.means.shape) 124 | xij = rij(self.period, xij, x) 125 | else: 126 | xij = x - self.means 127 | p = ( 128 | self.weights 129 | * self.norm 130 | * np.exp( 131 | -0.5 * (xij[:, np.newaxis, :] @ self.cov_inv @ xij[:, :, np.newaxis]) 132 | ).reshape(-1) 133 | ) 134 | sum_p = np.sum(p) 135 | if i is None: 136 | return sum_p 137 | 138 | return np.sum(p[i]) / sum_p 139 | 140 | 141 | # %% 142 | def rij(period: np.ndarray, xi: np.ndarray, xj: np.ndarray) -> np.ndarray: 143 | """Get the position vectors between two points. PBC are taken into account.""" 144 | xij = xi - xj 145 | if period is not None: 146 | xij -= np.round(xij / period) * period 147 | 148 | return xij 149 | 150 | 151 | # %% 152 | # The original model that we want to fit: 153 | original_model = GaussianMixtureModel(np.full(3, 1 / 3), means, covariances) 154 | # The fitted model: 155 | fitted_model = GaussianMixtureModel( 156 | estimator._sample_weights, estimator._grids, estimator.bandwidth_ 157 | ) 158 | 159 | # To plot the probability density contour, we need to create a grid of points: 160 | x, y = np.meshgrid(np.linspace(-6, 12, 100), np.linspace(-8, 8)) 161 | points = np.concatenate(np.stack([x, y], axis=-1)) 162 | probs = np.array([original_model(point) for point in points]) 163 | fitted_probs = np.array([fitted_model(point) for point in points]) 164 | 165 | fig, ax = plt.subplots() 166 | ct1 = ax.contour(x, y, probs.reshape(x.shape), colors="blue") 167 | ct2 = ax.contour(x, y, fitted_probs.reshape(x.shape), colors="orange") 168 | h1, _ = ct1.legend_elements() 169 | h2, _ = ct2.legend_elements() 170 | ax.legend( 171 | [h1[0], h2[0]], 172 | ["original", "fitted"], 173 | ) 174 | ax.set_xlabel("x") 175 | ax.set_ylabel("y") 176 | plt.show() 177 | 178 | # %% 179 | # The performance of the probability density estimation can be characterized by the 180 | # Mean Integrated Squared Error (MISE), which is defined as: 181 | # :math:`\text{MISE}=\text{E}[\int (\hat{P}(\textbf{x})-P(\textbf{x}))^2 d\textbf{x}]` 182 | 183 | # %% 184 | RMSE = np.sum((probs - fitted_probs) ** 2 * (x[0][1] - x[0][0]) * (y[1][0] - y[0][0])) 185 | print(f"Time sparse-kde: {end2 - start2} s") 186 | print(f"RMSE = {RMSE:.2e}") 187 | 188 | # %% 189 | # We can compare the result with the KDE class from scipy. (Usually takes 190 | # several minutes to run) 191 | 192 | # %% 193 | data = np.vstack([x.ravel(), y.ravel()]) 194 | start = time.time() 195 | kde = gaussian_kde(samples.T) 196 | sklearn_probs = kde(data).T 197 | end = time.time() 198 | print(f"Time scipy: {end - start} s") 199 | RMSE_kde = np.sum( 200 | (probs - sklearn_probs) ** 2 * (x[0][1] - x[0][0]) * (y[1][0] - y[0][0]) 201 | ) 202 | print(f"RMSE_kde = {RMSE_kde:.2e}") 203 | 204 | # %% 205 | # We can see that the fitted model can perfectly capture the original one. Even though 206 | # we have not specified the number of the Gaussians, it can still perform well. This 207 | # allows us to fit distributions of the data automatically at a comparable quality 208 | # within a much shorter time than scipy. 209 | -------------------------------------------------------------------------------- /examples/pcovc/PCovC_Comparison.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Comparing PCovC with PCA and LDA 6 | ================================ 7 | """ 8 | # %% 9 | # 10 | 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | from sklearn.datasets import load_breast_cancer 14 | from sklearn.decomposition import PCA 15 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 16 | from sklearn.linear_model import LogisticRegressionCV 17 | from sklearn.preprocessing import StandardScaler 18 | 19 | from skmatter.decomposition import PCovC 20 | 21 | 22 | plt.rcParams["image.cmap"] = "tab10" 23 | plt.rcParams["scatter.edgecolors"] = "k" 24 | 25 | random_state = 0 26 | 27 | # %% 28 | # 29 | # For this, we will use the :func:`sklearn.datasets.load_breast_cancer` dataset from 30 | # ``sklearn``. 31 | 32 | X, y = load_breast_cancer(return_X_y=True) 33 | 34 | scaler = StandardScaler() 35 | X_scaled = scaler.fit_transform(X) 36 | 37 | # %% 38 | # 39 | # PCA 40 | # --- 41 | # 42 | 43 | pca = PCA(n_components=2) 44 | 45 | pca.fit(X_scaled, y) 46 | T_pca = pca.transform(X_scaled) 47 | 48 | fig, ax = plt.subplots() 49 | scatter = ax.scatter(T_pca[:, 0], T_pca[:, 1], c=y) 50 | ax.set(xlabel="PC$_1$", ylabel="PC$_2$") 51 | ax.legend( 52 | scatter.legend_elements()[0][::-1], 53 | load_breast_cancer().target_names[::-1], 54 | loc="upper right", 55 | title="Classes", 56 | ) 57 | 58 | # %% 59 | # 60 | # LDA 61 | # --- 62 | # 63 | 64 | lda = LinearDiscriminantAnalysis(n_components=1) 65 | lda.fit(X_scaled, y) 66 | 67 | T_lda = lda.transform(X_scaled) 68 | 69 | fig, ax = plt.subplots() 70 | ax.scatter(T_lda[:], np.zeros(len(T_lda[:])), c=y) 71 | ax.set(xlabel="LDA$_1$", ylabel="LDA$_2$") 72 | 73 | # %% 74 | # 75 | # PCovC 76 | # ------------------- 77 | # 78 | # Below, we see the map produced 79 | # by a PCovC model with :math:`\alpha` = 0.5 and a logistic 80 | # regression classifier. 81 | 82 | mixing = 0.5 83 | 84 | pcovc = PCovC( 85 | mixing=mixing, 86 | n_components=2, 87 | random_state=random_state, 88 | classifier=LogisticRegressionCV(), 89 | ) 90 | pcovc.fit(X_scaled, y) 91 | 92 | T_pcovc = pcovc.transform(X_scaled) 93 | 94 | fig, ax = plt.subplots() 95 | ax.scatter(T_pcovc[:, 0], T_pcovc[:, 1], c=y) 96 | ax.set(xlabel="PCov$_1$", ylabel="PCov$_2$") 97 | 98 | # %% 99 | # 100 | # A side-by-side comparison of the 101 | # three maps (PCA, LDA, and PCovC): 102 | 103 | fig, axs = plt.subplots(1, 3, figsize=(18, 5)) 104 | axs[0].scatter(T_pca[:, 0], T_pca[:, 1], c=y) 105 | axs[0].set_title("PCA") 106 | axs[1].scatter(T_lda, np.zeros(len(T_lda)), c=y) 107 | axs[1].set_title("LDA") 108 | axs[2].scatter(T_pcovc[:, 0], T_pcovc[:, 1], c=y) 109 | axs[2].set_title("PCovC") 110 | plt.show() 111 | -------------------------------------------------------------------------------- /examples/pcovc/PCovC_Hyperparameters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | PCovC Hyperparameter Tuning 6 | =========================== 7 | """ 8 | # %% 9 | # 10 | 11 | import matplotlib.pyplot as plt 12 | from matplotlib.colors import LinearSegmentedColormap 13 | from sklearn.datasets import load_iris 14 | from sklearn.decomposition import PCA 15 | from sklearn.inspection import DecisionBoundaryDisplay 16 | from sklearn.linear_model import LogisticRegressionCV, Perceptron, RidgeClassifierCV 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.svm import LinearSVC 19 | 20 | from skmatter.decomposition import PCovC 21 | 22 | 23 | plt.rcParams["image.cmap"] = "tab10" 24 | plt.rcParams["scatter.edgecolors"] = "k" 25 | 26 | random_state = 10 27 | n_components = 2 28 | 29 | # %% 30 | # 31 | # For this, we will use the :func:`sklearn.datasets.load_iris` dataset from 32 | # ``sklearn``. 33 | 34 | X, y = load_iris(return_X_y=True) 35 | 36 | scaler = StandardScaler() 37 | X_scaled = scaler.fit_transform(X) 38 | 39 | # %% 40 | # 41 | # PCA 42 | # --- 43 | # 44 | 45 | pca = PCA(n_components=n_components) 46 | 47 | pca.fit(X_scaled, y) 48 | T_pca = pca.transform(X_scaled) 49 | 50 | fig, axis = plt.subplots() 51 | scatter = axis.scatter(T_pca[:, 0], T_pca[:, 1], c=y) 52 | axis.set(xlabel="PC$_1$", ylabel="PC$_2$") 53 | axis.legend( 54 | scatter.legend_elements()[0], 55 | load_iris().target_names, 56 | loc="lower right", 57 | title="Classes", 58 | ) 59 | 60 | # %% 61 | # 62 | # Effect of Mixing Parameter :math:`\alpha` on PCovC Map 63 | # ------------------------------------------------------ 64 | # 65 | # Below, we see how different :math:`\alpha` values for our PCovC model 66 | # result in varying class distinctions between setosa, versicolor, 67 | # and virginica on the PCovC map. 68 | 69 | n_mixing = 5 70 | mixing_params = [0, 0.25, 0.50, 0.75, 1] 71 | 72 | fig, axs = plt.subplots(1, n_mixing, figsize=(4 * n_mixing, 4), sharey="row") 73 | 74 | for id in range(0, n_mixing): 75 | mixing = mixing_params[id] 76 | 77 | pcovc = PCovC( 78 | mixing=mixing, 79 | n_components=n_components, 80 | random_state=random_state, 81 | classifier=LogisticRegressionCV(), 82 | ) 83 | 84 | pcovc.fit(X_scaled, y) 85 | T = pcovc.transform(X_scaled) 86 | 87 | axs[id].set_xticks([]) 88 | axs[id].set_yticks([]) 89 | 90 | axs[id].set_title(r"$\alpha=$" + str(mixing)) 91 | axs[id].set_xlabel("PCov$_1$") 92 | axs[id].scatter(T[:, 0], T[:, 1], c=y) 93 | 94 | axs[0].set_ylabel("PCov$_2$") 95 | 96 | fig.subplots_adjust(wspace=0) 97 | 98 | # %% 99 | # 100 | # Effect of PCovC Classifier on PCovC Map and Decision Boundaries 101 | # --------------------------------------------------------------- 102 | # 103 | # Here, we see how a PCovC model (:math:`\alpha` = 0.5) fitted with 104 | # different classifiers produces varying PCovC maps. In addition, 105 | # we see the varying decision boundaries produced by the 106 | # respective PCovC classifiers. 107 | 108 | mixing = 0.5 109 | fig, axs = plt.subplots(1, 4, figsize=(16, 4)) 110 | 111 | models = { 112 | RidgeClassifierCV(): "Ridge Classification", 113 | LogisticRegressionCV(random_state=random_state): "Logistic Regression", 114 | LinearSVC(random_state=random_state): "Support Vector Classification", 115 | Perceptron(random_state=random_state): "Single-Layer Perceptron", 116 | } 117 | 118 | for id in range(0, len(models)): 119 | model = list(models)[id] 120 | 121 | pcovc = PCovC( 122 | mixing=mixing, 123 | n_components=n_components, 124 | random_state=random_state, 125 | classifier=model, 126 | ) 127 | 128 | pcovc.fit(X_scaled, y) 129 | T = pcovc.transform(X_scaled) 130 | 131 | graph = axs[id] 132 | graph.set_title(models[model]) 133 | 134 | DecisionBoundaryDisplay.from_estimator( 135 | estimator=pcovc.classifier_, 136 | X=T, 137 | ax=graph, 138 | response_method="predict", 139 | grid_resolution=1000, 140 | ) 141 | 142 | scatter = graph.scatter(T[:, 0], T[:, 1], c=y) 143 | 144 | graph.set_xlabel("PCov$_1$") 145 | graph.set_xticks([]) 146 | graph.set_yticks([]) 147 | 148 | axs[0].set_ylabel("PCov$_2$") 149 | axs[0].legend( 150 | scatter.legend_elements()[0], 151 | load_iris().target_names, 152 | loc="lower right", 153 | title="Classes", 154 | fontsize=8, 155 | ) 156 | 157 | fig.subplots_adjust(wspace=0.04) 158 | plt.show() 159 | -------------------------------------------------------------------------------- /examples/pcovc/README.rst: -------------------------------------------------------------------------------- 1 | PCovC 2 | ===== 3 | -------------------------------------------------------------------------------- /examples/pcovr/PCovR.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Construct a PCovR Map 6 | ===================== 7 | """ 8 | # %% 9 | # 10 | 11 | 12 | import numpy as np 13 | from matplotlib import cm 14 | from matplotlib import pyplot as plt 15 | from sklearn.datasets import load_diabetes 16 | from sklearn.kernel_ridge import KernelRidge 17 | from sklearn.linear_model import Ridge 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | from skmatter.decomposition import KernelPCovR, PCovR 21 | 22 | 23 | cmapX = cm.plasma 24 | cmapy = cm.Greys 25 | 26 | # %% 27 | # 28 | # For this, we will use the :func:`sklearn.datasets.load_diabetes` dataset from 29 | # ``sklearn``. 30 | 31 | X, y = load_diabetes(return_X_y=True) 32 | y = y.reshape(X.shape[0], -1) 33 | 34 | X_scaler = StandardScaler() 35 | X_scaled = X_scaler.fit_transform(X) 36 | 37 | y_scaler = StandardScaler() 38 | y_scaled = y_scaler.fit_transform(y) 39 | 40 | # %% 41 | # 42 | # Computing a simple PCovR and making a fancy plot of the results 43 | # --------------------------------------------------------------- 44 | 45 | mixing = 0.5 46 | pcovr = PCovR( 47 | mixing=mixing, 48 | regressor=Ridge(alpha=1e-8, fit_intercept=False, tol=1e-12), 49 | n_components=2, 50 | ) 51 | pcovr.fit(X_scaled, y_scaled) 52 | T = pcovr.transform(X_scaled) 53 | yp = y_scaler.inverse_transform(pcovr.predict(X_scaled).reshape(-1, 1)) 54 | 55 | fig, ((axT, axy), (caxT, caxy)) = plt.subplots( 56 | 2, 2, figsize=(8, 5), gridspec_kw=dict(height_ratios=(1, 0.1)) 57 | ) 58 | 59 | scatT = axT.scatter(T[:, 0], T[:, 1], s=50, alpha=0.8, c=y, cmap=cmapX, edgecolor="k") 60 | axT.set_xlabel(r"$PC_1$") 61 | axT.set_ylabel(r"$PC_2$") 62 | fig.colorbar(scatT, cax=caxT, label="True NMR Shift [ppm]", orientation="horizontal") 63 | 64 | scaty = axy.scatter(y, yp, s=50, alpha=0.8, c=np.abs(y - yp), cmap=cmapy, edgecolor="k") 65 | axy.plot(axy.get_xlim(), axy.get_xlim(), "r--") 66 | fig.suptitle(r"$\alpha=$" + str(mixing)) 67 | 68 | axy.set_xlabel(r"True NMR Shift [ppm]") 69 | axy.set_ylabel(r"Predicted NMR Shift [ppm]") 70 | fig.colorbar( 71 | scaty, cax=caxy, label="Error in NMR Shift [ppm]", orientation="horizontal" 72 | ) 73 | 74 | fig.tight_layout() 75 | 76 | # %% 77 | # 78 | # Surveying many Mixing Parameters 79 | # -------------------------------- 80 | 81 | n_alpha = 5 82 | 83 | fig, axes = plt.subplots(2, n_alpha, figsize=(4 * n_alpha, 10), sharey="row") 84 | 85 | for i, mixing in enumerate(np.linspace(0, 1, n_alpha)): 86 | pcovr = PCovR( 87 | mixing=mixing, 88 | regressor=Ridge(alpha=1e-8, fit_intercept=False, tol=1e-12), 89 | n_components=2, 90 | ) 91 | pcovr.fit(X_scaled, y_scaled) 92 | T = pcovr.transform(X_scaled) 93 | yp = y_scaler.inverse_transform(pcovr.predict(X_scaled).reshape(-1, 1)) 94 | 95 | axes[0, i].scatter( 96 | T[:, 0], T[:, 1], s=50, alpha=0.8, c=y, cmap=cmapX, edgecolor="k" 97 | ) 98 | axes[0, i].set_title(r"$\alpha=$" + str(mixing)) 99 | axes[0, i].set_xlabel(r"$PC_1$") 100 | axes[0, i].set_xticks([]) 101 | axes[0, i].set_yticks([]) 102 | 103 | axes[1, i].scatter( 104 | y, yp, s=50, alpha=0.8, c=np.abs(y - yp), cmap=cmapy, edgecolor="k" 105 | ) 106 | axes[1, i].set_title(r"$\alpha=$" + str(mixing)) 107 | axes[1, i].set_xlabel("y") 108 | 109 | axes[0, 0].set_ylabel(r"$PC_2$") 110 | axes[1, 0].set_ylabel("Predicted y") 111 | 112 | fig.subplots_adjust(wspace=0, hspace=0.25) 113 | plt.show() 114 | 115 | # %% 116 | # 117 | # Construct a Kernel PCovR Map 118 | # ============================ 119 | # 120 | # Moving from PCovR to KernelPCovR is much like moving from PCA to KernelPCA in 121 | # ``sklearn``. Like KernelPCA, KernelPCovR can compute any pairwise kernel supported by 122 | # ``sklearn`` or operate on a precomputed kernel. 123 | 124 | 125 | mixing = 0.5 126 | kpcovr = KernelPCovR( 127 | mixing=mixing, 128 | regressor=KernelRidge( 129 | alpha=1e-8, 130 | kernel="rbf", 131 | gamma=0.1, 132 | ), 133 | kernel="rbf", 134 | gamma=0.1, 135 | n_components=2, 136 | ) 137 | kpcovr.fit(X_scaled, y_scaled) 138 | T = kpcovr.transform(X_scaled) 139 | yp = y_scaler.inverse_transform(kpcovr.predict(X_scaled).reshape(-1, 1)) 140 | 141 | fig, ((axT, axy), (caxT, caxy)) = plt.subplots( 142 | 2, 2, figsize=(8, 5), gridspec_kw=dict(height_ratios=(1, 0.1)) 143 | ) 144 | 145 | scatT = axT.scatter(T[:, 0], T[:, 1], s=50, alpha=0.8, c=y, cmap=cmapX, edgecolor="k") 146 | axT.set_xlabel(r"$PC_1$") 147 | axT.set_ylabel(r"$PC_2$") 148 | fig.colorbar(scatT, cax=caxT, label="y", orientation="horizontal") 149 | 150 | scaty = axy.scatter(y, yp, s=50, alpha=0.8, c=np.abs(y - yp), cmap=cmapy, edgecolor="k") 151 | axy.plot(axy.get_xlim(), axy.get_xlim(), "r--") 152 | fig.suptitle(r"$\alpha=$" + str(mixing)) 153 | 154 | axy.set_xlabel(r"$y$") 155 | axy.set_ylabel(r"Predicted $y$") 156 | fig.colorbar(scaty, cax=caxy, label="Error in y", orientation="horizontal") 157 | 158 | fig.tight_layout() 159 | 160 | # %% 161 | # 162 | # As you can see, the regression error has decreased considerably from the linear case, 163 | # meaning that the map on the left can, and will, better correlate with the target 164 | # values. 165 | # 166 | # Note on KernelPCovR for Atoms, Molecules, and Structures 167 | # -------------------------------------------------------- 168 | # 169 | # Applying this to datasets involving collections of atoms and their atomic descriptors, 170 | # it's important to consider the nature of the property you are learning and the samples 171 | # you are comparing before constructing a kernel, for example, whether the analysis is 172 | # to be based on whole structures or individual atomic environments. For more detail, 173 | # see Appendix C of 174 | # `Helfrecht 2020 `_ or, 175 | # regarding kernels involving gradients, 176 | # `Musil 2021 `_. 177 | -------------------------------------------------------------------------------- /examples/pcovr/PCovR_Regressors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | """ 4 | Choosing Different Regressors for PCovR 5 | ======================================= 6 | """ 7 | # %% 8 | # 9 | import time 10 | 11 | from matplotlib import pyplot as plt 12 | from sklearn.datasets import load_diabetes 13 | from sklearn.linear_model import Ridge 14 | from sklearn.preprocessing import StandardScaler 15 | 16 | from skmatter.decomposition import PCovR 17 | 18 | 19 | # %% 20 | # 21 | # For this, we will use the :func:`sklearn.datasets.load_diabetes` dataset from 22 | # ``sklearn``. 23 | 24 | mixing = 0.5 25 | 26 | X, y = load_diabetes(return_X_y=True) 27 | 28 | X_scaler = StandardScaler() 29 | X_scaled = X_scaler.fit_transform(X) 30 | 31 | y_scaler = StandardScaler() 32 | y_scaled = y_scaler.fit_transform(y.reshape(-1, 1)) 33 | 34 | 35 | # %% 36 | # 37 | # Use the default regressor in PCovR 38 | # ---------------------------------- 39 | # 40 | # When there is no regressor supplied, PCovR uses 41 | # ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)``. 42 | 43 | pcovr1 = PCovR(mixing=mixing, n_components=2) 44 | 45 | t0 = time.perf_counter() 46 | pcovr1.fit(X_scaled, y_scaled) 47 | t1 = time.perf_counter() 48 | 49 | print(f"Regressor is {pcovr1.regressor_} and fit took {1e3 * (t1 - t0):0.2} ms.") 50 | 51 | 52 | # %% 53 | # 54 | # Use a fitted regressor 55 | # ---------------------- 56 | # 57 | # You can pass a fitted regressor to ``PCovR`` to rely on the predetermined regression 58 | # parameters. Currently, scikit-matter supports ``scikit-learn`` classes 59 | # class:`LinearModel `, :class:`Ridge 60 | # `, and class:`RidgeCV `, 61 | # with plans to support any regressor with similar architecture in the future. 62 | 63 | regressor = Ridge(alpha=1e-6, fit_intercept=False, tol=1e-12) 64 | 65 | t0 = time.perf_counter() 66 | regressor.fit(X_scaled, y_scaled) 67 | t1 = time.perf_counter() 68 | 69 | print(f"Fit took {1e3 * (t1 - t0):0.2} ms.") 70 | 71 | 72 | # %% 73 | # 74 | 75 | pcovr2 = PCovR(mixing=mixing, n_components=2, regressor=regressor) 76 | 77 | t0 = time.perf_counter() 78 | pcovr2.fit(X_scaled, y_scaled) 79 | t1 = time.perf_counter() 80 | 81 | print(f"Regressor is {pcovr2.regressor_} and fit took {1e3 * (t1 - t0):0.2} ms.") 82 | 83 | # %% 84 | # 85 | # Use a pre-predicted y 86 | # --------------------- 87 | # 88 | # With ``regressor='precomputed'``, you can pass a regression output :math:`\hat{Y}` and 89 | # optional regression weights :math:`W` to PCovR. If ``W=None``, then PCovR will 90 | # determine :math:`W` as the least-squares solution between :math:`X` and 91 | # :math:`\hat{Y}`. 92 | 93 | regressor = Ridge(alpha=1e-6, fit_intercept=False, tol=1e-12) 94 | 95 | t0 = time.perf_counter() 96 | regressor.fit(X_scaled, y_scaled) 97 | t1 = time.perf_counter() 98 | 99 | print(f"Fit took {1e3 * (t1 - t0):0.2} ms.") 100 | 101 | W = regressor.coef_ 102 | 103 | # %% 104 | # 105 | 106 | pcovr3 = PCovR(mixing=mixing, n_components=2, regressor="precomputed") 107 | 108 | t0 = time.perf_counter() 109 | pcovr3.fit(X_scaled, y_scaled, W=W) 110 | t1 = time.perf_counter() 111 | 112 | print(f"Fit took {1e3 * (t1 - t0):0.2} ms.") 113 | 114 | # %% 115 | # 116 | # Comparing Results 117 | # ----------------- 118 | # 119 | # Because we used the same regressor in all three models, they will yield the same 120 | # result. 121 | 122 | fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), sharex=True, sharey=True) 123 | 124 | ax1.scatter(*pcovr1.transform(X_scaled).T, c=y) 125 | ax2.scatter(*pcovr2.transform(X_scaled).T, c=y) 126 | ax3.scatter(*pcovr3.transform(X_scaled).T, c=y) 127 | 128 | ax1.set_ylabel("PCov$_2$") 129 | ax1.set_xlabel("PCov$_1$") 130 | ax2.set_xlabel("PCov$_1$") 131 | ax3.set_xlabel("PCov$_1$") 132 | 133 | ax1.set_title("Default Regressor") 134 | ax2.set_title("Pre-fit Regressor") 135 | ax3.set_title("Precomputed Regression Result") 136 | 137 | fig.show() 138 | 139 | # %% 140 | # 141 | # As you can imagine, these three options have different use cases -- if you 142 | # are working with a large dataset, you should always pre-fit to save on time! 143 | -------------------------------------------------------------------------------- /examples/pcovr/PCovR_Scaling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | """ 4 | The Importance of Data Scaling in PCovR / KernelPCovR 5 | ===================================================== 6 | """ 7 | # %% 8 | # 9 | 10 | import numpy as np 11 | from matplotlib import pyplot as plt 12 | from sklearn.datasets import load_diabetes 13 | from sklearn.preprocessing import StandardScaler 14 | 15 | from skmatter.decomposition import PCovR 16 | 17 | 18 | # %% 19 | # 20 | # In PCovR, and KernelPCovR, we are combining multiple aspects of the dataset, primarily 21 | # the features and targets. As such, the results largely depend on the relative 22 | # contributions of each aspect to the 23 | # mixed model. 24 | 25 | X, y = load_diabetes(return_X_y=True) 26 | 27 | # %% 28 | # 29 | # Take the diabetes dataset from sklearn. In their raw form, the magnitudes of the 30 | # features and targets are 31 | 32 | print( 33 | "Norm of the features: %0.2f \nNorm of the targets: %0.2f" 34 | % (np.linalg.norm(X), np.linalg.norm(y)) 35 | ) 36 | 37 | # %% 38 | # 39 | # For the California dataset, we can use the `StandardScaler` class from sklearn, 40 | # as the features and targets are independent. 41 | 42 | x_scaler = StandardScaler() 43 | y_scaler = StandardScaler() 44 | 45 | X_scaled = x_scaler.fit_transform(X) 46 | y_scaled = y_scaler.fit_transform(y.reshape(-1, 1)) 47 | 48 | # %% 49 | # 50 | # Looking at the results at ``mixing=0.5``, we see an especially large difference in the 51 | # latent-space projections 52 | 53 | 54 | pcovr_unscaled = PCovR(mixing=0.5, n_components=4).fit(X, y) 55 | T_unscaled = pcovr_unscaled.transform(X) 56 | Yp_unscaled = pcovr_unscaled.predict(X) 57 | 58 | pcovr_scaled = PCovR(mixing=0.5, n_components=4).fit(X_scaled, y_scaled) 59 | T_scaled = pcovr_scaled.transform(X_scaled) 60 | Yp_scaled = y_scaler.inverse_transform(pcovr_scaled.predict(X_scaled)) 61 | 62 | fig, ((ax1_T, ax2_T), (ax1_Y, ax2_Y)) = plt.subplots(2, 2, figsize=(8, 10)) 63 | 64 | ax1_T.scatter(T_unscaled[:, 0], T_unscaled[:, 1], c=y, cmap="plasma", ec="k") 65 | ax1_T.set_xlabel("PCov1") 66 | ax1_T.set_ylabel("PCov2") 67 | ax1_T.set_title("Latent Projection\nWithout Scaling") 68 | 69 | ax2_T.scatter(T_scaled[:, 0], T_scaled[:, 1], c=y, cmap="plasma", ec="k") 70 | ax2_T.set_xlabel("PCov1") 71 | ax2_T.set_ylabel("PCov2") 72 | ax2_T.set_title("Latent Projection\nWith Scaling") 73 | 74 | ax1_Y.scatter(Yp_unscaled, y, c=np.abs(y - Yp_unscaled), cmap="bone_r", ec="k") 75 | ax1_Y.plot(ax1_Y.get_xlim(), ax1_Y.get_xlim(), "r--") 76 | ax1_Y.set_xlabel("True Y, unscaled") 77 | ax1_Y.set_ylabel("Predicted Y, unscaled") 78 | ax1_Y.set_title("Regression\nWithout Scaling") 79 | 80 | ax2_Y.scatter( 81 | Yp_scaled, y, c=np.abs(y.ravel() - Yp_scaled.ravel()), cmap="bone_r", ec="k" 82 | ) 83 | ax2_Y.plot(ax2_Y.get_xlim(), ax2_Y.get_xlim(), "r--") 84 | ax2_Y.set_xlabel("True Y, unscaled") 85 | ax2_Y.set_ylabel("Predicted Y, unscaled") 86 | ax2_Y.set_title("Regression\nWith Scaling") 87 | 88 | fig.subplots_adjust(hspace=0.5, wspace=0.3) 89 | 90 | # %% 91 | # 92 | # Also, we see that when the datasets are unscaled, the total loss (loss in recreating 93 | # the original dataset and regression loss) does not vary with ``mixing``, as expected. 94 | # Typically, the regression loss should _gradually_ increase with ``mixing`` 95 | # (and vice-versa for the loss in reconstructing the original features). When the 96 | # inputs are not scaled, however, only in the case of ``mixing`` = 0 or 1 will the 97 | # losses drastically change, depending on which component is dominating the model. 98 | # Here, because the features dominate the model, this jump occurs as ``mixing`` goes to 99 | # 0. With the scaled inputs, there is still a jump when ``mixing>0`` due to the change 100 | # in matrix rank. 101 | 102 | mixings = np.linspace(0, 1, 21) 103 | losses_unscaled = np.zeros((2, len(mixings))) 104 | losses_scaled = np.zeros((2, len(mixings))) 105 | 106 | nc = 4 107 | 108 | for mi, mixing in enumerate(mixings): 109 | pcovr_unscaled = PCovR(mixing=mixing, n_components=nc).fit(X, y) 110 | t_unscaled = pcovr_unscaled.transform(X) 111 | yp_unscaled = pcovr_unscaled.predict(T=t_unscaled) 112 | xr_unscaled = pcovr_unscaled.inverse_transform(t_unscaled) 113 | losses_unscaled[:, mi] = ( 114 | np.linalg.norm(xr_unscaled - X) ** 2.0 / np.linalg.norm(X) ** 2, 115 | np.linalg.norm(yp_unscaled - y) ** 2.0 / np.linalg.norm(y) ** 2, 116 | ) 117 | 118 | pcovr_scaled = PCovR(mixing=mixing, n_components=nc).fit(X_scaled, y_scaled) 119 | t_scaled = pcovr_scaled.transform(X_scaled) 120 | yp_scaled = pcovr_scaled.predict(T=t_scaled) 121 | xr_scaled = pcovr_scaled.inverse_transform(t_scaled) 122 | losses_scaled[:, mi] = ( 123 | np.linalg.norm(xr_scaled - X_scaled) ** 2.0 / np.linalg.norm(X_scaled) ** 2, 124 | np.linalg.norm(yp_scaled - y_scaled) ** 2.0 / np.linalg.norm(y_scaled) ** 2, 125 | ) 126 | 127 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharey=True, sharex=True) 128 | ax1.plot(mixings, losses_unscaled[0], marker="o", label=r"$\ell_{X}$") 129 | ax1.plot(mixings, losses_unscaled[1], marker="o", label=r"$\ell_{Y}$") 130 | ax1.plot(mixings, np.sum(losses_unscaled, axis=0), marker="o", label=r"$\ell$") 131 | ax1.legend(fontsize=12) 132 | ax1.set_title("With Inputs Unscaled") 133 | ax1.set_xlabel(r"Mixing parameter $\alpha$") 134 | ax1.set_ylabel(r"Loss $\ell$") 135 | 136 | ax2.plot(mixings, losses_scaled[0], marker="o", label=r"$\ell_{X}$") 137 | ax2.plot(mixings, losses_scaled[1], marker="o", label=r"$\ell_{Y}$") 138 | ax2.plot(mixings, np.sum(losses_scaled, axis=0), marker="o", label=r"$\ell$") 139 | ax2.legend(fontsize=12) 140 | ax2.set_title("With Inputs Scaled") 141 | ax2.set_xlabel(r"Mixing parameter $\alpha$") 142 | ax2.set_ylabel(r"Loss $\ell$") 143 | 144 | fig.show() 145 | 146 | # %% 147 | # 148 | # **Note**: When the relative magnitude of the features or targets is important, such 149 | # as in :func:`skmatter.datasets.load_csd_1000r`, one should use the 150 | # :class:`skmatter.preprocessing.StandardFlexibleScaler`. 151 | -------------------------------------------------------------------------------- /examples/pcovr/README.rst: -------------------------------------------------------------------------------- 1 | PCovR and KernelPCovR 2 | ===================== 3 | -------------------------------------------------------------------------------- /examples/reconstruction/PlotGFRE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Global Feature Reconstruction Error (GFRE) and Distortion (GFRD) 6 | ================================================================ 7 | Example for the usage of the :class:`skmatter.metrics.global_reconstruction_error` as 8 | global feature reconstruction error (GFRE) and 9 | :class:`skmatter.metrics.global_reconstruction_distortion` global feature reconstruction 10 | distortion (GFRD). We apply the global reconstruction measures on the degenerate CH4 11 | manifold dataset. This dataset was specifically constructed to be representable by a 12 | 4-body features (bispectrum) but not by a 3-body features (power spectrum). In other 13 | words the dataset contains environments which are different, but have the same 3-body 14 | features. For more details about the dataset please refer to `Pozdnyakov 2020 15 | `_. 16 | 17 | The ``skmatter`` dataset already contains the 3 and 4-body features computed with 18 | `librascal `_ so we can load it and compare it 19 | with the GFRE/GFRD. 20 | """ 21 | # %% 22 | # 23 | 24 | import matplotlib as mpl 25 | import matplotlib.pyplot as plt 26 | import numpy as np 27 | 28 | from skmatter.datasets import load_degenerate_CH4_manifold 29 | from skmatter.metrics import ( 30 | global_reconstruction_distortion, 31 | global_reconstruction_error, 32 | ) 33 | 34 | 35 | mpl.rc("font", size=20) 36 | 37 | # load features 38 | degenerate_manifold = load_degenerate_CH4_manifold() 39 | power_spectrum_features = degenerate_manifold.data.SOAP_power_spectrum 40 | bispectrum_features = degenerate_manifold.data.SOAP_bispectrum 41 | 42 | # %% 43 | # 44 | 45 | gfre_matrix = np.zeros((2, 2)) 46 | print("Computing GFRE...") 47 | 48 | 49 | # reconstruction error of power spectrum features using power spectrum features 50 | gfre_matrix[0, 0] = global_reconstruction_error( 51 | power_spectrum_features, power_spectrum_features 52 | ) 53 | 54 | # reconstruction error of bispectrum features using power spectrum features 55 | gfre_matrix[0, 1] = global_reconstruction_error( 56 | power_spectrum_features, bispectrum_features 57 | ) 58 | 59 | 60 | # reconstruction error of power spectrum features using bispectrum features 61 | gfre_matrix[1, 0] = global_reconstruction_error( 62 | bispectrum_features, power_spectrum_features 63 | ) 64 | 65 | # reconstruction error of bispectrum features using bispectrum features 66 | gfre_matrix[1, 1] = global_reconstruction_error( 67 | bispectrum_features, bispectrum_features 68 | ) 69 | 70 | print("Computing GFRE finished.") 71 | 72 | 73 | # %% 74 | # 75 | 76 | 77 | gfrd_matrix = np.zeros((2, 2)) 78 | print("Computing GFRD...") 79 | 80 | 81 | # reconstruction distortion of power spectrum features using power spectrum features 82 | gfrd_matrix[0, 0] = global_reconstruction_distortion( 83 | power_spectrum_features, power_spectrum_features 84 | ) 85 | 86 | # reconstruction distortion of power spectrum features using bispectrum features 87 | gfrd_matrix[0, 1] = global_reconstruction_distortion( 88 | power_spectrum_features, bispectrum_features 89 | ) 90 | 91 | # reconstruction distortion of bispectrum features using power spectrum features 92 | gfrd_matrix[1, 0] = global_reconstruction_distortion( 93 | bispectrum_features, power_spectrum_features 94 | ) 95 | 96 | 97 | # reconstruction distortion of bipsectrum features using bispectrum features 98 | gfrd_matrix[1, 1] = global_reconstruction_distortion( 99 | bispectrum_features, bispectrum_features 100 | ) 101 | 102 | print("Computing GFRD finished.") 103 | 104 | 105 | # %% 106 | # 107 | 108 | 109 | fig, (axGFRE, axGFRD, cbar_ax) = plt.subplots( 110 | 1, 111 | 3, 112 | figsize=(10, 4), 113 | gridspec_kw=dict(width_ratios=(1, 1, 0.2)), 114 | ) 115 | 116 | 117 | pcm1 = axGFRE.imshow(gfre_matrix, vmin=0, vmax=0.25) 118 | axGFRE.set_ylabel("F") 119 | axGFRE.set_xlabel("F'") 120 | axGFRE.set_title("GFRE(F, F')") 121 | 122 | axGFRE.set_xticks([0, 1]) 123 | axGFRE.set_xticklabels(["3-body", "4-body"]) 124 | axGFRE.set_yticks([0, 1]) 125 | axGFRE.set_yticklabels(["3-body", "4-body"]) 126 | 127 | pcm2 = axGFRD.imshow(gfrd_matrix, vmin=0, vmax=0.25) 128 | axGFRD.set_xlabel("F'") 129 | axGFRD.set_title("GFRD(F, F')") 130 | 131 | axGFRD.set_xticks([0, 1]) 132 | axGFRD.set_xticklabels(["3-body", "4-body"]) 133 | axGFRD.set_yticks([0, 1]) 134 | axGFRD.set_yticklabels(["", ""]) 135 | 136 | cbar = fig.colorbar(pcm2, cax=cbar_ax, label="GFRE or GFRD") 137 | plt.show() 138 | 139 | # %% 140 | # 141 | # It can be seen that the reconstruction error of 4-body features with 3-body 142 | # features in the left plot in the upper right corner is large, expressing that the 143 | # dataset contains 4-body information that cannot be well linearly reconstructed using 144 | # 3-body information. This is expected, since the dataset was specifically designed for 145 | # this purpose (for more information please read 146 | # `Pozdnyakov 2020 `_). On the other 147 | # hand the 3-body features can be perfectly reconstructed from the 4-body features 148 | # as seen in the left plot in the lower left corner. However, this reconstruction 149 | # distorts the 4-body features significantly as seen in the right plot in the lower left 150 | # corner which is a typical behaviour of higher order features and can be also observe 151 | # for polynomial kernel features. 152 | -------------------------------------------------------------------------------- /examples/reconstruction/PlotLFRE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | """ 4 | Pointwise Local Reconstruction Error 5 | ==================================== 6 | Example for the usage of the 7 | :class:`skmatter.metrics.pointwise_local_reconstruction_error` as pointwise local 8 | reconstruction error (LFRE) on the degenerate CH4 manifold. We apply the local 9 | reconstruction measure on the degenerate CH4 manifold dataset. This dataset was 10 | specifically constructed to be representable by a 4-body features (bispectrum) but not 11 | by a 3-body features (power spectrum). In other words the dataset contains environments 12 | which are different, but have the same 3-body features. For more details about the 13 | dataset please refer to `Pozdnyakov 2020 14 | `_. 15 | 16 | The skmatter dataset already contains the 3 and 4-body features computed with `librascal 17 | `_ so we can load it and compare it with the 18 | LFRE. 19 | """ 20 | # %% 21 | # 22 | 23 | 24 | import matplotlib as mpl 25 | import matplotlib.pyplot as plt 26 | import numpy as np 27 | 28 | from skmatter.datasets import load_degenerate_CH4_manifold 29 | from skmatter.metrics import pointwise_local_reconstruction_error 30 | 31 | 32 | mpl.rc("font", size=20) 33 | 34 | 35 | # load features 36 | degenerate_manifold = load_degenerate_CH4_manifold() 37 | power_spectrum_features = degenerate_manifold.data.SOAP_power_spectrum 38 | bispectrum_features = degenerate_manifold.data.SOAP_bispectrum 39 | 40 | print(degenerate_manifold.DESCR) 41 | 42 | 43 | # %% 44 | # 45 | 46 | 47 | n_local_points = 20 48 | 49 | print("Computing pointwise LFRE...") 50 | 51 | # %% 52 | 53 | # local reconstruction error of power spectrum features using bispectrum features 54 | power_spectrum_to_bispectrum_pointwise_lfre = pointwise_local_reconstruction_error( 55 | power_spectrum_features, 56 | bispectrum_features, 57 | n_local_points, 58 | train_idx=np.arange(0, len(power_spectrum_features), 2), 59 | test_idx=np.arange(0, len(power_spectrum_features)), 60 | estimator=None, 61 | n_jobs=4, 62 | ) 63 | 64 | # local reconstruction error of bispectrum features using power spectrum features 65 | bispectrum_to_power_spectrum_pointwise_lfre = pointwise_local_reconstruction_error( 66 | bispectrum_features, 67 | power_spectrum_features, 68 | n_local_points, 69 | train_idx=np.arange(0, len(power_spectrum_features), 2), 70 | test_idx=np.arange(0, len(power_spectrum_features)), 71 | estimator=None, 72 | n_jobs=4, 73 | ) 74 | 75 | print("Computing pointwise LFRE finished.") 76 | 77 | print( 78 | "LFRE(3-body, 4-body) = ", 79 | np.linalg.norm(power_spectrum_to_bispectrum_pointwise_lfre) 80 | / np.sqrt(len(power_spectrum_to_bispectrum_pointwise_lfre)), 81 | ) 82 | 83 | print( 84 | "LFRE(4-body, 3-body) = ", 85 | np.linalg.norm(bispectrum_to_power_spectrum_pointwise_lfre) 86 | / np.sqrt(len(power_spectrum_to_bispectrum_pointwise_lfre)), 87 | ) 88 | 89 | 90 | # %% 91 | # 92 | 93 | 94 | fig, (ax34, ax43) = plt.subplots( 95 | 1, 2, constrained_layout=True, figsize=(16, 7.5), sharey="row", sharex=True 96 | ) 97 | 98 | vmax = 0.5 99 | 100 | X, Y = np.meshgrid(np.linspace(0.7, 0.9, 9), np.linspace(-0.1, 0.1, 9)) 101 | pcm = ax34.contourf( 102 | X, 103 | Y, 104 | power_spectrum_to_bispectrum_pointwise_lfre[81:].reshape(9, 9).T, 105 | vmin=0, 106 | vmax=vmax, 107 | ) 108 | 109 | ax43.contourf( 110 | X, 111 | Y, 112 | bispectrum_to_power_spectrum_pointwise_lfre[81:].reshape(9, 9).T, 113 | vmin=0, 114 | vmax=vmax, 115 | ) 116 | 117 | ax34.axhline(y=0, color="red", linewidth=5) 118 | ax43.axhline(y=0, color="red", linewidth=5) 119 | ax34.set_ylabel(r"v/$\pi$") 120 | ax34.set_xlabel(r"u/$\pi$") 121 | ax43.set_xlabel(r"u/$\pi$") 122 | 123 | ax34.set_title(r"$X^-$ LFRE(3-body, 4-body)") 124 | ax43.set_title(r"$X^-$ LFRE(4-body, 3-body)") 125 | 126 | cbar = fig.colorbar(pcm, ax=[ax34, ax43], label="LFRE", location="bottom") 127 | 128 | plt.show() 129 | 130 | # %% 131 | # 132 | # The environments span a manifold which is described by the coordinates :math:`v/\pi` 133 | # and :math:`u/\pi` (please refer to 134 | # `Pozdnyakov 2020 `_ for a concrete 135 | # understanding of the manifold). The LFRE is presented for each environment in the 136 | # manifold in the two contour plots. It can be seen that the reconstruction error 137 | # of 4-body features using 3-body features (the left plot) is most significant along the 138 | # degenerate line (the horizontal red line). This agrees with the fact that the 3-body 139 | # features remain the same on the degenerate line and can therefore not reconstruct the 140 | # 4-body features. On the other hand the 4-body features can perfectly reconstruct the 141 | # 3-body features as seen in the right plot. 142 | -------------------------------------------------------------------------------- /examples/reconstruction/PlotPointwiseGFRE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Pointwise GFRE applied on RKHS features 6 | ======================================= 7 | Example for the usage of the 8 | :class:`skmatter.metrics.pointwise_global_reconstruction_error` as the pointwise global 9 | feature reconstruction error (pointwise GFRE). We apply the pointwise global feature 10 | reconstruction error on the degenerate CH4 manifold dataset containing 3 and 4-body 11 | features computed with `librascal `_. We will 12 | show that using reproducing kernel Hilbert space (RKHS) features can improve the quality 13 | of the reconstruction with the downside of being less general. 14 | """ 15 | 16 | # %% 17 | # 18 | 19 | 20 | import matplotlib as mpl 21 | import matplotlib.pyplot as plt 22 | import numpy as np 23 | from sklearn.model_selection import train_test_split 24 | from sklearn.preprocessing._data import KernelCenterer 25 | 26 | from skmatter.datasets import load_degenerate_CH4_manifold 27 | from skmatter.metrics import ( 28 | global_reconstruction_error, 29 | pointwise_global_reconstruction_error, 30 | ) 31 | from skmatter.preprocessing import StandardFlexibleScaler 32 | 33 | 34 | mpl.rc("font", size=20) 35 | 36 | # load features 37 | degenerate_manifold = load_degenerate_CH4_manifold() 38 | power_spectrum_features = degenerate_manifold.data.SOAP_power_spectrum 39 | bispectrum_features = degenerate_manifold.data.SOAP_bispectrum 40 | 41 | # %% 42 | # 43 | # We compare 3-body features with their mapping to the reproducing kernel Hilbert space 44 | # (RKHS) projected to the sample space using the nonlinear radial basis function (RBF) 45 | # kernel 46 | # 47 | # .. math:: 48 | # k^{\textrm{RBF}}(\mathbf{x},\mathbf{x}') = 49 | # \exp(-\gamma \|\mathbf{x}-\mathbf{x}'\|^2),\quad \gamma\in\mathbb{R}_+ 50 | # 51 | # The projected RKHS features are computed using the eigendecomposition of the 52 | # positive-definite kernel matrix :math:`K` 53 | # 54 | # .. math:: 55 | # K = ADA^T = AD^{\frac12}(AD^{\frac12})^T = \Phi\Phi^T 56 | 57 | 58 | def compute_standardized_rbf_rkhs_features(features, gamma): 59 | """Compute the standardized RDF RKHS features.""" 60 | # standardize features 61 | features = StandardFlexibleScaler().fit_transform(features) 62 | 63 | # compute \|x - x\|^2 64 | squared_distance = ( 65 | np.sum(features**2, axis=1)[:, np.newaxis] 66 | + np.sum(features**2, axis=1)[np.newaxis, :] 67 | - 2 * features.dot(features.T) 68 | ) 69 | # computer rbf kernel 70 | kernel = np.exp(-gamma * squared_distance) 71 | 72 | # center kernel 73 | kernel = KernelCenterer().fit_transform(kernel) 74 | 75 | # compute D and A 76 | D, A = np.linalg.eigh(kernel) 77 | 78 | # retain features associated with an eigenvalue above 1e-9 for denoising 79 | select_idx = np.where(D > 1e-9)[0] 80 | 81 | # compute rkhs features 82 | rbf_rkhs_features = A[:, select_idx] @ np.diag(np.sqrt(D[select_idx])) 83 | 84 | # standardize rkhs features, 85 | # this step could be omitted since it is done by the reconstruction measure by 86 | # default 87 | standardized_rbf_rkhs_features = StandardFlexibleScaler().fit_transform( 88 | rbf_rkhs_features 89 | ) 90 | return standardized_rbf_rkhs_features 91 | 92 | 93 | gamma = 1 94 | rbf_power_spectrum_features = compute_standardized_rbf_rkhs_features( 95 | power_spectrum_features, gamma=gamma 96 | ) 97 | 98 | # %% 99 | # 100 | 101 | # some split into train and test idx 102 | idx = np.arange(len(power_spectrum_features)) 103 | 104 | train_idx, test_idx = train_test_split(idx, random_state=42) 105 | 106 | print("Computing pointwise GFRE...") 107 | 108 | # pointwise global reconstruction error of bispectrum features using power spectrum 109 | # features 110 | power_spectrum_to_bispectrum_pointwise_gfre = pointwise_global_reconstruction_error( 111 | power_spectrum_features, bispectrum_features, train_idx=train_idx, test_idx=test_idx 112 | ) 113 | 114 | # pointwise global reconstruction error of bispectrum features using power spectrum 115 | # features mapped to the RKHS 116 | power_spectrum_rbf_to_bispectrum_pointwise_gfre = pointwise_global_reconstruction_error( 117 | rbf_power_spectrum_features, 118 | bispectrum_features, 119 | train_idx=train_idx, 120 | test_idx=test_idx, 121 | ) 122 | 123 | print("Computing pointwise GFRE finished.") 124 | 125 | print("Computing GFRE...") 126 | 127 | # global reconstruction error of bispectrum features using power spectrum features 128 | power_spectrum_to_bispectrum_gfre = global_reconstruction_error( 129 | power_spectrum_features, bispectrum_features, train_idx=train_idx, test_idx=test_idx 130 | ) 131 | 132 | # global reconstruction error of bispectrum features using power spectrum features 133 | # mapped to the RKHS 134 | power_spectrum_rbf_to_bispectrum_gfre = global_reconstruction_error( 135 | rbf_power_spectrum_features, 136 | bispectrum_features, 137 | train_idx=train_idx, 138 | test_idx=test_idx, 139 | ) 140 | 141 | print("Computing GFRE finished.") 142 | 143 | 144 | # %% 145 | # 146 | 147 | fig, axes = plt.subplots(1, 1, figsize=(12, 7)) 148 | 149 | bins = np.linspace(0, 0.5, 10) 150 | axes.hist( 151 | power_spectrum_to_bispectrum_pointwise_gfre, 152 | bins, 153 | alpha=0.5, 154 | label="pointwise GFRE(3-body, 4-body)", 155 | ) 156 | axes.hist( 157 | power_spectrum_rbf_to_bispectrum_pointwise_gfre, 158 | bins, 159 | color="r", 160 | alpha=0.5, 161 | label="pointwise GFRE(3-body RBF, 4-body)", 162 | ) 163 | axes.axvline( 164 | power_spectrum_to_bispectrum_gfre, 165 | color="darkblue", 166 | label="GFRE(3-body, 4-body)", 167 | linewidth=4, 168 | ) 169 | axes.axvline( 170 | power_spectrum_rbf_to_bispectrum_gfre, 171 | color="darkred", 172 | label="GFRE(3-body RBF RKHS, 4-body)", 173 | linewidth=4, 174 | ) 175 | axes.set_title(f"3-body vs 4-body RBF gamma={gamma} comparison") 176 | axes.set_xlabel("pointwise GFRE") 177 | axes.set_ylabel("number of samples") 178 | axes.legend(fontsize=13) 179 | plt.show() 180 | 181 | 182 | # %% 183 | # 184 | 185 | 186 | print("GFRE(3-body, 4-body) =", power_spectrum_to_bispectrum_gfre) 187 | print("GFRE(3-body RBF RKHS, 4-body) = ", power_spectrum_rbf_to_bispectrum_gfre) 188 | 189 | # %% 190 | # 191 | # It can be seen that RBF RKHS features improve the linear reconstruction of the 192 | # 4-body features (~0.22 in contrast to ~0.19) while also spreading the error for 193 | # individual samples across a wider span of [0, 0.45] in contrast to [0.17, 0.32]. 194 | # This indicates that the reconstruction using the RBF RKHS is less generally 195 | # applicable but instead specific to this dataset 196 | -------------------------------------------------------------------------------- /examples/reconstruction/README.rst: -------------------------------------------------------------------------------- 1 | Feature Reconstruction Measures 2 | =============================== 3 | -------------------------------------------------------------------------------- /examples/regression/README.rst: -------------------------------------------------------------------------------- 1 | Regression 2 | ========== 3 | -------------------------------------------------------------------------------- /examples/selection/FeatureSelection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | """ 4 | PCovR-Inspired Feature Selection 5 | ================================ 6 | """ 7 | # %% 8 | # 9 | import numpy as np 10 | from matplotlib import cm 11 | from matplotlib import pyplot as plt 12 | from sklearn.linear_model import RidgeCV 13 | from sklearn.preprocessing import StandardScaler 14 | 15 | from skmatter.datasets import load_csd_1000r 16 | from skmatter.feature_selection import CUR, FPS, PCovCUR, PCovFPS 17 | from skmatter.preprocessing import StandardFlexibleScaler 18 | 19 | 20 | cmap = cm.brg 21 | 22 | # %% 23 | # 24 | # For this, we will use the provided CSD dataset, which has 100 features to select from. 25 | 26 | X, y = load_csd_1000r(return_X_y=True) 27 | X = StandardFlexibleScaler(column_wise=False).fit_transform(X) 28 | y = StandardScaler().fit_transform(y.reshape(X.shape[0], -1)) 29 | 30 | 31 | # %% 32 | # 33 | 34 | n = X.shape[-1] // 2 35 | lr = RidgeCV(cv=2, alphas=np.logspace(-10, 1), fit_intercept=False) 36 | 37 | # %% 38 | # 39 | # Feature Selection with CUR + PCovR 40 | # ---------------------------------- 41 | # 42 | # First, let's demonstrate CUR feature selection, and show the ten features chosen with 43 | # a mixing parameter of 0.0, 0.5, and 1.0 perform. 44 | 45 | for m in np.arange(0, 1.01, 0.5, dtype=np.float32): 46 | if m < 1.0: 47 | idx = PCovCUR(mixing=m, n_to_select=n).fit(X, y).selected_idx_ 48 | else: 49 | idx = CUR(n_to_select=n).fit(X, y).selected_idx_ 50 | 51 | plt.loglog( 52 | range(1, n + 1), 53 | np.array( 54 | [ 55 | lr.fit(X[:, idx[: ni + 1]], y).score(X[:, idx[: ni + 1]], y) 56 | for ni in range(n) 57 | ] 58 | ), 59 | label=m, 60 | c=cmap(m), 61 | marker="o", 62 | ) 63 | 64 | plt.xlabel("Number of Features Selected") 65 | plt.ylabel(r"$R^2$") 66 | plt.legend(title="Mixing \nParameter") 67 | plt.show() 68 | 69 | # %% 70 | # 71 | # Non-iterative feature selection with CUR + PCovR 72 | # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 73 | # 74 | # Computing a non-iterative CUR is more efficient, although can result in poorer 75 | # performance for larger datasets. you can also use a greater number of 76 | # eigenvectors to compute the feature importance by varying ``k``, but ``k`` should 77 | # not exceed the number of targets, for optimal results. 78 | 79 | m = 0.0 80 | 81 | idx = PCovCUR(mixing=m, n_to_select=n).fit(X, y).selected_idx_ 82 | idx_non_it = PCovCUR(mixing=m, recompute_every=0, n_to_select=n).fit(X, y).selected_idx_ 83 | 84 | plt.loglog( 85 | range(1, n + 1), 86 | np.array( 87 | [ 88 | lr.fit(X[:, idx[: ni + 1]], y).score(X[:, idx[: ni + 1]], y) 89 | for ni in range(n) 90 | ] 91 | ), 92 | label="Iterative", 93 | marker="o", 94 | ) 95 | plt.loglog( 96 | range(1, n + 1), 97 | np.array( 98 | [ 99 | lr.fit(X[:, idx_non_it[: ni + 1]], y).score(X[:, idx_non_it[: ni + 1]], y) 100 | for ni in range(n) 101 | ] 102 | ), 103 | label="Non-Iterative", 104 | marker="s", 105 | ) 106 | 107 | plt.xlabel("Number of Features Selected") 108 | plt.ylabel(r"$R^2$") 109 | plt.legend() 110 | plt.show() 111 | 112 | # %% 113 | # 114 | # Feature Selection with FPS + PCovR 115 | # ---------------------------------- 116 | # 117 | # Next, let's look at FPS. We'll choose the first index from CUR at m = 0, which is 46. 118 | 119 | 120 | for m in np.arange(0, 1.01, 0.5, dtype=np.float32): 121 | if m < 1.0: 122 | idx = PCovFPS(mixing=m, n_to_select=n, initialize=46).fit(X, y).selected_idx_ 123 | else: 124 | idx = FPS(n_to_select=n, initialize=46).fit(X, y).selected_idx_ 125 | 126 | plt.loglog( 127 | range(1, n + 1), 128 | np.array( 129 | [ 130 | lr.fit(X[:, idx[: ni + 1]], y).score(X[:, idx[: ni + 1]], y) 131 | for ni in range(n) 132 | ] 133 | ), 134 | label=m, 135 | c=cmap(m), 136 | marker="o", 137 | ) 138 | 139 | plt.xlabel("Number of Features Selected") 140 | plt.ylabel(r"$R^2$") 141 | plt.legend(title="Mixing \nParameter") 142 | plt.show() 143 | -------------------------------------------------------------------------------- /examples/selection/README.rst: -------------------------------------------------------------------------------- 1 | Feature and Sample Selection 2 | ============================ 3 | -------------------------------------------------------------------------------- /examples/selection/Selectors-Pipelines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Using scikit-matter selectors with scikit-learn pipelines 6 | ========================================================= 7 | """ 8 | 9 | # %% 10 | # 11 | 12 | 13 | import numpy as np 14 | from matplotlib import pyplot as plt 15 | from sklearn.datasets import load_diabetes 16 | from sklearn.linear_model import RidgeCV 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.pipeline import Pipeline 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | from skmatter.feature_selection import CUR, FPS 22 | 23 | 24 | # %% 25 | # 26 | # Simple integration of scikit-matter selectors 27 | # --------------------------------------------- 28 | # 29 | # This example shows how to use FPS to subselect features before training a RidgeCV. 30 | 31 | 32 | scaler = StandardScaler() 33 | selector = FPS(n_to_select=4) 34 | ridge = RidgeCV(cv=2, alphas=np.logspace(-8, 2, 10)) 35 | 36 | X, y = load_diabetes(return_X_y=True) 37 | 38 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 39 | 40 | pipe = Pipeline([("scaler", scaler), ("selector", selector), ("ridge", ridge)]) 41 | pipe.fit(X_train.copy(), y_train.copy()) 42 | 43 | plt.scatter(y_test, pipe.predict(X_test)) 44 | plt.gca().set_aspect("equal") 45 | plt.plot(plt.xlim(), plt.xlim(), "r--") 46 | plt.xlabel("True Values") 47 | plt.ylabel("Predicted Values") 48 | plt.show() 49 | 50 | 51 | # %% 52 | # 53 | # Stacking selectors one after another 54 | # ------------------------------------ 55 | # 56 | # This example shows how to use an FPS, then CUR selector 57 | # to subselect features before training a RidgeCV. 58 | 59 | 60 | scaler = StandardScaler() 61 | fps = FPS(n_to_select=8) 62 | cur = CUR(n_to_select=4) 63 | ridge = RidgeCV(cv=2, alphas=np.logspace(-8, 2, 10)) 64 | 65 | X, y = load_diabetes(return_X_y=True) 66 | 67 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 68 | 69 | pipe = Pipeline( 70 | [("scaler", scaler), ("selector1", fps), ("selector2", cur), ("ridge", ridge)] 71 | ) 72 | pipe.fit(X_train.copy(), y_train.copy()) 73 | 74 | plt.scatter(y_test, pipe.predict(X_test)) 75 | plt.gca().set_aspect("equal") 76 | plt.plot(plt.xlim(), plt.xlim(), "r--") 77 | plt.xlabel("True Values") 78 | plt.ylabel("Predicted Values") 79 | plt.show() 80 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "wheel", 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "skmatter" 10 | description = "A collection of scikit-learn compatible utilities that implement methods born out of the materials science and chemistry communities." 11 | authors = [ 12 | {name = "Rose K. Cersonsky", email="rose.cersonsky@wisc.edu"}, 13 | {name = "Guillaume Fraux"}, 14 | {name = "Sergei Kliavinek"}, 15 | {name = "Alexander Goscinski"}, 16 | {name = "Benjamin A. Helfrecht"}, 17 | {name = "Victor P. Principe"}, 18 | {name = "Philip Loche"}, 19 | {name = "Michele Ceriotti"} 20 | ] 21 | readme = "README.rst" 22 | requires-python = ">=3.10" 23 | license = {text = "BSD-3-Clause"} 24 | classifiers = [ 25 | "Development Status :: 4 - Beta", 26 | "Environment :: Console", 27 | "Intended Audience :: Science/Research", 28 | "License :: OSI Approved :: BSD License", 29 | "Natural Language :: English", 30 | "Operating System :: POSIX", 31 | "Operating System :: MacOS :: MacOS X", 32 | "Operating System :: Microsoft :: Windows", 33 | "Programming Language :: Python :: 3", 34 | "Programming Language :: Python :: 3.9", 35 | "Programming Language :: Python :: 3.10", 36 | "Programming Language :: Python :: 3.11", 37 | "Programming Language :: Python :: 3.12", 38 | "Topic :: Scientific/Engineering", 39 | ] 40 | dependencies = [ 41 | "scikit-learn >= 1.6.0", 42 | "scipy >= 1.15.0", # explicit to adhere to scikit-learn dependencies 43 | ] 44 | dynamic = ["version"] 45 | 46 | [project.optional-dependencies] 47 | examples = [ 48 | "matplotlib", 49 | "pandas", 50 | "tqdm", 51 | ] 52 | 53 | [project.urls] 54 | homepage = "http://scikit-matter.readthedocs.io" 55 | documentation = "http://scikit-matter.readthedocs.io" 56 | repository = "https://github.com/scikit-learn-contrib/scikit-matter" 57 | issues = "https://github.com/scikit-learn-contrib/scikit-matterissues" 58 | changelog = "http://scikit-matter.readthedocs.io/en/latest/changelog.html" 59 | 60 | [tool.setuptools.packages.find] 61 | where = ["src"] 62 | 63 | [tool.setuptools.dynamic] 64 | version = {attr = "skmatter.__version__"} 65 | 66 | [tool.coverage.run] 67 | branch = true 68 | data_file = 'tests/.coverage' 69 | 70 | [tool.coverage.report] 71 | include = [ 72 | "src/skmatter/*" 73 | ] 74 | 75 | [tool.coverage.xml] 76 | output = 'tests/coverage.xml' 77 | 78 | [tool.isort] 79 | skip = "__init__.py" 80 | profile = "black" 81 | line_length = 88 82 | indent = 4 83 | include_trailing_comma = true 84 | lines_after_imports = 2 85 | known_first_party = "skmatter" 86 | 87 | [tool.pytest.ini_options] 88 | testpaths = ["tests"] 89 | addopts = [ 90 | "--cov", 91 | "--cov-append", 92 | "--cov-report=", 93 | "--import-mode=append", 94 | ] 95 | 96 | [tool.ruff] 97 | exclude = ["docs/src/examples/"] 98 | lint.ignore = [ 99 | "F401", 100 | "E203", 101 | "D100", 102 | "D101", 103 | "D102", 104 | "D205", 105 | "D400", 106 | "D401", 107 | ] 108 | line-length = 88 109 | lint.select = [ 110 | "D", 111 | "E", 112 | "F", 113 | "W", 114 | ] 115 | 116 | [tool.ruff.lint.pydocstyle] 117 | convention = "numpy" 118 | 119 | [tool.ruff.lint.per-file-ignores] 120 | "examples/**" = [ 121 | "D205", 122 | "D400", 123 | ] 124 | -------------------------------------------------------------------------------- /src/skmatter/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | scikit-matter 3 | ============= 4 | 5 | scikit-matter is a toolbox of methods developed in the computational chemical and 6 | materials science community, following the `scikit-learn `_ API and 7 | coding guidelines to promote usability and interoperability with existing workflows. 8 | """ 9 | 10 | __version__ = "0.3.0-dev" 11 | -------------------------------------------------------------------------------- /src/skmatter/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | r""" 2 | The module implements the quick shift clustering algorithm, which is used in 3 | probabilistic analysis of molecular motifs (PAMM). See `Gasparotto and Ceriotti 4 | `_ for more details. 5 | """ 6 | 7 | from ._quick_shift import QuickShift 8 | 9 | __all__ = [ 10 | "QuickShift", 11 | ] 12 | -------------------------------------------------------------------------------- /src/skmatter/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Datasets used for example and testing.""" 2 | 3 | from ._base import ( 4 | load_csd_1000r, 5 | load_degenerate_CH4_manifold, 6 | load_hbond_dataset, 7 | load_nice_dataset, 8 | load_roy_dataset, 9 | load_who_dataset, 10 | ) 11 | 12 | 13 | __all__ = [ 14 | "load_degenerate_CH4_manifold", 15 | "load_csd_1000r", 16 | "load_hbond_dataset", 17 | "load_nice_dataset", 18 | "load_roy_dataset", 19 | "load_who_dataset", 20 | ] 21 | -------------------------------------------------------------------------------- /src/skmatter/datasets/_base.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join 2 | 3 | import numpy as np 4 | import sklearn 5 | 6 | 7 | if sklearn.__version__ >= "1.5.0": 8 | from sklearn.utils._optional_dependencies import check_pandas_support 9 | else: 10 | from sklearn.utils import check_pandas_support 11 | 12 | from sklearn.utils import Bunch 13 | 14 | 15 | def load_nice_dataset(): 16 | """Load and returns NICE dataset. 17 | 18 | Returns 19 | ------- 20 | nice_data : sklearn.utils.Bunch 21 | Dictionary-like object, with the following attributes: 22 | data : `sklearn.utils.Bunch` -- 23 | contains the keys ``X`` and ``y``. 24 | Structural NICE features and energies, respectively. 25 | DESCR: `str` -- 26 | The full description of the dataset. 27 | """ 28 | module_path = dirname(__file__) 29 | target_filename = join(module_path, "data", "nice_dataset.npz") 30 | raw_data = np.load(target_filename) 31 | data = Bunch( 32 | X=raw_data["structural_features"], 33 | y=raw_data["energies"], 34 | ) 35 | with open(join(module_path, "descr", "nice_dataset.rst")) as rst_file: 36 | fdescr = rst_file.read() 37 | return Bunch(data=data, DESCR=fdescr) 38 | 39 | 40 | def load_degenerate_CH4_manifold(): 41 | """Load and return the degenerate manifold dataset. 42 | 43 | Returns 44 | ------- 45 | degenerate_CH4_manifold_data : sklearn.utils.Bunch 46 | Dictionary-like object, with the following attributes: 47 | 48 | data : `sklearn.utils.Bunch` -- 49 | contains the keys ``SOAP_power_spectrum`` and ``SOAP_bispectrum``. 50 | Two representations of the carbon environments of the 51 | degenerate manifold dataset. 52 | 53 | DESCR: `str` -- 54 | The full description of the dataset. 55 | """ 56 | module_path = dirname(__file__) 57 | target_filename = join(module_path, "data", "degenerate_CH4_manifold.npz") 58 | raw_data = np.load(target_filename) 59 | data = Bunch( 60 | SOAP_power_spectrum=raw_data["SOAP_power_spectrum"], 61 | SOAP_bispectrum=raw_data["SOAP_bispectrum"], 62 | ) 63 | with open(join(module_path, "descr", "degenerate_CH4_manifold.rst")) as rst_file: 64 | fdescr = rst_file.read() 65 | 66 | return Bunch(data=data, DESCR=fdescr) 67 | 68 | 69 | def load_csd_1000r(return_X_y=False): 70 | """Load and return the minimal CSD dataset. 71 | 72 | Returns 73 | ------- 74 | csd1000r : sklearn.utils.Bunch 75 | Dictionary-like object, with the following attributes: 76 | 77 | data : `sklearn.utils.Bunch` -- 78 | contains the keys ``X`` and ``Y``, corresponding to the 79 | FPS-reduced SOAP vectors and local NMR chemical shielding, respectively, 80 | for 100 selected environments of the CSD-1000r dataset. 81 | 82 | DESCR: `str` -- 83 | The full description of the dataset. 84 | """ 85 | module_path = dirname(__file__) 86 | target_filename = join(module_path, "data", "csd-1000r.npz") 87 | raw_data = np.load(target_filename) 88 | if not return_X_y: 89 | data = Bunch( 90 | X=raw_data["X"], 91 | y=raw_data["Y"], 92 | ) 93 | with open(join(module_path, "descr", "csd-1000r.rst")) as rst_file: 94 | fdescr = rst_file.read() 95 | 96 | return Bunch(data=data, DESCR=fdescr) 97 | else: 98 | return raw_data["X"], raw_data["Y"] 99 | 100 | 101 | def load_who_dataset(): 102 | """Load and returns WHO dataset. 103 | 104 | Returns 105 | ------- 106 | who_dataset : sklearn.utils.Bunch 107 | Dictionary-like object, with the following attributes: 108 | data : `pandas.core.frame.DataFrame` -- the WHO dataset 109 | as a Pandas dataframe. 110 | DESCR: `str` -- The full description of the dataset. 111 | """ 112 | module_path = dirname(__file__) 113 | target_filename = join(module_path, "data", "who_dataset.csv") 114 | pd = check_pandas_support("load_who_dataset") 115 | raw_data = pd.read_csv(target_filename) 116 | with open(join(module_path, "descr", "who_dataset.rst")) as rst_file: 117 | fdescr = rst_file.read() 118 | return Bunch(data=raw_data, DESCR=fdescr) 119 | 120 | 121 | def load_roy_dataset(): 122 | """Load and returns the ROY dataset, which contains densities, 123 | energies and SOAP-derived descriptors for 264 structures of polymorphs of ROY, 124 | from [Beran et Al, Chemical Science (2022)](https://doi.org/10.1039/D1SC06074K) 125 | Each structure is labeled as "Known" or "Unknown". 126 | 127 | Returns 128 | ------- 129 | roy_dataset : sklearn.utils.Bunch 130 | Dictionary-like object, with the following attributes: 131 | densities : `np.array` -- the densities of the structures 132 | structure_types : `np.array` -- the type of the structures 133 | features : `np.array` -- SOAP-derived descriptors for the structures 134 | energies : `np.array` -- energies of the structures 135 | """ 136 | module_path = dirname(__file__) 137 | target_properties = join(module_path, "data", "beran_roy_properties.npz") 138 | properties = np.load(target_properties) 139 | 140 | return Bunch( 141 | densities=properties["densities"], 142 | energies=properties["energies"], 143 | structure_types=properties["structure_types"], 144 | features=properties["feats"], 145 | ) 146 | 147 | 148 | def load_hbond_dataset(): 149 | """Load and returns the hydrogen bond dataset, which contains 150 | a set of 3D descriptors for 27233 hydrogen bonds and corresponding 151 | weights, from [Gasparotto et Al, The Journal of Chemical Physics] 152 | (https://doi.org/10.1063/1.4900655) 153 | 154 | Returns 155 | ------- 156 | hbond_dataset : sklearn.utils.Bunch 157 | Dictionary-like object, with the following attributes: 158 | descriptors : `numpy.ndarray` -- the descriptors of hydrogen bond dataset 159 | weights : `numpy.ndarray` -- the weights of each sample in the dataset 160 | """ 161 | module_path = dirname(__file__) 162 | target_filename = join(module_path, "data", "h2o-blyp-piglet.npz") 163 | raw_data = np.load(target_filename) 164 | 165 | with open(join(module_path, "descr", "h2o-blyp-piglet.rst")) as rst_file: 166 | fdescr = rst_file.read() 167 | 168 | return Bunch( 169 | descriptors=raw_data["descriptors"], 170 | weights=raw_data["weights"], 171 | DESCR=fdescr, 172 | ) 173 | -------------------------------------------------------------------------------- /src/skmatter/datasets/data/beran_roy_properties.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/beran_roy_properties.npz -------------------------------------------------------------------------------- /src/skmatter/datasets/data/csd-1000r.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/csd-1000r.npz -------------------------------------------------------------------------------- /src/skmatter/datasets/data/degenerate_CH4_manifold.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/degenerate_CH4_manifold.npz -------------------------------------------------------------------------------- /src/skmatter/datasets/data/h2o-blyp-piglet.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/h2o-blyp-piglet.npz -------------------------------------------------------------------------------- /src/skmatter/datasets/data/nice_dataset.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/scikit-matter/77fb4eac5ba9ebe7c1c17bf6b82d41c02dd57787/src/skmatter/datasets/data/nice_dataset.npz -------------------------------------------------------------------------------- /src/skmatter/datasets/descr/csd-1000r.rst: -------------------------------------------------------------------------------- 1 | .. _csd: 2 | 3 | CSD-1000R 4 | ######### 5 | 6 | This dataset, intended for model testing, contains the SOAP power spectrum features and 7 | local NMR chemical shieldings for 100 environments selected from CSD-1000r, originally 8 | published in [Ceriotti2019]_. 9 | 10 | Function Call 11 | ------------- 12 | 13 | .. function:: skmatter.datasets.load_csd_1000r 14 | 15 | Data Set Characteristics 16 | ------------------------ 17 | 18 | :Number of Instances: Each representation 100 19 | 20 | :Number of Features: Each representation 100 21 | 22 | The representations were computed with [C1]_ using the hyperparameters: 23 | 24 | :rascal hyperparameters: 25 | 26 | +---------------------------+------------+ 27 | | key | value | 28 | +===========================+============+ 29 | | interaction_cutoff: | 3.5 | 30 | +---------------------------+------------+ 31 | | max_radial: | 6 | 32 | +---------------------------+------------+ 33 | | max_angular: | 6 | 34 | +---------------------------+------------+ 35 | | gaussian_sigma_constant: | 0.4 | 36 | +---------------------------+------------+ 37 | | gaussian_sigma_type: | "Constant"| 38 | +---------------------------+------------+ 39 | | cutoff_smooth_width: | 0.5 | 40 | +---------------------------+------------+ 41 | | normalize: | True | 42 | +---------------------------+------------+ 43 | 44 | Of the 2'520 resulting features, 100 were selected via FPS using [C2]_. 45 | 46 | Chemical Properties 47 | ------------------- 48 | 49 | The CSD-1000R dataset consists of 100 atomic environments selected from crystal 50 | structures in the Cambridge Structural Database (CSD) [C3]_. These environments 51 | represent a diverse set of chemical compositions and bonding types, including: 52 | 53 | - Metals, metalloids, and non-metals 54 | - Covalent, ionic, and metallic bonding environments 55 | - Various coordination numbers and geometries 56 | 57 | The dataset captures local chemical environments relevant for modeling properties 58 | such as nuclear magnetic resonance (NMR) chemical shieldings, aiding in the 59 | understanding of structure-property relationships in materials chemistry. 60 | 61 | For more detailed chemical information, users can refer to the original Cambridge 62 | Structural Database [C3]_ or the publication by Ceriotti et al. (2019) [C4]_. 63 | 64 | References 65 | ---------- 66 | 67 | .. [C1] https://github.com/lab-cosmo/librascal commit ade202a6 68 | .. [C2] https://github.com/lab-cosmo/scikit-matter commit 4ed1d92 69 | .. [C3] https://www.ccdc.cam.ac.uk/structures/ 70 | .. [C4] https://www.nature.com/articles/s41597-019-0224-1 71 | 72 | Reference Code 73 | -------------- 74 | 75 | .. code-block:: python 76 | 77 | from skmatter.feature_selection import CUR 78 | from skmatter.preprocessing import StandardFlexibleScaler 79 | from skmatter.sample_selection import FPS 80 | 81 | # read all of the frames and book-keep the centers and species 82 | filename = "/path/to/CSD-1000R.xyz" 83 | frames = np.asarray( 84 | read(filename, ":"), 85 | dtype=object, 86 | ) 87 | 88 | n_centers = np.array([len(frame) for frame in frames]) 89 | center_idx = np.array([i for i, f in enumerate(frames) for p in f]) 90 | n_env_accum = np.zeros(len(frames) + 1, dtype=int) 91 | n_env_accum[1:] = np.cumsum(n_centers) 92 | 93 | numbers = np.concatenate([frame.numbers for frame in frames]) 94 | 95 | # compute radial soap vectors as first pass 96 | hypers = dict( 97 | soap_type="PowerSpectrum", 98 | interaction_cutoff=2.5, 99 | max_radial=6, 100 | max_angular=0, 101 | gaussian_sigma_type="Constant", 102 | gaussian_sigma_constant=0.4, 103 | cutoff_smooth_width=0.5, 104 | normalize=False, 105 | global_species=[1, 6, 7, 8], 106 | expansion_by_species_method="user defined", 107 | ) 108 | soap = SOAP(**hypers) 109 | 110 | X_raw = StandardFlexibleScaler(column_wise=False).fit_transform( 111 | soap.transform(frames).get_features(soap) 112 | ) 113 | 114 | # rank the environments in terms of diversity 115 | n_samples = 500 116 | i_selected = FPS(n_to_select=n_samples, initialize=0).fit(X_raw).selected_idx_ 117 | 118 | # book-keep which frames these samples belong in 119 | f_selected = center_idx[i_selected] 120 | reduced_f_selected = list(sorted(set(f_selected))) 121 | frames_selected = frames[f_selected].copy() 122 | ci_selected = i_selected - n_env_accum[f_selected] 123 | 124 | properties_select = [ 125 | frames[fi].arrays["CS_local"][ci] for fi, ci in zip(f_selected, ci_selected) 126 | ] 127 | -------------------------------------------------------------------------------- /src/skmatter/datasets/descr/degenerate_CH4_manifold.rst: -------------------------------------------------------------------------------- 1 | .. _degenerate_manifold: 2 | 3 | Degenerate CH4 manifold 4 | ####################### 5 | 6 | The dataset contains two representations (SOAP power spectrum and bispectrum) of the two 7 | manifolds spanned by the carbon atoms of two times 81 methane structures. The SOAP power 8 | spectrum representation the two manifolds intersect creating a degenerate manifold/line 9 | for which the representation remains the same. In contrast for higher body order 10 | representations as the (SOAP) bispectrum the carbon atoms can be uniquely represented 11 | and do not create a degenerate manifold. Following the naming convention of 12 | [Pozdnyakov2020]_ for each representation the first 81 samples correspond to the X minus 13 | manifold and the second 81 samples contain the X plus manifold 14 | 15 | Function Call 16 | ------------- 17 | 18 | .. function:: skmatter.datasets.load_degenerate_CH4_manifold 19 | 20 | Data Set Characteristics 21 | ------------------------ 22 | 23 | :Number of Instances: Each representation 162 24 | 25 | :Number of Features: Each representation 12 26 | 27 | The representations were computed with [D1]_ using the hyperparameters: 28 | 29 | :rascal hyperparameters: 30 | 31 | +---------------------------+------------+ 32 | | key | value | 33 | +===========================+============+ 34 | | radial_basis: | "GTO" | 35 | +---------------------------+------------+ 36 | | interaction_cutoff: | 4 | 37 | +---------------------------+------------+ 38 | | max_radial: | 2 | 39 | +---------------------------+------------+ 40 | | max_angular: | 2 | 41 | +---------------------------+------------+ 42 | | gaussian_sigma_constant": | 0.5 | 43 | +---------------------------+------------+ 44 | | gaussian_sigma_type: | "Constant"| 45 | +---------------------------+------------+ 46 | | cutoff_smooth_width: | 0.5 | 47 | +---------------------------+------------+ 48 | | normalize: | False | 49 | +---------------------------+------------+ 50 | 51 | The SOAP bispectrum features were in addition reduced to 12 features with principal 52 | component analysis (PCA) [D2]_. 53 | 54 | References 55 | ---------- 56 | 57 | .. [D1] https://github.com/lab-cosmo/librascal commit 8d9ad7a 58 | .. [D2] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html 59 | -------------------------------------------------------------------------------- /src/skmatter/datasets/descr/h2o-blyp-piglet.rst: -------------------------------------------------------------------------------- 1 | .. _water: 2 | 3 | H2O-BLYP-Piglet 4 | ############### 5 | 6 | This dataset contains 27233 hydrogen bond descriptors and corresponding weights from a 7 | trajectory of a classical simulation performed with a BLYP exchange-correlation 8 | functional and a DZVP basis set. The simulation box contined 64 water molecules. This 9 | dataset was originally published in 10 | [Gasparotto2014]_. 11 | 12 | Function Call 13 | ------------- 14 | 15 | .. function:: skmatter.datasets.load_hbond_dataset 16 | 17 | Data Set Characteristics 18 | ------------------------ 19 | 20 | :Number of Instances: 27233 21 | 22 | :Number of Features: 3 23 | 24 | Reference 25 | --------- 26 | 27 | [1] https://github.com/lab-cosmo/pamm/tree/master/examples/water 28 | 29 | Reference Code 30 | -------------- 31 | 32 | [2] https://github.com/GardevoirX/pypamm/blob/master/tutorials/water/tutorial.ipynb 33 | 34 | [3] https://github.com/lab-cosmo/pamm/blob/master/examples/water/README 35 | 36 | -------------------------------------------------------------------------------- /src/skmatter/datasets/descr/nice_dataset.rst: -------------------------------------------------------------------------------- 1 | .. _nice-dataset: 2 | 3 | NICE dataset 4 | ############ 5 | 6 | This is a toy dataset containing NICE[1, 4](N-body Iterative Contraction of 7 | Equivariants) features for first 500 configurations of the dataset[2, 3] with randomly 8 | displaced methane configurations. 9 | 10 | Function Call 11 | ------------- 12 | 13 | .. function:: skmatter.datasets.load_nice_dataset 14 | 15 | Data Set Characteristics 16 | ------------------------ 17 | 18 | :Number of Instances: 500 19 | 20 | :Number of Features: 160 21 | 22 | The representations were computed using the NICE package[4] using the following 23 | definition of the NICE calculator: 24 | 25 | .. code-block:: python 26 | 27 | StandardSequence( 28 | [ 29 | StandardBlock( 30 | ThresholdExpansioner(num_expand=150), 31 | CovariantsPurifierBoth(max_take=10), 32 | IndividualLambdaPCAsBoth(n_components=50), 33 | ThresholdExpansioner(num_expand=300, mode="invariants"), 34 | InvariantsPurifier(max_take=50), 35 | InvariantsPCA(n_components=30), 36 | ), 37 | StandardBlock( 38 | ThresholdExpansioner(num_expand=150), 39 | CovariantsPurifierBoth(max_take=10), 40 | IndividualLambdaPCAsBoth(n_components=50), 41 | ThresholdExpansioner(num_expand=300, mode="invariants"), 42 | InvariantsPurifier(max_take=50), 43 | InvariantsPCA(n_components=20), 44 | ), 45 | StandardBlock( 46 | None, 47 | None, 48 | None, 49 | ThresholdExpansioner(num_expand=300, mode="invariants"), 50 | InvariantsPurifier(max_take=50), 51 | InvariantsPCA(n_components=20), 52 | ), 53 | ], 54 | initial_scaler=InitialScaler(mode="signal integral", individually=True), 55 | ) 56 | 57 | 58 | References 59 | ---------- 60 | 61 | [1] Jigyasa Nigam, Sergey Pozdnyakov, and Michele Ceriotti. "Recursive evaluation and 62 | iterative contraction of N-body equivariant features." The Journal of Chemical 63 | Physics 153.12 (2020): 121101. 64 | 65 | [2] Incompleteness of Atomic Structure Representations 66 | Sergey N. Pozdnyakov, Michael J. Willatt, Albert P. Bartók, Christoph Ortner, 67 | Gábor Csányi, and Michele Ceriotti 68 | 69 | [3] https://archive.materialscloud.org/record/2020.110 70 | 71 | Reference Code 72 | -------------- 73 | 74 | [4] https://github.com/lab-cosmo/nice 75 | -------------------------------------------------------------------------------- /src/skmatter/datasets/descr/who_dataset.rst: -------------------------------------------------------------------------------- 1 | .. _who: 2 | 3 | WHO dataset 4 | ########### 5 | 6 | ``who_dataset.csv`` is a compilation of multiple publically-available datasets 7 | through data.worldbank.org. Specifically, the following versioned datasets are used: 8 | 9 | - NY.GDP.PCAP.CD (v2_4770383) [1]_ 10 | - SE.XPD.TOTL.GD.ZS (v2_4773094) [2]_ 11 | - SH.DYN.AIDS.ZS (v2_4770518) [3]_ 12 | - SH.IMM.IDPT (v2_4770682) [4]_ 13 | - SH.IMM.MEAS (v2_4774112) [5]_ 14 | - SH.TBS.INCD (v2_4770775) [6]_ 15 | - SH.XPD.CHEX.GD.ZS (v2_4771258) [7]_ 16 | - SN.ITK.DEFC.ZS (v2_4771336) [8]_ 17 | - SP.DYN.LE00.IN (v2_4770556) [9]_ 18 | - SP.POP.TOTL (v2_4770385) [10]_ 19 | 20 | where the corresponding file names are ``API_{dataset}_DS2_excel_en_{version}.xls``. 21 | 22 | This dataset, intended only for demonstration, contains 2020 country-year pairings and 23 | the corresponding values above. 24 | Function Call 25 | ------------- 26 | 27 | .. function:: skmatter.datasets.load_who_dataset 28 | 29 | Data Set Characteristics 30 | ------------------------ 31 | 32 | :Number of Instances: 2020 33 | 34 | :Number of Features: 10 35 | 36 | References 37 | ---------- 38 | 39 | .. [1] https://data.worldbank.org/indicator/NY.GDP.PCAP.CD 40 | .. [2] https://data.worldbank.org/indicator/SE.XPD.TOTL.GD.ZS 41 | .. [3] https://data.worldbank.org/indicator/SH.DYN.AIDS.ZS 42 | .. [4] https://data.worldbank.org/indicator/SH.IMM.IDPT 43 | .. [5] https://data.worldbank.org/indicator/SH.IMM.MEAS 44 | .. [6] https://data.worldbank.org/indicator/SH.TBS.INCD 45 | .. [7] https://data.worldbank.org/indicator/SH.XPD.CHEX.GD.ZS 46 | .. [8] https://data.worldbank.org/indicator/SN.ITK.DEFC.ZS 47 | .. [9] https://data.worldbank.org/indicator/SP.DYN.LE00.IN 48 | .. [10] https://data.worldbank.org/indicator/SP.POP.TOTL 49 | 50 | 51 | Reference Code 52 | -------------- 53 | 54 | The following script is compiled, where the datasets have been placed in a 55 | folder named ``who_data``: 56 | 57 | .. code-block:: python 58 | 59 | import os 60 | import pandas as pd 61 | import numpy as np 62 | 63 | files = os.listdir("who_data/") 64 | indicators = [f[4 : f[4:].index("_") + 4] for f in files] 65 | indicator_codes = {} 66 | data_dict = {} 67 | entries = [] 68 | 69 | for file in files: 70 | data = pd.read_excel( 71 | "who_data/" + file, 72 | header=3, 73 | sheet_name="Data", 74 | index_col=0, 75 | ) 76 | 77 | indicator = data["Indicator Code"].values[0] 78 | indicator_codes[indicator] = data["Indicator Name"].values[0] 79 | 80 | for index in data.index: 81 | for year in range(1900, 2022): 82 | if str(year) in data.loc[index] and not np.isnan( 83 | data.loc[index].loc[str(year)] 84 | ): 85 | if (index, year) not in data_dict: 86 | data_dict[(index, year)] = np.nan * np.ones(len(indicators)) 87 | data_dict[(index, year)][indicators.index(indicator)] = data.loc[ 88 | index 89 | ].loc[str(year)] 90 | 91 | with open("who_data.csv", "w") as outf: 92 | outf.write("Country,Year," + ",".join(indicators) + "\n") 93 | for key, data in data_dict.items(): 94 | if np.count_nonzero(~np.isnan(np.array(data, dtype=float))) == len( 95 | indicators 96 | ): 97 | outf.write( 98 | "{},{},{}\n".format( 99 | key[0].replace(",", " "), 100 | key[1], 101 | ",".join([str(d) for d in data]), 102 | ) 103 | ) 104 | -------------------------------------------------------------------------------- /src/skmatter/decomposition/__init__.py: -------------------------------------------------------------------------------- 1 | r""" 2 | Often, one wants to construct new ML features from their current representation 3 | in order to compress data or visualise trends in the dataset. In the archetypal 4 | method for this dimensionality reduction, principal components analysis (PCA), 5 | features are transformed into the latent space which best preserves the 6 | variance of the original data. 7 | 8 | This module provides the Principal Covariates 9 | Regression (PCovR), as introduced by [deJong1992]_, which is a modification to PCA 10 | that incorporates target information, such that the resulting embedding could 11 | be tuned using a mixing parameter α to improve performance in regression tasks 12 | (:math:`\alpha = 0` corresponding to linear regression and :math:`\alpha = 1` 13 | corresponding to PCA). Also provided is Principal Covariates Classification (PCovC), 14 | proposed in [Jorgensen2025]_, which can similarly be used for classification problems. 15 | 16 | [Helfrecht2020]_ introduced the non-linear version of PCovR, 17 | Kernel Principal Covariates Regression (KPCovR), where the mixing parameter α 18 | now interpolates between kernel ridge regression (:math:`\alpha = 0`) and 19 | kernel principal components analysis (KPCA, :math:`\alpha = 1`). 20 | 21 | The module includes: 22 | 23 | * :ref:`PCovR-api` the standard Principal Covariates Regression. Utilises a 24 | combination between a PCA-like and an LR-like loss, and therefore attempts to find 25 | a low-dimensional projection of the feature vectors that simultaneously minimises 26 | information loss and error in predicting the target properties using only the 27 | latent space vectors :math:`\mathbf{T}`. 28 | * :ref:`PCovC-api` the standard Principal Covariates Classification, proposed in 29 | [Jorgensen2025]_. 30 | * :ref:`KPCovR-api` the Kernel Principal Covariates Regression. 31 | A kernel-based variation on the 32 | original PCovR method, proposed in [Helfrecht2020]_. 33 | """ 34 | 35 | from ._pcov import _BasePCov 36 | 37 | from ._pcovr import PCovR 38 | from ._pcovc import PCovC 39 | 40 | from ._kernel_pcovr import KernelPCovR 41 | 42 | __all__ = [ 43 | "_BasePCov", 44 | "PCovR", 45 | "PCovC", 46 | "KernelPCovR", 47 | ] 48 | -------------------------------------------------------------------------------- /src/skmatter/feature_selection/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn.feature_selection` module includes FPS and CUR selection, each 3 | with the optional PCov-flavor 4 | """ 5 | 6 | from ._base import ( 7 | CUR, 8 | FPS, 9 | PCovCUR, 10 | PCovFPS, 11 | ) 12 | 13 | __all__ = ["PCovFPS", "PCovCUR", "FPS", "CUR"] 14 | -------------------------------------------------------------------------------- /src/skmatter/linear_model/__init__.py: -------------------------------------------------------------------------------- 1 | """Classes for building linear models.""" 2 | 3 | from ._base import OrthogonalRegression 4 | from ._ridge import Ridge2FoldCV 5 | 6 | __all__ = ["OrthogonalRegression", "Ridge2FoldCV"] 7 | -------------------------------------------------------------------------------- /src/skmatter/linear_model/_base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.linalg import orthogonal_procrustes 3 | from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.utils import check_array, check_X_y 6 | from sklearn.utils.validation import check_is_fitted 7 | 8 | 9 | class OrthogonalRegression(MultiOutputMixin, RegressorMixin, BaseEstimator): 10 | r"""Orthogonal regression by solving the Procrustes problem 11 | 12 | Linear regression with the additional constraint that the weight matrix 13 | must be an orthogonal matrix/projection. It minimizes the Procrustes 14 | problem: 15 | 16 | .. math:: 17 | 18 | \min_\Omega ||y - X\Omega\||_F \quad\mathrm{subject\ to}\quad \Omega^T\Omega=I 19 | 20 | Parameters 21 | ---------- 22 | use_orthogonal_projector : bool, default=True 23 | Controls if orthogonal projectors are used to predict y fitting on X. 24 | If this parameter is set to False X and y are padded with zeros to the larger 25 | number of features of X and y. The projection method is similar 26 | to the procedure in the computation GFRD in the first version of 27 | Ref. [Goscinski2021]_. The method has been adapted obtain a full weight matrix. 28 | 29 | The projection can introduce nonanalytic behavior with respect to 30 | changes in dimensions of X for cases where X n_features > y n_targets. 31 | See ``examples/OrthogonalRegressionNonAnalytic_no-doc.ipynb`` 32 | 33 | linear_estimator : object implementing fit/predict, default=None 34 | The linear estimator is used when `use_orthogonal_projector` 35 | is set to True, to compute the projection matrix 36 | 37 | Attributes 38 | ---------- 39 | max_components_ : int 40 | The source X and target y are padded with zeros to match in feature/target 41 | dimension, when `use_orthogonal_projector` is set to False. This attribute 42 | is set to the maximum of the feature and target dimension. 43 | 44 | coef_ : numpy.ndarray of shape (n_features,) or (n_targets, n_features) or (max_components, max_components) 45 | Weight matrix. The shape (max_components, max_components) is used if 46 | `use_orthogonal_projector` is set to False. 47 | """ # NoQa: E501 48 | 49 | def __init__(self, use_orthogonal_projector=True, linear_estimator=None): 50 | self.use_orthogonal_projector = use_orthogonal_projector 51 | self.linear_estimator = linear_estimator 52 | 53 | def fit(self, X, y): 54 | """ 55 | Parameters 56 | ---------- 57 | X : numpy.ndarray of shape (n_samples, n_features) 58 | Training data, where ``n_samples`` is the number of samples and 59 | ``n_features`` is the number of features. 60 | y : numpy.ndarray of shape (n_samples, n_targets) 61 | Training data, where ``n_samples`` is the number of samples and 62 | ``n_targets`` is the number of target properties. 63 | """ 64 | X, y = check_X_y( 65 | X, 66 | y, 67 | y_numeric=True, 68 | ensure_min_features=1, 69 | ensure_min_samples=1, 70 | multi_output=True, 71 | ) 72 | 73 | self.n_samples_in_, self.n_features_in_ = X.shape 74 | if self.use_orthogonal_projector: 75 | # check estimator 76 | linear_estimator = ( 77 | LinearRegression() 78 | if self.linear_estimator is None 79 | else self.linear_estimator 80 | ) 81 | # compute orthogonal projectors 82 | linear_estimator.fit(X, y) 83 | coef = np.reshape(linear_estimator.coef_.T, (X.shape[1], -1)) 84 | U, _, Vt = np.linalg.svd(coef, full_matrices=False) 85 | 86 | # compute weights by solving the Procrustes problem 87 | self.coef_ = ( 88 | U 89 | @ orthogonal_procrustes(X @ U, y.reshape(X.shape[0], -1) @ Vt.T)[0] 90 | @ Vt 91 | ).T 92 | else: 93 | self.max_components_ = max(X.shape[1], y.shape[1]) 94 | X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])]) 95 | y = np.pad(y, [(0, 0), (0, self.max_components_ - y.shape[1])]) 96 | self.coef_ = orthogonal_procrustes(X, y)[0].T 97 | 98 | return self 99 | 100 | def predict(self, X): 101 | """ 102 | Parameters 103 | ---------- 104 | X : numpy.ndarray of shape (n_samples, n_features) 105 | Training data, where n_samples is the number of samples and n_features is 106 | the number of features. 107 | """ 108 | X = check_array(X, ensure_min_features=1, ensure_min_samples=1) 109 | check_is_fitted(self, ["coef_"]) 110 | 111 | if not (self.use_orthogonal_projector): 112 | X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])]) 113 | return X @ self.coef_.T 114 | -------------------------------------------------------------------------------- /src/skmatter/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """Set of metrics that can be used for an enhanced understanding of your machine 2 | learning model. 3 | 4 | First are the easily-interpretable error measures of the relative information 5 | capacity of feature space `F` with respect to feature space `F'`. The methods 6 | returns a value between 0 and 1, where 0 means that `F` and `F'` are completey 7 | distinct in terms of linearly-decodable information, and where 1 means that `F'` 8 | is contained in `F`. All methods are implemented as the root mean-square error 9 | for the regression of the feature matrix `X_F'` (or sometimes called `Y` in the 10 | doc) from `X_F` (or sometimes called `X` in the doc) for transformations with 11 | different constraints (linear, orthogonal, locally-linear). By default a custom 12 | 2-fold cross-validation :py:class:`skosmo.linear_model.Ridge2FoldCV` 13 | is used to ensure the generalization of the transformation and efficiency of the 14 | computation, since we deal with a multi-target regression problem. Methods were 15 | applied to compare different forms of featurizations through different 16 | hyperparameters and induced metrics and kernels [Goscinski2021]_ . 17 | 18 | These reconstruction measures are available: 19 | 20 | * :ref:`GRE-api` (GRE) computes the amount of linearly-decodable information 21 | recovered through a global linear reconstruction. 22 | * :ref:`GRD-api` (GRD) computes the amount of distortion contained in a global 23 | linear reconstruction. 24 | * :ref:`LRE-api` (LRE) computes the amount of decodable information recovered 25 | through a local linear reconstruction for the k-nearest neighborhood of each 26 | sample. 27 | 28 | Next, we offer a set of prediction rigidity metrics, which can be used to 29 | quantify the robustness of the local or component-wise predictions that the 30 | machine learning model has been trained to make, based on the training dataset 31 | composition. 32 | 33 | These prediction rigidities are available: 34 | 35 | * :ref:`LPR-api` (LPR) computes the local prediction rigidity of a linear or 36 | kernel model. 37 | * :ref:`CPR-api` (CPR) computes the component-wise prediction rigidity of a 38 | linear or kernel model. 39 | 40 | There are also two distance metrics compatible with the periodic boundary conditions 41 | available. 42 | 43 | .. note:: 44 | Currently only rectangular cells are supported. 45 | Cell format: [side_length_1, ..., side_length_n] 46 | 47 | * :ref:`pairwise-euclidian-api` computes the euclidean distance between two sets 48 | of points. It is compatible with the periodic boundary conditions. 49 | If the cell length is not provided, it will fall back to the ``scikit-learn`` version 50 | of the euclidean distance :func:`sklearn.metrics.pairwise.euclidean_distances`. 51 | * :ref:`pairwise-mahalanobis-api` computes the Mahalanobis distance between two sets 52 | of points. It is compatible with the periodic boundary conditions. 53 | """ 54 | 55 | from ._reconstruction_measures import ( 56 | check_global_reconstruction_measures_input, 57 | check_local_reconstruction_measures_input, 58 | global_reconstruction_distortion, 59 | global_reconstruction_error, 60 | local_reconstruction_error, 61 | pointwise_global_reconstruction_distortion, 62 | pointwise_global_reconstruction_error, 63 | pointwise_local_reconstruction_error, 64 | ) 65 | 66 | from ._prediction_rigidities import ( 67 | local_prediction_rigidity, 68 | componentwise_prediction_rigidity, 69 | ) 70 | 71 | from ._pairwise import ( 72 | periodic_pairwise_euclidean_distances, 73 | pairwise_mahalanobis_distances, 74 | ) 75 | 76 | __all__ = [ 77 | "pointwise_global_reconstruction_error", 78 | "global_reconstruction_error", 79 | "pointwise_global_reconstruction_distortion", 80 | "global_reconstruction_distortion", 81 | "pointwise_local_reconstruction_error", 82 | "local_reconstruction_error", 83 | "check_global_reconstruction_measures_input", 84 | "check_local_reconstruction_measures_input", 85 | "local_prediction_rigidity", 86 | "componentwise_prediction_rigidity", 87 | "periodic_pairwise_euclidean_distances", 88 | "pairwise_mahalanobis_distances", 89 | ] 90 | 91 | DIST_METRICS = { 92 | "periodic_euclidean": periodic_pairwise_euclidean_distances, 93 | } 94 | -------------------------------------------------------------------------------- /src/skmatter/metrics/_pairwise.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | from sklearn.metrics.pairwise import _euclidean_distances, check_pairwise_arrays 5 | 6 | 7 | def periodic_pairwise_euclidean_distances( 8 | X, 9 | Y=None, 10 | *, 11 | squared=False, 12 | cell_length=None, 13 | ): 14 | r""" 15 | Compute the pairwise distance matrix between each pair from a vector array X and Y. 16 | 17 | .. math:: 18 | d_{i, j} = \\sqrt{\\sum_{k=1}^n (x_{i, k} - y_{j, k})^2} 19 | 20 | For efficiency reasons, the euclidean distance between a pair of row 21 | vector x and y is computed as:: 22 | 23 | dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) 24 | 25 | This formulation has two advantages over other ways of computing distances. First, 26 | it is computationally efficient when dealing with sparse data. Second, if one 27 | argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` 28 | can be pre-computed. 29 | 30 | However, this is not the most precise way of doing this computation, because this 31 | equation potentially suffers from "catastrophic cancellation". Also, the distance 32 | matrix returned by this function may not be exactly symmetric as required by, e.g., 33 | ``scipy.spatial.distance`` functions. 34 | 35 | Read more in the :ref:`User Guide `. 36 | 37 | Parameters 38 | ---------- 39 | X : {array-like, sparse matrix} of shape (n_samples_X, n_components) 40 | An array where each row is a sample and each column is a component. 41 | Y : {array-like, sparse matrix} of shape (n_samples_Y, n_components), \ 42 | default=None 43 | An array where each row is a sample and each column is a component. 44 | If `None`, method uses `Y=X`. 45 | cell_length : array-like of shape (n_components,), default=None 46 | The side length of rectangular cell used for periodic boundary conditions. 47 | `None` for non-periodic boundary conditions. 48 | 49 | .. note:: 50 | Only side lengths of rectangular cells are supported. 51 | Cell format: `[side_length_1, ..., side_length_n]` 52 | 53 | Returns 54 | ------- 55 | distances : ndarray of shape (n_samples_X, n_samples_Y) 56 | Returns the distances between the row vectors of `X` 57 | and the row vectors of `Y`. 58 | 59 | Examples 60 | -------- 61 | >>> import numpy as np 62 | >>> from skmatter.metrics import periodic_pairwise_euclidean_distances 63 | >>> X = np.array([[0, 1], [1, 1]]) 64 | >>> origin = np.array([[0, 0]]) 65 | >>> # distance between rows of X 66 | >>> periodic_pairwise_euclidean_distances(X, X) 67 | array([[0., 1.], 68 | [1., 0.]]) 69 | >>> # get distance to origin 70 | >>> periodic_pairwise_euclidean_distances(X, origin, cell_length=[0.5, 0.7]) 71 | array([[0.3], 72 | [0.3]]) 73 | """ 74 | _check_dimension(X, cell_length) 75 | X, Y = check_pairwise_arrays(X, Y) 76 | 77 | if cell_length is None: 78 | return _euclidean_distances(X, Y, squared=squared) 79 | else: 80 | return _periodic_euclidean_distances(X, Y, squared=squared, cell=cell_length) 81 | 82 | 83 | def _periodic_euclidean_distances(X, Y=None, *, squared=False, cell=None): 84 | X, Y = np.array(X).astype(float), np.array(Y).astype(float) 85 | XY = np.concatenate([x - Y for x in X]) 86 | XY -= np.round(XY / cell) * cell 87 | distance = np.linalg.norm(XY, axis=1).reshape(X.shape[0], Y.shape[0]) 88 | if squared: 89 | distance **= 2 90 | return distance 91 | 92 | 93 | def pairwise_mahalanobis_distances( 94 | X: np.ndarray, 95 | Y: np.ndarray, 96 | cov_inv: np.ndarray, 97 | cell_length: Union[np.ndarray, None] = None, 98 | squared: bool = False, 99 | ): 100 | r""" 101 | Calculate the pairwise Mahalanobis distance between two arrays. 102 | 103 | This metric is used for calculating the distances between observations from Gaussian 104 | distributions. It is defined as: 105 | 106 | .. math:: 107 | d_{\Sigma}(x, y)^2 = (x - y)^T \Sigma^{-1} (x - y) 108 | 109 | where :math:`\Sigma` is the covariance matrix, :math:`x` and :math:`y` are 110 | observations from the same distribution. 111 | 112 | Parameters 113 | ---------- 114 | X : numpy.ndarray of shape (n_samples_X, n_components) 115 | An array where each row is a sample and each column is a component. 116 | Y : np.ndarray of shape (n_samples_Y, n_components) 117 | An array where each row is a sample and each column is a component. 118 | cov_inv : np.ndarray 119 | The inverse covariance matrix of shape (n_components, n_components). 120 | cell_length : np.ndarray, optinal, default=None 121 | The cell size for periodic boundary conditions. 122 | None for non-periodic boundary conditions. 123 | 124 | .. note:: 125 | Only cubic cells are supported. 126 | Cell format: `[side_length_1, ..., side_length_n]` 127 | 128 | squared : bool, default=False 129 | Whether to return the squared distance. 130 | 131 | Returns 132 | ------- 133 | np.ndarray 134 | The pairwise Mahalanobis distance between the two input arrays, 135 | of shape `(cov_inv.shape[0], x.shape[0], y.shape[0])`. 136 | 137 | Examples 138 | -------- 139 | >>> import numpy as np 140 | >>> from skmatter.metrics import pairwise_mahalanobis_distances 141 | >>> iv = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]) 142 | >>> X = np.array([[1, 0, 0], [0, 2, 0], [2, 0, 0]]) 143 | >>> Y = np.array([[0, 1, 0]]) 144 | >>> pairwise_mahalanobis_distances(X, Y, iv) 145 | array([[[1. ], 146 | [1. ], 147 | [1.73205081]]]) 148 | """ 149 | 150 | def _mahalanobis( 151 | cell: np.ndarray, X: np.ndarray, Y: np.ndarray, cov_inv: np.ndarray 152 | ): 153 | 154 | XY = np.concatenate([x - Y for x in X]) 155 | if cell is not None: 156 | XY -= np.round(XY / cell) * cell 157 | 158 | return np.sum(XY * np.transpose(cov_inv @ XY.T, (0, 2, 1)), axis=-1).reshape( 159 | (cov_inv.shape[0], X.shape[0], Y.shape[0]) 160 | ) 161 | 162 | _check_dimension(X, cell_length) 163 | X, Y = check_pairwise_arrays(X, Y) 164 | if len(cov_inv.shape) == 2: 165 | cov_inv = cov_inv[np.newaxis, :, :] 166 | dists = _mahalanobis(cell_length, X, Y, cov_inv) 167 | if not squared: 168 | dists **= 0.5 169 | return dists 170 | 171 | 172 | def _check_dimension(X, cell_length): 173 | if (cell_length is not None) and (X.shape[1] != len(cell_length)): 174 | raise ValueError("Cell dimension does not match the data dimension.") 175 | -------------------------------------------------------------------------------- /src/skmatter/model_selection/__init__.py: -------------------------------------------------------------------------------- 1 | """Functions for model selection.""" 2 | 3 | from ._split import train_test_split 4 | 5 | __all__ = ["train_test_split"] 6 | -------------------------------------------------------------------------------- /src/skmatter/model_selection/_split.py: -------------------------------------------------------------------------------- 1 | import sklearn.model_selection 2 | from sklearn.utils import indexable 3 | from sklearn.utils.validation import _num_samples 4 | 5 | 6 | def train_test_split(*arrays, **options): 7 | """Extended version of the sklearn train test split supporting overlapping train and 8 | test sets. 9 | 10 | See `sklearn.model_selection.train_test_split (external link) 11 | `_ . 12 | 13 | Parameters 14 | ---------- 15 | *arrays : sequence of indexables with same length / shape[0] 16 | Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas 17 | dataframes. 18 | test_size : float or int, default=None 19 | If float, should be between 0.0 and 1.0 and represent the proportion of the 20 | dataset to include in the test split. If int, represents the absolute number of 21 | test samples. If :obj:`None`, the value is set to the complement of the train 22 | size. If ``train_size`` is also None, it will be set to 0.25. 23 | train_size : float or int, default=None 24 | If float, should be between 0.0 and 1.0 and represent the proportion of the 25 | dataset to include in the train split. If int, represents the absolute number of 26 | train samples. If :obj:`None`, the value is automatically set to the complement 27 | of the test size. 28 | random_state : int or :class`numpy.random.RandomState` instance, default=None 29 | Controls the shuffling applied to the data before applying the split. Pass an 30 | int for reproducible output across multiple function calls. See `random state 31 | glossary from sklearn (external link) 32 | `_ 33 | shuffle : bool, default=True 34 | Whether or not to shuffle the data before splitting. If shuffle=False then 35 | stratify must be :obj:`None`. 36 | stratify : array-like, default=None 37 | If not :obj:`None`, data is split in a stratified fashion, using this as the 38 | class labels. 39 | train_test_overlap : bool, default=False 40 | If :obj:`True`, and train and test set are both not :obj:`None`, the train and 41 | test set may overlap. 42 | 43 | Returns 44 | ------- 45 | splitting : list, length=2 * len(arrays) 46 | List containing train-test split of inputs. 47 | """ # NoQa: E501 48 | train_test_overlap = options.pop("train_test_overlap", False) 49 | test_size = options.get("test_size", None) 50 | train_size = options.get("train_size", None) 51 | 52 | if train_test_overlap and train_size is not None and test_size is not None: 53 | # checks from sklearn 54 | arrays = indexable(*arrays) 55 | n_samples = _num_samples(arrays[0]) 56 | 57 | if test_size == 1.0 or test_size == n_samples: 58 | test_sets = arrays 59 | else: 60 | options["train_size"] = None 61 | test_sets = sklearn.model_selection.train_test_split(*arrays, **options)[ 62 | 1::2 63 | ] 64 | options["train_size"] = train_size 65 | 66 | if train_size == 1.0 or train_size == n_samples: 67 | train_sets = arrays 68 | else: 69 | options["test_size"] = None 70 | train_sets = sklearn.model_selection.train_test_split(*arrays, **options)[ 71 | ::2 72 | ] 73 | options["test_size"] = test_size 74 | 75 | train_test_sets = [] 76 | for i in range(len(train_sets)): 77 | train_test_sets += [train_sets[i], test_sets[i]] 78 | return train_test_sets 79 | else: 80 | return sklearn.model_selection.train_test_split(*arrays, **options) 81 | -------------------------------------------------------------------------------- /src/skmatter/neighbors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module implements the sparse kernel density estimator. 3 | 4 | A large dataset can be generated during the molecular dynamics sampling. The 5 | distribution of the sampled data reflects the (free) energetic stability of molecular 6 | patterns. The KDE model can be used to characterize the probability distribution, and 7 | thus to identify the stable patterns in the system. However, the computational 8 | cost of KDE is `O(N^2)` where `N` is the number of sampled points, which is very 9 | expensive. Here we offer a sparse implementation of the KDE model with a 10 | `O(MN)` computational cost, where `M` is the number of grid points generated from the 11 | sampled data. 12 | 13 | The following class is available: 14 | 15 | * :ref:`sparse-kde-api` computes the kernel density estimator based on a set of grid 16 | points generated from the sampled data. 17 | 18 | """ 19 | 20 | from ._sparsekde import SparseKDE 21 | 22 | __all__ = ["SparseKDE"] 23 | -------------------------------------------------------------------------------- /src/skmatter/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """Scaling, centering and normalization methods.""" 2 | 3 | from ._data import ( 4 | KernelNormalizer, 5 | SparseKernelCenterer, 6 | StandardFlexibleScaler, 7 | ) 8 | 9 | __all__ = ["StandardFlexibleScaler", "KernelNormalizer", "SparseKernelCenterer"] 10 | -------------------------------------------------------------------------------- /src/skmatter/sample_selection/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`sklearn.sample selection` module will include FPS and CUR selection, each 3 | with the optional PCov-flavor 4 | """ 5 | 6 | from ._base import ( 7 | CUR, 8 | FPS, 9 | DirectionalConvexHull, 10 | PCovCUR, 11 | PCovFPS, 12 | ) 13 | from ._voronoi_fps import VoronoiFPS 14 | 15 | __all__ = ["PCovFPS", "PCovCUR", "FPS", "CUR", "DirectionalConvexHull", "VoronoiFPS"] 16 | -------------------------------------------------------------------------------- /src/skmatter/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`skmatter.utils` module includes functions which are 3 | used by multiple packages 4 | """ 5 | 6 | from ._orthogonalizers import ( 7 | X_orthogonalizer, 8 | Y_feature_orthogonalizer, 9 | Y_sample_orthogonalizer, 10 | ) 11 | 12 | from ._pcovc_utils import check_cl_fit 13 | 14 | from ._pcovr_utils import ( 15 | check_krr_fit, 16 | check_lr_fit, 17 | pcovr_covariance, 18 | pcovr_kernel, 19 | ) 20 | 21 | from ._progress_bar import ( 22 | get_progress_bar, 23 | no_progress_bar, 24 | ) 25 | 26 | from ._sparsekde import ( 27 | effdim, 28 | oas, 29 | ) 30 | 31 | __all__ = [ 32 | "get_progress_bar", 33 | "no_progress_bar", 34 | "pcovr_covariance", 35 | "pcovr_kernel", 36 | "check_krr_fit", 37 | "check_lr_fit", 38 | "X_orthogonalizer", 39 | "Y_sample_orthogonalizer", 40 | "Y_feature_orthogonalizer", 41 | "effdim", 42 | "oas", 43 | ] 44 | -------------------------------------------------------------------------------- /src/skmatter/utils/_orthogonalizers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Necessary orthogonalizers for the CUR decomposition subselection method. 3 | 4 | Authors: Rose K. Cersonsky 5 | Michele Ceriotti 6 | """ 7 | 8 | import warnings 9 | 10 | import numpy as np 11 | 12 | 13 | def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False): 14 | """Orthogonalizes a feature matrix by the given columns. 15 | 16 | Can be used to orthogonalize by samples by calling `X = X_orthogonalizer(X.T, 17 | row_index).T`. After orthogonalization, each column of X will contain only what is 18 | orthogonal to X[:, c] or x2. 19 | 20 | Parameters 21 | ---------- 22 | x1: numpy.ndarray of shape (n x m) 23 | feature matrix to orthogonalize 24 | c: int, less than m, default=None 25 | index of the column to orthogonalize by 26 | x2: numpy.ndarray of shape (n x a), default=x1[:, c] 27 | a separate set of columns to orthogonalize with respect to 28 | Note: the orthogonalizer will work column-by-column in column-index order 29 | """ 30 | if x2 is None and c is not None: 31 | cols = x1[:, [c]] 32 | elif x2.shape[0] == x1.shape[0]: 33 | cols = np.reshape(x2, (x1.shape[0], -1)) 34 | else: 35 | raise ValueError( 36 | "You can only orthogonalize a matrix using a vector with the same number " 37 | f"of rows. Matrix X has {x1.shape[0]} rows, whereas the orthogonalizing " 38 | f"matrix has {x2.shape[0]} rows." 39 | ) 40 | 41 | if copy: 42 | xnew = x1.copy() 43 | else: 44 | xnew = x1 45 | 46 | for i in range(cols.shape[-1]): 47 | col = cols[:, [i]] 48 | 49 | if np.linalg.norm(col) < tol: 50 | warnings.warn("Column vector contains only zeros.", stacklevel=1) 51 | else: 52 | col = np.divide(col, np.linalg.norm(col, axis=0)) 53 | 54 | xnew -= (col @ (col.T @ xnew)).astype(xnew.dtype) 55 | 56 | return xnew 57 | 58 | 59 | def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True): 60 | r"""Orthogonalizes a property matrix given the selected features in 61 | :math:`\mathbf{X}`. 62 | 63 | .. math:: 64 | \mathbf{Y} \leftarrow \mathbf{Y} - 65 | \mathbf{X} \left(\mathbf{X}^T\mathbf{X}\right)^{-1}\mathbf{X}^T \mathbf{Y} 66 | 67 | Parameters 68 | ---------- 69 | y : numpy.ndarray of shape (n_samples x n_properties) 70 | property matrix 71 | X : numpy.ndarray of shape (n_samples x n_features) 72 | feature matrix 73 | tol: float 74 | cutoff for small eigenvalues to send to np.linalg.pinv 75 | copy: bool 76 | whether to return a copy of y or edit in-place, default=True 77 | """ 78 | v = np.linalg.pinv(np.matmul(X.T, X), rcond=tol) 79 | v = np.matmul(X, v) 80 | v = np.matmul(v, X.T) 81 | 82 | if copy: 83 | return y.copy() - np.matmul(v, y) 84 | else: 85 | y -= np.matmul(v, y) 86 | return y 87 | 88 | 89 | def Y_sample_orthogonalizer(y, X, y_ref, X_ref, tol=1e-12, copy=True): 90 | r"""Orthogonalizes a matrix of targets :math:`{\mathbf{Y}}` given a reference 91 | feature matrix :math:`{\mathbf{X}_r}` and reference target matrix 92 | :math:`{\mathbf{Y}_r}`: 93 | 94 | .. math:: 95 | \mathbf{Y} \leftarrow \mathbf{Y} - 96 | \mathbf{X} \left(\mathbf{X}_{\mathbf{r}}^T 97 | \mathbf{X}_{\mathbf{r}}\right)^{-1}\mathbf{X}_{\mathbf{r}}^T 98 | \mathbf{Y}_{\mathbf{r}} 99 | 100 | Parameters 101 | ---------- 102 | y : numpy.ndarray of shape (n_samples x n_properties) 103 | property matrix 104 | X : numpy.ndarray of shape (n_samples x n_features) 105 | feature matrix 106 | y_ref : numpy.ndarray of shape (n_ref x n_properties) 107 | reference property matrix 108 | X_ref : numpy.ndarray of shape (n_ref x n_features) 109 | reference feature matrix 110 | tol: float 111 | cutoff for small eigenvalues to send to np.linalg.pinv 112 | copy: bool 113 | whether to return a copy of y or edit in-place, default=True 114 | """ 115 | y_frag = (X @ (np.linalg.lstsq(X_ref, y_ref, rcond=tol)[0])).reshape(y.shape) 116 | 117 | if copy: 118 | return y.copy() - y_frag 119 | else: 120 | y -= y_frag 121 | return y 122 | -------------------------------------------------------------------------------- /src/skmatter/utils/_pcovc_utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | from sklearn import clone 5 | from sklearn.exceptions import NotFittedError 6 | from sklearn.utils.validation import check_is_fitted, validate_data 7 | 8 | 9 | def check_cl_fit(classifier, X, y): 10 | """ 11 | Checks that a (linear) classifier is fitted, and if not, 12 | fits it with the provided data. 13 | 14 | Parameters 15 | ---------- 16 | classifier : object 17 | sklearn-style classifier 18 | X : array-like 19 | Feature matrix with which to fit the classifier if it is not already fitted 20 | y : array-like 21 | Target values with which to fit the classifier if it is not already fitted 22 | 23 | Returns 24 | ------- 25 | fitted_classifier : object 26 | The fitted classifier. If input classifier was already fitted and compatible 27 | with the data, returns a deep copy. Otherwise returns a newly fitted classifier. 28 | 29 | Raises 30 | ------ 31 | ValueError 32 | If the fitted classifiers's coefficients have a shape incompatible with the 33 | number of features in X or the number of classes in y. 34 | """ 35 | try: 36 | check_is_fitted(classifier) 37 | fitted_classifier = deepcopy(classifier) 38 | 39 | # Check compatibility with X 40 | validate_data(fitted_classifier, X, y, reset=False, multi_output=True) 41 | 42 | # Check compatibility with the number of features in X and the number of 43 | # classes in y 44 | n_classes = len(np.unique(y)) 45 | 46 | if n_classes == 2: 47 | if fitted_classifier.coef_.shape[0] != 1: 48 | raise ValueError( 49 | "For binary classification, expected classifier coefficients " 50 | "to have shape (1, " 51 | f"{X.shape[1]}) but got shape " 52 | f"{fitted_classifier.coef_.shape}" 53 | ) 54 | else: 55 | if fitted_classifier.coef_.shape[0] != n_classes: 56 | raise ValueError( 57 | "For multiclass classification, expected classifier coefficients " 58 | "to have shape " 59 | f"({n_classes}, {X.shape[1]}) but got shape " 60 | f"{fitted_classifier.coef_.shape}" 61 | ) 62 | 63 | except NotFittedError: 64 | fitted_classifier = clone(classifier) 65 | fitted_classifier.fit(X, y) 66 | 67 | return fitted_classifier 68 | -------------------------------------------------------------------------------- /src/skmatter/utils/_progress_bar.py: -------------------------------------------------------------------------------- 1 | def get_progress_bar(): 2 | """Returns the appropriate version of ``tqdm``, as determined by ``tqdm.auto``. 3 | 4 | If ``tqdm`` is not installed, an :py:class`ImportError` is raised. 5 | """ 6 | try: 7 | from tqdm.auto import tqdm 8 | 9 | return tqdm 10 | except ImportError: 11 | raise ImportError( 12 | "tqdm must be installed to use a progress bar. Either install tqdm or " 13 | "re-run with progress_bar = False" 14 | ) 15 | 16 | 17 | def no_progress_bar(x): 18 | """Identity function, same as ``lambda x:x``. It returns ``x``.""" 19 | return x 20 | -------------------------------------------------------------------------------- /src/skmatter/utils/_sparsekde.py: -------------------------------------------------------------------------------- 1 | """The file holds utility functions and classes for the sparse KDE.""" 2 | 3 | import numpy as np 4 | 5 | 6 | def effdim(cov): 7 | """ 8 | Calculate the effective dimension of a covariance matrix based on Shannon entropy. 9 | 10 | Parameters 11 | ---------- 12 | cov : numpy.ndarray 13 | The covariance matrix. 14 | 15 | Returns 16 | ------- 17 | float 18 | The effective dimension of the covariance matrix. 19 | 20 | Examples 21 | -------- 22 | >>> import numpy as np 23 | >>> from skmatter.utils import effdim 24 | >>> cov = np.array([[25, 15, -5], [15, 18, 0], [-5, 0, 11]], dtype=np.float64) 25 | >>> print(round(effdim(cov), 3)) 26 | 2.214 27 | 28 | References 29 | ---------- 30 | https://ieeexplore.ieee.org/document/7098875 31 | """ 32 | eigval = np.linalg.eigvals(cov) 33 | if (lowest_eigval := np.min(eigval)) <= -np.max(cov.shape) * np.finfo( 34 | cov.dtype 35 | ).eps: 36 | raise np.linalg.LinAlgError( 37 | f"Matrix is not positive definite." 38 | f"Lowest eigenvalue {lowest_eigval} is " 39 | f"above numerical threshold." 40 | ) 41 | eigval[eigval < 0.0] = 0.0 42 | eigval /= sum(eigval) 43 | eigval *= np.log(eigval) 44 | 45 | return np.exp(-sum(eigval)) 46 | 47 | 48 | def oas(cov: np.ndarray, n: float, D: int) -> np.ndarray: 49 | """ 50 | Oracle approximating shrinkage (OAS) estimator 51 | 52 | Parameters 53 | ---------- 54 | cov : numpy.ndarray 55 | A covariance matrix 56 | n : float 57 | The local population 58 | D : int 59 | Dimension 60 | 61 | Examples 62 | -------- 63 | >>> import numpy as np 64 | >>> from skmatter.utils import oas 65 | >>> cov = np.array([[0.5, 1.0], [0.7, 0.4]]) 66 | >>> oas(cov, 10, 2) 67 | array([[0.48903924, 0.78078484], 68 | [0.54654939, 0.41096076]]) 69 | 70 | Returns 71 | ------- 72 | np.ndarray 73 | Covariance matrix 74 | """ 75 | tr = np.trace(cov) 76 | tr2 = tr**2 77 | tr_cov2 = np.trace(cov**2) 78 | phi = ((1 - 2 / D) * tr_cov2 + tr2) / ((n + 1 - 2 / D) * tr_cov2 - tr2 / D) 79 | 80 | return (1 - phi) * cov + phi * np.eye(D) * tr / D 81 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | .coverage 2 | coverage.xml 3 | -------------------------------------------------------------------------------- /tests/test_check_estimators.py: -------------------------------------------------------------------------------- 1 | from sklearn.utils.estimator_checks import parametrize_with_checks 2 | 3 | from skmatter.decomposition import KernelPCovR, PCovC, PCovR 4 | from skmatter.feature_selection import CUR as fCUR 5 | from skmatter.feature_selection import FPS as fFPS 6 | from skmatter.feature_selection import PCovCUR as fPCovCUR 7 | from skmatter.feature_selection import PCovFPS as fPCovFPS 8 | from skmatter.linear_model import Ridge2FoldCV # OrthogonalRegression, 9 | from skmatter.preprocessing import KernelNormalizer, StandardFlexibleScaler 10 | 11 | 12 | @parametrize_with_checks( 13 | [ 14 | KernelPCovR(mixing=0.5), 15 | PCovR(mixing=0.5), 16 | PCovC(mixing=0.5), 17 | fCUR(), 18 | fFPS(), 19 | fPCovCUR(), 20 | fPCovFPS(), 21 | Ridge2FoldCV(), 22 | KernelNormalizer(), 23 | StandardFlexibleScaler(), 24 | ] 25 | ) 26 | def test_sklearn_compatible_estimator(estimator, check): 27 | """Test of the estimators are compatible with sklearn.""" 28 | check(estimator) 29 | -------------------------------------------------------------------------------- /tests/test_clustering.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from skmatter.clustering import QuickShift 6 | 7 | 8 | class QuickShiftTests(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls) -> None: 11 | cls.points = np.array( 12 | [ 13 | [-1.72779275, -1.32763554], 14 | [-4.44991964, -2.13474901], 15 | [0.54817734, -2.43319467], 16 | [3.19881307, -0.49547222], 17 | [-1.1335991, 2.33478428], 18 | [0.55437388, 0.18745963], 19 | ] 20 | ) 21 | cls.cuts = np.array( 22 | [6.99485011, 8.80292681, 7.68486852, 9.5115009, 8.07736919, 6.22057056] 23 | ) 24 | cls.weights = np.array( 25 | [ 26 | -3.94008092, 27 | -12.68095664, 28 | -7.07512499, 29 | -9.03064023, 30 | -8.26529849, 31 | -2.61132267, 32 | ] 33 | ) 34 | cls.qs_labels_ = np.array([0, 0, 0, 5, 5, 5]) 35 | cls.qs_cluster_centers_idx_ = np.array([0, 5]) 36 | cls.gabriel_labels_ = np.array([5, 5, 5, 5, 5, 5]) 37 | cls.gabriel_cluster_centers_idx_ = np.array([5]) 38 | cls.cell = [3, 3] 39 | cls.gabriel_shell = 2 40 | 41 | def test_fit_qs(self): 42 | model = QuickShift(dist_cutoff_sq=self.cuts) 43 | model.fit(self.points, samples_weight=self.weights) 44 | self.assertTrue(np.all(model.labels_ == self.qs_labels_)) 45 | self.assertTrue( 46 | np.all(model.cluster_centers_idx_ == self.qs_cluster_centers_idx_) 47 | ) 48 | 49 | def test_fit_garbriel(self): 50 | model = QuickShift(gabriel_shell=self.gabriel_shell) 51 | model.fit(self.points, samples_weight=self.weights) 52 | self.assertTrue(np.all(model.labels_ == self.gabriel_labels_)) 53 | self.assertTrue( 54 | np.all(model.cluster_centers_idx_ == self.gabriel_cluster_centers_idx_) 55 | ) 56 | 57 | def test_dimension_check(self): 58 | model = QuickShift(self.cuts, metric_params={"cell_length": self.cell}) 59 | self.assertRaises(ValueError, model.fit, np.array([[2]])) 60 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from skmatter.datasets import ( 6 | load_csd_1000r, 7 | load_degenerate_CH4_manifold, 8 | load_hbond_dataset, 9 | load_nice_dataset, 10 | load_roy_dataset, 11 | load_who_dataset, 12 | ) 13 | 14 | 15 | class NICEDatasetTests(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls): 18 | cls.nice_data = load_nice_dataset() 19 | 20 | def test_load_nice_data(self): 21 | # test if representations and properties have commensurate shape 22 | self.assertTrue( 23 | self.nice_data.data.X.shape[0] == self.nice_data.data.y.shape[0] 24 | ) 25 | self.assertTrue(self.nice_data.data.X.shape[0] == 500) 26 | self.assertTrue(self.nice_data.data.X.shape[1] == 160) 27 | self.assertTrue(len(self.nice_data.data.X.shape) == 2) 28 | 29 | def test_load_nice_data_descr(self): 30 | self.nice_data.DESCR 31 | 32 | 33 | class DegenerateCH4Tests(unittest.TestCase): 34 | @classmethod 35 | def setUpClass(cls): 36 | cls.degenerate_CH4_manifold = load_degenerate_CH4_manifold() 37 | 38 | def test_load_degenerate_CH4_manifold_power_spectrum_shape(self): 39 | # test if representations have correct shape 40 | self.assertTrue( 41 | self.degenerate_CH4_manifold.data.SOAP_power_spectrum.shape == (162, 12) 42 | ) 43 | 44 | def test_load_degenerate_CH4_manifold_bispectrum_shape(self): 45 | self.assertTrue( 46 | self.degenerate_CH4_manifold.data.SOAP_bispectrum.shape == (162, 12) 47 | ) 48 | 49 | def test_load_degenerate_CH4_manifold_access_descr(self): 50 | self.degenerate_CH4_manifold.DESCR 51 | 52 | 53 | class CSDTests(unittest.TestCase): 54 | @classmethod 55 | def setUpClass(cls): 56 | cls.csd = load_csd_1000r() 57 | 58 | def test_load_csd_1000r_shape(self): 59 | # test if representations and properties have commensurate shape 60 | self.assertTrue(self.csd.data.X.shape[0] == self.csd.data.y.shape[0]) 61 | 62 | def test_load_csd_1000r_access_descr(self): 63 | self.csd.DESCR 64 | 65 | 66 | class WHOTests(unittest.TestCase): 67 | @classmethod 68 | def setUpClass(cls): 69 | cls.size = 24240 70 | cls.shape = (2020, 12) 71 | cls.value = 5.00977993011475 72 | try: 73 | import pandas as pd # NoQa: F401 74 | 75 | cls.has_pandas = True 76 | cls.who = load_who_dataset() 77 | except ImportError: 78 | cls.has_pandas = False 79 | 80 | def test_load_dataset_without_pandas(self): 81 | """Check if the correct exception occurs when pandas isn't present.""" 82 | with unittest.mock.patch.dict("sys.modules", {"pandas": None}): 83 | with self.assertRaises(ImportError) as cm: 84 | _ = load_who_dataset() 85 | self.assertEqual(str(cm.exception), "load_who_dataset requires pandas.") 86 | 87 | def test_dataset_size_and_shape(self): 88 | """ 89 | Check if the correct number of datapoints are present in the dataset. 90 | Also check if the size of the dataset is correct. 91 | """ 92 | if self.has_pandas is True: 93 | self.assertEqual(self.who["data"].size, self.size) 94 | self.assertEqual(self.who["data"].shape, self.shape) 95 | 96 | def test_datapoint_value(self): 97 | """Check if the value of a datapoint at a certain location is correct.""" 98 | if self.has_pandas is True: 99 | self.assertTrue( 100 | np.allclose( 101 | self.who["data"]["SE.XPD.TOTL.GD.ZS"][1924], self.value, rtol=1e-6 102 | ) 103 | ) 104 | 105 | 106 | class ROYTests(unittest.TestCase): 107 | @classmethod 108 | def setUpClass(cls): 109 | cls.size = 264 110 | cls.shape = (264, 32) 111 | cls.roy = load_roy_dataset() 112 | 113 | def test_dataset_content(self): 114 | """Check if the correct number of datapoints are present in the dataset. 115 | 116 | Also check if the size of the dataset is correct. 117 | """ 118 | self.assertEqual(len(self.roy["structure_types"]), self.size) 119 | self.assertEqual(self.roy["features"].shape, self.shape) 120 | self.assertEqual(len(self.roy["energies"]), self.size) 121 | 122 | 123 | class HBondTests(unittest.TestCase): 124 | @classmethod 125 | def setUpClass(cls): 126 | cls.size = 27233 127 | cls.shape = (27233, 3) 128 | cls.hbond = load_hbond_dataset() 129 | 130 | def test_dataset_size_and_shape(self): 131 | """ 132 | Check if the correct number of datapoints are present in the dataset. 133 | Also check if the size of the dataset is correct. 134 | """ 135 | self.assertEqual(self.hbond["descriptors"].shape, self.shape) 136 | self.assertEqual(self.hbond["weights"].size, self.size) 137 | 138 | 139 | if __name__ == "__main__": 140 | unittest.main() 141 | -------------------------------------------------------------------------------- /tests/test_feature_pcov_cur.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.datasets import load_diabetes as get_dataset 5 | 6 | from skmatter.feature_selection import PCovCUR 7 | 8 | 9 | class TestPCovCUR(unittest.TestCase): 10 | def setUp(self): 11 | self.X, self.y = get_dataset(return_X_y=True) 12 | self.idx = [2, 8, 3, 4, 1, 7, 5, 9, 6] 13 | 14 | def test_known(self): 15 | """Check that the model returns a known set of indices.""" 16 | selector = PCovCUR(n_to_select=9) 17 | selector.fit(self.X, self.y) 18 | 19 | self.assertTrue(np.allclose(selector.selected_idx_, self.idx)) 20 | 21 | def test_restart(self): 22 | """Check that the model can be restarted with a new instance.""" 23 | selector = PCovCUR(n_to_select=1) 24 | selector.fit(self.X, self.y) 25 | 26 | for i in range(len(self.idx) - 2): 27 | selector.n_to_select += 1 28 | selector.fit(self.X, self.y, warm_start=True) 29 | self.assertEqual(selector.selected_idx_[i], self.idx[i]) 30 | 31 | def test_non_it(self): 32 | """Check that the model can be run non-iteratively.""" 33 | self.idx = [2, 8, 3, 6, 7, 9, 1, 0, 5] 34 | selector = PCovCUR(n_to_select=9, recompute_every=0) 35 | selector.fit(self.X, self.y) 36 | 37 | self.assertTrue(np.allclose(selector.selected_idx_, self.idx)) 38 | 39 | 40 | if __name__ == "__main__": 41 | unittest.main(verbosity=2) 42 | -------------------------------------------------------------------------------- /tests/test_feature_pcov_fps.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from sklearn.datasets import load_diabetes as get_dataset 4 | 5 | from skmatter.feature_selection import PCovFPS 6 | 7 | 8 | class TestPCovFPS(unittest.TestCase): 9 | def setUp(self): 10 | self.X, self.y = get_dataset(return_X_y=True) 11 | self.idx = [0, 2, 6, 7, 1, 3, 4] 12 | 13 | def test_restart(self): 14 | """Check that the model can be restarted with a new number of features and 15 | `warm_start`. 16 | """ 17 | selector = PCovFPS(n_to_select=1, initialize=self.idx[0]) 18 | selector.fit(self.X, y=self.y) 19 | 20 | for i in range(2, len(self.idx)): 21 | selector.n_to_select = i 22 | selector.fit(self.X, y=self.y, warm_start=True) 23 | self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) 24 | 25 | def test_no_mixing_1(self): 26 | """Check that the model throws an error when mixing = 1.0.""" 27 | selector = PCovFPS(n_to_select=1, mixing=1.0) 28 | with self.assertRaises(ValueError) as cm: 29 | selector.fit(self.X, y=self.y) 30 | self.assertEqual( 31 | str(cm.exception), 32 | "Mixing = 1.0 corresponds to traditional FPS. Please use the FPS class.", 33 | ) 34 | 35 | 36 | if __name__ == "__main__": 37 | unittest.main(verbosity=2) 38 | -------------------------------------------------------------------------------- /tests/test_feature_simple_cur.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn import exceptions 5 | 6 | from skmatter.datasets import load_csd_1000r as load 7 | from skmatter.feature_selection import CUR, FPS 8 | 9 | 10 | class TestCUR(unittest.TestCase): 11 | def setUp(self): 12 | self.X, _ = load(return_X_y=True) 13 | self.X = FPS(n_to_select=10).fit(self.X).transform(self.X) 14 | 15 | def test_bad_transform(self): 16 | selector = CUR(n_to_select=2) 17 | with self.assertRaises(exceptions.NotFittedError): 18 | _ = selector.transform(self.X) 19 | 20 | def test_restart(self): 21 | """Check that the model can be restarted with a new instance.""" 22 | ref_selector = CUR(n_to_select=self.X.shape[-1] - 3).fit(X=self.X) 23 | ref_idx = ref_selector.selected_idx_ 24 | 25 | selector = CUR(n_to_select=1) 26 | selector.fit(self.X) 27 | 28 | for i in range(self.X.shape[-1] - 3): 29 | selector.n_to_select += 1 30 | selector.fit(self.X, warm_start=True) 31 | self.assertEqual(selector.selected_idx_[i], ref_idx[i]) 32 | 33 | def test_non_it(self): 34 | """Check that the model can be run non-iteratively.""" 35 | C = self.X.T @ self.X 36 | _, UC = np.linalg.eigh(C) 37 | ref_idx = np.argsort(-(UC[:, -1] ** 2.0))[:-1] 38 | 39 | selector = CUR(n_to_select=self.X.shape[-1] - 1, recompute_every=0) 40 | selector.fit(self.X) 41 | 42 | self.assertTrue(np.allclose(selector.selected_idx_, ref_idx)) 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main(verbosity=2) 47 | -------------------------------------------------------------------------------- /tests/test_feature_simple_fps.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.datasets import load_diabetes as get_dataset 5 | from sklearn.utils.validation import NotFittedError 6 | 7 | from skmatter.feature_selection import FPS 8 | 9 | 10 | class TestFPS(unittest.TestCase): 11 | def setUp(self): 12 | self.X, _ = get_dataset(return_X_y=True) 13 | self.idx = [0, 6, 1, 2, 4, 9, 3] 14 | 15 | def test_restart(self): 16 | """ 17 | Check that the model can be restarted with a new number of 18 | features and `warm_start` 19 | """ 20 | selector = FPS(n_to_select=1, initialize=self.idx[0]) 21 | selector.fit(self.X) 22 | 23 | for i in range(2, len(self.idx)): 24 | selector.n_to_select = i 25 | selector.fit(self.X, warm_start=True) 26 | self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) 27 | 28 | def test_initialize(self): 29 | """Check that the model can be initialized in all applicable manners and throws 30 | an error otherwise. 31 | """ 32 | for initialize in [self.idx[0], "random"]: 33 | with self.subTest(initialize=initialize): 34 | selector = FPS(n_to_select=1, initialize=initialize) 35 | selector.fit(self.X) 36 | 37 | initialize = self.idx[:4] 38 | with self.subTest(initialize=initialize): 39 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 40 | selector.fit(self.X) 41 | for i in range(4): 42 | self.assertEqual(selector.selected_idx_[i], self.idx[i]) 43 | 44 | initialize = np.array(self.idx[:4]) 45 | with self.subTest(initialize=initialize): 46 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 47 | selector.fit(self.X) 48 | for i in range(4): 49 | self.assertEqual(selector.selected_idx_[i], self.idx[i]) 50 | 51 | initialize = np.array([1, 5, 3, 0.25]) 52 | with self.subTest(initialize=initialize): 53 | with self.assertRaises(ValueError) as cm: 54 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 55 | selector.fit(self.X) 56 | self.assertEqual( 57 | str(cm.exception), "Invalid value of the initialize parameter" 58 | ) 59 | 60 | initialize = np.array([[1, 5, 3], [2, 4, 6]]) 61 | with self.subTest(initialize=initialize): 62 | with self.assertRaises(ValueError) as cm: 63 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 64 | selector.fit(self.X) 65 | self.assertEqual( 66 | str(cm.exception), "Invalid value of the initialize parameter" 67 | ) 68 | 69 | with self.assertRaises(ValueError) as cm: 70 | selector = FPS(n_to_select=1, initialize="bad") 71 | selector.fit(self.X) 72 | self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter") 73 | 74 | def test_get_distances(self): 75 | """Check that the hausdorff distances are returnable after fitting.""" 76 | selector = FPS(n_to_select=7) 77 | selector.fit(self.X) 78 | d = selector.get_select_distance() 79 | 80 | dist_grad = d[1:-1] - d[2:] 81 | self.assertTrue(all(dist_grad > 0)) 82 | 83 | with self.assertRaises(NotFittedError): 84 | selector = FPS(n_to_select=7) 85 | _ = selector.get_select_distance() 86 | 87 | 88 | if __name__ == "__main__": 89 | unittest.main(verbosity=2) 90 | -------------------------------------------------------------------------------- /tests/test_greedy_selector.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.datasets import load_diabetes as get_dataset 5 | from sklearn.exceptions import NotFittedError 6 | 7 | from skmatter._selection import GreedySelector 8 | 9 | 10 | class GreedyTester(GreedySelector): 11 | def __init__( 12 | self, n_to_select=None, score_threshold=None, selection_type="feature", **kwargs 13 | ): 14 | super().__init__( 15 | selection_type=selection_type, 16 | n_to_select=n_to_select, 17 | score_threshold=score_threshold, 18 | **kwargs, 19 | ) 20 | 21 | def score(self, X, y=None): 22 | scores = np.linalg.norm(X, axis=0) 23 | scores[self.selected_idx_] = 0.0 24 | return scores 25 | 26 | 27 | class TestGreedy(unittest.TestCase): 28 | def setUp(self): 29 | self.X, _ = get_dataset(return_X_y=True) 30 | 31 | def test_bad_type(self): 32 | with self.assertRaises( 33 | ValueError, msg="Only feature and sample selection supported." 34 | ): 35 | _ = GreedyTester(selection_type="bad").fit(self.X) 36 | 37 | def test_score_threshold(self): 38 | selector = GreedyTester(score_threshold=200, n_to_select=7) 39 | with self.assertWarns( 40 | Warning, msg="Score threshold of 200 reached. Terminating search at 6 / 7." 41 | ): 42 | selector.fit(self.X) 43 | 44 | def test_score_threshold_and_full(self): 45 | with self.assertRaises(ValueError) as cm: 46 | _ = GreedyTester(score_threshold=20, full=True, n_to_select=12).fit(self.X) 47 | self.assertEqual( 48 | str(cm.exception), 49 | "You cannot specify both `score_threshold` and `full=True`.", 50 | ) 51 | 52 | def test_bad_score_threshold_type(self): 53 | with self.assertRaises(ValueError) as cm: 54 | _ = GreedyTester(score_threshold_type="bad").fit(self.X) 55 | self.assertEqual( 56 | str(cm.exception), 57 | "invalid score_threshold_type, expected one of 'relative' or 'absolute'", 58 | ) 59 | 60 | def test_bad_warm_start(self): 61 | selector = GreedyTester() 62 | with self.assertRaises(ValueError) as cm: 63 | selector.fit(self.X, warm_start=True) 64 | self.assertTrue( 65 | str(cm.exception), 66 | "Cannot fit with warm_start=True without having been previously " 67 | "initialized", 68 | ) 69 | 70 | def test_bad_y(self): 71 | self.X, self.Y = get_dataset(return_X_y=True) 72 | Y = self.Y[:2] 73 | selector = GreedyTester(n_to_select=2) 74 | with self.assertRaises(ValueError): 75 | selector.fit(X=self.X, y=Y) 76 | 77 | def test_bad_transform(self): 78 | selector = GreedyTester(n_to_select=2) 79 | selector.fit(self.X) 80 | with self.assertRaises(ValueError) as cm: 81 | _ = selector.transform(self.X[:, :3]) 82 | self.assertEqual( 83 | str(cm.exception), 84 | "X has 3 features, but GreedyTester is expecting 10 features as input.", 85 | ) 86 | 87 | def test_no_nfeatures(self): 88 | selector = GreedyTester() 89 | selector.fit(self.X) 90 | self.assertEqual(len(selector.selected_idx_), self.X.shape[1] // 2) 91 | 92 | def test_decimal_nfeatures(self): 93 | selector = GreedyTester(n_to_select=0.2) 94 | selector.fit(self.X) 95 | self.assertEqual(len(selector.selected_idx_), int(self.X.shape[1] * 0.2)) 96 | 97 | def test_bad_nfeatures(self): 98 | for nf in [1.2, "1", 20]: 99 | with self.subTest(n_features=nf): 100 | selector = GreedyTester(n_to_select=nf) 101 | with self.assertRaises(ValueError) as cm: 102 | selector.fit(self.X) 103 | self.assertEqual( 104 | str(cm.exception), 105 | ( 106 | "n_to_select must be either None, an integer in " 107 | "[1, n_features] representing the absolute number " 108 | "of features, or a float in (0, 1] representing a " 109 | f"percentage of features to select. Got {nf} " 110 | f"features and an input with {self.X.shape[1]} feature." 111 | ), 112 | ) 113 | 114 | def test_not_fitted(self): 115 | with self.assertRaises(NotFittedError): 116 | selector = GreedyTester() 117 | _ = selector._get_support_mask() 118 | 119 | def test_fitted(self): 120 | selector = GreedyTester() 121 | selector.fit(self.X) 122 | _ = selector._get_support_mask() 123 | 124 | Xr = selector.transform(self.X) 125 | self.assertEqual(Xr.shape[1], self.X.shape[1] // 2) 126 | 127 | def test_size_input(self): 128 | X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1) 129 | selector_sample = GreedyTester(selection_type="sample") 130 | selector_feature = GreedyTester(selection_type="feature") 131 | with self.assertRaises(ValueError) as cm: 132 | selector_feature.fit(X) 133 | self.assertEqual( 134 | str(cm.exception), 135 | f"Found array with 1 feature(s) (shape={X.shape}) while a minimum of 2 is " 136 | "required by GreedyTester.", 137 | ) 138 | 139 | X = X.reshape(1, -1) 140 | 141 | with self.assertRaises(ValueError) as cm: 142 | selector_sample.fit(X) 143 | self.assertEqual( 144 | str(cm.exception), 145 | f"Found array with 1 sample(s) (shape={X.shape}) while a minimum of 2 is " 146 | "required by GreedyTester.", 147 | ) 148 | 149 | 150 | if __name__ == "__main__": 151 | unittest.main(verbosity=2) 152 | -------------------------------------------------------------------------------- /tests/test_kernel_normalizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import sklearn 5 | 6 | from skmatter.preprocessing import KernelNormalizer 7 | 8 | 9 | class KernelTests(unittest.TestCase): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self.random_state = np.random.RandomState(0) 13 | 14 | def test_sample_weights(self): 15 | """Checks that sample weights of one are equal to the unweighted case and 16 | that nonuniform weights are different from the unweighted case. 17 | """ 18 | K = self.random_state.uniform(0, 100, size=(3, 3)) 19 | equal_wts = np.ones(len(K)) 20 | nonequal_wts = self.random_state.uniform(0, 100, size=(len(K),)) 21 | model = KernelNormalizer() 22 | weighted_model = KernelNormalizer() 23 | K_unweighted = model.fit_transform(K) 24 | K_equal_weighted = weighted_model.fit_transform(K, sample_weight=equal_wts) 25 | self.assertTrue((np.isclose(K_unweighted, K_equal_weighted, atol=1e-12)).all()) 26 | K_nonequal_weighted = weighted_model.fit_transform( 27 | K, sample_weight=nonequal_wts 28 | ) 29 | self.assertFalse( 30 | (np.isclose(K_unweighted, K_nonequal_weighted, atol=1e-12)).all() 31 | ) 32 | 33 | def test_invalid_sample_weights(self): 34 | """Checks that weights must be 1D array with the same length as the number of 35 | samples. 36 | """ 37 | K = self.random_state.uniform(0, 100, size=(3, 3)) 38 | wts_len = np.ones(len(K) + 1) 39 | wts_dim = np.ones((len(K), 2)) 40 | model = KernelNormalizer() 41 | with self.assertRaises(ValueError): 42 | model.fit_transform(K, sample_weight=wts_len) 43 | with self.assertRaises(ValueError): 44 | model.fit_transform(K, sample_weight=wts_dim) 45 | 46 | def test_ValueError(self): 47 | """Checks that a non-square matrix cannot be normalized.""" 48 | K = self.random_state.uniform(0, 100, size=(3, 4)) 49 | model = KernelNormalizer() 50 | with self.assertRaises(ValueError): 51 | model.fit(K) 52 | 53 | def test_reference_ValueError(self): 54 | """Checks that it is impossible to normalize a matrix with a non-coincident 55 | size with the reference. 56 | """ 57 | K = self.random_state.uniform(0, 100, size=(3, 3)) 58 | K_2 = self.random_state.uniform(0, 100, size=(2, 2)) 59 | model = KernelNormalizer() 60 | model = model.fit(K) 61 | with self.assertRaises(ValueError): 62 | model.transform(K_2) 63 | 64 | def test_NotFittedError_transform(self): 65 | """Checks that an error is returned when trying to use the transform function 66 | before the fit function. 67 | """ 68 | K = self.random_state.uniform(0, 100, size=(3, 3)) 69 | model = KernelNormalizer() 70 | with self.assertRaises(sklearn.exceptions.NotFittedError): 71 | model.transform(K) 72 | 73 | def test_fit_transform(self): 74 | """Checks that the kernel is correctly normalized. 75 | 76 | Compare with the value calculated directly from the equation. 77 | """ 78 | K = self.random_state.uniform(0, 100, size=(3, 3)) 79 | model = KernelNormalizer() 80 | Ktr = model.fit_transform(K) 81 | Kc = K - K.mean(axis=0) - K.mean(axis=1)[:, np.newaxis] + K.mean() 82 | Kc /= np.trace(Kc) / Kc.shape[0] 83 | 84 | self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all()) 85 | 86 | def test_center_only(self): 87 | """Checks that the kernel is correctly centered, 88 | but not normalized. 89 | Compare with the value calculated 90 | directly from the equation. 91 | """ 92 | K = self.random_state.uniform(0, 100, size=(3, 3)) 93 | model = KernelNormalizer(with_center=True, with_trace=False) 94 | Ktr = model.fit_transform(K) 95 | Kc = K - K.mean(axis=0) - K.mean(axis=1)[:, np.newaxis] + K.mean() 96 | 97 | self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all()) 98 | 99 | def test_trace_only(self): 100 | """Checks that the kernel is correctly normalized, 101 | but not centered. 102 | Compare with the value calculated 103 | directly from the equation. 104 | """ 105 | K = self.random_state.uniform(0, 100, size=(3, 3)) 106 | model = KernelNormalizer(with_center=False, with_trace=True) 107 | Ktr = model.fit_transform(K) 108 | Kc = K.copy() 109 | Kc /= np.trace(Kc) / Kc.shape[0] 110 | 111 | self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all()) 112 | 113 | def test_no_preprocessing(self): 114 | """Checks that the kernel is unchanged 115 | if no preprocessing is specified. 116 | """ 117 | K = self.random_state.uniform(0, 100, size=(3, 3)) 118 | model = KernelNormalizer(with_center=False, with_trace=False) 119 | Ktr = model.fit_transform(K) 120 | Kc = K.copy() 121 | self.assertTrue((np.isclose(Ktr, Kc, atol=1e-12)).all()) 122 | 123 | 124 | if __name__ == "__main__": 125 | unittest.main() 126 | -------------------------------------------------------------------------------- /tests/test_model_selection.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import sklearn.model_selection 4 | from sklearn.datasets import load_iris 5 | 6 | import skmatter.model_selection 7 | 8 | 9 | class SplitTests(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.X = load_iris().data[:10] 13 | cls.seed = 0x5F3759DF 14 | 15 | def test_train_test_splits(self): 16 | # see if train_test_split of skmatter agrees with the one of sklearn 17 | sklearn_outputs = sklearn.model_selection.train_test_split( 18 | self.X, random_state=self.seed 19 | ) 20 | skmatter_outputs = skmatter.model_selection.train_test_split( 21 | self.X, random_state=self.seed 22 | ) 23 | for i in range(len(skmatter_outputs)): 24 | self.assertTrue((sklearn_outputs[i] == skmatter_outputs[i]).all()) 25 | 26 | def test_train_test_splits_train_test_overlap(self): 27 | # tests that a test/train split which necessitates overlap returns the right 28 | # number of points in each set 29 | X_train, X_test = skmatter.model_selection.train_test_split( 30 | self.X, 31 | train_size=0.8, 32 | test_size=0.8, 33 | train_test_overlap=True, 34 | random_state=self.seed, 35 | ) 36 | self.assertTrue(len(X_train) == len(X_test) == int(0.8 * self.X.shape[0])) 37 | 38 | def test_train_test_splits_train_test_overlap_full_test_set(self): 39 | # tests that the entire dataset can be used as the testing set 40 | X_train, X_test = skmatter.model_selection.train_test_split( 41 | self.X, 42 | train_size=0.8, 43 | test_size=1.0, 44 | train_test_overlap=True, 45 | random_state=self.seed, 46 | ) 47 | self.assertTrue((self.X == X_test).all()) 48 | 49 | def test_train_test_splits_train_test_overlap_full_train_test_set(self): 50 | # tests that the full dataset can be "split" to both train and test set 51 | X_train, X_test = skmatter.model_selection.train_test_split( 52 | self.X, 53 | train_size=1.0, 54 | test_size=1.0, 55 | train_test_overlap=True, 56 | random_state=self.seed, 57 | ) 58 | self.assertTrue((X_train == X_test).all()) 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest.main() 63 | -------------------------------------------------------------------------------- /tests/test_neighbors.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from skmatter.feature_selection import FPS 6 | from skmatter.neighbors import SparseKDE 7 | from skmatter.neighbors._sparsekde import _covariance 8 | from skmatter.utils import effdim, oas 9 | 10 | 11 | class SparseKDETests(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls) -> None: 14 | np.random.seed(0) 15 | cls.n_samples_per_cov = 10000 16 | cls.samples = np.concatenate( 17 | [ 18 | np.random.multivariate_normal( 19 | [0, 0], [[1, 0.5], [0.5, 1]], cls.n_samples_per_cov 20 | ), 21 | np.random.multivariate_normal( 22 | [4, 4], [[1, 0.5], [0.5, 0.5]], cls.n_samples_per_cov 23 | ), 24 | ] 25 | ) 26 | cls.sample_results = np.array( 27 | [[4.56393465, 4.20566218], [0.73562454, 1.11116178]] 28 | ) 29 | cls.selector = FPS(n_to_select=int(np.sqrt(2 * cls.n_samples_per_cov))) 30 | cls.grids = cls.selector.fit_transform(cls.samples.T).T 31 | cls.expect_score_fp = -759.831 32 | cls.expect_score_fs = -781.567 33 | 34 | cls.cell = np.array([4, 4]) 35 | cls.expect_score_periodic = -456.744 36 | 37 | def test_sparse_kde(self): 38 | estimator = SparseKDE(self.samples, None, fpoints=0.5) 39 | estimator.fit(self.grids) 40 | self.assertTrue(round(estimator.score(self.grids), 3) == self.expect_score_fp) 41 | self.assertTrue(np.allclose(estimator.sample(2), self.sample_results)) 42 | 43 | def test_sparce_kde_fs(self): 44 | estimator = SparseKDE(self.samples, None, fspread=0.5) 45 | estimator.fit(self.grids) 46 | self.assertTrue(round(estimator.score(self.grids), 3) == self.expect_score_fs) 47 | 48 | def test_sparse_kde_periodic(self): 49 | estimator = SparseKDE( 50 | self.samples, 51 | None, 52 | metric_params={"cell_length": self.cell}, 53 | fpoints=0.5, 54 | ) 55 | estimator.fit(self.grids) 56 | self.assertTrue( 57 | round(estimator.score(self.grids), 3) == self.expect_score_periodic 58 | ) 59 | 60 | def test_dimension_check(self): 61 | estimator = SparseKDE( 62 | self.samples, None, metric_params={"cell_length": self.cell}, fpoints=0.5 63 | ) 64 | self.assertRaises(ValueError, estimator.fit, np.array([[4]])) 65 | 66 | def test_fs_fp_imcompatibility(self): 67 | estimator = SparseKDE( 68 | self.samples, 69 | None, 70 | metric_params={"cell_length": self.cell}, 71 | fspread=2, 72 | fpoints=0.5, 73 | ) 74 | self.assertTrue(estimator.fpoints == -1) 75 | 76 | 77 | class CovarianceTests(unittest.TestCase): 78 | @classmethod 79 | def setUpClass(cls): 80 | cls.X = np.array([[1, 2], [3, 3], [4, 6]]) 81 | cls.expected_cov = np.array( 82 | [[2.33333333, 2.83333333], [2.83333333, 4.33333333]] 83 | ) 84 | cls.expected_cov_periodic = np.array( 85 | [[1.12597216, 0.45645371], [0.45645371, 0.82318948]] 86 | ) 87 | cls.cell = np.array([3, 3]) 88 | 89 | def test_covariance(self): 90 | cov = _covariance(self.X, np.full(len(self.X), 1 / len(self.X)), None) 91 | self.assertTrue(np.allclose(cov, self.expected_cov)) 92 | 93 | def test_covariance_periodic(self): 94 | cov = _covariance(self.X, np.full(len(self.X), 1 / len(self.X)), self.cell) 95 | self.assertTrue(np.allclose(cov, self.expected_cov_periodic)) 96 | 97 | 98 | class EffdimTests(unittest.TestCase): 99 | @classmethod 100 | def setUpClass(cls): 101 | cls.cov = np.array([[1, 1, 0], [1, 1.5, 0], [0, 0, 1]], dtype=np.float64) 102 | cls.expected_effdim = 2.24909102090124 103 | 104 | def test_effdim(self): 105 | self.assertTrue(np.allclose(effdim(self.cov), self.expected_effdim)) 106 | 107 | 108 | class OASTests(unittest.TestCase): 109 | @classmethod 110 | def setUpClass(cls): 111 | cls.cov = np.array([[0.5, 1.0], [0.7, 0.4]]) 112 | cls.n = 10 113 | cls.D = 2 114 | cls.expected_oas = np.array( 115 | [[0.48903924, 0.78078484], [0.54654939, 0.41096076]] 116 | ) 117 | 118 | def test_oas(self): 119 | self.assertTrue(np.allclose(oas(self.cov, self.n, self.D), self.expected_oas)) 120 | -------------------------------------------------------------------------------- /tests/test_pcovr_distances.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import scipy 5 | from sklearn.datasets import load_diabetes as get_dataset 6 | 7 | from skmatter.utils import pcovr_covariance, pcovr_kernel 8 | 9 | 10 | class CovarianceTest(unittest.TestCase): 11 | def __init__(self, *args, **kwargs): 12 | super().__init__(*args, **kwargs) 13 | self.X, self.Y = get_dataset(return_X_y=True) 14 | 15 | def test_alphas(self): 16 | C_X = self.X.T @ self.X 17 | 18 | C_inv = np.linalg.pinv(C_X, rcond=1e-12) 19 | C_isqrt = np.real(scipy.linalg.sqrtm(C_inv)) 20 | 21 | # parentheses speed up calculation greatly 22 | C_Y = C_isqrt @ (self.X.T @ self.Y) 23 | C_Y = C_Y.reshape((C_X.shape[0], -1)) 24 | C_Y = np.real(C_Y) 25 | C_Y = C_Y @ C_Y.T 26 | 27 | for alpha in [0.0, 0.5, 1.0]: 28 | with self.subTest(alpha=alpha): 29 | C = pcovr_covariance(alpha, X=self.X, Y=self.Y, rcond=1e-6) 30 | self.assertTrue(np.allclose(C, alpha * C_X + (1 - alpha) * C_Y)) 31 | 32 | def test_no_return_isqrt(self): 33 | with self.assertRaises(ValueError): 34 | _, _ = pcovr_covariance(0.5, self.X, self.Y, return_isqrt=False) 35 | 36 | def test_inverse_covariance(self): 37 | rcond = 1e-12 38 | rng = np.random.default_rng(0) 39 | 40 | # Make some random data where the last feature 41 | # is a linear comibination of the other features. 42 | # This gives us a covariance with a zero eigenvalue 43 | # that should be dropped (via rcond). 44 | # Hence, the inverse square root covariance 45 | # should be identical between the "full" 46 | # computation (eigh) and the approximate 47 | # computation that takes the top n_features-1 48 | # singular values (randomized svd). 49 | X = rng.random((10, 5)) 50 | Y = rng.random(10) 51 | x = rng.random(5) 52 | Xx = np.column_stack((X, np.sum(X * x, axis=1))) 53 | Xx -= np.mean(Xx, axis=0) 54 | 55 | C_inv = np.linalg.pinv(Xx.T @ Xx, rcond=rcond) 56 | C_isqrt = np.real(scipy.linalg.sqrtm(C_inv)) 57 | 58 | _, C_isqrt_eigh = pcovr_covariance(0.5, Xx, Y, return_isqrt=True, rcond=rcond) 59 | _, C_isqrt_svd = pcovr_covariance( 60 | 0.5, Xx, Y, return_isqrt=True, rank=min(Xx.shape) - 1, rcond=rcond 61 | ) 62 | 63 | for C, C_type in zip([C_isqrt_eigh, C_isqrt_svd], ["eigh", "svd"]): 64 | with self.subTest(C_isqrt_type=C_type): 65 | self.assertTrue(np.allclose(C_isqrt, C)) 66 | 67 | 68 | class KernelTest(unittest.TestCase): 69 | def __init__(self, *args, **kwargs): 70 | super().__init__(*args, **kwargs) 71 | self.X, self.Y = get_dataset(return_X_y=True) 72 | 73 | def test_alphas(self): 74 | K_X = self.X @ self.X.T 75 | K_Y = self.Y @ self.Y.T 76 | 77 | for alpha in [0.0, 0.5, 1.0]: 78 | with self.subTest(alpha=alpha): 79 | K = pcovr_kernel(alpha, self.X, self.Y) 80 | self.assertTrue(np.allclose(K, alpha * K_X + (1 - alpha) * K_Y)) 81 | 82 | 83 | if __name__ == "__main__": 84 | unittest.main(verbosity=2) 85 | -------------------------------------------------------------------------------- /tests/test_progress_bar.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from skmatter.utils import get_progress_bar 4 | 5 | 6 | class PBarTest(unittest.TestCase): 7 | def test_no_tqdm(self): 8 | """Check that the model cannot use a progress bar when tqdm is not installed.""" 9 | import sys 10 | 11 | sys.modules["tqdm"] = None 12 | 13 | with self.assertRaises(ImportError) as cm: 14 | _ = get_progress_bar() 15 | self.assertEqual( 16 | str(cm.exception), 17 | "tqdm must be installed to use a progress bar. Either install tqdm or " 18 | "re-run with progress_bar = False", 19 | ) 20 | 21 | 22 | if __name__ == "__main__": 23 | unittest.main(verbosity=2) 24 | -------------------------------------------------------------------------------- /tests/test_sample_pcov_cur.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.datasets import load_diabetes as get_dataset 5 | 6 | from skmatter.sample_selection import PCovCUR 7 | 8 | 9 | EPSILON = 1e-6 10 | 11 | 12 | class TestPCovCUR(unittest.TestCase): 13 | def setUp(self): 14 | self.X, self.y = get_dataset(return_X_y=True) 15 | self.X = self.X[:, :4] 16 | self.idx = [256, 304, 41, 408, 311, 364, 152, 78, 359, 102] 17 | 18 | def test_known(self): 19 | """Check that the model returns a known set of indices.""" 20 | selector = PCovCUR(n_to_select=10, mixing=0.5) 21 | selector.fit(self.X, self.y) 22 | 23 | self.assertTrue(np.allclose(selector.selected_idx_, self.idx)) 24 | 25 | def test_restart(self): 26 | """Check that the model can be restarted with a new instance.""" 27 | selector = PCovCUR(n_to_select=1, mixing=0.5) 28 | selector.fit(self.X, self.y) 29 | 30 | for i in range(len(self.idx) - 2): 31 | selector.n_to_select += 1 32 | selector.fit(self.X, self.y, warm_start=True) 33 | self.assertEqual(selector.selected_idx_[i], self.idx[i]) 34 | 35 | self.assertLessEqual( 36 | np.linalg.norm(selector.X_current_[self.idx[i]]), EPSILON 37 | ) 38 | 39 | for j in range(self.X.shape[0]): 40 | self.assertLessEqual( 41 | np.dot(selector.X_current_[self.idx[i]], selector.X_current_[j]), 42 | EPSILON, 43 | ) 44 | 45 | def test_non_it(self): 46 | """Check that the model can be run non-iteratively.""" 47 | self.idx = [256, 32, 138, 290, 362, 141, 359, 428, 254, 9] 48 | selector = PCovCUR(n_to_select=10, recompute_every=0) 49 | selector.fit(self.X, self.y) 50 | 51 | self.assertTrue(np.allclose(selector.selected_idx_, self.idx)) 52 | 53 | def test_multiple_k(self): 54 | """Check that the model can be run with multiple k's.""" 55 | for k in list(set(np.logspace(0, np.log10(min(self.X.shape)), 4, dtype=int))): 56 | selector = PCovCUR(n_to_select=10, k=k) 57 | selector.fit(self.X, self.y) 58 | 59 | 60 | if __name__ == "__main__": 61 | unittest.main(verbosity=2) 62 | -------------------------------------------------------------------------------- /tests/test_sample_pcov_fps.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from sklearn.datasets import load_diabetes as get_dataset 4 | 5 | from skmatter.sample_selection import PCovFPS 6 | 7 | 8 | class TestPCovFPS(unittest.TestCase): 9 | def setUp(self): 10 | self.X, self.y = get_dataset(return_X_y=True) 11 | self.idx = [0, 256, 156, 324, 349, 77, 113, 441, 426, 51] 12 | 13 | def test_restart(self): 14 | """Check that the model can be restarted with a new number of samples and 15 | `warm_start`. 16 | """ 17 | selector = PCovFPS(n_to_select=1, initialize=self.idx[0]) 18 | selector.fit(self.X, y=self.y) 19 | 20 | for i in range(2, len(self.idx)): 21 | selector.n_to_select = i 22 | selector.fit(self.X, y=self.y, warm_start=True) 23 | self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) 24 | 25 | def test_no_mixing_1(self): 26 | """Check that the model throws an error when mixing = 1.0.""" 27 | selector = PCovFPS(n_to_select=1, mixing=1.0) 28 | with self.assertRaises(ValueError) as cm: 29 | selector.fit(self.X, y=self.y) 30 | self.assertEqual( 31 | str(cm.exception), 32 | "Mixing = 1.0 corresponds to traditional FPS. Please use the FPS class.", 33 | ) 34 | 35 | 36 | if __name__ == "__main__": 37 | unittest.main(verbosity=2) 38 | -------------------------------------------------------------------------------- /tests/test_sample_simple_cur.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.datasets import fetch_california_housing as load 5 | 6 | from skmatter.sample_selection import CUR, FPS 7 | 8 | 9 | class TestCUR(unittest.TestCase): 10 | def setUp(self): 11 | self.X, _ = load(return_X_y=True) 12 | self.X = self.X[FPS(n_to_select=100).fit(self.X).selected_idx_] 13 | self.n_select = min(20, min(self.X.shape) // 2) 14 | 15 | def test_sample_transform(self): 16 | """ 17 | Check that an error is raised when the transform function is used, 18 | because sklearn does not support well transformers that change the number 19 | of samples with other classes like Pipeline 20 | """ 21 | selector = CUR(n_to_select=1) 22 | selector.fit(self.X) 23 | with self.assertRaises(ValueError) as error: 24 | _ = selector.transform(self.X) 25 | 26 | self.assertTrue( 27 | "Transform is not currently supported for sample selection." 28 | == str(error.exception) 29 | ) 30 | 31 | def test_restart(self): 32 | """Check that the model can be restarted with a new instance""" 33 | ref_selector = CUR(n_to_select=self.n_select) 34 | ref_idx = ref_selector.fit(self.X).selected_idx_ 35 | 36 | selector = CUR(n_to_select=1) 37 | selector.fit(self.X) 38 | 39 | for i in range(len(ref_idx) - 2): 40 | selector.n_to_select += 1 41 | selector.fit(self.X, warm_start=True) 42 | self.assertEqual(selector.selected_idx_[i], ref_idx[i]) 43 | 44 | def test_non_it(self): 45 | """Check that the model can be run non-iteratively.""" 46 | K = self.X @ self.X.T 47 | _, UK = np.linalg.eigh(K) 48 | ref_idx = np.argsort(-(UK[:, -1] ** 2.0))[: self.n_select] 49 | 50 | selector = CUR(n_to_select=len(ref_idx), recompute_every=0) 51 | selector.fit(self.X) 52 | 53 | self.assertTrue(np.allclose(selector.selected_idx_, ref_idx)) 54 | 55 | 56 | if __name__ == "__main__": 57 | unittest.main(verbosity=2) 58 | -------------------------------------------------------------------------------- /tests/test_sample_simple_fps.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.datasets import load_diabetes as get_dataset 5 | from sklearn.utils.validation import NotFittedError 6 | 7 | from skmatter.sample_selection import FPS 8 | 9 | 10 | class TestFPS(unittest.TestCase): 11 | def setUp(self): 12 | self.X, _ = get_dataset(return_X_y=True) 13 | self.idx = [0, 123, 441, 187, 117, 276, 261, 281, 251, 193] 14 | 15 | def test_restart(self): 16 | """Checks that the model can be restarted with a new number of samples and 17 | `warm_start`. 18 | """ 19 | selector = FPS(n_to_select=1, initialize=self.idx[0]) 20 | selector.fit(self.X) 21 | 22 | for i in range(2, len(self.idx)): 23 | selector.n_to_select = i 24 | selector.fit(self.X, warm_start=True) 25 | self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) 26 | 27 | def test_initialize(self): 28 | """Checks that the model can be initialized in all applicable manners and throws 29 | an error otherwise. 30 | """ 31 | for initialize in [self.idx[0], "random"]: 32 | with self.subTest(initialize=initialize): 33 | selector = FPS(n_to_select=1, initialize=initialize) 34 | selector.fit(self.X) 35 | 36 | initialize = self.idx[:4] 37 | with self.subTest(initialize=initialize): 38 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 39 | selector.fit(self.X) 40 | for i in range(4): 41 | self.assertEqual(selector.selected_idx_[i], self.idx[i]) 42 | 43 | initialize = np.array(self.idx[:4]) 44 | with self.subTest(initialize=initialize): 45 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 46 | selector.fit(self.X) 47 | for i in range(4): 48 | self.assertEqual(selector.selected_idx_[i], self.idx[i]) 49 | 50 | initialize = np.array([1, 5, 3, 0.25]) 51 | with self.subTest(initialize=initialize): 52 | with self.assertRaises(ValueError) as cm: 53 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 54 | selector.fit(self.X) 55 | self.assertEqual( 56 | str(cm.exception), "Invalid value of the initialize parameter" 57 | ) 58 | 59 | initialize = np.array([[1, 5, 3], [2, 4, 6]]) 60 | with self.subTest(initialize=initialize): 61 | with self.assertRaises(ValueError) as cm: 62 | selector = FPS(n_to_select=len(self.idx) - 1, initialize=initialize) 63 | selector.fit(self.X) 64 | self.assertEqual( 65 | str(cm.exception), "Invalid value of the initialize parameter" 66 | ) 67 | 68 | with self.assertRaises(ValueError) as cm: 69 | selector = FPS(n_to_select=1, initialize="bad") 70 | selector.fit(self.X) 71 | self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter") 72 | 73 | def test_get_distances(self): 74 | """Checks that the hausdorff distances are returnable after fitting.""" 75 | selector = FPS(n_to_select=1) 76 | selector.fit(self.X) 77 | _ = selector.get_select_distance() 78 | 79 | with self.assertRaises(NotFittedError): 80 | selector = FPS(n_to_select=1) 81 | _ = selector.get_select_distance() 82 | 83 | def test_threshold(self): 84 | selector = FPS( 85 | n_to_select=10, 86 | score_threshold=5e-2, 87 | score_threshold_type="absolute", 88 | ) 89 | selector.fit(self.X) 90 | self.assertEqual(len(selector.selected_idx_), 6) 91 | self.assertEqual(selector.selected_idx_.tolist(), self.idx[:6]) 92 | 93 | selector = FPS( 94 | n_to_select=10, 95 | score_threshold=0.4, 96 | score_threshold_type="relative", 97 | ) 98 | selector.fit(self.X) 99 | self.assertEqual(len(selector.selected_idx_), 5) 100 | self.assertEqual(selector.selected_idx_.tolist(), self.idx[:5]) 101 | 102 | 103 | if __name__ == "__main__": 104 | unittest.main(verbosity=2) 105 | -------------------------------------------------------------------------------- /tests/test_voronoi_fps.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.exceptions import NotFittedError 5 | from test_sample_simple_fps import TestFPS 6 | 7 | from skmatter.sample_selection import FPS, VoronoiFPS 8 | 9 | 10 | class TestVoronoiFPS(TestFPS): 11 | def setUp(self): 12 | super().setUp() 13 | 14 | def test_restart(self): 15 | """Checks that the model can be restarted with a new number of 16 | features and `warm_start` 17 | """ 18 | selector = VoronoiFPS(n_to_select=1, initialize=self.idx[0]) 19 | selector.fit(self.X) 20 | 21 | for i in range(2, len(self.idx)): 22 | selector.n_to_select = i 23 | selector.fit(self.X, warm_start=True) 24 | self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) 25 | 26 | def test_initialize(self): 27 | """Checks that the model can be initialized in all applicable manners 28 | and throws an error otherwise 29 | """ 30 | for initialize in [self.idx[0], "random"]: 31 | with self.subTest(initialize=initialize): 32 | selector = VoronoiFPS(n_to_select=1, initialize=initialize) 33 | selector.fit(self.X) 34 | 35 | with self.assertRaises(ValueError) as cm: 36 | selector = VoronoiFPS(n_to_select=1, initialize="bad") 37 | selector.fit(self.X) 38 | self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter") 39 | 40 | def test_switching_point(self): 41 | """Check work of the switching point calculator into the 42 | _init_greedy_search function 43 | """ 44 | selector = VoronoiFPS(n_to_select=1) 45 | selector.fit(self.X) 46 | self.assertTrue(1 > selector.full_fraction) 47 | 48 | selector = VoronoiFPS(n_to_select=1, full_fraction=0.5) 49 | selector.fit(self.X) 50 | self.assertEqual(selector.full_fraction, 0.5) 51 | 52 | with self.subTest(name="bad_ntrial"): 53 | with self.assertRaises(ValueError) as cm: 54 | selector = VoronoiFPS(n_to_select=1, n_trial_calculation=0) 55 | selector.fit(self.X) 56 | self.assertEqual( 57 | str(cm.exception), 58 | "Number of trial calculation should be more or equal to 1", 59 | ) 60 | 61 | with self.subTest(name="float_ntrial"): 62 | with self.assertRaises(TypeError) as cm: 63 | selector = VoronoiFPS(n_to_select=1, n_trial_calculation=0.3) 64 | selector.fit(self.X) 65 | self.assertEqual( 66 | str(cm.exception), "Number of trial calculation should be integer" 67 | ) 68 | 69 | with self.subTest(name="large_ff"): 70 | with self.assertRaises(ValueError) as cm: 71 | selector = VoronoiFPS(n_to_select=1, full_fraction=1.1) 72 | selector.fit(self.X) 73 | self.assertEqual( 74 | str(cm.exception), 75 | "Switching point should be real and more than 0 and less than 1. " 76 | f"Received {selector.full_fraction}", 77 | ) 78 | 79 | with self.subTest(name="string_ff"): 80 | with self.assertRaises(ValueError) as cm: 81 | selector = VoronoiFPS(n_to_select=1, full_fraction="STRING") 82 | selector.fit(self.X) 83 | self.assertEqual( 84 | str(cm.exception), 85 | "Switching point should be real and more than 0 and less than 1. " 86 | f"Received {selector.full_fraction}", 87 | ) 88 | 89 | def test_get_distances(self): 90 | """Checks that the hausdorff distances are returnable after fitting""" 91 | selector = VoronoiFPS(n_to_select=1) 92 | selector.fit(self.X) 93 | _ = selector.get_select_distance() 94 | 95 | with self.assertRaises(NotFittedError): 96 | selector = VoronoiFPS(n_to_select=1) 97 | _ = selector.get_select_distance() 98 | 99 | def test_comparison(self): 100 | """Checks that the voronoi FPS strictly computes less distances 101 | than its normal FPS counterpart. 102 | """ 103 | vselector = VoronoiFPS(n_to_select=self.X.shape[0] - 1) 104 | vselector.fit(self.X) 105 | 106 | selector = FPS(n_to_select=self.X.shape[0] - 1) 107 | selector.fit(self.X) 108 | 109 | self.assertTrue(np.allclose(vselector.selected_idx_, selector.selected_idx_)) 110 | 111 | def test_nothing_updated_points(self): 112 | """Checks that in the case where we have no points to update, the code 113 | still works fine 114 | """ 115 | X = np.array([[1, 1], [4, 4], [10, 10], [100, 100]]) 116 | selector = VoronoiFPS(n_to_select=3, initialize=0) 117 | try: 118 | selector.fit(X) 119 | f = 1 120 | except Exception: 121 | f = 0 122 | self.assertEqual(f, 1) 123 | 124 | self.assertEqual( 125 | len(np.where(selector.vlocation_of_idx == (selector.n_selected_ - 2))[0]), 1 126 | ) 127 | 128 | def test_calculate_dSL(self): 129 | selector = VoronoiFPS(n_to_select=3) 130 | selector.fit(self.X) 131 | 132 | active_points = np.where( 133 | selector.dSL_[selector.vlocation_of_idx] < selector.hausdorff_ 134 | )[0] 135 | 136 | ap = selector._get_active(self.X, selector.selected_idx_[-1]) 137 | 138 | self.assertTrue( 139 | np.allclose( 140 | active_points, 141 | ap, 142 | ) 143 | ) 144 | 145 | selector = VoronoiFPS(n_to_select=1) 146 | 147 | ap = selector._get_active(self.X, 0) 148 | 149 | self.assertTrue( 150 | np.allclose( 151 | np.arange(self.X.shape[0]), 152 | ap, 153 | ) 154 | ) 155 | 156 | def test_score(self): 157 | """Check that function score return hausdorff distance""" 158 | selector = VoronoiFPS(n_to_select=3, initialize=0) 159 | selector.fit(self.X) 160 | 161 | self.assertTrue( 162 | np.allclose( 163 | selector.hausdorff_, 164 | selector.score(self.X, selector.selected_idx_[-1]), 165 | ) 166 | ) 167 | 168 | 169 | if __name__ == "__main__": 170 | unittest.main(verbosity=2) 171 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | lint 4 | build 5 | tests 6 | 7 | lint_folders = 8 | "{toxinidir}/src" \ 9 | "{toxinidir}/tests" \ 10 | "{toxinidir}/docs/src/" \ 11 | "{toxinidir}/examples" 12 | 13 | 14 | [testenv:build] 15 | description = Builds the package and checks integrity 16 | 17 | usedevelop = true 18 | deps = 19 | build 20 | check-manifest 21 | twine 22 | allowlist_externals = bash 23 | commands_pre = 24 | bash -c "if [ -e {toxinidir}/dist/*tar.gz ]; then unlink {toxinidir}/dist/*.whl; fi" 25 | bash -c "if [ -e {toxinidir}/dist/*tar.gz ]; then unlink {toxinidir}/dist/*.tar.gz; fi" 26 | commands = 27 | python -m build 28 | twine check dist/*.tar.gz dist/*.whl 29 | check-manifest {toxinidir} 30 | 31 | [testenv:tests] 32 | description = Runs the tests 33 | usedevelop = true 34 | changedir = tests 35 | deps = 36 | ase 37 | parameterized 38 | pytest 39 | pytest-cov 40 | tqdm 41 | 42 | commands = 43 | # Run unit tests 44 | pytest {posargs} 45 | 46 | # Run documentation tests 47 | pytest --doctest-modules --pyargs skmatter {posargs} 48 | 49 | [testenv:lint] 50 | description = Checks the code and doc for programmatic and stylistic errors 51 | skip_install = true 52 | deps = 53 | black 54 | blackdoc 55 | ruff 56 | isort 57 | sphinx-lint 58 | commands = 59 | ruff check {[tox]lint_folders} 60 | black --check --diff {[tox]lint_folders} 61 | blackdoc --check --diff {[tox]lint_folders} 62 | isort --check-only --diff {[tox]lint_folders} 63 | sphinx-lint --enable all --max-line-length 88 \ 64 | -i "{toxinidir}/docs/src/examples" \ 65 | {[tox]lint_folders} "{toxinidir}/README.rst" 66 | 67 | [testenv:{format,format-unsafe}] 68 | description = 69 | format: Formats files in working directory. 70 | format-unsafe: Formats files in working directory. Fixes more linter errors 71 | but might alter code logic. Result of this formatting should 72 | be double checked. 73 | skip_install = true 74 | deps = 75 | ruff 76 | black 77 | blackdoc 78 | isort 79 | commands = 80 | format: ruff check --fix {[tox]lint_folders} 81 | format-unsafe: ruff check --fix --unsafe-fixes {[tox]lint_folders} 82 | black {[tox]lint_folders} 83 | blackdoc {[tox]lint_folders} 84 | isort {[tox]lint_folders} 85 | 86 | [testenv:docs] 87 | description = Builds the documentation 88 | usedevelop = true 89 | deps = 90 | -r docs/requirements.txt 91 | # The documentation runs "examples" to produce outputs via sphinx-gallery. 92 | extras = examples 93 | commands = 94 | sphinx-build {posargs:-E} -W -b html docs/src docs/build/html 95 | --------------------------------------------------------------------------------