├── tests ├── __init__.py ├── requirements.txt ├── test_utilities.py └── test_transformer.py ├── releasenotes └── notes │ ├── initial-40d81acd45b0cca8.yaml │ ├── problem-submission-label-68c3670bd04b5c25.yaml │ ├── SelectFromQuadraticModel-b051274f784d6fbf.yaml │ ├── fixed-column-f05e10ff94adfaef.yaml │ ├── Update-the-transformers.py-file-to-work-with-the-NL-solver-0cf03a2e1b7a33d6.yaml │ ├── Updated-README-to-NL-e02d2ae2ae9d6705.yaml │ ├── test_transformer.py-7793a6957af9735a.yaml │ └── correlation-8eaf77b85eeed9ba.yaml ├── requirements.txt ├── setup.py ├── dwave ├── __init__.py └── plugins │ ├── __init__.py │ └── sklearn │ ├── __init__.py │ ├── utilities.py │ └── transformers.py ├── pyproject.toml ├── .gitignore ├── .circleci └── config.yml ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | coverage 2 | codecov 3 | 4 | parameterized==0.9.0 -------------------------------------------------------------------------------- /releasenotes/notes/initial-40d81acd45b0cca8.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | prelude: > 3 | Initial release of ``dwave-scikit-learn-plugin``. 4 | -------------------------------------------------------------------------------- /releasenotes/notes/problem-submission-label-68c3670bd04b5c25.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - Apply a label to the problems submitted to Leap. 4 | -------------------------------------------------------------------------------- /releasenotes/notes/SelectFromQuadraticModel-b051274f784d6fbf.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - Add ``SelectFromQuadraticModel`` class for feature selection. 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dimod==0.12.20 2 | dwave-optimization==0.6.2 3 | dwave-system==1.32.0 4 | 5 | numpy==2.0.2 6 | scikit-learn==1.6.1 7 | 8 | reno==3.5.0 9 | -------------------------------------------------------------------------------- /releasenotes/notes/fixed-column-f05e10ff94adfaef.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | fixes: 3 | - Fix ``SelectFromQuadraticModel`` to handle the case when ``X`` has a column with all equal values. 4 | -------------------------------------------------------------------------------- /releasenotes/notes/Update-the-transformers.py-file-to-work-with-the-NL-solver-0cf03a2e1b7a33d6.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | upgrade: 3 | - | 4 | Update the entire file to work with the NL solver 5 | -------------------------------------------------------------------------------- /releasenotes/notes/Updated-README-to-NL-e02d2ae2ae9d6705.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | other: 3 | - Updated README.md to include usage instructions and explanations for the new nonlinear solver backend. 4 | -------------------------------------------------------------------------------- /releasenotes/notes/test_transformer.py-7793a6957af9735a.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | issues: 3 | - | 4 | ResourceWarnings occur on some tests. Some tests run several times, but RessourceWarnings only occur on some of the runs. 5 | "/usr/lib/python3.11/concurrent/futures/thread.py:85: ResourceWarning: unclosed file <_io.BufferedRandom name=8> 6 | del work_item 7 | ResourceWarning: Enable tracemalloc to get the object allocation traceback" 8 | upgrade: 9 | - | 10 | Updated to test the nonlinear version of transformers.py 11 | 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 D-Wave Systems Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from setuptools import setup 16 | 17 | setup() 18 | -------------------------------------------------------------------------------- /dwave/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 D-Wave Systems Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pkgutil 16 | __path__ = pkgutil.extend_path(__path__, __name__) 17 | -------------------------------------------------------------------------------- /dwave/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 D-Wave Systems Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pkgutil 16 | __path__ = pkgutil.extend_path(__path__, __name__) 17 | -------------------------------------------------------------------------------- /dwave/plugins/sklearn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 D-Wave Systems Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __version__ = '0.2.0' 16 | 17 | from dwave.plugins.sklearn.transformers import * 18 | -------------------------------------------------------------------------------- /releasenotes/notes/correlation-8eaf77b85eeed9ba.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - | 4 | Add ``dwave.plugins.sklearn.utilities.cov`` function. A drop-in replacement 5 | for ``numpy.cov`` that is modified to avoid unnecessary memory usage when 6 | working with ``numpy.memmap`` arrays. 7 | - | 8 | Add ``dwave.plugins.sklearn.utilities.corrcoef`` function. A drop-in replacement 9 | for ``numpy.corrcoef`` that is modified to avoid unnecessary memory usage when 10 | working with ``numpy.memmap`` arrays. 11 | - | 12 | Add ``dwave.plugins.sklearn.utilities.dot_2d`` function. A drop-in replacement 13 | for ``numpy.dot`` for 2d arrays that is modified to avoid unnecessary memory usage when 14 | working with ``numpy.memmap`` arrays. 15 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=77.0.3", # PEP 639 4 | ] 5 | build-backend = "setuptools.build_meta" 6 | 7 | [project] 8 | name = "dwave-scikit-learn-plugin" 9 | dynamic = ["version"] 10 | authors = [ 11 | {name = "D-Wave Inc.", email = "tools@dwavesys.com"}, 12 | ] 13 | description = "A plugin to scikit-learn for quantum-classical hybrid solving." 14 | license = "Apache-2.0" 15 | license-files = ["LICENSE"] 16 | classifiers = [ 17 | "Operating System :: OS Independent", 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3", 20 | ] 21 | requires-python = ">= 3.9" 22 | dependencies = [ 23 | "dimod>=0.12.20", 24 | "dwave-optimization>=0.6.2", 25 | "dwave-system>=1.32.0", 26 | "numpy>=2.0.2", 27 | "scikit-learn>=1.6.1", 28 | ] 29 | 30 | [project.readme] 31 | file = "README.md" 32 | content-type = "text/markdown" 33 | 34 | [project.urls] 35 | Homepage = "https://github.com/dwavesystems/dwave-scikit-learn-plugin" 36 | Download = "https://github.com/dwavesystems/dwave-scikit-learn-plugin/releases" 37 | 38 | [tool.setuptools.packages.find] 39 | include = ["dwave.*"] 40 | 41 | [tool.setuptools.dynamic] 42 | version = {attr = "dwave.plugins.sklearn.__version__"} 43 | 44 | [tool.coverage.run] 45 | omit = ["tests/*"] 46 | source = ["dwave/plugins/sklearn"] 47 | 48 | [tool.coverage.report] 49 | include_namespace_packages = true 50 | exclude_lines = [ 51 | "pragma: no cover", 52 | "raise NotImplementedError", 53 | "if __name__ == .__main__.:", 54 | ] 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # vscode 2 | .vscode/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | ocean: dwave/ocean@1 5 | windows: circleci/windows@5.0 6 | 7 | environment: 8 | PIP_PROGRESS_BAR: 'off' 9 | 10 | jobs: 11 | test-linux: 12 | parameters: 13 | python-version: 14 | type: string 15 | 16 | docker: 17 | - image: python:<< parameters.python-version >> 18 | 19 | steps: 20 | - checkout 21 | 22 | - ocean/pip-install: 23 | requirements: tests/requirements.txt 24 | cache: true 25 | 26 | - ocean/pip-install: 27 | requirements: requirements.txt 28 | packages: . 29 | cache: false 30 | 31 | - ocean/coverage-run-unittest 32 | 33 | test-macos: 34 | parameters: 35 | python-version: 36 | type: string 37 | 38 | executor: 39 | name: ocean/macos 40 | xcode: "16.2.0" 41 | 42 | steps: 43 | - checkout 44 | 45 | - ocean/brew-install-pyenv: 46 | update-homebrew: true 47 | 48 | - ocean/pyenv-install-python: 49 | python-version: << parameters.python-version >> 50 | cache: true 51 | 52 | - ocean/pip-install: 53 | requirements: tests/requirements.txt 54 | cache: true 55 | 56 | - ocean/pip-install: 57 | requirements: requirements.txt 58 | packages: . 59 | cache: false 60 | 61 | - ocean/coverage-run-unittest: 62 | upload-coverage: true 63 | 64 | test-windows: 65 | parameters: 66 | python-version: 67 | type: string 68 | 69 | executor: 70 | name: windows/default 71 | 72 | steps: 73 | - checkout 74 | 75 | - ocean/nuget-install-python: 76 | python-version: << parameters.python-version >> 77 | cache: true 78 | 79 | - ocean/pip-install: 80 | requirements: tests/requirements.txt 81 | cache: false 82 | 83 | - ocean/pip-install: 84 | requirements: requirements.txt 85 | packages: . 86 | cache: false 87 | 88 | - ocean/coverage-run-unittest 89 | 90 | deploy: 91 | docker: 92 | - image: python:3.12 93 | 94 | steps: 95 | - checkout 96 | 97 | - run: 98 | name: create virtualenv 99 | command: | 100 | python -m venv env 101 | 102 | - run: 103 | name: build sdist and bdist 104 | command: | 105 | . env/bin/activate 106 | pip install -U pip setuptools wheel 107 | python setup.py sdist 108 | python setup.py bdist_wheel 109 | 110 | - run: 111 | name: upload 112 | command: | 113 | . env/bin/activate 114 | pip install twine 115 | twine check dist/* 116 | twine upload -u "$PYPI_USERNAME" -p "$PYPI_PASSWORD" --skip-existing ./dist/* 117 | 118 | workflows: 119 | version: 2 120 | 121 | test: 122 | jobs: 123 | - test-linux: 124 | matrix: 125 | parameters: 126 | python-version: &python-versions ["3.9.21", "3.10.16", "3.11.11", "3.12.8", "3.13.1"] 127 | - test-macos: 128 | matrix: 129 | parameters: 130 | python-version: *python-versions 131 | - test-windows: 132 | matrix: 133 | parameters: 134 | # note: limit to versions available via nuget 135 | python-version: &python-versions-windows ["3.9.13", "3.10.11", "3.11.9", "3.12.8", "3.13.1"] 136 | 137 | deploy: 138 | jobs: 139 | - deploy: 140 | filters: &on-tag-push 141 | tags: 142 | only: /^[0-9]+(\.[0-9]+)*((\.dev|rc)([0-9]+)?)?$/ 143 | branches: 144 | ignore: /.*/ 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI](https://img.shields.io/pypi/v/dwave-scikit-learn-plugin.svg)](https://pypi.python.org/pypi/dwave-scikit-learn-plugin) 2 | [![CircleCI](https://dl.circleci.com/status-badge/img/gh/dwavesystems/dwave-scikit-learn-plugin/tree/main.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/dwavesystems/dwave-scikit-learn-plugin) 3 | 4 | # D-Wave `scikit-learn` Plugin 5 | 6 | This package provides a [scikit-learn](https://scikit-learn.org/) transformer for 7 | [feature selection](https://en.wikipedia.org/wiki/Feature_selection) using a 8 | quantum-classical [hybrid solver](https://docs.ocean.dwavesys.com/en/stable/concepts/hybrid.html). 9 | 10 | This plugin makes use of a Leap™ quantum-classical hybrid solver. Developers can get started by 11 | [signing up](https://cloud.dwavesys.com/leap/signup/) for the Leap quantum cloud service for free. 12 | Those seeking a more collaborative approach and assistance with building a production application can 13 | reach out to D-Wave [directly](https://www.dwavesys.com/solutions-and-products/professional-services/) and also explore the feature selection [offering](https://aws.amazon.com/marketplace/pp/prodview-bsrc3yuwgjbo4) in AWS Marketplace. 14 | 15 | The package's main class, `SelectFromQuadraticModel`, can be used in any existing `sklearn` pipeline. 16 | For an introduction to hybrid methods for feature selection, see the [Feature Selection for CQM](https://github.com/dwave-examples/feature-selection-cqm). 17 | 18 | ## Examples 19 | 20 | ### Basic Usage 21 | 22 | A minimal example of using the plugin to select 20 of 30 features of an `sklearn` dataset: 23 | 24 | ```python 25 | >>> from sklearn.datasets import load_breast_cancer 26 | >>> from dwave.plugins.sklearn import SelectFromQuadraticModel 27 | ... 28 | >>> X, y = load_breast_cancer(return_X_y=True) 29 | >>> X.shape 30 | (569, 30) 31 | >>> # solver can also be equal to "cqm" 32 | >>> X_new = SelectFromQuadraticModel(num_features=20, solver="nl").fit_transform(X, y) 33 | >>> X_new.shape 34 | (569, 20) 35 | ``` 36 | 37 | For large problems, the default runtime may be insufficient. You can use the CQM solver's [`time_limit`](https://docs.dwavequantum.com/en/latest/industrial_optimization/solver_cqm_parameters.html#time-limit) or Nonlinear (NL) solver's 38 | [`time_limit`](https://docs.dwavequantum.com/en/latest/industrial_optimization/solver_nl_parameters.html#time-limit) 39 | method to find the minimum accepted runtime for your problem; alternatively, simply submit as above 40 | and check the returned error message for the required runtime. 41 | 42 | The feature selector can be re-instantiated with a longer time limit. 43 | 44 | ```python 45 | >>> # solver can also be equal to "nl" 46 | >>> X_new = SelectFromQuadraticModel(num_features=20, time_limit=200, solver="cqm").fit_transform(X, y) 47 | ``` 48 | 49 | ### Tuning 50 | 51 | You can use `SelectFromQuadraticModel` with scikit-learn's 52 | [hyper-parameter optimizers](https://scikit-learn.org/stable/modules/classes.html#hyper-parameter-optimizers). 53 | 54 | For example, the number of features can be tuned using a grid search. **Please note that this will 55 | submit many problems to the hybrid solver.** 56 | 57 | ```python 58 | >>> import numpy as np 59 | ... 60 | >>> from sklearn.datasets import load_breast_cancer 61 | >>> from sklearn.ensemble import RandomForestClassifier 62 | >>> from sklearn.model_selection import GridSearchCV 63 | >>> from sklearn.pipeline import Pipeline 64 | >>> from dwave.plugins.sklearn import SelectFromQuadraticModel 65 | ... 66 | >>> X, y = load_breast_cancer(return_X_y=True) 67 | ... 68 | >>> num_features = X.shape[1] 69 | >>> searchspace = np.linspace(1, num_features, num=5, dtype=int, endpoint=True) 70 | ... 71 | >>> # solver can also be equal to "cqm" 72 | >>> pipe = Pipeline([ 73 | >>> ('feature_selection', SelectFromQuadraticModel(solver="nl")), 74 | >>> ('classification', RandomForestClassifier()) 75 | >>> ]) 76 | ... 77 | >>> clf = GridSearchCV(pipe, param_grid=dict(feature_selection__num_features=searchspace)) 78 | >>> search = clf.fit(X, y) 79 | >>> print(search.best_params_) 80 | {'feature_selection__num_features': 22} 81 | ``` 82 | 83 | ## Installation 84 | 85 | To install the core package: 86 | 87 | ```bash 88 | pip install dwave-scikit-learn-plugin 89 | ``` 90 | 91 | ## License 92 | 93 | Released under the Apache License 2.0 94 | 95 | ## Contributing 96 | 97 | Ocean's [contributing guide](https://docs.ocean.dwavesys.com/en/stable/contributing.html) 98 | has guidelines for contributing to Ocean packages. 99 | 100 | ### Release Notes 101 | 102 | **dwave-scikit-learn-plugin** makes use of [reno](https://docs.openstack.org/reno/) to manage its 103 | release notes. 104 | 105 | When making a contribution to **dwave-scikit-learn-plugin** that will affect users, create a new 106 | release note file by running 107 | 108 | ```bash 109 | reno new your-short-descriptor-here 110 | ``` 111 | 112 | You can then edit the file created under ``releasenotes/notes/``. 113 | Remove any sections not relevant to your changes. 114 | Commit the file along with your changes. 115 | -------------------------------------------------------------------------------- /tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 D-Wave Systems Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Some tests are adapted from NumPy under the following license. 16 | 17 | # Copyright (c) 2005-2022, NumPy Developers. 18 | # All rights reserved. 19 | # 20 | # Redistribution and use in source and binary forms, with or without 21 | # modification, are permitted provided that the following conditions are 22 | # met: 23 | # 24 | # * Redistributions of source code must retain the above copyright 25 | # notice, this list of conditions and the following disclaimer. 26 | # 27 | # * Redistributions in binary form must reproduce the above 28 | # copyright notice, this list of conditions and the following 29 | # disclaimer in the documentation and/or other materials provided 30 | # with the distribution. 31 | # 32 | # * Neither the name of the NumPy Developers nor the names of any 33 | # contributors may be used to endorse or promote products derived 34 | # from this software without specific prior written permission. 35 | # 36 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 37 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 38 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 39 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 40 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 42 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 43 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 44 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 45 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 46 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 47 | 48 | import os.path 49 | import tempfile 50 | import unittest 51 | 52 | import numpy as np 53 | 54 | from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d 55 | 56 | 57 | class TestCorrCoef(unittest.TestCase): 58 | def test_agreement(self): 59 | rng = np.random.default_rng(42) 60 | X = rng.uniform(size=(100, 100)) 61 | for rowvar in (True, False): 62 | with self.subTest(rowvar=rowvar): 63 | np.testing.assert_array_equal( 64 | corrcoef(X, rowvar=rowvar), np.corrcoef(X, rowvar=rowvar)) 65 | 66 | def test_memmap(self): 67 | # Smoketest for memmap. 68 | # here isn't really a nice way to test memory usage but it's useful to 69 | # have this test present for manual testing 70 | rng = np.random.default_rng(42) 71 | 72 | size = (1_000, 100) 73 | # size = (25_000, 100_000) # The max size we want to support 74 | 75 | with tempfile.TemporaryFile() as fX: 76 | with tempfile.NamedTemporaryFile() as fout: 77 | X = np.memmap(fX, "float64", mode="w+", shape=size) 78 | X[:, :10] = rng.uniform(size=(X.shape[0], 10)) # so we don't get stddev = 0 79 | X[:, 10:] = 1 80 | out = np.memmap(fout, "float64", mode="w+", shape=(X.shape[0], X.shape[0])) 81 | 82 | corrcoef(X, rowvar=True, out=out, copy=False) 83 | 84 | # the following tests are adapted from NumPy 85 | 86 | def test_non_array(self): 87 | np.testing.assert_almost_equal( 88 | corrcoef([[0, 1, 0], [1, 0, 1]]), [[1., -1.], [-1., 1.]]) 89 | 90 | def test_simple(self): 91 | A = np.array( 92 | [[0.15391142, 0.18045767, 0.14197213], 93 | [0.70461506, 0.96474128, 0.27906989], 94 | [0.9297531, 0.32296769, 0.19267156]]) 95 | res1 = np.array( 96 | [[1., 0.9379533, -0.04931983], 97 | [0.9379533, 1., 0.30007991], 98 | [-0.04931983, 0.30007991, 1.]]) 99 | tgt1 = corrcoef(A) 100 | np.testing.assert_almost_equal(tgt1, res1) 101 | self.assertTrue(np.all(np.abs(tgt1) <= 1.0)) 102 | 103 | def test_complex(self): 104 | x = np.array([[1, 2, 3], [1j, 2j, 3j]]) 105 | res = corrcoef(x) 106 | tgt = np.array([[1., -1.j], [1.j, 1.]]) 107 | np.testing.assert_allclose(res, tgt) 108 | self.assertTrue(np.all(np.abs(res) <= 1.0)) 109 | 110 | 111 | class TestCov(unittest.TestCase): 112 | def test_agreement(self): 113 | rng = np.random.default_rng(42) 114 | X = rng.uniform(size=(10, 20)) 115 | for rowvar in (True, False): 116 | with self.subTest(rowvar=rowvar): 117 | np.testing.assert_array_equal(cov(X, rowvar=rowvar), np.cov(X, rowvar=rowvar)) 118 | 119 | def test_memmap(self): 120 | # Smoketest for memmap. 121 | # here isn't really a nice way to test memory usage but it's useful to 122 | # have this test present for manual testing 123 | size = (1_000, 100) 124 | # size = (25_000, 100_000) # The max size we want to support 125 | 126 | with tempfile.TemporaryFile() as fX: 127 | with tempfile.NamedTemporaryFile() as fout: 128 | X = np.memmap(fX, "float64", mode="w+", shape=size) 129 | X[:] = 1 130 | out = np.memmap(fout, "float64", mode="w+", shape=(X.shape[0], X.shape[0])) 131 | 132 | cov(X, rowvar=True, out=out, copy=False) 133 | 134 | # the following tests are adapted from NumPy 135 | 136 | def test_basic(self): 137 | x1 = np.array([[0, 2], [1, 1], [2, 0]]).T 138 | res1 = np.array([[1., -1.], [-1., 1.]]) 139 | np.testing.assert_allclose(cov(x1), res1) 140 | 141 | def test_complex(self): 142 | x = np.array([[1, 2, 3], [1j, 2j, 3j]]) 143 | res = np.array([[1., -1.j], [1.j, 1.]]) 144 | np.testing.assert_allclose(cov(x), res) 145 | 146 | def test_1D_rowvar(self): 147 | x3 = np.array([0.3942, 0.5969, 0.7730, 0.9918, 0.7964]) 148 | np.testing.assert_allclose(cov(x3), cov(x3, rowvar=False)) 149 | 150 | 151 | class TestDot2D(unittest.TestCase): 152 | def test_agreement(self): 153 | rng = np.random.default_rng(42) 154 | X = rng.uniform(size=(10, 20)) 155 | Y = rng.uniform(size=(20, 100)) 156 | np.testing.assert_array_equal(dot_2d(X, Y), np.dot(X, Y)) 157 | 158 | def test_chunksize(self): 159 | # make sure that chunk sizes that don't align with the total number 160 | # of rows still work 161 | rng = np.random.default_rng(42) 162 | X = rng.uniform(size=(10, 20)) 163 | Y = rng.uniform(size=(20, 15)) 164 | np.testing.assert_array_almost_equal(dot_2d(X, Y, chunksize=86), np.dot(X, Y)) 165 | np.testing.assert_array_almost_equal(dot_2d(X, Y, chunksize=365), np.dot(X, Y)) 166 | 167 | def test_memmap(self): 168 | # Smoketest for memmap. 169 | # here isn't really a nice way to test memory usage but it's useful to 170 | # have this test present for manual testing 171 | size = (1_000, 100) 172 | # size = (25_000, 100_000) # The max size we want to support 173 | 174 | with tempfile.TemporaryFile() as fX: 175 | with tempfile.NamedTemporaryFile() as fout: 176 | X = np.memmap(fX, "float64", mode="w+", shape=size) 177 | X[:] = 1 178 | out = np.memmap(fout, "float64", mode="w+", shape=(X.shape[0], X.shape[0])) 179 | 180 | dot_2d(X, X.T, out=out) 181 | -------------------------------------------------------------------------------- /dwave/plugins/sklearn/utilities.py: -------------------------------------------------------------------------------- 1 | # The following traversal code is adapted from NumPy's implementation. 2 | 3 | # Copyright (c) 2005-2022, NumPy Developers. 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are 8 | # met: 9 | # 10 | # * Redistributions of source code must retain the above copyright 11 | # notice, this list of conditions and the following disclaimer. 12 | # 13 | # * Redistributions in binary form must reproduce the above 14 | # copyright notice, this list of conditions and the following 15 | # disclaimer in the documentation and/or other materials provided 16 | # with the distribution. 17 | # 18 | # * Neither the name of the NumPy Developers nor the names of any 19 | # contributors may be used to endorse or promote products derived 20 | # from this software without specific prior written permission. 21 | # 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | 34 | # Modifications are licensed under the Apache 2.0 Software license. 35 | 36 | # Copyright 2023 D-Wave Systems Inc. 37 | # 38 | # Licensed under the Apache License, Version 2.0 (the "License"); 39 | # you may not use this file except in compliance with the License. 40 | # You may obtain a copy of the License at 41 | # 42 | # http://www.apache.org/licenses/LICENSE-2.0 43 | # 44 | # Unless required by applicable law or agreed to in writing, software 45 | # distributed under the License is distributed on an "AS IS" BASIS, 46 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 47 | # See the License for the specific language governing permissions and 48 | # limitations under the License. 49 | 50 | import typing 51 | 52 | import numpy as np 53 | import numpy.typing as npt 54 | 55 | __all__ = ["corrcoef", "cov", "dot_2d"] 56 | 57 | 58 | def corrcoef(x: npt.ArrayLike, *, 59 | out: typing.Optional[np.ndarray] = None, 60 | rowvar: bool = True, 61 | copy: bool = True, 62 | ) -> np.ndarray: 63 | """A drop-in replacement for :func:`numpy.corrcoef`. 64 | 65 | This method is modified to avoid unnecessary memory usage when working with 66 | :class:`numpy.memmap` arrays. 67 | It does not support the full range of arguments accepted by 68 | :func:`numpy.corrcoef`. 69 | 70 | Additionally, in the case that a row of ``x`` is fixed, this method 71 | will return a correlation value of 0 rather than :class:`numpy.nan`. 72 | 73 | Args: 74 | x: See :func:`numpy.corrcoef`. 75 | 76 | out: Output argument. This must be the exact kind that would be returned 77 | if it was not used. 78 | 79 | rowvar: See :func:`numpy.corrcoef`. 80 | 81 | copy: If ``True``, ``x`` is not modified by this function. 82 | 83 | Returns: 84 | See :func:`numpy.corrcoef`. 85 | 86 | """ 87 | c = cov(x, out=out, rowvar=rowvar, copy=copy) 88 | try: 89 | d = np.diag(c) 90 | except ValueError: 91 | # scalar covariance 92 | # nan if incorrect value (nan, inf, 0), 1 otherwise 93 | return c / c 94 | stddev = np.sqrt(d.real) 95 | 96 | # the places that stddev == 0 are exactly the places that the columns 97 | # are fixed. We can safely ignore those when dividing 98 | np.divide(c, stddev[:, None], out=c, where=stddev[:, None] != 0) 99 | np.divide(c, stddev[None, :], out=c, where=stddev[None, :] != 0) 100 | 101 | # Clip real and imaginary parts to [-1, 1]. This does not guarantee 102 | # abs(a[i,j]) <= 1 for complex arrays, but is the best we can do without 103 | # excessive work. 104 | np.clip(c.real, -1, 1, out=c.real) 105 | if np.iscomplexobj(c): 106 | np.clip(c.imag, -1, 1, out=c.imag) 107 | 108 | return c 109 | 110 | 111 | def cov(m: npt.ArrayLike, *, 112 | out: typing.Optional[np.ndarray] = None, 113 | rowvar: bool = True, 114 | copy: bool = True, 115 | ) -> np.ndarray: 116 | """A drop-in replacement for :func:`numpy.cov`. 117 | 118 | This method is modified to avoid unnecessary memory usage when working with 119 | :class:`numpy.memmap` arrays. 120 | It does not support the full range of arguments accepted by 121 | :func:`numpy.cov`. 122 | 123 | Args: 124 | m: See :func:`numpy.cov`. 125 | 126 | out: Output argument. This must be the exact kind that would be returned 127 | if it was not used. 128 | 129 | rowvar: See :func:`numpy.cov`. 130 | 131 | copy: If ``True``, ``x`` is not modified by this function. 132 | 133 | Returns: 134 | See :func:`numpy.cov`. 135 | 136 | """ 137 | # we want to modify X, so if copy=True we make a copy and re-call 138 | if copy: 139 | if hasattr(m, "flush"): 140 | # we could do a lot of fiddling here, but it's easier to just 141 | # disallow this case and rely on the user making a modifiable 142 | # X 143 | raise ValueError("memmap arrays cannot be copied easily") 144 | 145 | return cov(np.array(m), rowvar=rowvar, copy=False, out=out) 146 | 147 | # handle array-like 148 | if isinstance(m, np.memmap): 149 | X = m 150 | else: 151 | X = np.atleast_2d(np.asarray(m, dtype=np.result_type(m, np.float64))) 152 | 153 | if X.ndim != 2: 154 | raise ValueError("X must have 2 dimensions") 155 | 156 | if not rowvar and X.shape[0] != 1: 157 | X = X.T 158 | 159 | # Get the product of frequencies and weights 160 | avg = np.average(X, axis=1) 161 | 162 | # Determine the normalization 163 | fact = max(X.shape[1] - 1, 0) 164 | 165 | X -= avg[:, None] 166 | 167 | if hasattr(m, "flush"): 168 | X.flush() 169 | 170 | X_T = X.T 171 | 172 | out = dot_2d(X, X_T.conj(), out=out) 173 | out *= np.true_divide(1, fact) 174 | 175 | if hasattr(out, "flush"): 176 | out.flush() 177 | 178 | return out 179 | 180 | 181 | def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *, 182 | out: typing.Optional[np.ndarray] = None, 183 | chunksize: int = int(1e+9), 184 | ) -> np.ndarray: 185 | """A drop-in replacment for :func:`numpy.dot` for 2d arrays. 186 | 187 | This method is modified to avoid unnecessary memory usage when working with 188 | :class:`numpy.memmap` arrays. 189 | 190 | Args: 191 | a: See :func:`numpy.dot`. ``a.ndim`` must be 2. 192 | b: See :func:`numpy.dot`. ``b.ndim`` must be 2. 193 | out: See :func:`numpy.dot`. 194 | chunksize: The number of bytes that should be created by each step 195 | of the multiplication. This is used to keep the total memory 196 | usage low when multiplying :class:`numpy.memmap` arrays. 197 | 198 | Returns: 199 | See :func:`numpy.dot`. 200 | 201 | """ 202 | if not isinstance(a, np.memmap): 203 | a = np.asarray(a) 204 | if not isinstance(b, np.memmap): 205 | b = np.asarray(b) 206 | 207 | if a.ndim != 2: 208 | raise ValueError("a must be a 2d array") 209 | if b.ndim != 2: 210 | raise ValueError("b must be a 2d array") 211 | 212 | if out is None: 213 | out = np.empty((a.shape[0], b.shape[1]), dtype=np.result_type(a, b)) 214 | elif out.shape[0] != a.shape[0] or out.shape[1] != b.shape[1]: 215 | raise ValueError(f"out must be a ({a.shape[0]}, {b.shape[1]}) array") 216 | 217 | is_memmap = hasattr(out, "flush") 218 | 219 | num_rows = max(chunksize // (out.dtype.itemsize * out.shape[1]), 1) 220 | for start in range(0, out.shape[0], num_rows): 221 | np.dot(a[start:start+num_rows, :], b, out=out[start:start+num_rows, :]) 222 | 223 | if is_memmap: 224 | out.flush() 225 | 226 | return out 227 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /tests/test_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright 523 D-Wave Systems Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import concurrent.futures 16 | import unittest 17 | import unittest.mock 18 | import warnings 19 | import tempfile 20 | from parameterized import parameterized 21 | 22 | import dimod 23 | import numpy as np 24 | 25 | from dwave.optimization import Model 26 | from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError 27 | from dwave.system import LeapHybridNLSampler 28 | from dwave.system import LeapHybridCQMSampler 29 | from sklearn.datasets import load_iris 30 | from sklearn.ensemble import RandomForestClassifier 31 | from sklearn.model_selection import GridSearchCV 32 | from sklearn.pipeline import Pipeline 33 | 34 | from dwave.plugins.sklearn.transformers import SelectFromQuadraticModel 35 | from dwave.plugins.sklearn.utilities import corrcoef 36 | 37 | FEASIBLE_NL_SOLUTION = [] 38 | 39 | class MockCQM(dimod.ExactCQMSolver): 40 | def sample_cqm(self, cqm: dimod.CQM, *, time_limit: float, label: str) -> dimod.SampleSet: 41 | return super().sample_cqm(cqm) 42 | 43 | def min_time_limit(self, cqm): 44 | return 1 45 | 46 | 47 | class MockNL(): 48 | def sample(self, nl: Model, *, time_limit: float, label: str): 49 | nl.states.resize(1) 50 | 51 | for decision in nl.iter_decisions(): 52 | decision.set_state(0, FEASIBLE_NL_SOLUTION) 53 | 54 | return concurrent.futures.Future() 55 | 56 | 57 | @unittest.mock.patch("dwave.plugins.sklearn.transformers.LeapHybridNLSampler", MockNL) 58 | @unittest.mock.patch("dwave.plugins.sklearn.transformers.LeapHybridCQMSampler", MockCQM) 59 | class TestSelectFromQuadraticModel(unittest.TestCase): 60 | @classmethod 61 | def setUpClass(cls): 62 | rng = np.random.default_rng(138984) 63 | cls.X = rng.uniform(-10, 10, size=(100, 9)) 64 | cls.y = np.asarray(rng.uniform(0, 1, size=100) > 0.5, dtype=int) 65 | 66 | @parameterized.expand([ 67 | (0.1, 30, "cqm"), 68 | (0.1, 15, "cqm"), 69 | (0.1, 30, "nl"), 70 | (0.1, 15, "nl"), 71 | ]) 72 | def test_init_good(self, alpha, time_limit, solver): 73 | a = SelectFromQuadraticModel(solver=solver) 74 | 75 | b = SelectFromQuadraticModel(alpha=alpha, solver=solver) 76 | 77 | c = SelectFromQuadraticModel(alpha=alpha, time_limit=time_limit, solver=solver) 78 | 79 | d = SelectFromQuadraticModel(time_limit=time_limit, solver=solver) 80 | 81 | self.assertIsInstance(a, SelectFromQuadraticModel) 82 | self.assertIsInstance(b, SelectFromQuadraticModel) 83 | self.assertIsInstance(c, SelectFromQuadraticModel) 84 | self.assertIsInstance(d, SelectFromQuadraticModel) 85 | 86 | self.assertEqual(a.alpha, 0.5) 87 | self.assertEqual(b.alpha, 0.1) 88 | self.assertEqual(c.alpha, 0.1) 89 | self.assertEqual(d.alpha, 0.5) 90 | 91 | self.assertEqual(a.time_limit, None) 92 | self.assertEqual(b.time_limit, None) 93 | self.assertEqual(c.time_limit, time_limit) 94 | self.assertEqual(d.time_limit, time_limit) 95 | 96 | self.assertIsInstance( 97 | SelectFromQuadraticModel(alpha=0), SelectFromQuadraticModel 98 | ) 99 | 100 | @parameterized.expand([ 101 | (-10, "cqm"), 102 | (10, "cqm"), 103 | (-10, "nl"), 104 | (10, "nl"), 105 | ]) 106 | def test_init_bad(self, alpha, solver): 107 | self.assertRaises(ValueError, SelectFromQuadraticModel, alpha=alpha, solver=solver) 108 | self.assertRaises(ValueError, SelectFromQuadraticModel, alpha=alpha, solver=solver) 109 | 110 | @parameterized.expand([ 111 | (7, "cqm"), 112 | (5, "cqm"), 113 | (7, "nl"), 114 | (5, "nl"), 115 | ]) 116 | def test_fit(self, num_features, solver): 117 | global FEASIBLE_NL_SOLUTION 118 | if num_features==7: 119 | FEASIBLE_NL_SOLUTION = [1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0] 120 | elif num_features==5: 121 | FEASIBLE_NL_SOLUTION = [1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0] 122 | 123 | selector = SelectFromQuadraticModel(num_features=num_features, solver=solver) 124 | 125 | # test default numpy 126 | 127 | selector.fit(self.X, self.y) 128 | self.assertEqual(sum(selector._mask), num_features) 129 | 130 | try: 131 | self.X[:, selector._mask] 132 | except Exception as e: 133 | self.fail(e) 134 | 135 | # test non-default numpy 136 | 137 | selector.fit(self.X, self.y, num_features=num_features, solver=solver) 138 | self.assertEqual(sum(selector._mask), num_features) 139 | 140 | try: 141 | self.X[:, selector._mask] 142 | except Exception as e: 143 | self.fail(e) 144 | 145 | @parameterized.expand([ 146 | (7, "cqm"), 147 | (7, "nl"), 148 | ]) 149 | def test_fit_transform(self, num_features, solver): 150 | global FEASIBLE_NL_SOLUTION 151 | FEASIBLE_NL_SOLUTION = [1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0] 152 | 153 | selector = SelectFromQuadraticModel(num_features=num_features, solver=solver) 154 | 155 | # test numpy without fit 156 | x = selector.fit_transform(self.X, self.y, num_features=num_features-2, solver=solver) 157 | 158 | self.assertEqual(x.shape[1], num_features-2) 159 | 160 | x_from_fit = self.X[:, selector._mask] 161 | np.testing.assert_array_equal(x, x_from_fit) 162 | 163 | @parameterized.expand([ 164 | (2, "cqm"), 165 | (2, "nl"), 166 | ]) 167 | def test_pipeline(self, num_features, solver): 168 | global FEASIBLE_NL_SOLUTION 169 | FEASIBLE_NL_SOLUTION = [1.0, 1.0, 0.0, 0.0] 170 | 171 | X, y = load_iris(return_X_y=True) 172 | 173 | clf = Pipeline([ 174 | ('feature_selection', SelectFromQuadraticModel(num_features=num_features, solver=solver)), 175 | ('classification', RandomForestClassifier()) 176 | ]) 177 | clf.fit(X, y) 178 | 179 | clf.predict(X) 180 | 181 | def test_alpha_0(self): 182 | cqm = SelectFromQuadraticModel.correlation(self.X, self.y, num_features=3, alpha=0, solver="cqm") 183 | self.assertTrue(not any(cqm.objective.linear.values())) 184 | 185 | X = np.atleast_2d(np.asarray(self.X)) 186 | y = np.asarray(self.y) 187 | 188 | with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout: 189 | # we make a copy of X because we'll be modifying it in-place within 190 | # some of the functions 191 | X_copy = np.memmap(fX, X.dtype, mode="w+", shape=(X.shape[0], X.shape[1] + 1)) 192 | X_copy[:, :-1] = X 193 | X_copy[:, -1] = y 194 | 195 | # make the matrix that will hold the correlations 196 | correlations = np.memmap( 197 | fout, 198 | dtype=np.result_type(X, y), 199 | mode="w+", 200 | shape=(X_copy.shape[1], X_copy.shape[1]), 201 | ) 202 | 203 | # main calculation. It modifies X_copy in-place 204 | corrcoef(X_copy, out=correlations, rowvar=False, copy=False) 205 | 206 | # we don't care about the direction of correlation in terms of 207 | # the penalty/quality 208 | np.absolute(correlations, out=correlations) 209 | 210 | label_corr = np.array(correlations[:-1,-1]) 211 | expected_linear = np.zeros(X.shape[1]) 212 | expected_linear += (-1.0 * label_corr * 0 * 3) 213 | self.assertTrue(np.allclose(expected_linear.all(), 0)) 214 | 215 | @parameterized.expand([ 216 | (3, 1, "cqm"), 217 | (3, 1, "nl"), 218 | ]) 219 | def test_alpha_1(self, num_features, alpha, solver): 220 | global FEASIBLE_NL_SOLUTION 221 | FEASIBLE_NL_SOLUTION = [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 222 | 223 | rng = np.random.default_rng(42) 224 | 225 | y = rng.uniform(size=1000) 226 | 227 | # make the first three columns exactly match the test data 228 | X = rng.uniform(size=(1000, 10)) 229 | X[:, 0] = X[:, 1] = X[:, 2] = y 230 | 231 | selector = SelectFromQuadraticModel(num_features=num_features, alpha=alpha, solver=solver).fit(X, y) 232 | 233 | # with alpha=1, we should see that only the quality matters, so the 234 | # first three should be selected despite being perfectly correlated 235 | self.assertTrue(selector._get_support_mask()[0:3].all()) 236 | self.assertFalse(selector._get_support_mask()[3:].any()) 237 | 238 | @parameterized.expand([ 239 | (1, "cqm"), 240 | (1, "nl"), 241 | ]) 242 | def test_xy_shape(self, num_features, solver): 243 | with self.assertRaises(ValueError): 244 | SelectFromQuadraticModel(num_features=num_features, solver=solver).fit([[0, 1]], [1, 2]) 245 | 246 | def test_repr(self): 247 | repr(SelectFromQuadraticModel(solver="cqm")) 248 | repr(SelectFromQuadraticModel(solver="nl")) 249 | 250 | @parameterized.expand([ 251 | (2, "cqm"), 252 | (2, "nl"), 253 | ]) 254 | def test_gridsearch(self, num_features, solver): 255 | global FEASIBLE_NL_SOLUTION 256 | FEASIBLE_NL_SOLUTION = [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0] 257 | rng = np.random.default_rng(42) 258 | X = rng.uniform(-10, 10, size=(100, 9)) 259 | y = np.asarray(rng.uniform(0, 1, size=100) > 0.5, dtype=int) 260 | 261 | pipe = Pipeline([ 262 | ('feature_selection', SelectFromQuadraticModel(num_features=num_features, solver=solver)), 263 | ('classification', RandomForestClassifier()) 264 | ]) 265 | 266 | clf = GridSearchCV(pipe, 267 | param_grid=dict( 268 | feature_selection__num_features=[num_features+1], 269 | feature_selection__alpha=[0, .5])) 270 | clf.fit(X, y) 271 | 272 | @parameterized.expand([ 273 | (2, "cqm"), 274 | (2, "nl"), 275 | ]) 276 | def test_one_row(self, num_features, solver): 277 | X = [[-7.85717866, 1.93442648, 8.85760003]] 278 | y = [1] 279 | 280 | with self.assertRaises(ValueError): 281 | SelectFromQuadraticModel(num_features=num_features, solver=solver).fit(X, y) 282 | 283 | def test_fixed_column(self): 284 | global FEASIBLE_NL_SOLUTION 285 | FEASIBLE_NL_SOLUTION = [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0] 286 | X = np.copy(self.X) 287 | 288 | # fix two of the columns 289 | X[:, 1] = 0 290 | X[:, 5] = 1 291 | 292 | cqm = SelectFromQuadraticModel.correlation(X, self.y, alpha=0.5, num_features=5, solver="cqm") 293 | fitted = SelectFromQuadraticModel(alpha=0.5, num_features=5, solver="cqm").fit(X, self.y) 294 | 295 | # in this case the linear bias for those two columns should be 0 296 | self.assertEqual(cqm.objective.linear[1], 0) 297 | self.assertEqual(cqm.objective.linear[5], 0) 298 | 299 | # as should the quadratic biases 300 | self.assertEqual(cqm.objective.degree(1), 0) 301 | self.assertEqual(cqm.objective.degree(5), 0) 302 | 303 | selected = SelectFromQuadraticModel(alpha=0.5, num_features=5, solver="nl").fit(X, self.y) 304 | 305 | # Check that the variables corresponding to constant columns are not present 306 | self.assertEqual(selected._mask.all(), fitted._mask.all()) 307 | 308 | class TestIntegration(unittest.TestCase): 309 | @classmethod 310 | def setUpClass(cls): 311 | try: 312 | LeapHybridNLSampler() 313 | LeapHybridCQMSampler() 314 | except (ConfigFileError, SolverAuthenticationError, ValueError): 315 | raise unittest.SkipTest("no hybrid solver available") 316 | 317 | def test_pipeline(self): 318 | X, y = load_iris(return_X_y=True) 319 | 320 | clf = Pipeline([ 321 | ('feature_selection', SelectFromQuadraticModel(num_features=2)), 322 | ('classification', RandomForestClassifier()) 323 | ]) 324 | clf.fit(X, y) 325 | 326 | clf.predict(X) 327 | -------------------------------------------------------------------------------- /dwave/plugins/sklearn/transformers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 D-Wave Systems Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import annotations 16 | 17 | import itertools 18 | import logging 19 | import tempfile 20 | import typing 21 | import warnings 22 | 23 | import dimod 24 | import numpy as np 25 | import numpy.typing as npt 26 | 27 | from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError 28 | from dwave.system import LeapHybridCQMSampler, LeapHybridNLSampler 29 | from dwave.optimization import Model 30 | 31 | from sklearn.base import BaseEstimator 32 | from sklearn.feature_selection import SelectorMixin 33 | from sklearn.utils.validation import check_is_fitted 34 | 35 | from dwave.plugins.sklearn.utilities import corrcoef 36 | 37 | __all__ = ["SelectFromQuadraticModel"] 38 | 39 | 40 | class SelectFromQuadraticModel(SelectorMixin, BaseEstimator): 41 | """Select features using a quadratic optimization problem solved on a hybrid solver. 42 | 43 | Args: 44 | alpha: 45 | Hyperparameter between 0 and 1 that controls the relative weight of 46 | the relevance and redundancy terms. 47 | ``alpha=0`` places no weight on the quality of the features, 48 | therefore the features will be selected as to minimize the 49 | redundancy without any consideration to quality. 50 | ``alpha=1`` places the maximum weight on the quality of the features, 51 | and therefore will be equivalent to using 52 | :class:`sklearn.feature_selection.SelectKBest`. 53 | num_features: 54 | The number of features to select. 55 | time_limit: 56 | The time limit for the run on the hybrid solver. 57 | """ 58 | 59 | ACCEPTED_METHODS = [ 60 | "correlation", 61 | # "mutual information", # todo 62 | ] 63 | 64 | def __init__( 65 | self, 66 | *, 67 | alpha: float = .5, 68 | method: str = "correlation", # undocumented until there is another supported 69 | num_features: int = 10, 70 | time_limit: typing.Optional[float] = None, 71 | solver: str = "cqm", 72 | ): 73 | if not 0 <= alpha <= 1: 74 | raise ValueError(f"alpha must be between 0 and 1, given {alpha}") 75 | 76 | if method not in self.ACCEPTED_METHODS: 77 | raise ValueError( 78 | f"method must be one of {self.ACCEPTED_METHODS}, given {method}" 79 | ) 80 | 81 | if num_features <= 0: 82 | raise ValueError(f"num_features must be a positive integer, given {num_features}") 83 | 84 | self.alpha = alpha 85 | self.method = method 86 | self.num_features = num_features 87 | self.time_limit = time_limit # check this lazily 88 | self.solver = solver 89 | 90 | def __sklearn_is_fitted__(self) -> bool: 91 | # used by `check_is_fitted()` 92 | try: 93 | self._mask 94 | except AttributeError: 95 | return False 96 | 97 | return True 98 | 99 | def _get_support_mask(self) -> np.ndarray[typing.Any, np.dtype[np.bool_]]: 100 | """Get the boolean mask indicating which features are selected 101 | 102 | Returns: 103 | boolean array of shape [# input features]. An element is True iff its 104 | corresponding feature is selected for retention. 105 | 106 | Raises: 107 | RuntimeError: This method will raise an error if it is run before `fit` 108 | """ 109 | check_is_fitted(self) 110 | 111 | try: 112 | return self._mask 113 | except AttributeError: 114 | raise RuntimeError("fit hasn't been run yet") 115 | 116 | @staticmethod 117 | def _create_cqm_model( 118 | correlations: np.memmap, 119 | X: npt.ArrayLike, 120 | alpha: float, 121 | num_features: int, 122 | strict: bool, 123 | ) -> dimod.ConstrainedQuadraticModel: 124 | """Build a constrained quadratic model (CQM) for feature selection. 125 | 126 | This method is based on maximizing influence and feature independence as 127 | measured by correlation [Milne et al.]_. 128 | 129 | Args: 130 | correlations: 131 | Correlation matrix of features 132 | X: 133 | Feature vectors formatted as a numerical 2D array-like. 134 | alpha: 135 | Hyperparameter between 0 and 1 that controls the relative weight of 136 | the relevance and redundancy terms. 137 | ``alpha=0`` places no weight on the quality of the features, 138 | therefore the features will be selected as to minimize the 139 | redundancy without any consideration to quality. 140 | ``alpha=1`` places the maximum weight on the quality of the features, 141 | and therefore will be equivalent to using 142 | :class:`sklearn.feature_selection.SelectKBest`. 143 | num_features: 144 | The number of features to select. 145 | strict: 146 | If ``False`` the constraint on the number of selected features 147 | is ``<=`` rather than ``==``. 148 | 149 | Returns: 150 | A constrained quadratic model (CQM) 151 | """ 152 | 153 | # our objective 154 | # we multiply by 2 because the matrix is symmetric 155 | np.fill_diagonal(correlations, correlations[:, -1] * (-2 * alpha * num_features)) 156 | 157 | cqm = dimod.ConstrainedQuadraticModel() 158 | cqm.add_variables(dimod.BINARY, X.shape[1]) 159 | 160 | # add the k-hot constraint 161 | cqm.add_constraint( 162 | ((v, 1) for v in cqm.variables), 163 | '==' if strict else '<=', 164 | num_features, 165 | label=f"{num_features}-hot", 166 | ) 167 | 168 | # Note: the full symmetric matrix (with both upper- and lower-diagonal 169 | # entries for each correlation coefficient) is retained for consistency with 170 | # the original formulation from Milne et al. 171 | it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']]) 172 | cqm.set_objective((*it.multi_index, x) for x in it if x) 173 | 174 | return cqm 175 | 176 | @staticmethod 177 | def _create_nl_model( 178 | correlations: np.memmap, 179 | X: npt.ArrayLike, 180 | alpha: float, 181 | num_features: int, 182 | strict: bool, 183 | ) -> Model: 184 | """Build a nonlinear (NL) model for feature selection. 185 | 186 | This method is based on maximizing influence and feature independence as 187 | measured by correlation [Milne et al.]_. 188 | 189 | Args: 190 | correlations: 191 | Correlation matrix of features 192 | X: 193 | Feature vectors formatted as a numerical 2D array-like. 194 | alpha: 195 | Hyperparameter between 0 and 1 that controls the relative weight of 196 | the relevance and redundancy terms. 197 | ``alpha=0`` places no weight on the quality of the features, 198 | therefore the features will be selected as to minimize the 199 | redundancy without any consideration to quality. 200 | ``alpha=1`` places the maximum weight on the quality of the features, 201 | and therefore will be equivalent to using 202 | :class:`sklearn.feature_selection.SelectKBest`. 203 | num_features: 204 | The number of features to select. 205 | strict: 206 | If ``False`` the constraint on the number of selected features 207 | is ``<=`` rather than ``==``. 208 | 209 | Returns: 210 | A nonlinear model, the binary list, and a ndarray 211 | """ 212 | 213 | # initialize model, create binary list, make constant 214 | nl = Model() 215 | total_num_features=X.shape[1] 216 | 217 | x_binary = nl.binary(total_num_features) 218 | var_features = nl.constant(num_features) 219 | feat_corr = correlations[:-1,:-1] 220 | 221 | # take last element in every row 222 | label_corr = np.array(correlations[:-1,-1]) 223 | 224 | # Make a constant node in order to splice and use in objective 225 | nl_corr = nl.constant(feat_corr) 226 | 227 | # extract upper triangle, excluding diagonal. Flatten into 1D array 228 | C = np.triu(nl_corr, k=1).flatten() 229 | 230 | # generate all column and row indices 231 | quad_col = np.tile(np.arange(total_num_features), total_num_features) 232 | quad_row = np.tile(np.arange(total_num_features), 233 | (total_num_features,1)).flatten('F') 234 | 235 | # extract indices where correlation value not equal to zero 236 | 237 | # j index 238 | q2 = quad_col[C != 0] 239 | # i index 240 | q1 = quad_row[C != 0] 241 | 242 | # extract values at position (i,j) where not equal to zero 243 | q3 = C[C != 0] 244 | 245 | # 1D numpy array initialized to size of num_rows with 0 in every position 246 | linear = np.zeros(len(feat_corr[0])) 247 | 248 | # numpy will automatically go element-by-element in the arrays 249 | linear += nl.constant(-1.0 * label_corr * alpha * num_features) 250 | 251 | # if must choose exact number of desired features 252 | if strict: 253 | nl.add_constraint(x_binary.sum() == var_features) 254 | else: 255 | nl.add_constraint(x_binary.sum() <= var_features) 256 | 257 | nl.minimize(nl.constant(2.0) * nl.quadratic_model(x_binary, quadratic=(q3, [q1, q2]), linear=linear)) 258 | return nl 259 | 260 | @typing.overload 261 | def correlation(X: npt.ArrayLike, y: npt.ArrayLike, *, solver: Literal["cqm"], **kwargs) -> dimod.ConstrainedQuadraticModel: ... 262 | 263 | @typing.overload 264 | def correlation(X: npt.ArrayLike, y: npt.ArrayLike, *, solver: Literal["nl"], **kwargs) -> Model: ... 265 | 266 | @staticmethod 267 | def correlation( 268 | X: npt.ArrayLike, 269 | y: npt.ArrayLike, 270 | *, 271 | alpha: float, 272 | num_features: int, 273 | strict: bool = True, 274 | solver: str, 275 | ) -> Union[dimod.ConstrainedQuadraticModel, Model]: 276 | """Build a model for feature selection. 277 | 278 | This method is based on maximizing influence and feature independence as 279 | measured by correlation [Milne et al.]_. 280 | 281 | Args: 282 | X: 283 | Feature vectors formatted as a numerical 2D array-like. 284 | y: 285 | Class labels formatted as a numerical 1D array-like. 286 | alpha: 287 | Hyperparameter between 0 and 1 that controls the relative weight of 288 | the relevance and redundancy terms. 289 | ``alpha=0`` places no weight on the quality of the features, 290 | therefore the features will be selected as to minimize the 291 | redundancy without any consideration to quality. 292 | ``alpha=1`` places the maximum weight on the quality of the features, 293 | and therefore will be equivalent to using 294 | :class:`sklearn.feature_selection.SelectKBest`. 295 | num_features: 296 | The number of features to select. 297 | strict: 298 | If ``False`` the constraint on the number of selected features 299 | is ``<=`` rather than ``==``. 300 | solver: 301 | String containing either "cqm" or "nl" to decide which solver creation method to use. Defaults to "cqm" 302 | 303 | Returns: 304 | A constrained quadratic model or a nonlinear model. 305 | 306 | .. [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature 307 | Selection in Credit Scoring and Classification Using a Quantum Annealer." 308 | 1QBit; White Paper. 309 | https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer 310 | """ 311 | 312 | X = np.atleast_2d(np.asarray(X)) 313 | y = np.asarray(y) 314 | 315 | if X.ndim != 2: 316 | raise ValueError("X must be a 2-dimensional array-like") 317 | 318 | if y.ndim != 1: 319 | raise ValueError("y must be a 1-dimensional array-like") 320 | 321 | if y.shape[0] != X.shape[0]: 322 | raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") 323 | 324 | if not 0 <= alpha <= 1: 325 | raise ValueError(f"alpha must be between 0 and 1, given {alpha}") 326 | 327 | if num_features <= 0: 328 | raise ValueError(f"num_features must be a positive integer, given {num_features}") 329 | 330 | if X.shape[0] <= 1: 331 | raise ValueError("X must have at least two rows") 332 | 333 | with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout: 334 | # we make a copy of X because we'll be modifying it in-place within 335 | # some of the functions 336 | X_copy = np.memmap(fX, X.dtype, mode="w+", shape=(X.shape[0], X.shape[1] + 1)) 337 | X_copy[:, :-1] = X 338 | X_copy[:, -1] = y 339 | 340 | # make the matrix that will hold the correlations 341 | correlations = np.memmap( 342 | fout, 343 | dtype=np.result_type(X, y), 344 | mode="w+", 345 | shape=(X_copy.shape[1], X_copy.shape[1]), 346 | ) 347 | 348 | # main calculation. It modifies X_copy in-place 349 | corrcoef(X_copy, out=correlations, rowvar=False, copy=False) 350 | 351 | # we don't care about the direction of correlation in terms of 352 | # the penalty/quality 353 | np.absolute(correlations, out=correlations) 354 | 355 | if (solver == "cqm"): 356 | return SelectFromQuadraticModel._create_cqm_model(correlations=correlations, X=X, alpha=alpha, num_features=num_features, strict=strict) 357 | elif (solver == "nl"): 358 | return SelectFromQuadraticModel._create_nl_model(correlations=correlations, X=X, alpha=alpha, num_features=num_features, strict=strict) 359 | raise ValueError(f"Solver parameter must be equal to 'nl' or 'cqm'. Received solver parameter: {solver}") 360 | 361 | @staticmethod 362 | def correlation_cqm(X: npt.ArrayLike, 363 | y: npt.ArrayLike, 364 | *, 365 | alpha: float, 366 | num_features: int, 367 | strict: bool = True, 368 | ) -> dimod.ConstrainedQuadraticModel: 369 | return SelectFromQuadraticModel.correlation(X=X, y=y, num_features=num_features, alpha=alpha, solver="cqm") 370 | 371 | @staticmethod 372 | def correlation_nl(X: npt.ArrayLike, 373 | y: npt.ArrayLike, 374 | *, 375 | alpha: float, 376 | num_features: int, 377 | strict: bool = True, 378 | ) -> dimod.ConstrainedQuadraticModel: 379 | return SelectFromQuadraticModel.correlation(X=X, y=y, num_features=num_features, alpha=alpha, solver="nl") 380 | 381 | def fit( 382 | self, 383 | X: npt.ArrayLike, 384 | y: npt.ArrayLike, 385 | *, 386 | alpha: typing.Optional[float] = None, 387 | num_features: typing.Optional[int] = None, 388 | time_limit: typing.Optional[float] = None, 389 | solver: typing.Optional[str] = None, 390 | ) -> SelectFromQuadraticModel: 391 | """Select the features to keep. 392 | 393 | Args: 394 | X: 395 | Feature vectors formatted as a numerical 2D array-like. 396 | y: 397 | Class labels formatted as a numerical 1D array-like. 398 | alpha: 399 | Hyperparameter between 0 and 1 that controls the relative weight of 400 | the relevance and redundancy terms. 401 | ``alpha=0`` places no weight on the quality of the features, 402 | therefore the features will be selected as to minimize the 403 | redundancy without any consideration to quality. 404 | ``alpha=1`` places the maximum weight on the quality of the features, 405 | and therefore will be equivalent to using 406 | :class:`sklearn.feature_selection.SelectKBest`. 407 | num_features: 408 | The number of features to select. 409 | Defaults to the value provided to the constructor. 410 | time_limit: 411 | The time limit for the run on the hybrid solver. 412 | Defaults to the value provided to the constructor. 413 | 414 | Returns: 415 | This instance of `SelectFromQuadraticModel`. 416 | """ 417 | X = np.atleast_2d(np.asarray(X)) 418 | if X.ndim != 2: 419 | raise ValueError("X must be a 2-dimensional array-like") 420 | 421 | # y is checked by the correlation method function 422 | 423 | if alpha is None: 424 | alpha = self.alpha 425 | # alpha is checked by the correlation method function 426 | 427 | if num_features is None: 428 | num_features = self.num_features 429 | # num_features is checked by the correlation method function 430 | 431 | if solver is None: 432 | solver = self.solver 433 | 434 | # if we already have fewer features than requested, just return 435 | if num_features >= X.shape[1]: 436 | self._mask = np.ones(X.shape[1], dtype=bool) 437 | return self 438 | 439 | if self.method == "correlation": 440 | model = self.correlation(X, y, num_features=num_features, alpha=alpha, solver=solver) 441 | 442 | else: 443 | raise ValueError(f"only methods {self.acceptable_methods} are implemented") 444 | 445 | try: 446 | if solver == "cqm": 447 | sampler = LeapHybridCQMSampler() 448 | sampleset = sampler.sample_cqm(model, time_limit=self.time_limit, 449 | label=f"{self.__module__}.{type(self).__qualname__}") 450 | 451 | filtered = sampleset.filter(lambda d: d.is_feasible) 452 | 453 | if len(filtered) == 0: 454 | raise RuntimeError("no feasible solutions found by the hybrid solver") 455 | 456 | lowest = filtered.first.sample 457 | 458 | self._mask = np.fromiter((lowest[v] for v in model.variables), 459 | count=model.num_variables(), dtype=bool) 460 | 461 | elif solver == "nl": 462 | sampler = LeapHybridNLSampler() 463 | 464 | # time_limit is checked by the LeapHybridNLSampler 465 | _ = sampler.sample(model, time_limit=self.time_limit, label='scikit-learn Plug-In: NL') 466 | 467 | # Get the index position of chosen features 468 | # Example Given (e.g.) of 6 features to choose 3 469 | with model.lock(): 470 | selected = next(model.iter_decisions()).state(0) 471 | 472 | self._mask = np.asarray(selected, dtype=bool) # e.g. [False, True, False, False, True, True, False] 473 | 474 | except (ConfigFileError, SolverAuthenticationError) as e: 475 | raise RuntimeError( 476 | f"""Instantiation of a Leap hybrid solver failed with an {e} error. 477 | 478 | See https://docs.ocean.dwavesys.com/en/stable/overview/sapi.html for configuring 479 | access to Leap’s solvers. 480 | """ 481 | ) 482 | 483 | return self 484 | 485 | def unfit(self): 486 | """Undo a previously executed ``fit`` method.""" 487 | del self._mask 488 | --------------------------------------------------------------------------------