├── docs
├── documentation
│ ├── ahr.rst
│ ├── ann.rst
│ ├── hr.rst
│ ├── sklearn.rst
│ ├── auto_examples
│ │ ├── plot_lle_digits.py.md5
│ │ ├── plot_regression.py.md5
│ │ ├── plot_classification.py.md5
│ │ ├── plot_nearest_centroid.py.md5
│ │ ├── plot_nca_classification.py.md5
│ │ ├── plot_nca_dim_reduction.py.md5
│ │ ├── plot_multioutput_face_completion.py.md5
│ │ ├── auto_examples_jupyter.zip
│ │ ├── auto_examples_python.zip
│ │ ├── images
│ │ │ ├── sphx_glr_plot_lle_digits_001.png
│ │ │ ├── sphx_glr_plot_lle_digits_002.png
│ │ │ ├── sphx_glr_plot_lle_digits_003.png
│ │ │ ├── sphx_glr_plot_lle_digits_004.png
│ │ │ ├── sphx_glr_plot_lle_digits_005.png
│ │ │ ├── sphx_glr_plot_lle_digits_006.png
│ │ │ ├── sphx_glr_plot_lle_digits_007.png
│ │ │ ├── sphx_glr_plot_lle_digits_008.png
│ │ │ ├── sphx_glr_plot_lle_digits_009.png
│ │ │ ├── sphx_glr_plot_lle_digits_010.png
│ │ │ ├── sphx_glr_plot_lle_digits_011.png
│ │ │ ├── sphx_glr_plot_lle_digits_012.png
│ │ │ ├── sphx_glr_plot_lle_digits_013.png
│ │ │ ├── sphx_glr_plot_lle_digits_014.png
│ │ │ ├── sphx_glr_plot_lle_digits_015.png
│ │ │ ├── sphx_glr_plot_lle_digits_016.png
│ │ │ ├── sphx_glr_plot_regression_001.png
│ │ │ ├── sphx_glr_plot_classification_001.png
│ │ │ ├── sphx_glr_plot_classification_002.png
│ │ │ ├── sphx_glr_plot_nearest_centroid_001.png
│ │ │ ├── sphx_glr_plot_nearest_centroid_002.png
│ │ │ ├── sphx_glr_plot_nca_classification_001.png
│ │ │ ├── sphx_glr_plot_nca_classification_002.png
│ │ │ ├── sphx_glr_plot_nca_classification_003.png
│ │ │ ├── sphx_glr_plot_nca_classification_004.png
│ │ │ ├── sphx_glr_plot_nca_classification_005.png
│ │ │ ├── sphx_glr_plot_nca_classification_006.png
│ │ │ ├── sphx_glr_plot_nca_dim_reduction_001.png
│ │ │ ├── sphx_glr_plot_nca_dim_reduction_002.png
│ │ │ ├── sphx_glr_plot_nca_dim_reduction_003.png
│ │ │ ├── thumb
│ │ │ │ ├── sphx_glr_plot_lle_digits_thumb.png
│ │ │ │ ├── sphx_glr_plot_regression_thumb.png
│ │ │ │ ├── sphx_glr_plot_classification_thumb.png
│ │ │ │ ├── sphx_glr_plot_nearest_centroid_thumb.png
│ │ │ │ ├── sphx_glr_plot_nca_classification_thumb.png
│ │ │ │ ├── sphx_glr_plot_nca_dim_reduction_thumb.png
│ │ │ │ └── sphx_glr_plot_multioutput_face_completion_thumb.png
│ │ │ └── sphx_glr_plot_multioutput_face_completion_001.png
│ │ ├── sg_execution_times.rst
│ │ ├── plot_regression.py
│ │ ├── plot_classification.py
│ │ ├── plot_nearest_centroid.py
│ │ ├── plot_regression.ipynb
│ │ ├── plot_classification.ipynb
│ │ ├── plot_nearest_centroid.ipynb
│ │ ├── plot_multioutput_face_completion.py
│ │ ├── plot_regression.rst
│ │ ├── plot_classification.rst
│ │ ├── plot_nca_dim_reduction.py
│ │ ├── plot_multioutput_face_completion.ipynb
│ │ ├── plot_nearest_centroid.rst
│ │ └── plot_nca_dim_reduction.ipynb
│ ├── auto_examples_hr
│ │ ├── auto_examples_hr_python.zip
│ │ ├── auto_examples_hr_jupyter.zip
│ │ ├── images
│ │ │ └── thumb
│ │ │ │ ├── sphx_glr_pipelines_thumb.png
│ │ │ │ └── sphx_glr_olivetti_faces_thumb.png
│ │ ├── index.rst
│ │ ├── pipelines.py
│ │ ├── olivetti_faces.py
│ │ ├── pipelines.ipynb
│ │ ├── pipelines.rst
│ │ ├── olivetti_faces.ipynb
│ │ └── olivetti_faces.rst
│ ├── auto_examples_ahr
│ │ ├── auto_examples_ahr_jupyter.zip
│ │ ├── auto_examples_ahr_python.zip
│ │ ├── images
│ │ │ └── thumb
│ │ │ │ ├── sphx_glr_reusing_index_thumb.png
│ │ │ │ └── sphx_glr_high_dim_gaussian_thumb.png
│ │ ├── reusing_index.py
│ │ ├── high_dim_gaussian.py
│ │ ├── index.rst
│ │ ├── reusing_index.ipynb
│ │ ├── high_dim_gaussian.ipynb
│ │ ├── reusing_index.rst
│ │ └── high_dim_gaussian.rst
│ ├── auto_examples_ann
│ │ ├── auto_examples_ann_jupyter.zip
│ │ ├── auto_examples_ann_python.zip
│ │ ├── images
│ │ │ └── thumb
│ │ │ │ └── sphx_glr_word_embeddings_thumb.png
│ │ ├── index.rst
│ │ └── word_embeddings.py
│ ├── examples.rst
│ ├── user_guide.rst
│ ├── history.rst
│ ├── nearestneighbors.rst
│ ├── documentation.rst
│ └── reduction.rst
├── Makefile
├── make.bat
├── getting_started
│ ├── installation.rst
│ └── example.rst
├── github_link.py
├── index.rst
└── changelog.md
├── skhubness
├── utils
│ ├── __init__.py
│ ├── tests
│ │ ├── __init__.py
│ │ └── test_io.py
│ ├── check.py
│ ├── multiprocessing.py
│ └── io.py
├── data
│ ├── tests
│ │ ├── __init__.py
│ │ └── test_load_datasets.py
│ ├── __init__.py
│ ├── dexter
│ │ ├── ABOUT
│ │ └── dexter_train.labels
│ └── load_dataset.py
├── analysis
│ ├── tests
│ │ └── __init__.py
│ └── __init__.py
├── neighbors
│ ├── tests
│ │ ├── __init__.py
│ │ └── test_neighbors.py
│ ├── __init__.py
│ └── approximate_neighbors.py
├── reduction
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_local_scaling.py
│ │ └── test_hubness_reduction.py
│ ├── _base.py
│ └── __init__.py
└── __init__.py
├── .flake8
├── paper
└── arxiv
│ └── scikit-hubness_arxiv_v1.pdf
├── MANIFEST.in
├── requirements.txt
├── requirements-win.txt
├── examples
├── approximate_neighbors
│ ├── README.rst
│ └── word_embeddings.py
├── approximate_hub_red
│ ├── README.rst
│ ├── reusing_index.py
│ └── high_dim_gaussian.py
├── hubness_reduction
│ ├── README.rst
│ ├── pipelines.py
│ └── olivetti_faces.py
└── sklearn
│ ├── README.rst
│ ├── plot_regression.py
│ ├── plot_classification.py
│ ├── plot_nearest_centroid.py
│ ├── plot_multioutput_face_completion.py
│ └── plot_nca_dim_reduction.py
├── requirements-rtd.txt
├── .coveragerc
├── .readthedocs.yml
├── pyproject.toml
├── WARRANTY.txt
├── scripts
├── install-puffinn.sh
└── install-ngt.sh
├── LICENSE.txt
├── .github
└── workflows
│ └── scikit-hubness_ci.yml
├── .gitignore
├── setup.cfg
└── CODE_OF_CONDUCT.md
/docs/documentation/ahr.rst:
--------------------------------------------------------------------------------
1 | .. include:: auto_examples_ahr/index.rst
2 |
--------------------------------------------------------------------------------
/docs/documentation/ann.rst:
--------------------------------------------------------------------------------
1 | .. include:: auto_examples_ann/index.rst
2 |
--------------------------------------------------------------------------------
/docs/documentation/hr.rst:
--------------------------------------------------------------------------------
1 | .. include:: auto_examples_hr/index.rst
2 |
--------------------------------------------------------------------------------
/docs/documentation/sklearn.rst:
--------------------------------------------------------------------------------
1 | .. include:: auto_examples/index.rst
2 |
--------------------------------------------------------------------------------
/skhubness/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
--------------------------------------------------------------------------------
/skhubness/data/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
--------------------------------------------------------------------------------
/skhubness/utils/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
--------------------------------------------------------------------------------
/skhubness/analysis/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
--------------------------------------------------------------------------------
/skhubness/neighbors/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
--------------------------------------------------------------------------------
/skhubness/reduction/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_lle_digits.py.md5:
--------------------------------------------------------------------------------
1 | af9f3e15361795b55a753e531924945c
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_regression.py.md5:
--------------------------------------------------------------------------------
1 | bb057885f6f41ce374c6001535beee34
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | exclude = *puffinn/include/external/ffht*
4 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_classification.py.md5:
--------------------------------------------------------------------------------
1 | 828dcf172d84ae7101cc889cf5c7c9f8
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nearest_centroid.py.md5:
--------------------------------------------------------------------------------
1 | f82e0922b095569b290ff698edf738ae
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nca_classification.py.md5:
--------------------------------------------------------------------------------
1 | 92a38f10df6d7ae167988498b8907ef5
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nca_dim_reduction.py.md5:
--------------------------------------------------------------------------------
1 | f825086405653531d6cb420f49988d4c
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_multioutput_face_completion.py.md5:
--------------------------------------------------------------------------------
1 | dfd8de51f7a147dc438a3fbf7fcd091a
--------------------------------------------------------------------------------
/paper/arxiv/scikit-hubness_arxiv_v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/paper/arxiv/scikit-hubness_arxiv_v1.pdf
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | recursive-include docs *
4 | recursive-include skhubness *
5 | include skhubness/data/dexter/*
6 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/auto_examples_jupyter.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/auto_examples_jupyter.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/auto_examples_python.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/auto_examples_python.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/auto_examples_hr_python.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/auto_examples_hr_python.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/auto_examples_ahr_jupyter.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/auto_examples_ahr_jupyter.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/auto_examples_ahr_python.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/auto_examples_ahr_python.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ann/auto_examples_ann_jupyter.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ann/auto_examples_ann_jupyter.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ann/auto_examples_ann_python.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ann/auto_examples_ann_python.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/auto_examples_hr_jupyter.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/auto_examples_hr_jupyter.zip
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_001.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_002.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_003.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_004.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_005.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_006.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_007.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_008.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_009.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_010.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_011.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_012.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_013.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_013.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_014.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_014.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_015.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_015.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_016.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_016.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_regression_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_regression_001.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_classification_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_classification_001.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_classification_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_classification_002.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_001.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_002.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/images/thumb/sphx_glr_pipelines_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/images/thumb/sphx_glr_pipelines_thumb.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy>=1.2
3 | scikit-learn
4 | pandas
5 | joblib>=0.12
6 | tqdm
7 | numba
8 | annoy
9 | nmslib
10 | ngt>=1.8
11 | pytest
12 | pytest-cov
13 | codecov
14 | nose
15 | flake8
16 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_001.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_002.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_003.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_004.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_005.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_006.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_001.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_002.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_003.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_lle_digits_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_lle_digits_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_regression_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_regression_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_reusing_index_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_reusing_index_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/images/thumb/sphx_glr_olivetti_faces_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/images/thumb/sphx_glr_olivetti_faces_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_classification_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_classification_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nearest_centroid_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nearest_centroid_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_high_dim_gaussian_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_high_dim_gaussian_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ann/images/thumb/sphx_glr_word_embeddings_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ann/images/thumb/sphx_glr_word_embeddings_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_classification_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_classification_thumb.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_dim_reduction_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_dim_reduction_thumb.png
--------------------------------------------------------------------------------
/requirements-win.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy>=1.2
3 | scikit-learn
4 | pandas
5 | joblib>=0.12
6 | tqdm
7 | nmslib
8 | annoy
9 | # ngt # DOES NOT support Windows
10 | pytest
11 | pytest-cov
12 | codecov
13 | nose
14 | flake8
15 |
--------------------------------------------------------------------------------
/skhubness/data/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 | """
3 | The :mod:`skhubness.data` package provides example data sets.
4 | """
5 | from .load_dataset import load_dexter
6 |
7 | __all__ = ['load_dexter']
8 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_multioutput_face_completion_thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_multioutput_face_completion_thumb.png
--------------------------------------------------------------------------------
/examples/approximate_neighbors/README.rst:
--------------------------------------------------------------------------------
1 | ============================================
2 | Example: Approximate nearest neighbor search
3 | ============================================
4 |
5 | This example shows how to perform approximate nearest neighbor search.
6 |
--------------------------------------------------------------------------------
/examples/approximate_hub_red/README.rst:
--------------------------------------------------------------------------------
1 | ========================================
2 | Example: Approximate hubness reduction
3 | ========================================
4 |
5 | These examples show how to combine approximate nearest neighbor search and hubness reduction.
6 |
--------------------------------------------------------------------------------
/examples/hubness_reduction/README.rst:
--------------------------------------------------------------------------------
1 | ============================================
2 | Example: Hubness reduction
3 | ============================================
4 |
5 | These examples show how to perform hubness reduction in kNN classification
6 | in (nested) cross-validation and pipelines.
7 |
--------------------------------------------------------------------------------
/skhubness/data/dexter/ABOUT:
--------------------------------------------------------------------------------
1 | DEXTER is a text classification problem in a bag-of-word representation. This
2 | is a two-class classification problem with sparse continuous input variables.
3 | This dataset is one of five datasets of the NIPS 2003 feature selection
4 | challenge.
5 |
6 | http://archive.ics.uci.edu/ml/datasets/Dexter
7 |
--------------------------------------------------------------------------------
/skhubness/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 |
4 | """
5 | The :mod:`skhubness.analysis` package provides methods for measuring hubness.
6 | """
7 | from .estimation import Hubness, VALID_HUBNESS_MEASURES
8 |
9 | __all__ = [
10 | "Hubness",
11 | "VALID_HUBNESS_MEASURES",
12 | ]
13 |
--------------------------------------------------------------------------------
/requirements-rtd.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy>=1.2
3 | scikit-learn
4 | pandas
5 | joblib>=0.12
6 | tqdm
7 | pytest
8 | pytest-cov
9 | codecov
10 | nose
11 | flake8
12 | git+https://github.com/readthedocs/readthedocs-sphinx-search@master # TODO update to PyPI when it becomes available
13 | sphinx>=2.1
14 | sphinx-automodapi
15 | sphinx-gallery
16 | sphinx-pdj-theme
17 | mock
18 | graphviz
19 | numpydoc
20 |
--------------------------------------------------------------------------------
/docs/documentation/examples.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | Examples
3 | ==========
4 |
5 | In this section, we provide usage examples for ``skhubness``.
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 | :caption: Contents:
10 |
11 | Example: Hubness reduction
12 | Example: Approximate nearest neighbor search
13 | Example: Approximate hubness reduction
14 | Example: From sklearn to skhubness
15 |
--------------------------------------------------------------------------------
/skhubness/data/tests/test_load_datasets.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
3 | from skhubness.data import load_dexter
4 |
5 |
6 | def test_load_dexter():
7 | X, y = load_dexter()
8 | n_samples = 300
9 | n_features = 20_000
10 | assert X.shape == (n_samples, n_features), f'Wrong shape: X.shape = {X.shape}, should be (300, 20_000).'
11 | assert y.shape == (n_samples, ), f'Wrong shape: y.shape = {y.shape}, should be (300, ).'
12 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | setup.py
4 | branch = True
5 | parallel = True
6 | concurrency = multiprocessing
7 |
8 | [report]
9 | exclude_lines =
10 | pragma: no cover
11 | def __repr__
12 | raise AssertionError
13 | raise NotImplementedError
14 | raise ValueError
15 | raise TypeError
16 | warnings.warn
17 | only on win32
18 | sys.platform == 'win32'
19 | except ImportError
20 | ModuleNotFoundError
21 | if __name__ == .__main__.:
--------------------------------------------------------------------------------
/docs/documentation/user_guide.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | User guide
3 | ==========
4 |
5 | Welcome to ``scikit-hubness``!
6 | Here we describe the core functionality of the package
7 | (hubness analysis, hubness reduction, neighbor search),
8 | and provide several usage examples.
9 |
10 |
11 | .. toctree::
12 | :maxdepth: 2
13 | :caption: Contents:
14 |
15 | Core concepts
16 | Hubness analysis
17 | Hubness reduction
18 | Nearest neighbors
19 | Examples
20 |
--------------------------------------------------------------------------------
/skhubness/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 |
4 | """ Python package for nearest neighbor retrieval in high-dimensional space."""
5 |
6 | __version__ = '0.30.0a1'
7 |
8 | from . import analysis
9 | from . import data
10 | from .analysis.estimation import Hubness
11 | from . import neighbors
12 | from . import reduction
13 | from . import utils
14 |
15 |
16 | __all__ = [
17 | "analysis",
18 | "data",
19 | "neighbors",
20 | "reduction",
21 | "utils",
22 | "Hubness",
23 | ]
24 |
--------------------------------------------------------------------------------
/skhubness/utils/check.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | # Author: Roman Feldbauer
4 | import numpy as np
5 |
6 | __all__ = [
7 | "check_n_candidates",
8 | ]
9 |
10 |
11 | def check_n_candidates(n_candidates):
12 | # Check the n_neighbors parameter
13 | if n_candidates <= 0:
14 | raise ValueError(f"Expected n_neighbors > 0. Got {n_candidates:d}")
15 | if not np.issubdtype(type(n_candidates), np.integer):
16 | raise TypeError(f"n_neighbors does not take {type(n_candidates)} value, enter integer value")
17 | return n_candidates
18 |
--------------------------------------------------------------------------------
/skhubness/reduction/_base.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 |
4 | from abc import ABC, abstractmethod
5 |
6 | from sklearn.base import BaseEstimator
7 |
8 |
9 | class HubnessReduction(BaseEstimator, ABC):
10 | """ Base class for hubness reduction in a sparse neighbors graph. """
11 | @abstractmethod
12 | def __init__(self, **kwargs):
13 | # TODO whether to include/exclude self distances, or let the user decide...
14 | pass
15 |
16 | @abstractmethod
17 | def fit(self, X, y=None, **kwargs):
18 | pass
19 |
20 | @abstractmethod
21 | def transform(self, X, y=None, **kwargs):
22 | pass
23 |
--------------------------------------------------------------------------------
/skhubness/utils/tests/test_io.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | # Author: Roman Feldbauer
4 | import os
5 | import platform
6 | import pytest
7 | from skhubness.utils.io import create_tempfile_preferably_in_dir
8 |
9 |
10 | @pytest.mark.parametrize('directory', [None, '/does/not/exist/kluawev'])
11 | @pytest.mark.parametrize('persistent', [True, False])
12 | def test_tempfile(directory, persistent):
13 | f = create_tempfile_preferably_in_dir(directory=directory, persistent=persistent)
14 | assert isinstance(f, str)
15 | if persistent and platform.system() != 'Windows': # locked by running process on Windows
16 | os.remove(f)
17 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/conf.py
11 |
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | # configuration: mkdocs.yml
15 |
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 |
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 | version: 3.7
22 | install:
23 | - requirements: requirements-rtd.txt
24 | system_packages: false
25 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/skhubness/reduction/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 |
4 | """
5 | The :mod:`skhubness.reduction` package provides methods for hubness reduction.
6 | """
7 |
8 | from ._mutual_proximity import MutualProximity
9 | from ._local_scaling import LocalScaling
10 | from ._dis_sim import DisSimLocal
11 |
12 | #: Supported hubness reduction algorithms
13 | hubness_algorithms = [
14 | "mp",
15 | "ls",
16 | "dsl",
17 | ]
18 | hubness_algorithms_long = [
19 | "mutual_proximity",
20 | "local_scaling",
21 | "dis_sim_local",
22 | ]
23 |
24 |
25 | __all__ = [
26 | "LocalScaling",
27 | "MutualProximity",
28 | "DisSimLocal",
29 | "hubness_algorithms",
30 | ]
31 |
--------------------------------------------------------------------------------
/examples/sklearn/README.rst:
--------------------------------------------------------------------------------
1 | ================================================
2 | scikit-learn examples adapted for scikit-hubness
3 | ================================================
4 |
5 | Examples concerning using :mod:`skhubness.neighbors`
6 | as drop-in replacement for :mod:`sklearn.neighbors`.
7 |
8 | These examples are taken from scikit-learn and demonstrate the ease of transition
9 | from ``sklearn.neighbors`` to ``skhubness.neighbors``.
10 | You will find that many examples require no more than modifying an import line,
11 | and/or adding one argument when instantiating an estimator.
12 |
13 | Note, that these examples are not intended to demonstrate improved learning performance
14 | due to hubness reduction (the data are rather low-dimensional).
15 |
--------------------------------------------------------------------------------
/skhubness/neighbors/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | """
4 | The :mod:`skhubness.neighbors` package provides wrappers for various
5 | approximate nearest neighbor packages. These are compatible with the
6 | scikit-learn `KNeighborsTransformer`.
7 | """
8 | from ._annoy import AnnoyTransformer, LegacyRandomProjectionTree
9 | from ._nmslib import NMSlibTransformer, LegacyHNSW
10 | from ._puffinn import PuffinnTransformer, LegacyPuffinn
11 | from ._ngt import NGTTransformer, LegacyNNG
12 |
13 |
14 | __all__ = [
15 | "AnnoyTransformer",
16 | "LegacyHNSW",
17 | "LegacyNNG",
18 | "LegacyPuffinn",
19 | "LegacyRandomProjectionTree",
20 | "NGTTransformer",
21 | "NMSlibTransformer",
22 | "PuffinnTransformer",
23 | ]
24 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "pybind11"]
3 |
4 | [tool.black]
5 | line-length = 88
6 | target_version = ['py38', 'py39', 'py310']
7 | experimental_string_processing = true
8 | exclude = """
9 | /(
10 | \\.eggs # exclude a few common directories in the
11 | | \\.git # root of the project
12 | | \\.mypy_cache
13 | | \\.vscode
14 | | examples
15 | | build
16 | | dist
17 | | doc/tutorial
18 | | doc/_build
19 | | doc/auto_examples
20 | )/
21 | """
22 |
23 | [tool.coverage.run]
24 | omit = ["setup.py", ]
25 | branch = true
26 | parallel = true
27 | concurrency = ["multiprocessing", ]
28 |
29 | [tool.coverage.report]
30 | exclude_lines = [
31 | "pragma: no cover",
32 | "def __repr__",
33 | "raise AssertionError",
34 | "only on win32",
35 | "sys.platform == 'win32'",
36 | ]
37 |
--------------------------------------------------------------------------------
/WARRANTY.txt:
--------------------------------------------------------------------------------
1 | THIS SOURCE CODE IS SUPPLIED “AS IS” WITHOUT WAR-
2 | RANTY OF ANY KIND, AND ITS AUTHOR AND THE JOURNAL OF
3 | MACHINE LEARNING RESEARCH (JMLR) AND JMLR’S PUBLISH-
4 | ERS AND DISTRIBUTORS, DISCLAIM ANY AND ALL WARRANTIES,
5 | INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
6 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PUR-
7 | POSE, AND ANY WARRANTIES OR NON INFRINGEMENT. THE USER
8 | ASSUMES ALL LIABILITY AND RESPONSIBILITY FOR USE OF THIS
9 | SOURCE CODE, AND NEITHER THE AUTHOR NOR JMLR, NOR
10 | JMLR’S PUBLISHERS AND DISTRIBUTORS, WILL BE LIABLE FOR
11 | DAMAGES OF ANY KIND RESULTING FROM ITS USE. Without lim-
12 | iting the generality of the foregoing, neither the author, nor JMLR, nor
13 | JMLR’s publishers and distributors, warrant that the Source Code will be
14 | error-free, will operate without interruption, or will meet the needs of the
15 | user
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/skhubness/utils/multiprocessing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | # Author: Roman Feldbauer
4 | from multiprocessing import cpu_count
5 |
6 | __all__ = [
7 | "validate_n_jobs",
8 | ]
9 |
10 |
11 | def register_parallel_pytest_cov():
12 | try:
13 | from pytest_cov.embed import cleanup_on_sigterm
14 | except ImportError:
15 | pass
16 | else:
17 | cleanup_on_sigterm()
18 |
19 |
20 | def validate_n_jobs(n_jobs):
21 | """ Handle special integers and non-integer `n_jobs` values. """
22 | if n_jobs is None:
23 | n_jobs = 1
24 | elif n_jobs == -1:
25 | n_jobs = cpu_count()
26 | elif n_jobs < -1 or n_jobs == 0:
27 | raise ValueError(f"Number of parallel processes 'n_jobs' must be "
28 | f"a positive integer, or ``-1`` to use all local"
29 | f" CPU cores. Was {n_jobs} instead.")
30 | return n_jobs
31 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/sg_execution_times.rst:
--------------------------------------------------------------------------------
1 |
2 | :orphan:
3 |
4 | .. _sphx_glr_documentation_auto_examples_sg_execution_times:
5 |
6 | Computation times
7 | =================
8 | **00:25.940** total execution time for **documentation_auto_examples** files:
9 |
10 | - **00:25.940**: :ref:`sphx_glr_documentation_auto_examples_plot_classification.py` (``plot_classification.py``)
11 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_lle_digits.py` (``plot_lle_digits.py``)
12 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_multioutput_face_completion.py` (``plot_multioutput_face_completion.py``)
13 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_nca_classification.py` (``plot_nca_classification.py``)
14 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_nca_dim_reduction.py` (``plot_nca_dim_reduction.py``)
15 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_nearest_centroid.py` (``plot_nearest_centroid.py``)
16 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_regression.py` (``plot_regression.py``)
17 |
--------------------------------------------------------------------------------
/skhubness/data/load_dataset.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
3 | import os
4 | import numpy as np
5 |
6 | __all__ = ["load_dexter"]
7 |
8 |
9 | def load_dexter() -> (np.ndarray, np.ndarray):
10 | """Load the example data set (dexter).
11 |
12 | Returns
13 | -------
14 | X, y : ndarray, ndarray
15 | Vector data, and class labels
16 | """
17 | n = 300
18 | dim = 20000
19 |
20 | # Read class labels
21 | dexter_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "dexter")
22 | dexter_labels = os.path.join(dexter_path, "dexter_train.labels")
23 | dexter_vectors = os.path.join(dexter_path, "dexter_train.data")
24 | y = np.loadtxt(dexter_labels)
25 |
26 | # Read data
27 | X = np.zeros((n, dim))
28 | with open(dexter_vectors, mode="r") as fid:
29 | data = fid.readlines()
30 | row = 0
31 | for line in data:
32 | line = line.strip().split() # line now contains pairs of dim:val
33 | for word in line:
34 | col, val = word.split(":")
35 | X[row][int(col) - 1] = int(val)
36 | row += 1
37 |
38 | return X, y
39 |
--------------------------------------------------------------------------------
/scripts/install-puffinn.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Build external dependencies that cannot successfully install via pip or conda
3 | # If you use this file as template, don't forget to `chmod a+x newfile`
4 |
5 | set -e
6 |
7 | # Check for the operating system and install puffinn
8 | if [[ $(uname) == "Darwin" ]]; then
9 | echo "Running under Mac OS X..."
10 | git clone https://github.com/puffinn/puffinn.git
11 | cd puffinn
12 | python3 setup.py build
13 | pip install .
14 | cd ..
15 | rm -r puffinn
16 |
17 | elif [[ $(uname -s) == Linux* ]]; then
18 | echo "Running under Linux..."
19 | # Trying to install puffinn from cache,
20 | # and only build if this fails.
21 | # pip install puffinn || (\
22 | # git clone https://github.com/puffinn/puffinn.git;\
23 | # cd puffinn;\
24 | # python3 setup.py build;\
25 | # pip install . ;\
26 | # cd ..)
27 | git clone https://github.com/puffinn/puffinn.git
28 | cd puffinn
29 | python3 setup.py build
30 | pip install .
31 | cd ..
32 | rm -r puffinn
33 |
34 | elif [[ $(uname -s) == MINGW32_NT* ]]; then
35 | echo "Running under Win x86-32"
36 | echo "Nothing to build."
37 |
38 | elif [[ $(uname -s) == MINGW64_NT* ]]; then
39 | echo "Running under Win x86-64"
40 | echo "Nothing to build."
41 |
42 | elif [[ $(uname -s) == CYGWIN* ]]; then
43 | echo "Running under Cygwin"
44 | echo "Nothing to build."
45 |
46 | fi
47 |
--------------------------------------------------------------------------------
/skhubness/utils/io.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | # Author: Roman Feldbauer
4 | import logging
5 | from tempfile import mkstemp, NamedTemporaryFile
6 |
7 | __all__ = [
8 | "create_tempfile_preferably_in_dir",
9 | "validate_verbose",
10 | ]
11 |
12 |
13 | def create_tempfile_preferably_in_dir(suffix=None, prefix=None, directory=None, persistent: bool = False, ):
14 | """ Create a temporary file with precedence for directory if possible, in TMP otherwise.
15 | For example, this is useful to try to save into /dev/shm.
16 | """
17 | temp_file = mkstemp if persistent else NamedTemporaryFile
18 | try:
19 | handle = temp_file(suffix=suffix, prefix=prefix, dir=directory)
20 | warn = False
21 | except FileNotFoundError:
22 | handle = temp_file(suffix=suffix, prefix=prefix, dir=None)
23 | warn = True
24 |
25 | # Extract the path (as string)
26 | try:
27 | path = handle.name
28 | except AttributeError:
29 | _, path = handle
30 |
31 | if warn:
32 | logging.warning(f"Could not create temp file in {directory}. "
33 | f"Instead, the path is {path}.")
34 | return path
35 |
36 |
37 | def validate_verbose(verbose):
38 | """ Handle special values for verbose parameter. """
39 | if verbose is None:
40 | verbose = 0
41 | elif verbose < 0:
42 | verbose = 0
43 | return verbose
44 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2018-2019, the scikit-hubness developers
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/docs/documentation/history.rst:
--------------------------------------------------------------------------------
1 | ==============================
2 | History of ``scikit-hubness``
3 | ==============================
4 |
5 | ``scikit-hubness`` builds upon previous software: the Hub-Toolbox.
6 | The original `Hub-Toolbox `_
7 | was written for Matlab, and released in parallel
8 | with the release of the first hubness reduction methods in
9 | `JMLR `_.
10 | In essence, it comprises methods to reduce hubness in distance matrices.
11 |
12 | The `Hub-Toolbox for Python3 `_
13 | is a port from Matlab to Python,
14 | which over the years got several extensions and additional functionality,
15 | such as more hubness reduction methods (Localized Centering, DisSimLocal, mp-dissim, etc.),
16 | approximate hubness reduction, and more.
17 | The software was developed by hubness researchers for hubness research.
18 |
19 | The new ``scikit-hubness`` package is rewritten from scratch with a different goal in mind:
20 | Providing easy-to-use neighborhood-based data mining methods (classification, regression, etc.)
21 | with transparent hubness reduction.
22 | Building upon scikit-learn's ``neighbors`` package, we provide a drop-in replacement
23 | called ``skhubness.neighbors``, which offers all the functionality of ``sklearn.neighbors``,
24 | but adds additional functionality (approximate nearest neighbor search, hubness reduction).
25 |
26 | This way, we think that machine learning researchers and practitioners
27 | (many of which will be fluent in scikit-learn)
28 | can quickly and effectively employ ``scikit-hubness`` in their existing workflows,
29 | and improve learning in their high-dimensional data.
30 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ann/index.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 |
4 |
5 | .. _sphx_glr_documentation_auto_examples_ann:
6 |
7 | ============================================
8 | Example: Approximate nearest neighbor search
9 | ============================================
10 |
11 | This example shows how to perform approximate nearest neighbor search.
12 |
13 |
14 |
15 | .. raw:: html
16 |
17 |
18 |
19 | .. only:: html
20 |
21 | .. figure:: /documentation/auto_examples_ann/images/thumb/sphx_glr_word_embeddings_thumb.png
22 |
23 | :ref:`sphx_glr_documentation_auto_examples_ann_word_embeddings.py`
24 |
25 | .. raw:: html
26 |
27 |
28 |
29 |
30 | .. toctree::
31 | :hidden:
32 |
33 | /documentation/auto_examples_ann/word_embeddings
34 | .. raw:: html
35 |
36 |
37 |
38 |
39 |
40 | .. only :: html
41 |
42 | .. container:: sphx-glr-footer
43 | :class: sphx-glr-footer-gallery
44 |
45 |
46 | .. container:: sphx-glr-download
47 |
48 | :download:`Download all examples in Python source code: auto_examples_ann_python.zip /home/user/feldbauer/PycharmProjects/hubness/docs/documentation/auto_examples_ann/auto_examples_ann_python.zip>`
49 |
50 |
51 |
52 | .. container:: sphx-glr-download
53 |
54 | :download:`Download all examples in Jupyter notebooks: auto_examples_ann_jupyter.zip /home/user/feldbauer/PycharmProjects/hubness/docs/documentation/auto_examples_ann/auto_examples_ann_jupyter.zip>`
55 |
56 |
57 | .. only:: html
58 |
59 | .. rst-class:: sphx-glr-signature
60 |
61 | `Gallery generated by Sphinx-Gallery `_
62 |
--------------------------------------------------------------------------------
/docs/documentation/nearestneighbors.rst:
--------------------------------------------------------------------------------
1 | ========================================================
2 | Nearest neighbors
3 | ========================================================
4 |
5 | The :mod:`skhubness.neighbors` subpackage provides several neighbors-based learning methods.
6 | It is designed as a drop-in replacement for scikit-learn's ``neighbors``.
7 | The package provides all functionality from ``sklearn.neighbors``,
8 | and adds support for transparent hubness reduction, where applicable, including
9 |
10 | - classification (e.g. :class:`KNeighborsClassifier `),
11 | - regression (e.g. :class:`RadiusNeighborsRegressor `),
12 | - unsupervised learning (e.g. :class:`NearestNeighbors `),
13 | - outlier detection (:class:`LocalOutlierFactor `), and
14 | - kNN graphs (:meth:`kneighbors_graph `).
15 |
16 | In addition, scikit-hubness provides approximate nearest neighbor (ANN) search,
17 | in order to support large data sets with millions of data objects and more.
18 | A list of currently provided ANN methods is available
19 | :ref:`here `.
20 |
21 | Hubness reduction and ANN search can be used independently or in conjunction,
22 | the latter yielding `approximate hubness reduction`.
23 | User of scikit-learn will find that only minor modification of their code
24 | is required to enable one or both of the above.
25 | We describe how to do so :ref:`here `.
26 |
27 | For general information and details about nearest neighbors,
28 | we refer to the excellent scikit-learn
29 | `User Guide on Nearest Neighbors `__.
30 |
--------------------------------------------------------------------------------
/docs/documentation/documentation.rst:
--------------------------------------------------------------------------------
1 | =================
2 | API Documentation
3 | =================
4 |
5 | This is the API documentation for ``scikit-hubness``.
6 |
7 | .. _data_ref:
8 |
9 | Analysis: :mod:`skhubness.analysis`
10 | ===================================
11 |
12 | .. automodule:: skhubness.analysis
13 | :no-members:
14 | :no-inherited-members:
15 |
16 | .. currentmodule:: skhubness
17 |
18 | .. autosummary::
19 | :nosignatures:
20 | :toctree: _autosummary
21 |
22 | analysis.LegacyHubness
23 | analysis.VALID_HUBNESS_MEASURES
24 |
25 |
26 | Neighbors: :mod:`skhubness.neighbors`
27 | =====================================
28 |
29 | .. automodule:: skhubness.neighbors
30 | :no-members:
31 | :no-inherited-members:
32 |
33 | .. currentmodule:: skhubness
34 |
35 | .. autosummary::
36 | :nosignatures:
37 | :toctree: _autosummary
38 |
39 | neighbors.BallTree
40 | neighbors.DistanceMetric
41 | neighbors.KDTree
42 | neighbors.LegacyHNSW
43 | neighbors.KNeighborsClassifier
44 | neighbors.KNeighborsRegressor
45 | neighbors.LegacyFalconn
46 | neighbors.NearestCentroid
47 | neighbors.NearestNeighbors
48 | neighbors.LegacyNNG
49 | neighbors.LegacyPuffinn
50 | neighbors.RadiusNeighborsClassifier
51 | neighbors.RadiusNeighborsRegressor
52 | neighbors.LegacyRandomProjectionTree
53 | neighbors.kneighbors_graph
54 | neighbors.radius_neighbors_graph
55 | neighbors.KernelDensity
56 | neighbors.LocalOutlierFactor
57 | neighbors.NeighborhoodComponentsAnalysis
58 |
59 |
60 | Reduction: :mod:`skhubness.reduction`
61 | =====================================
62 |
63 | .. automodule:: skhubness.reduction
64 | :no-members:
65 | :no-inherited-members:
66 |
67 | .. currentmodule:: skhubness
68 |
69 | .. autosummary::
70 | :nosignatures:
71 | :toctree: _autosummary
72 |
73 | reduction.MutualProximity
74 | reduction.LocalScaling
75 | reduction.DisSimLocal
76 | reduction.hubness_algorithms
77 |
--------------------------------------------------------------------------------
/examples/approximate_hub_red/reusing_index.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================
3 | Example: Reusing index structures
4 | ========================================
5 |
6 | This example shows how to reuse index structures. If you want to first estimate hubness,
7 | and then perform kNN, you can avoid recomputing the ANN index structure, which can be
8 | costly.
9 | """
10 | from sklearn.datasets import make_classification
11 | from sklearn.model_selection import train_test_split
12 |
13 | from skhubness.analysis import LegacyHubness
14 | from skhubness.neighbors import KNeighborsClassifier
15 |
16 | X, y = make_classification(n_samples=100_000,
17 | n_features=500,
18 | n_informative=400,
19 | random_state=543)
20 |
21 | X_train, X_test, y_train, y_test = train_test_split(X, y,
22 | test_size=0.01,
23 | stratify=y,
24 | shuffle=True,
25 | random_state=2346)
26 |
27 | # Approximate hubness estimation: Creates LSH index and computes local scaling factors
28 | hub = LegacyHubness(k=10,
29 | return_value='robinhood',
30 | algorithm='falconn_lsh',
31 | hubness='ls',
32 | random_state=2345,
33 | shuffle_equal=False,
34 | verbose=1)
35 | hub.fit(X_train)
36 |
37 | robin_hood = hub.score(X_test)
38 | print(f'Hubness (Robin Hood): {robin_hood}:.4f')
39 | # 0.9060
40 |
41 | # Approximate hubness reduction for classification: Reuse index & factors
42 | knn = KNeighborsClassifier(n_neighbor=10,
43 | algorithm='falconn_lsh',
44 | hubness='ls',
45 | n_jobs=1)
46 |
47 | knn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE
48 | acc = knn.score(X_test, y_test)
49 | print(f'Test accuracy: {acc:.3f}')
50 | # 0.959
51 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/reusing_index.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================
3 | Example: Reusing index structures
4 | ========================================
5 |
6 | This example shows how to reuse index structures. If you want to first estimate hubness,
7 | and then perform kNN, you can avoid recomputing the ANN index structure, which can be
8 | costly.
9 | """
10 | from sklearn.datasets import make_classification
11 | from sklearn.model_selection import train_test_split
12 |
13 | from skhubness.analysis import LegacyHubness
14 | from skhubness.neighbors import KNeighborsClassifier
15 |
16 | X, y = make_classification(n_samples=100_000,
17 | n_features=500,
18 | n_informative=400,
19 | random_state=543)
20 |
21 | X_train, X_test, y_train, y_test = train_test_split(X, y,
22 | test_size=0.01,
23 | stratify=y,
24 | shuffle=True,
25 | random_state=2346)
26 |
27 | # Approximate hubness estimation: Creates LSH index and computes local scaling factors
28 | hub = LegacyHubness(k=10,
29 | return_value='robinhood',
30 | algorithm='falconn_lsh',
31 | hubness='ls',
32 | random_state=2345,
33 | shuffle_equal=False,
34 | verbose=1)
35 | hub.fit(X_train)
36 |
37 | robin_hood = hub.score(X_test)
38 | print(f'Hubness (Robin Hood): {robin_hood}:.4f')
39 | # 0.9060
40 |
41 | # Approximate hubness reduction for classification: Reuse index & factors
42 | knn = KNeighborsClassifier(n_neighbor=10,
43 | algorithm='falconn_lsh',
44 | hubness='ls',
45 | n_jobs=1)
46 |
47 | knn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE
48 | acc = knn.score(X_test, y_test)
49 | print(f'Test accuracy: {acc:.3f}')
50 | # 0.959
51 |
--------------------------------------------------------------------------------
/examples/approximate_hub_red/high_dim_gaussian.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================
3 | Example: Approximate hubness reduction
4 | ========================================
5 |
6 | This example shows how to combine approximate nearest neighbor search and hubness reduction
7 | in order to perform approximate hubness reduction for large data sets.
8 | """
9 | from sklearn.datasets import make_classification
10 | from sklearn.metrics import accuracy_score
11 | from sklearn.model_selection import train_test_split
12 |
13 | from skhubness.analysis import LegacyHubness
14 | from skhubness.neighbors import KNeighborsClassifier
15 |
16 | # High-dimensional artificial data
17 | X, y = make_classification(n_samples=1_000_000,
18 | n_features=500,
19 | n_informative=400,
20 | random_state=543)
21 |
22 | X_train, X_test, y_train, y_test = train_test_split(X, y,
23 | test_size=10_000,
24 | stratify=y,
25 | shuffle=True,
26 | random_state=2346)
27 |
28 | # Approximate hubness estimation
29 | hub = LegacyHubness(k=10,
30 | return_value='robinhood',
31 | algorithm='hnsw',
32 | random_state=2345,
33 | shuffle_equal=False,
34 | n_jobs=-1,
35 | verbose=2)
36 | hub.fit(X_train)
37 | robin_hood = hub.score(X_test)
38 | print(f'Hubness (Robin Hood): {robin_hood:.3f}')
39 | # 0.944
40 |
41 | # Approximate hubness reduction for classification
42 | knn = KNeighborsClassifier(n_neighbor=10,
43 | algorithm='hnsw',
44 | hubness='ls',
45 | n_jobs=-1,
46 | verbose=2)
47 |
48 | knn.fit(X_train, y_train)
49 | y_pred = knn.predict(X_test)
50 | acc = accuracy_score(y_test, y_pred)
51 | print(f'Test accuracy: {acc:.3f}')
52 | # Test accuracy: 0.987
53 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/high_dim_gaussian.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================
3 | Example: Approximate hubness reduction
4 | ========================================
5 |
6 | This example shows how to combine approximate nearest neighbor search and hubness reduction
7 | in order to perform approximate hubness reduction for large data sets.
8 | """
9 | from sklearn.datasets import make_classification
10 | from sklearn.metrics import accuracy_score
11 | from sklearn.model_selection import train_test_split
12 |
13 | from skhubness.analysis import LegacyHubness
14 | from skhubness.neighbors import KNeighborsClassifier
15 |
16 | # High-dimensional artificial data
17 | X, y = make_classification(n_samples=1_000_000,
18 | n_features=500,
19 | n_informative=400,
20 | random_state=543)
21 |
22 | X_train, X_test, y_train, y_test = train_test_split(X, y,
23 | test_size=10_000,
24 | stratify=y,
25 | shuffle=True,
26 | random_state=2346)
27 |
28 | # Approximate hubness estimation
29 | hub = LegacyHubness(k=10,
30 | return_value='robinhood',
31 | algorithm='hnsw',
32 | random_state=2345,
33 | shuffle_equal=False,
34 | n_jobs=-1,
35 | verbose=2)
36 | hub.fit(X_train)
37 | robin_hood = hub.score(X_test)
38 | print(f'LegacyHubness (Robin Hood): {robin_hood:.3f}')
39 | # 0.944
40 |
41 | # Approximate hubness reduction for classification
42 | knn = KNeighborsClassifier(n_neighbor=10,
43 | algorithm='hnsw',
44 | hubness='ls',
45 | n_jobs=-1,
46 | verbose=2)
47 |
48 | knn.fit(X_train, y_train)
49 | y_pred = knn.predict(X_test)
50 | acc = accuracy_score(y_test, y_pred)
51 | print(f'Test accuracy: {acc:.3f}')
52 | # Test accuracy: 0.987
53 |
--------------------------------------------------------------------------------
/examples/sklearn/plot_regression.py:
--------------------------------------------------------------------------------
1 | """
2 | ============================
3 | Nearest Neighbors regression
4 | ============================
5 |
6 | Demonstrate the resolution of a regression problem
7 | using a k-Nearest Neighbor and the interpolation of the
8 | target using both barycenter and constant weights.
9 |
10 | Hubness reduction of this low-dimensional dataset
11 | shows only small effects.
12 |
13 | Adapted from ``_
14 | """
15 | print(__doc__)
16 |
17 | # Author: Alexandre Gramfort
18 | # Fabian Pedregosa
19 | #
20 | # License: BSD 3 clause (C) INRIA
21 |
22 |
23 | # #############################################################################
24 | # Generate sample data
25 | import numpy as np
26 | import matplotlib.pyplot as plt
27 | from skhubness.neighbors import KNeighborsRegressor
28 |
29 | np.random.seed(0)
30 | X = np.sort(5 * np.random.rand(40, 1), axis=0)
31 | T = np.linspace(0, 5, 500)[:, np.newaxis]
32 | y = np.sin(X).ravel()
33 |
34 | # Add noise to targets
35 | y[::5] += 1 * (0.5 - np.random.rand(8))
36 |
37 | # #############################################################################
38 | # Fit regression model
39 | n_neighbors = 5
40 |
41 | f = plt.figure()
42 | for i, weights in enumerate(['uniform', 'distance']):
43 | for j, hubness in enumerate([None, 'local_scaling']):
44 | knn = KNeighborsRegressor(n_neighbors,
45 | algorithm_params={'n_candidates': 39},
46 | weights=weights,
47 | hubness=hubness)
48 | y_ = knn.fit(X, y).predict(T)
49 |
50 | plt.subplot(2, 2, i * 2 + j + 1)
51 | f.set_figheight(15)
52 | f.set_figwidth(15)
53 | plt.scatter(X, y, c='k', label='data')
54 | plt.plot(T, y_, c='g', label='prediction')
55 | plt.axis('tight')
56 | plt.legend()
57 | plt.title(f"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')")
58 |
59 | plt.tight_layout()
60 | plt.show()
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_regression.py:
--------------------------------------------------------------------------------
1 | """
2 | ============================
3 | Nearest Neighbors regression
4 | ============================
5 |
6 | Demonstrate the resolution of a regression problem
7 | using a k-Nearest Neighbor and the interpolation of the
8 | target using both barycenter and constant weights.
9 |
10 | Hubness reduction of this low-dimensional dataset
11 | shows only small effects.
12 |
13 | Adapted from ``_
14 | """
15 | print(__doc__)
16 |
17 | # Author: Alexandre Gramfort
18 | # Fabian Pedregosa
19 | #
20 | # License: BSD 3 clause (C) INRIA
21 |
22 |
23 | # #############################################################################
24 | # Generate sample data
25 | import numpy as np
26 | import matplotlib.pyplot as plt
27 | from skhubness.neighbors import KNeighborsRegressor
28 |
29 | np.random.seed(0)
30 | X = np.sort(5 * np.random.rand(40, 1), axis=0)
31 | T = np.linspace(0, 5, 500)[:, np.newaxis]
32 | y = np.sin(X).ravel()
33 |
34 | # Add noise to targets
35 | y[::5] += 1 * (0.5 - np.random.rand(8))
36 |
37 | # #############################################################################
38 | # Fit regression model
39 | n_neighbors = 5
40 |
41 | f = plt.figure()
42 | for i, weights in enumerate(['uniform', 'distance']):
43 | for j, hubness in enumerate([None, 'local_scaling']):
44 | knn = KNeighborsRegressor(n_neighbors,
45 | algorithm_params={'n_candidates': 39},
46 | weights=weights,
47 | hubness=hubness)
48 | y_ = knn.fit(X, y).predict(T)
49 |
50 | plt.subplot(2, 2, i * 2 + j + 1)
51 | f.set_figheight(15)
52 | f.set_figwidth(15)
53 | plt.scatter(X, y, c='k', label='data')
54 | plt.plot(T, y_, c='g', label='prediction')
55 | plt.axis('tight')
56 | plt.legend()
57 | plt.title(f"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')")
58 |
59 | plt.tight_layout()
60 | plt.show()
--------------------------------------------------------------------------------
/scripts/install-ngt.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Build external dependencies that cannot successfully install via pip or conda
3 | # If you use this file as template, don't forget to `chmod a+x newfile`
4 |
5 | set -e
6 |
7 | # Check for the operating system and install NGT
8 | if [[ $(uname) == "Darwin" ]]; then
9 | if [[ $(command ngt > /dev/null 2>&1) && $(command ngtq > /dev/null 2>&1) && $(command ngtqg > /dev/null 2>&1) ]]; then
10 | # This only checks for available ngt commands. Does not currently check the version.
11 | # To update NGT, this must be adapted.
12 | echo "NGT already installed"
13 | else
14 | echo "Installing NGT under Mac OS X..."
15 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
16 | brew install cmake
17 | brew install gcc@9
18 | export CXX=/usr/local/bin/g++-9
19 | export CC=/usr/local/bin/gcc-9
20 | pushd /tmp/
21 | git clone https://github.com/yahoojapan/NGT
22 | cd NGT/
23 | mkdir build
24 | cd build/
25 | cmake ..
26 | make
27 | sudo make install
28 | cd ../python
29 | pip install .
30 | popd
31 | rm -r /tmp/NGT
32 | fi
33 |
34 | elif [[ $(uname -s) == Linux* ]]; then
35 | if [[ $(command ngt > /dev/null 2>&1) && $(command ngtq > /dev/null 2>&1) && $(command ngtqg > /dev/null 2>&1) ]]; then
36 | # This only checks for available ngt commands. Does not currently check the version.
37 | # To update NGT, this must be adapted.
38 | echo "NGT already installed"
39 | else
40 | echo "Installing NGT under Linux..."
41 | pushd /tmp/
42 | git clone https://github.com/yahoojapan/NGT
43 | cd NGT/
44 | mkdir build
45 | cd build/
46 | cmake ..
47 | make
48 | sudo make install
49 | sudo ldconfig /usr/local/lib/
50 | cd ../python
51 | pip install .
52 | popd
53 | rm -r /tmp/NGT
54 | fi
55 |
56 | elif [[ $(uname -s) == MINGW32_NT* ]]; then
57 | echo "NGT not available under Win x86-32"
58 |
59 | elif [[ $(uname -s) == MINGW64_NT* ]]; then
60 | echo "NGT not available under Win x86-64"
61 |
62 | elif [[ $(uname -s) == CYGWIN* ]]; then
63 | echo "NGT not available under Cygwin"
64 |
65 | fi
66 |
--------------------------------------------------------------------------------
/.github/workflows/scikit-hubness_ci.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: scikit-hubness CI
5 |
6 | on:
7 | push:
8 | branches: [ main ]
9 | pull_request:
10 | branches: [ main ]
11 |
12 | jobs:
13 | build:
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | os: [ ubuntu-latest, macos-latest, windows-latest ]
19 | python: [ "3.8" , "3.9", "3.10" ]
20 | exclude:
21 | # Building nmslib from source fails on Windows: issue #102
22 | - os: windows-latest
23 | python: "3.9"
24 | - os: windows-latest
25 | python: "3.10"
26 |
27 | steps:
28 | - uses: actions/checkout@v2
29 | - name: Set up Python
30 | uses: actions/setup-python@v2
31 | with:
32 | python-version: ${{ matrix.python }}
33 | - name: Install dependencies
34 | run: |
35 | python3 -m pip install --upgrade pip
36 | python3 -m pip install setuptools wheel pybind11
37 | - name: Install ANN packages with special care
38 | run: |
39 | scripts/install-ngt.sh
40 | scripts/install-puffinn.sh
41 | - name: Install scikit-hubness
42 | run: |
43 | echo "Running on platform.system()=$(python -c 'import platform; print(platform.system())')"
44 | python3 -m pip install .[ann,tests]
45 | - name: Lint with flake8
46 | run: |
47 | # stop the build if there are Python syntax errors or undefined names
48 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exit-zero
49 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
50 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
51 | - name: Test with pytest
52 | run: |
53 | pytest --cov=skhubness --cov-append
54 | - name: Test coverage
55 | run: coverage html
56 | - name: Codecov
57 | run: codecov
58 |
--------------------------------------------------------------------------------
/examples/sklearn/plot_classification.py:
--------------------------------------------------------------------------------
1 | """
2 | ================================
3 | Nearest Neighbors Classification
4 | ================================
5 | Sample usage of Nearest Neighbors classification.
6 | It will plot the decision boundaries for each class.
7 |
8 | Adapted from ``_
9 | """
10 |
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from matplotlib.colors import ListedColormap
14 | from sklearn import datasets
15 | from skhubness.neighbors import KNeighborsClassifier
16 |
17 | n_neighbors = 15
18 |
19 | # import some data to play with
20 | iris = datasets.load_iris()
21 |
22 | # we only take the first two features. We could avoid this ugly
23 | # slicing by using a two-dim dataset
24 | X = iris.data[:, :2]
25 | y = iris.target
26 |
27 | h = .02 # step size in the mesh
28 |
29 | # Create color maps
30 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
31 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
32 |
33 | for hubness in [None, 'mutual_proximity']:
34 | # we create an instance of Neighbours Classifier and fit the data.
35 | clf = KNeighborsClassifier(n_neighbors,
36 | hubness=hubness,
37 | weights='distance')
38 | clf.fit(X, y)
39 |
40 | # Plot the decision boundary. For that, we will assign a color to each
41 | # point in the mesh [x_min, x_max]x[y_min, y_max].
42 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
43 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
44 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
45 | np.arange(y_min, y_max, h))
46 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
47 |
48 | # Put the result into a color plot
49 | Z = Z.reshape(xx.shape)
50 | plt.figure()
51 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
52 |
53 | # Plot also the training points
54 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
55 | edgecolor='k', s=20)
56 | plt.xlim(xx.min(), xx.max())
57 | plt.ylim(yy.min(), yy.max())
58 | plt.title("3-Class classification (k = %i, hubness = '%s')"
59 | % (n_neighbors, hubness))
60 |
61 | plt.show()
62 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Manually added
2 | dist_prev/
3 | notebooks/
4 | examples/playground
5 | new_test_pypi_release.bash
6 | new_pypi_release.bash
7 | coverage.html/*
8 | _autosummary/
9 | .idea/
10 | venv/
11 | codemeta.json
12 | generate_joss_metadata.rb
13 | *__pycache__*
14 | *egg-info*
15 |
16 | # From github
17 | Byte-compiled / optimized / DLL files
18 | __pycache__/
19 | *.py[cod]
20 | *$py.class
21 |
22 | # C extensions
23 | *.so
24 |
25 | # Distribution / packaging
26 | .Python
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | wheels/
39 | share/python-wheels/
40 | *.egg-info/
41 | .installed.cfg
42 | *.egg
43 | MANIFEST
44 |
45 | # PyInstaller
46 | # Usually these files are written by a python script from a template
47 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
48 | *.manifest
49 | *.spec
50 |
51 | # Installer logs
52 | pip-log.txt
53 | pip-delete-this-directory.txt
54 |
55 | # Unit test / coverage reports
56 | htmlcov/
57 | .tox/
58 | .nox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *.cover
65 | .hypothesis/
66 | .pytest_cache/
67 |
68 | # Translations
69 | *.mo
70 | *.pot
71 |
72 | # Django stuff:
73 | *.log
74 | local_settings.py
75 | db.sqlite3
76 |
77 | # Flask stuff:
78 | instance/
79 | .webassets-cache
80 |
81 | # Scrapy stuff:
82 | .scrapy
83 |
84 | # Sphinx documentation
85 | docs/_build/
86 |
87 | # PyBuilder
88 | target/
89 |
90 | # Jupyter Notebook
91 | .ipynb_checkpoints
92 |
93 | # IPython
94 | profile_default/
95 | ipython_config.py
96 |
97 | # pyenv
98 | .python-version
99 |
100 | # celery beat schedule file
101 | celerybeat-schedule
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_classification.py:
--------------------------------------------------------------------------------
1 | """
2 | ================================
3 | Nearest Neighbors Classification
4 | ================================
5 | Sample usage of Nearest Neighbors classification.
6 | It will plot the decision boundaries for each class.
7 |
8 | Adapted from ``_
9 | """
10 |
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from matplotlib.colors import ListedColormap
14 | from sklearn import datasets
15 | from skhubness.neighbors import KNeighborsClassifier
16 |
17 | n_neighbors = 15
18 |
19 | # import some data to play with
20 | iris = datasets.load_iris()
21 |
22 | # we only take the first two features. We could avoid this ugly
23 | # slicing by using a two-dim dataset
24 | X = iris.data[:, :2]
25 | y = iris.target
26 |
27 | h = .02 # step size in the mesh
28 |
29 | # Create color maps
30 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
31 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
32 |
33 | for hubness in [None, 'mutual_proximity']:
34 | # we create an instance of Neighbours Classifier and fit the data.
35 | clf = KNeighborsClassifier(n_neighbors,
36 | hubness=hubness,
37 | weights='distance')
38 | clf.fit(X, y)
39 |
40 | # Plot the decision boundary. For that, we will assign a color to each
41 | # point in the mesh [x_min, x_max]x[y_min, y_max].
42 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
43 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
44 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
45 | np.arange(y_min, y_max, h))
46 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
47 |
48 | # Put the result into a color plot
49 | Z = Z.reshape(xx.shape)
50 | plt.figure()
51 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
52 |
53 | # Plot also the training points
54 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
55 | edgecolor='k', s=20)
56 | plt.xlim(xx.min(), xx.max())
57 | plt.ylim(yy.min(), yy.max())
58 | plt.title("3-Class classification (k = %i, hubness = '%s')"
59 | % (n_neighbors, hubness))
60 |
61 | plt.show()
62 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/index.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 |
4 |
5 | .. _sphx_glr_documentation_auto_examples_hr:
6 |
7 | ============================================
8 | Example: Hubness reduction
9 | ============================================
10 |
11 | These examples show how to perform hubness reduction in kNN classification
12 | in (nested) cross-validation and pipelines.
13 |
14 |
15 |
16 | .. raw:: html
17 |
18 |
19 |
20 | .. only:: html
21 |
22 | .. figure:: /documentation/auto_examples_hr/images/thumb/sphx_glr_pipelines_thumb.png
23 |
24 | :ref:`sphx_glr_documentation_auto_examples_hr_pipelines.py`
25 |
26 | .. raw:: html
27 |
28 |
29 |
30 |
31 | .. toctree::
32 | :hidden:
33 |
34 | /documentation/auto_examples_hr/pipelines
35 |
36 | .. raw:: html
37 |
38 |
39 |
40 | .. only:: html
41 |
42 | .. figure:: /documentation/auto_examples_hr/images/thumb/sphx_glr_olivetti_faces_thumb.png
43 |
44 | :ref:`sphx_glr_documentation_auto_examples_hr_olivetti_faces.py`
45 |
46 | .. raw:: html
47 |
48 |
49 |
50 |
51 | .. toctree::
52 | :hidden:
53 |
54 | /documentation/auto_examples_hr/olivetti_faces
55 | .. raw:: html
56 |
57 |
58 |
59 |
60 |
61 | .. only :: html
62 |
63 | .. container:: sphx-glr-footer
64 | :class: sphx-glr-footer-gallery
65 |
66 |
67 | .. container:: sphx-glr-download
68 |
69 | :download:`Download all examples in Python source code: auto_examples_hr_python.zip /home/user/feldbauer/PycharmProjects/hubness/docs/documentation/auto_examples_hr/auto_examples_hr_python.zip>`
70 |
71 |
72 |
73 | .. container:: sphx-glr-download
74 |
75 | :download:`Download all examples in Jupyter notebooks: auto_examples_hr_jupyter.zip /home/user/feldbauer/PycharmProjects/hubness/docs/documentation/auto_examples_hr/auto_examples_hr_jupyter.zip>`
76 |
77 |
78 | .. only:: html
79 |
80 | .. rst-class:: sphx-glr-signature
81 |
82 | `Gallery generated by Sphinx-Gallery `_
83 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/index.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 |
4 |
5 | .. _sphx_glr_documentation_auto_examples_ahr:
6 |
7 | ========================================
8 | Example: Approximate hubness reduction
9 | ========================================
10 |
11 | These examples show how to combine approximate nearest neighbor search and hubness reduction.
12 |
13 |
14 |
15 | .. raw:: html
16 |
17 |
18 |
19 | .. only:: html
20 |
21 | .. figure:: /documentation/auto_examples_ahr/images/thumb/sphx_glr_reusing_index_thumb.png
22 |
23 | :ref:`sphx_glr_documentation_auto_examples_ahr_reusing_index.py`
24 |
25 | .. raw:: html
26 |
27 |
28 |
29 |
30 | .. toctree::
31 | :hidden:
32 |
33 | /documentation/auto_examples_ahr/reusing_index
34 |
35 | .. raw:: html
36 |
37 |
38 |
39 | .. only:: html
40 |
41 | .. figure:: /documentation/auto_examples_ahr/images/thumb/sphx_glr_high_dim_gaussian_thumb.png
42 |
43 | :ref:`sphx_glr_documentation_auto_examples_ahr_high_dim_gaussian.py`
44 |
45 | .. raw:: html
46 |
47 |
48 |
49 |
50 | .. toctree::
51 | :hidden:
52 |
53 | /documentation/auto_examples_ahr/high_dim_gaussian
54 | .. raw:: html
55 |
56 |
57 |
58 |
59 |
60 | .. only :: html
61 |
62 | .. container:: sphx-glr-footer
63 | :class: sphx-glr-footer-gallery
64 |
65 |
66 | .. container:: sphx-glr-download
67 |
68 | :download:`Download all examples in Python source code: auto_examples_ahr_python.zip /home/user/feldbauer/PycharmProjects/hubness/docs/documentation/auto_examples_ahr/auto_examples_ahr_python.zip>`
69 |
70 |
71 |
72 | .. container:: sphx-glr-download
73 |
74 | :download:`Download all examples in Jupyter notebooks: auto_examples_ahr_jupyter.zip /home/user/feldbauer/PycharmProjects/hubness/docs/documentation/auto_examples_ahr/auto_examples_ahr_jupyter.zip>`
75 |
76 |
77 | .. only:: html
78 |
79 | .. rst-class:: sphx-glr-signature
80 |
81 | `Gallery generated by Sphinx-Gallery `_
82 |
--------------------------------------------------------------------------------
/examples/sklearn/plot_nearest_centroid.py:
--------------------------------------------------------------------------------
1 | """
2 | ===============================
3 | Nearest Centroid Classification
4 | ===============================
5 |
6 | Sample usage of Nearest Centroid classification.
7 | It will plot the decision boundaries for each class.
8 |
9 | Note that no hubness reduction is currently implemented for centroids.
10 | However, `hubness.neighbors` retains all the features of `sklearn.neighbors`,
11 | in order to act as a full drop-in replacement.
12 |
13 | Adapted from ``_
14 | """
15 | print(__doc__)
16 |
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | from matplotlib.colors import ListedColormap
20 | from sklearn import datasets
21 | from skhubness.neighbors import NearestCentroid
22 |
23 | n_neighbors = 15
24 |
25 | # import some data to play with
26 | iris = datasets.load_iris()
27 | # we only take the first two features. We could avoid this ugly
28 | # slicing by using a two-dim dataset
29 | X = iris.data[:, :2]
30 | y = iris.target
31 |
32 | h = .02 # step size in the mesh
33 |
34 | # Create color maps
35 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
36 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
37 |
38 | for shrinkage in [None, .2]:
39 | # we create an instance of Neighbours Classifier and fit the data.
40 | clf = NearestCentroid(shrink_threshold=shrinkage)
41 | clf.fit(X, y)
42 | y_pred = clf.predict(X)
43 | print(shrinkage, np.mean(y == y_pred))
44 | # Plot the decision boundary. For that, we will assign a color to each
45 | # point in the mesh [x_min, x_max]x[y_min, y_max].
46 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
47 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
48 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
49 | np.arange(y_min, y_max, h))
50 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
51 |
52 | # Put the result into a color plot
53 | Z = Z.reshape(xx.shape)
54 | plt.figure()
55 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
56 |
57 | # Plot also the training points
58 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
59 | edgecolor='k', s=20)
60 | plt.title("3-Class classification (shrink_threshold=%r)"
61 | % shrinkage)
62 | plt.axis('tight')
63 |
64 | plt.show()
65 |
--------------------------------------------------------------------------------
/examples/hubness_reduction/pipelines.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================
3 | Example: skhubness in Pipelines
4 | ========================================
5 |
6 | Estimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``.
7 | In this example, we select the best hubness reduction method and several other
8 | hyperparameters in grid search w.r.t. to classification performance.
9 | """
10 | from sklearn.datasets import make_classification
11 | from sklearn.decomposition import PCA
12 | from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
13 | from sklearn.pipeline import Pipeline
14 | from sklearn.preprocessing import StandardScaler
15 |
16 | from skhubness.neighbors import KNeighborsClassifier
17 |
18 | # Not so high-dimensional data
19 | X, y = make_classification(n_samples=1_000,
20 | n_features=50,
21 | n_informative=20,
22 | n_classes=2,
23 | random_state=3453)
24 |
25 | X, X_test, y, y_test = train_test_split(X, y,
26 | test_size=100,
27 | stratify=y,
28 | shuffle=True,
29 | random_state=124)
30 |
31 | # Pipeline of standardization, dimensionality reduction, and kNN classification
32 | pipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)),
33 | ('pca', PCA(n_components=20, random_state=1213)),
34 | ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))])
35 |
36 | # Exhaustive search for best algorithms and hyperparameters
37 | param_grid = {'pca__n_components': [10, 20, 30],
38 | 'knn__n_neighbors': [5, 10, 20],
39 | 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'],
40 | 'knn__hubness': [None, 'mp', 'ls', 'dsl']}
41 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354)
42 | search = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1)
43 | search.fit(X, y)
44 |
45 | # Performance on hold-out data
46 | acc = search.score(y_test, y_test)
47 | print(acc)
48 | # 0.79
49 |
50 | print(search.best_params_)
51 | # {'knn__algorithm': 'auto',
52 | # 'knn__hubness': 'dsl',
53 | # 'knn__n_neighbors': 20,
54 | # 'pca__n_components': 30}
55 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nearest_centroid.py:
--------------------------------------------------------------------------------
1 | """
2 | ===============================
3 | Nearest Centroid Classification
4 | ===============================
5 |
6 | Sample usage of Nearest Centroid classification.
7 | It will plot the decision boundaries for each class.
8 |
9 | Note that no hubness reduction is currently implemented for centroids.
10 | However, `hubness.neighbors` retains all the features of `sklearn.neighbors`,
11 | in order to act as a full drop-in replacement.
12 |
13 | Adapted from ``_
14 | """
15 | print(__doc__)
16 |
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | from matplotlib.colors import ListedColormap
20 | from sklearn import datasets
21 | from skhubness.neighbors import NearestCentroid
22 |
23 | n_neighbors = 15
24 |
25 | # import some data to play with
26 | iris = datasets.load_iris()
27 | # we only take the first two features. We could avoid this ugly
28 | # slicing by using a two-dim dataset
29 | X = iris.data[:, :2]
30 | y = iris.target
31 |
32 | h = .02 # step size in the mesh
33 |
34 | # Create color maps
35 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
36 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
37 |
38 | for shrinkage in [None, .2]:
39 | # we create an instance of Neighbours Classifier and fit the data.
40 | clf = NearestCentroid(shrink_threshold=shrinkage)
41 | clf.fit(X, y)
42 | y_pred = clf.predict(X)
43 | print(shrinkage, np.mean(y == y_pred))
44 | # Plot the decision boundary. For that, we will assign a color to each
45 | # point in the mesh [x_min, x_max]x[y_min, y_max].
46 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
47 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
48 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
49 | np.arange(y_min, y_max, h))
50 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
51 |
52 | # Put the result into a color plot
53 | Z = Z.reshape(xx.shape)
54 | plt.figure()
55 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
56 |
57 | # Plot also the training points
58 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
59 | edgecolor='k', s=20)
60 | plt.title("3-Class classification (shrink_threshold=%r)"
61 | % shrinkage)
62 | plt.axis('tight')
63 |
64 | plt.show()
65 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/pipelines.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================
3 | Example: skhubness in Pipelines
4 | ========================================
5 |
6 | Estimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``.
7 | In this example, we select the best hubness reduction method and several other
8 | hyperparameters in grid search w.r.t. to classification performance.
9 | """
10 | from sklearn.datasets import make_classification
11 | from sklearn.decomposition import PCA
12 | from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
13 | from sklearn.pipeline import Pipeline
14 | from sklearn.preprocessing import StandardScaler
15 |
16 | from skhubness.neighbors import KNeighborsClassifier
17 |
18 | # Not so high-dimensional data
19 | X, y = make_classification(n_samples=1_000,
20 | n_features=50,
21 | n_informative=20,
22 | n_classes=2,
23 | random_state=3453)
24 |
25 | X, X_test, y, y_test = train_test_split(X, y,
26 | test_size=100,
27 | stratify=y,
28 | shuffle=True,
29 | random_state=124)
30 |
31 | # Pipeline of standardization, dimensionality reduction, and kNN classification
32 | pipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)),
33 | ('pca', PCA(n_components=20, random_state=1213)),
34 | ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))])
35 |
36 | # Exhaustive search for best algorithms and hyperparameters
37 | param_grid = {'pca__n_components': [10, 20, 30],
38 | 'knn__n_neighbors': [5, 10, 20],
39 | 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'],
40 | 'knn__hubness': [None, 'mp', 'ls', 'dsl']}
41 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354)
42 | search = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1)
43 | search.fit(X, y)
44 |
45 | # Performance on hold-out data
46 | acc = search.score(y_test, y_test)
47 | print(acc)
48 | # 0.79
49 |
50 | print(search.best_params_)
51 | # {'knn__algorithm': 'auto',
52 | # 'knn__hubness': 'dsl',
53 | # 'knn__n_neighbors': 20,
54 | # 'pca__n_components': 30}
55 |
--------------------------------------------------------------------------------
/skhubness/data/dexter/dexter_train.labels:
--------------------------------------------------------------------------------
1 | 1
2 | -1
3 | 1
4 | -1
5 | 1
6 | -1
7 | 1
8 | -1
9 | 1
10 | 1
11 | 1
12 | 1
13 | -1
14 | 1
15 | 1
16 | 1
17 | -1
18 | 1
19 | -1
20 | -1
21 | 1
22 | -1
23 | 1
24 | 1
25 | 1
26 | 1
27 | 1
28 | -1
29 | -1
30 | -1
31 | 1
32 | -1
33 | -1
34 | 1
35 | 1
36 | 1
37 | 1
38 | -1
39 | 1
40 | -1
41 | -1
42 | -1
43 | -1
44 | 1
45 | -1
46 | -1
47 | -1
48 | -1
49 | -1
50 | 1
51 | -1
52 | -1
53 | 1
54 | -1
55 | -1
56 | -1
57 | 1
58 | 1
59 | 1
60 | 1
61 | 1
62 | -1
63 | -1
64 | -1
65 | -1
66 | -1
67 | 1
68 | -1
69 | 1
70 | -1
71 | 1
72 | -1
73 | -1
74 | -1
75 | 1
76 | 1
77 | 1
78 | 1
79 | 1
80 | -1
81 | -1
82 | -1
83 | -1
84 | -1
85 | 1
86 | 1
87 | 1
88 | 1
89 | -1
90 | -1
91 | -1
92 | -1
93 | 1
94 | -1
95 | 1
96 | -1
97 | -1
98 | 1
99 | 1
100 | -1
101 | 1
102 | 1
103 | -1
104 | -1
105 | 1
106 | 1
107 | 1
108 | 1
109 | -1
110 | -1
111 | -1
112 | 1
113 | 1
114 | -1
115 | 1
116 | 1
117 | -1
118 | -1
119 | 1
120 | 1
121 | -1
122 | 1
123 | -1
124 | -1
125 | 1
126 | 1
127 | 1
128 | -1
129 | -1
130 | 1
131 | 1
132 | 1
133 | -1
134 | -1
135 | 1
136 | 1
137 | -1
138 | -1
139 | 1
140 | -1
141 | 1
142 | 1
143 | 1
144 | -1
145 | -1
146 | -1
147 | 1
148 | 1
149 | -1
150 | -1
151 | 1
152 | -1
153 | 1
154 | -1
155 | 1
156 | -1
157 | -1
158 | 1
159 | 1
160 | -1
161 | 1
162 | -1
163 | 1
164 | -1
165 | -1
166 | 1
167 | -1
168 | 1
169 | 1
170 | -1
171 | 1
172 | -1
173 | 1
174 | -1
175 | -1
176 | -1
177 | 1
178 | -1
179 | 1
180 | 1
181 | 1
182 | 1
183 | -1
184 | -1
185 | 1
186 | -1
187 | 1
188 | 1
189 | 1
190 | -1
191 | -1
192 | 1
193 | -1
194 | -1
195 | 1
196 | -1
197 | -1
198 | -1
199 | 1
200 | -1
201 | -1
202 | 1
203 | 1
204 | -1
205 | 1
206 | -1
207 | 1
208 | 1
209 | -1
210 | 1
211 | 1
212 | -1
213 | -1
214 | -1
215 | 1
216 | -1
217 | -1
218 | 1
219 | 1
220 | -1
221 | 1
222 | -1
223 | -1
224 | -1
225 | -1
226 | 1
227 | 1
228 | 1
229 | 1
230 | 1
231 | 1
232 | 1
233 | -1
234 | -1
235 | 1
236 | -1
237 | -1
238 | 1
239 | 1
240 | -1
241 | 1
242 | 1
243 | -1
244 | -1
245 | -1
246 | 1
247 | 1
248 | 1
249 | -1
250 | 1
251 | 1
252 | -1
253 | 1
254 | -1
255 | -1
256 | -1
257 | -1
258 | 1
259 | -1
260 | 1
261 | 1
262 | -1
263 | -1
264 | 1
265 | 1
266 | -1
267 | -1
268 | 1
269 | 1
270 | 1
271 | -1
272 | -1
273 | -1
274 | -1
275 | 1
276 | 1
277 | 1
278 | 1
279 | 1
280 | -1
281 | -1
282 | 1
283 | 1
284 | -1
285 | -1
286 | 1
287 | 1
288 | -1
289 | 1
290 | -1
291 | -1
292 | 1
293 | 1
294 | 1
295 | -1
296 | -1
297 | -1
298 | -1
299 | 1
300 | -1
301 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = scikit-hubness
3 | version = attr: skhubness.__version__
4 | author = Roman Feldbauer
5 | author_email = sci@feldbauer.org
6 | maintainer=Roman Feldbauer
7 | maintainer_email=sci@feldbauer.org
8 | url = https://github.com/VarIr/scikit-hubness
9 | description = Hubness reduction and analysis tools
10 | long_description = file: README.md
11 | long_description_content_type = text/markdown
12 | # This includes the license file(s) in the wheel.
13 | # https://wheel.readthedocs.io/en/stable/user_guide.html#including-license-files-in-the-generated-wheel-file
14 | license_files = LICENSE.txt
15 | platform = any
16 | keywords =
17 | machine-learning
18 | high-dimensional-data
19 | hubness
20 | nearest-neighbor
21 | data-science
22 | data-mining
23 | artificial-intelligence
24 |
25 | # https://pypi.org/classifiers/
26 | classifiers =
27 | Development Status :: 4 - Beta
28 | Environment :: Console
29 | Intended Audience :: Developers
30 | Intended Audience :: Science/Research
31 | License :: OSI Approved :: BSD License
32 | Operating System :: OS Independent
33 | Operating System :: POSIX :: Linux
34 | Operating System :: MacOS :: MacOS X
35 | Operating System :: Microsoft :: Windows
36 | Programming Language :: Python
37 | Programming Language :: Python :: 3.8
38 | Programming Language :: Python :: 3.9
39 | Programming Language :: Python :: 3.10
40 | Topic :: Software Development :: Libraries :: Python Modules
41 | Topic :: Scientific/Engineering :: Artificial Intelligence
42 |
43 | project_urls =
44 | Bug Tracker = https://github.com/VarIr/scikit-hubness/issues
45 | Changelog = https://github.com/VarIr/scikit-hubness/docs/changelog.md
46 | Documentation = https://scikit-hubness.readthedocs.io
47 | Say Thanks! = https://saythanks.io/to/VarIr
48 | Source = https://github.com/VarIr/scikit-hubness
49 |
50 | [options]
51 | zip_safe = false
52 | include_package_data = true
53 | python_requires = >= 3.8
54 | packages = find:
55 | test_suite = tests
56 | install_requires =
57 | numpy # These packages will be installed by pip.
58 | scipy >= 1.2 # For comparison with requirements.txt see also:
59 | scikit-learn >= 0.22 # https://packaging.python.org/en/latest/requirements.html
60 | tqdm
61 | joblib >= 0.12
62 | numba
63 |
64 | [options.extras_require]
65 | ann =
66 | annoy
67 | ngt; platform_system == "Linux" or platform_system == "Darwin"
68 | nmslib
69 | tests =
70 | codecov
71 | flake8
72 | pytest
73 | pytest-cov
74 |
75 | [options.package_data]
76 | * = *.data, *.labels
77 |
--------------------------------------------------------------------------------
/skhubness/reduction/tests/test_local_scaling.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 | import warnings
3 |
4 | import pytest
5 | from sklearn.datasets import make_classification
6 | from sklearn.neighbors import NearestNeighbors
7 | from sklearn.utils._testing import assert_array_almost_equal
8 | from sklearn.utils._testing import assert_array_equal
9 | from sklearn.utils._testing import assert_raises
10 |
11 | from skhubness.reduction import LocalScaling
12 | from skhubness.reduction.tests.reference_algorithms import ReferenceLocalScaling
13 |
14 | LS_METHODS = [
15 | "standard",
16 | "nicdm",
17 | ]
18 |
19 |
20 | @pytest.mark.parametrize("method", LS_METHODS)
21 | @pytest.mark.parametrize("verbose", [0, 1])
22 | def test_fit_sorted(method, verbose):
23 | # TODO add LocalScaling class tests
24 | X, y = make_classification()
25 | nn = NearestNeighbors()
26 | nn.fit(X, y)
27 | neigh_dist, neigh_ind = nn.kneighbors()
28 |
29 | ls = ReferenceLocalScaling(method=method, verbose=verbose)
30 |
31 | nd_sorted, ni_sorted = ls.fit(
32 | neigh_dist, neigh_ind, X, assume_sorted=True,
33 | ).transform(
34 | neigh_dist, neigh_ind, X, assume_sorted=True,
35 | )
36 | nd_unsort, ni_unsort = ls.fit(
37 | neigh_dist, neigh_ind, X, assume_sorted=False,
38 | ).transform(
39 | neigh_dist, neigh_ind, X, assume_sorted=False,
40 | )
41 |
42 | assert_array_almost_equal(nd_sorted, nd_unsort)
43 | assert_array_equal(ni_sorted, ni_unsort)
44 |
45 |
46 | @pytest.mark.parametrize("method", ["invalid", None])
47 | @pytest.mark.parametrize("LocalScalingClass", [ReferenceLocalScaling, LocalScaling])
48 | def test_invalid_method(method, LocalScalingClass):
49 | X, y = make_classification(n_samples=10, )
50 | nn = NearestNeighbors(n_neighbors=6)
51 | nn.fit(X, y)
52 | neigh_dist, neigh_ind = nn.kneighbors()
53 | neigh_graph = nn.kneighbors_graph(mode="distance")
54 |
55 | ls = LocalScalingClass(method=method)
56 | if isinstance(ls, LocalScaling):
57 | kwargs = {"X": neigh_graph}
58 | else:
59 | kwargs = {"neigh_dist": neigh_dist, "neigh_ind": neigh_ind, "X": X, "assume_sorted": True}
60 | with assert_raises(ValueError):
61 | ls.fit(**kwargs).transform(**kwargs)
62 |
63 |
64 | @pytest.mark.parametrize("k", [0, 1, 5, 6])
65 | def test_local_scaling_various_k_values(k):
66 | X, y = make_classification(n_samples=10)
67 | nn = NearestNeighbors(n_neighbors=5)
68 | graph = nn.fit(X).kneighbors_graph(X, mode="distance")
69 | ls = LocalScaling(k=k)
70 | if 1 <= k < 5:
71 | with warnings.catch_warnings():
72 | warnings.simplefilter("error")
73 | ls.fit(graph)
74 | else:
75 | with pytest.raises(ValueError, match="n_neighbors"):
76 | ls.fit(graph)
77 |
--------------------------------------------------------------------------------
/docs/getting_started/installation.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Installation
3 | ============
4 |
5 | Installation from PyPI
6 | ======================
7 |
8 | The current release of ``scikit-hubness`` can be installed from PyPI:
9 |
10 | .. code-block:: bash
11 |
12 | pip install scikit-hubness
13 |
14 |
15 | Dependencies
16 | ============
17 |
18 | All strict dependencies of ``scikit-hubness`` are automatically installed
19 | by ``pip``. Some optional dependencies (certain ANN libraries) may not
20 | yet be available from PyPI. If you require one of these libraries,
21 | please refer to the library's documentation for building instructions.
22 | For example, at the time of writing, ``puffinn`` was not available on PyPI.
23 | Building and installing is straight-forward:
24 |
25 | .. code-block:: bash
26 |
27 | git clone https://github.com/puffinn/puffinn.git
28 | cd puffinn
29 | python3 setup.py build
30 | pip install .
31 |
32 |
33 | Installation from source
34 | ========================
35 |
36 | You can always grab the latest version of ``scikit-hubness`` directly from GitHub:
37 |
38 | .. code-block:: bash
39 |
40 | cd install_dir
41 | git clone git@github.com:VarIr/scikit-hubness.git
42 | cd scikit-hubness
43 | pip install -e .
44 |
45 | This is the recommended approach, if you want to contribute to the development of ``scikit-hubness``.
46 |
47 |
48 | Supported platforms
49 | ===================
50 |
51 | ``scikit-hubness`` currently supports all major operating systems:
52 |
53 | - Linux
54 | - MacOS X
55 | - Windows
56 |
57 | Note, that not all approximate nearest neighbor algorithms used in ``scikit-hubness``
58 | are available on all platforms.
59 | This is because we rely on third-party libraries, which in some cases are not
60 | available for all platforms.
61 | The table below indicates, which libraries and
62 | algorithms are currently supported on your operating system.
63 | All exact nearest neighbor algorithms (as provided by scikit-learn) are available on all platforms.
64 |
65 | +---------+-------------+-------+-------+---------+
66 | | library | algorithm | Linux | MacOS | Windows |
67 | +---------+-------------+-------+-------+---------+
68 | | nmslib | hnsw | x | x | x |
69 | +---------+-------------+-------+-------+---------+
70 | | annoy | rptree | x | x | x |
71 | +---------+-------------+-------+-------+---------+
72 | | ngtpy | nng | x | x | |
73 | +---------+-------------+-------+-------+---------+
74 | | falconn | falconn_lsh | x | x | |
75 | +---------+-------------+-------+-------+---------+
76 | | puffinn | lsh | x | x | |
77 | +---------+-------------+-------+-------+---------+
78 | | sklearn | (all exact) | x | x | x |
79 | +---------+-------------+-------+-------+---------+
--------------------------------------------------------------------------------
/skhubness/neighbors/approximate_neighbors.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
3 | from abc import ABC, abstractmethod
4 | from multiprocessing import cpu_count
5 | from typing import Union, Tuple
6 | import warnings
7 | import numpy as np
8 |
9 |
10 | class ApproximateNearestNeighbor(ABC):
11 | """ Abstract base class for approximate nearest neighbor search methods.
12 |
13 | Parameters
14 | ----------
15 | n_candidates: int, default = 5
16 | Number of neighbors to retrieve
17 | metric: str, default = 'euclidean'
18 | Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot"
19 | n_jobs: int, default = 1
20 | Number of parallel jobs
21 | verbose: int, default = 0
22 | Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying.
23 | """
24 | def __init__(self, n_candidates: int = 5, metric: str = 'sqeuclidean',
25 | n_jobs: int = 1, verbose: int = 0, *args, **kwargs):
26 | self.n_candidates = n_candidates
27 | self.metric = metric
28 | if n_jobs is None:
29 | n_jobs = 1
30 | elif n_jobs == -1:
31 | n_jobs = cpu_count()
32 | self.n_jobs = n_jobs
33 | self.verbose = verbose
34 |
35 | @abstractmethod
36 | def fit(self, X, y=None):
37 | """ Setup ANN index from training data.
38 |
39 | Parameters
40 | ----------
41 | X: np.array
42 | Data to be indexed
43 | y: any
44 | Ignored
45 | """
46 | pass # pragma: no cover
47 |
48 | @abstractmethod
49 | def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]:
50 | """ Retrieve k nearest neighbors.
51 |
52 | Parameters
53 | ----------
54 | X: np.array or None, optional, default = None
55 | Query objects. If None, search among the indexed objects.
56 | n_candidates: int or None, optional, default = None
57 | Number of neighbors to retrieve.
58 | If None, use the value passed during construction.
59 | return_distance: bool, default = True
60 | If return_distance, will return distances and indices to neighbors.
61 | Else, only return the indices.
62 | """
63 | pass # pragma: no cover
64 |
65 |
66 | class UnavailableANN(ApproximateNearestNeighbor):
67 | """ Placeholder for ANN methods that are not available on specific platforms. """
68 | def __init__(self, *args, **kwargs):
69 | super().__init__(*args, **kwargs)
70 | warnings.warn("The chosen approximate nearest neighbor method is not supported on your platform.")
71 |
72 | def fit(self, X, y=None):
73 | pass
74 |
75 | def kneighbors(self, X=None, n_candidates=None, return_distance=True):
76 | pass
77 |
--------------------------------------------------------------------------------
/skhubness/reduction/tests/test_hubness_reduction.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: BSD-3-Clause
2 |
3 | from itertools import product
4 | import pytest
5 | from sklearn.datasets import make_classification
6 | from sklearn.utils._testing import assert_array_equal
7 | from sklearn.neighbors import NearestNeighbors
8 |
9 | from skhubness.analysis import Hubness
10 | from skhubness.data import load_dexter
11 | from skhubness.reduction import LocalScaling, MutualProximity, DisSimLocal
12 | from skhubness.reduction.tests.reference_algorithms import ReferenceNoHubnessReduction
13 |
14 |
15 | HUBNESS_REDUCTION = (
16 | LocalScaling, MutualProximity, DisSimLocal,
17 | )
18 | MP_PARAMS = tuple({"method": method} for method in ["normal", "empiric"])
19 | LS_PARAMS = tuple({"method": method} for method in ["standard", "nicdm"])
20 | HUBNESS_REDUCTION_WITH_PARAMS = ((
21 | *product([MutualProximity], MP_PARAMS),
22 | *product([LocalScaling], LS_PARAMS),
23 | (DisSimLocal, {}),
24 | ))
25 |
26 |
27 | @pytest.mark.parametrize("hubness_param", HUBNESS_REDUCTION_WITH_PARAMS)
28 | @pytest.mark.parametrize("metric", ["sqeuclidean", "euclidean", "cosine"])
29 | def test_neighbors_dexter(hubness_param, metric):
30 | HubnessReduction, param = hubness_param
31 | if HubnessReduction is MutualProximity and param.get("method") == "normal":
32 | pytest.skip("MP normal does not improve dexter")
33 | if HubnessReduction is DisSimLocal and metric != "sqeuclidean":
34 | pytest.skip("DisSimLocal works only with squared Euclidean distances")
35 | X, y = load_dexter()
36 |
37 | # Hubness in standard spaces
38 | hub = Hubness(k=10, metric=metric)
39 | hub.fit(X)
40 | k_skew_orig = hub.score()
41 |
42 | # Hubness in secondary distance spaces (after hub. red.)
43 | nn = NearestNeighbors(n_neighbors=50, metric=metric)
44 | graph = nn.fit(X).kneighbors_graph(mode="distance")
45 | hub_red = HubnessReduction(method=param.get("method"))
46 | if HubnessReduction is DisSimLocal:
47 | # TODO check_sorted="full" fails here with for unknown reasons (SIGSEGV during debug)
48 | graph = hub_red.fit_transform(graph, vectors=X, check_sorted=False)
49 | else:
50 | graph = hub_red.fit(graph).transform(graph)
51 | hub = Hubness(k=10, metric="precomputed")
52 | hub.fit(graph)
53 | k_skew_hr = hub.score()
54 |
55 | assert k_skew_hr < k_skew_orig * 8/10,\
56 | f"k-occurrence skewness was not reduced by at least 20% for dexter with {HubnessReduction}"
57 |
58 |
59 | def test_same_indices():
60 | X, y = make_classification()
61 | nn = NearestNeighbors()
62 | nn.fit(X, y)
63 | neigh_dist, neigh_ind = nn.kneighbors()
64 | hr = ReferenceNoHubnessReduction()
65 | _, neigh_ind_hr = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=True)
66 | neigh_ind_ht_no_dist = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=False)
67 | assert_array_equal(neigh_ind, neigh_ind_hr)
68 | assert_array_equal(neigh_ind_hr, neigh_ind_ht_no_dist)
69 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/reusing_index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n========================================\nExample: Reusing index structures\n========================================\n\nThis example shows how to reuse index structures. If you want to first estimate hubness,\nand then perform kNN, you can avoid recomputing the ANN index structure, which can be\ncostly.\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "from sklearn.datasets import make_classification\nfrom sklearn.model_selection import train_test_split\n\nfrom skhubness.analysis import LegacyHubness\nfrom skhubness.neighbors import KNeighborsClassifier\n\nX, y = make_classification(n_samples=100_000,\n n_features=500,\n n_informative=400,\n random_state=543)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.01,\n stratify=y,\n shuffle=True,\n random_state=2346)\n\n# Approximate hubness estimation: Creates LSH index and computes local scaling factors\nhub = LegacyHubness(k=10,\n return_value='robinhood',\n algorithm='falconn_lsh',\n hubness='ls',\n random_state=2345,\n shuffle_equal=False,\n verbose=1)\nhub.fit(X_train)\n\nrobin_hood = hub.score(X_test)\nprint(f'LegacyHubness (Robin Hood): {robin_hood}:.4f')\n# 0.9060\n\n# Approximate hubness reduction for classification: Reuse index & factors\nknn = KNeighborsClassifier(n_neighbor=10,\n algorithm='falconn_lsh',\n hubness='ls',\n n_jobs=1)\n\nknn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE\nacc = knn.score(X_test, y_test)\nprint(f'Test accuracy: {acc:.3f}')\n# 0.959"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/examples/hubness_reduction/olivetti_faces.py:
--------------------------------------------------------------------------------
1 | """
2 | =================================
3 | Face recognition (Olivetti faces)
4 | =================================
5 |
6 | This dataset contains a set of face images taken between April 1992
7 | and April 1994 at AT&T Laboratories Cambridge.
8 | Image data is typically embedded in very high-dimensional spaces,
9 | which might be prone to hubness.
10 | """
11 | import numpy as np
12 | from sklearn.datasets import olivetti_faces
13 | from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
14 |
15 | from skhubness import LegacyHubness
16 | from skhubness.neighbors import KNeighborsClassifier
17 |
18 | # Fetch data and have a look
19 | d = olivetti_faces.fetch_olivetti_faces()
20 | X, y = d['data'], d['target']
21 | print(f'Data shape: {X.shape}')
22 | print(f'Label shape: {y.shape}')
23 | # (400, 4096)
24 | # (400,)
25 |
26 | # The data is embedded in a high-dimensional space.
27 | # Is there hubness, and can we reduce it?
28 | for hubness in [None, 'dsl', 'ls', 'mp']:
29 | hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness')
30 | hub.fit(X)
31 | score = hub.score()
32 | print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}')
33 | # Hubness (10-skew): 1.972 with hubness reduction: None
34 | # Hubness (10-skew): 1.526 with hubness reduction: dsl
35 | # Hubness (10-skew): 0.943 with hubness reduction: ls
36 | # Hubness (10-skew): 0.184 with hubness reduction: mp
37 |
38 | # There is some hubness, and all hubness reduction methods can reduce it (to varying degree)
39 | # Let's assess the best kNN strategy and its estimated performance.
40 | cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)
41 | cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)
42 |
43 | knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})
44 |
45 | # specify parameters and distributions to sample from
46 | param_dist = {"n_neighbors": np.arange(1, 26),
47 | "weights": ['uniform', 'distance'],
48 | "hubness": [None, 'dsl', 'ls', 'mp']}
49 |
50 | # Inner cross-validation to select best hyperparameters (incl hubness reduction method)
51 | search = RandomizedSearchCV(estimator=knn,
52 | param_distributions=param_dist,
53 | n_iter=100,
54 | cv=cv_select,
55 | random_state=2345,
56 | verbose=1)
57 |
58 | # Outer cross-validation to estimate performance
59 | score = cross_val_score(search, X, y, cv=cv_perf, verbose=1)
60 | print(f'Scores: {score}')
61 | print(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}')
62 |
63 | # Select model that maximizes accuracy
64 | search.fit(X, y)
65 |
66 | # The best model's parameters
67 | print(search.best_params_)
68 |
69 | # Does it correspond to the results of hubness reduction above?
70 | # Scores: [0.95 0.9625 1. 0.95 0.925 ]
71 | # Mean acc = 0.957 +/- 0.024
72 | # {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'}
73 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/olivetti_faces.py:
--------------------------------------------------------------------------------
1 | """
2 | =================================
3 | Face recognition (Olivetti faces)
4 | =================================
5 |
6 | This dataset contains a set of face images taken between April 1992
7 | and April 1994 at AT&T Laboratories Cambridge.
8 | Image data is typically embedded in very high-dimensional spaces,
9 | which might be prone to hubness.
10 | """
11 | import numpy as np
12 | from sklearn.datasets import olivetti_faces
13 | from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
14 |
15 | from skhubness import LegacyHubness
16 | from skhubness.neighbors import KNeighborsClassifier
17 |
18 | # Fetch data and have a look
19 | d = olivetti_faces.fetch_olivetti_faces()
20 | X, y = d['data'], d['target']
21 | print(f'Data shape: {X.shape}')
22 | print(f'Label shape: {y.shape}')
23 | # (400, 4096)
24 | # (400,)
25 |
26 | # The data is embedded in a high-dimensional space.
27 | # Is there hubness, and can we reduce it?
28 | for hubness in [None, 'dsl', 'ls', 'mp']:
29 | hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness')
30 | hub.fit(X)
31 | score = hub.score()
32 | print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}')
33 | # Hubness (10-skew): 1.972 with hubness reduction: None
34 | # Hubness (10-skew): 1.526 with hubness reduction: dsl
35 | # Hubness (10-skew): 0.943 with hubness reduction: ls
36 | # Hubness (10-skew): 0.184 with hubness reduction: mp
37 |
38 | # There is some hubness, and all hubness reduction methods can reduce it (to varying degree)
39 | # Let's assess the best kNN strategy and its estimated performance.
40 | cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)
41 | cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)
42 |
43 | knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})
44 |
45 | # specify parameters and distributions to sample from
46 | param_dist = {"n_neighbors": np.arange(1, 26),
47 | "weights": ['uniform', 'distance'],
48 | "hubness": [None, 'dsl', 'ls', 'mp']}
49 |
50 | # Inner cross-validation to select best hyperparameters (incl hubness reduction method)
51 | search = RandomizedSearchCV(estimator=knn,
52 | param_distributions=param_dist,
53 | n_iter=100,
54 | cv=cv_select,
55 | random_state=2345,
56 | verbose=1)
57 |
58 | # Outer cross-validation to estimate performance
59 | score = cross_val_score(search, X, y, cv=cv_perf, verbose=1)
60 | print(f'Scores: {score}')
61 | print(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}')
62 |
63 | # Select model that maximizes accuracy
64 | search.fit(X, y)
65 |
66 | # The best model's parameters
67 | print(search.best_params_)
68 |
69 | # Does it correspond to the results of hubness reduction above?
70 | # Scores: [0.95 0.9625 1. 0.95 0.925 ]
71 | # Mean acc = 0.957 +/- 0.024
72 | # {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'}
73 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n# Nearest Neighbors regression\n\n\nDemonstrate the resolution of a regression problem\nusing a k-Nearest Neighbor and the interpolation of the\ntarget using both barycenter and constant weights.\n\nHubness reduction of this low-dimensional dataset\nshows only small effects.\n\nAdapted from ``_\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "print(__doc__)\n\n# Author: Alexandre Gramfort \n# Fabian Pedregosa \n#\n# License: BSD 3 clause (C) INRIA\n\n\n# #############################################################################\n# Generate sample data\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom skhubness.neighbors import KNeighborsRegressor\n\nnp.random.seed(0)\nX = np.sort(5 * np.random.rand(40, 1), axis=0)\nT = np.linspace(0, 5, 500)[:, np.newaxis]\ny = np.sin(X).ravel()\n\n# Add noise to targets\ny[::5] += 1 * (0.5 - np.random.rand(8))\n\n# #############################################################################\n# Fit regression model\nn_neighbors = 5\n\nf = plt.figure()\nfor i, weights in enumerate(['uniform', 'distance']):\n for j, hubness in enumerate([None, 'local_scaling']):\n knn = KNeighborsRegressor(n_neighbors,\n algorithm_params={'n_candidates': 39},\n weights=weights,\n hubness=hubness)\n y_ = knn.fit(X, y).predict(T)\n\n plt.subplot(2, 2, i * 2 + j + 1)\n f.set_figheight(15)\n f.set_figwidth(15)\n plt.scatter(X, y, c='k', label='data')\n plt.plot(T, y_, c='g', label='prediction')\n plt.axis('tight')\n plt.legend()\n plt.title(f\"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')\")\n\nplt.tight_layout()\nplt.show()"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/high_dim_gaussian.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n========================================\nExample: Approximate hubness reduction\n========================================\n\nThis example shows how to combine approximate nearest neighbor search and hubness reduction\nin order to perform approximate hubness reduction for large data sets.\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "from sklearn.datasets import make_classification\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom skhubness.analysis import LegacyHubness\nfrom skhubness.neighbors import KNeighborsClassifier\n\n# High-dimensional artificial data\nX, y = make_classification(n_samples=1_000_000,\n n_features=500,\n n_informative=400,\n random_state=543)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=10_000,\n stratify=y,\n shuffle=True,\n random_state=2346)\n\n# Approximate hubness estimation\nhub = LegacyHubness(k=10,\n return_value='robinhood',\n algorithm='hnsw',\n random_state=2345,\n shuffle_equal=False,\n n_jobs=-1,\n verbose=2)\nhub.fit(X_train)\nrobin_hood = hub.score(X_test)\nprint(f'LegacyHubness (Robin Hood): {robin_hood:.3f}')\n# 0.944\n\n# Approximate hubness reduction for classification\nknn = KNeighborsClassifier(n_neighbor=10,\n algorithm='hnsw',\n hubness='ls',\n n_jobs=-1,\n verbose=2)\n\nknn.fit(X_train, y_train)\ny_pred = knn.predict(X_test)\nacc = accuracy_score(y_test, y_pred)\nprint(f'Test accuracy: {acc:.3f}')\n# Test accuracy: 0.987"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n# Nearest Neighbors Classification\n\nSample usage of Nearest Neighbors classification.\nIt will plot the decision boundaries for each class.\n\nAdapted from ``_\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "import numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom skhubness.neighbors import KNeighborsClassifier\n\nn_neighbors = 15\n\n# import some data to play with\niris = datasets.load_iris()\n\n# we only take the first two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = iris.data[:, :2]\ny = iris.target\n\nh = .02 # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n\nfor hubness in [None, 'mutual_proximity']:\n # we create an instance of Neighbours Classifier and fit the data.\n clf = KNeighborsClassifier(n_neighbors,\n hubness=hubness,\n weights='distance')\n clf.fit(X, y)\n\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n np.arange(y_min, y_max, h))\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.figure()\n plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n\n # Plot also the training points\n plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n edgecolor='k', s=20)\n plt.xlim(xx.min(), xx.max())\n plt.ylim(yy.min(), yy.max())\n plt.title(\"3-Class classification (k = %i, hubness = '%s')\"\n % (n_neighbors, hubness))\n\nplt.show()"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/docs/github_link.py:
--------------------------------------------------------------------------------
1 | # from https://github.com/scikit-learn/scikit-learn/blob/master/doc/sphinxext/github_link.py
2 |
3 | from operator import attrgetter
4 | import inspect
5 | import subprocess
6 | import os
7 | import sys
8 | from functools import partial
9 |
10 | REVISION_CMD = 'git rev-parse --short HEAD'
11 |
12 |
13 | def _get_git_revision():
14 | try:
15 | revision = subprocess.check_output(REVISION_CMD.split()).strip()
16 | except (subprocess.CalledProcessError, OSError):
17 | print('Failed to execute git to get revision')
18 | return None
19 | return revision.decode('utf-8')
20 |
21 |
22 | def _linkcode_resolve(domain, info, package, url_fmt, revision):
23 | """Determine a link to online source for a class/method/function
24 |
25 | This is called by sphinx.ext.linkcode
26 |
27 | An example with a long-untouched module that everyone has
28 | >>> _linkcode_resolve('py', {'module': 'tty',
29 | ... 'fullname': 'setraw'},
30 | ... package='tty',
31 | ... url_fmt='http://hg.python.org/cpython/file/'
32 | ... '{revision}/Lib/{package}/{path}#L{lineno}',
33 | ... revision='xxxx')
34 | 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18'
35 | """
36 |
37 | if revision is None:
38 | return
39 | if domain not in ('py', 'pyx'):
40 | return
41 | if not info.get('module') or not info.get('fullname'):
42 | return
43 |
44 | class_name = info['fullname'].split('.')[0]
45 | if type(class_name) != str:
46 | # Python 2 only
47 | class_name = class_name.encode('utf-8')
48 | module = __import__(info['module'], fromlist=[class_name])
49 | obj = attrgetter(info['fullname'])(module)
50 |
51 | try:
52 | fn = inspect.getsourcefile(obj)
53 | except Exception:
54 | fn = None
55 | if not fn:
56 | try:
57 | fn = inspect.getsourcefile(sys.modules[obj.__module__])
58 | except Exception:
59 | fn = None
60 | if not fn:
61 | return
62 | # Work-around: disable links to imported packages (e.g. scikit-learn)
63 | if '/site-packages/' in fn:
64 | return
65 |
66 | fn = os.path.relpath(fn,
67 | start=os.path.dirname(__import__(package).__file__))
68 | try:
69 | lineno = inspect.getsourcelines(obj)[1]
70 | except Exception:
71 | lineno = ''
72 | return url_fmt.format(revision=revision, package=package,
73 | path=fn, lineno=lineno)
74 |
75 |
76 | def make_linkcode_resolve(package, url_fmt):
77 | """Returns a linkcode_resolve function for the given URL format
78 |
79 | revision is a git commit reference (hash or name)
80 |
81 | package is the name of the root module of the package
82 |
83 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
84 | 'blob/{revision}/{package}/'
85 | '{path}#L{lineno}')
86 | """
87 | revision = _get_git_revision()
88 | return partial(_linkcode_resolve, revision=revision, package=package,
89 | url_fmt=url_fmt)
90 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/reusing_index.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | :class: sphx-glr-download-link-note
3 |
4 | Click :ref:`here ` to download the full example code
5 | .. rst-class:: sphx-glr-example-title
6 |
7 | .. _sphx_glr_documentation_auto_examples_ahr_reusing_index.py:
8 |
9 |
10 | ========================================
11 | Example: Reusing index structures
12 | ========================================
13 |
14 | This example shows how to reuse index structures. If you want to first estimate hubness,
15 | and then perform kNN, you can avoid recomputing the ANN index structure, which can be
16 | costly.
17 |
18 |
19 | .. code-block:: default
20 |
21 | from sklearn.datasets import make_classification
22 | from sklearn.model_selection import train_test_split
23 |
24 | from skhubness.analysis import LegacyHubness
25 | from skhubness.neighbors import KNeighborsClassifier
26 |
27 | X, y = make_classification(n_samples=100_000,
28 | n_features=500,
29 | n_informative=400,
30 | random_state=543)
31 |
32 | X_train, X_test, y_train, y_test = train_test_split(X, y,
33 | test_size=0.01,
34 | stratify=y,
35 | shuffle=True,
36 | random_state=2346)
37 |
38 | # Approximate hubness estimation: Creates LSH index and computes local scaling factors
39 | hub = LegacyHubness(k=10,
40 | return_value='robinhood',
41 | algorithm='falconn_lsh',
42 | hubness='ls',
43 | random_state=2345,
44 | shuffle_equal=False,
45 | verbose=1)
46 | hub.fit(X_train)
47 |
48 | robin_hood = hub.score(X_test)
49 | print(f'Hubness (Robin Hood): {robin_hood}:.4f')
50 | # 0.9060
51 |
52 | # Approximate hubness reduction for classification: Reuse index & factors
53 | knn = KNeighborsClassifier(n_neighbor=10,
54 | algorithm='falconn_lsh',
55 | hubness='ls',
56 | n_jobs=1)
57 |
58 | knn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE
59 | acc = knn.score(X_test, y_test)
60 | print(f'Test accuracy: {acc:.3f}')
61 | # 0.959
62 |
63 |
64 | .. rst-class:: sphx-glr-timing
65 |
66 | **Total running time of the script:** ( 0 minutes 0.000 seconds)
67 |
68 |
69 | .. _sphx_glr_download_documentation_auto_examples_ahr_reusing_index.py:
70 |
71 |
72 | .. only :: html
73 |
74 | .. container:: sphx-glr-footer
75 | :class: sphx-glr-footer-example
76 |
77 |
78 |
79 | .. container:: sphx-glr-download
80 |
81 | :download:`Download Python source code: reusing_index.py `
82 |
83 |
84 |
85 | .. container:: sphx-glr-download
86 |
87 | :download:`Download Jupyter notebook: reusing_index.ipynb `
88 |
89 |
90 | .. only:: html
91 |
92 | .. rst-class:: sphx-glr-signature
93 |
94 | `Gallery generated by Sphinx-Gallery `_
95 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nearest_centroid.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n# Nearest Centroid Classification\n\n\nSample usage of Nearest Centroid classification.\nIt will plot the decision boundaries for each class.\n\nNote that no hubness reduction is currently implemented for centroids.\nHowever, `hubness.neighbors` retains all the features of `sklearn.neighbors`,\nin order to act as a full drop-in replacement.\n\nAdapted from ``_\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom skhubness.neighbors import NearestCentroid\n\nn_neighbors = 15\n\n# import some data to play with\niris = datasets.load_iris()\n# we only take the first two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = iris.data[:, :2]\ny = iris.target\n\nh = .02 # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n\nfor shrinkage in [None, .2]:\n # we create an instance of Neighbours Classifier and fit the data.\n clf = NearestCentroid(shrink_threshold=shrinkage)\n clf.fit(X, y)\n y_pred = clf.predict(X)\n print(shrinkage, np.mean(y == y_pred))\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n np.arange(y_min, y_max, h))\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.figure()\n plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n\n # Plot also the training points\n plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n edgecolor='k', s=20)\n plt.title(\"3-Class classification (shrink_threshold=%r)\"\n % shrinkage)\n plt.axis('tight')\n\nplt.show()"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ahr/high_dim_gaussian.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | :class: sphx-glr-download-link-note
3 |
4 | Click :ref:`here ` to download the full example code
5 | .. rst-class:: sphx-glr-example-title
6 |
7 | .. _sphx_glr_documentation_auto_examples_ahr_high_dim_gaussian.py:
8 |
9 |
10 | ========================================
11 | Example: Approximate hubness reduction
12 | ========================================
13 |
14 | This example shows how to combine approximate nearest neighbor search and hubness reduction
15 | in order to perform approximate hubness reduction for large data sets.
16 |
17 |
18 | .. code-block:: default
19 |
20 | from sklearn.datasets import make_classification
21 | from sklearn.metrics import accuracy_score
22 | from sklearn.model_selection import train_test_split
23 |
24 | from skhubness.analysis import LegacyHubness
25 | from skhubness.neighbors import KNeighborsClassifier
26 |
27 | # High-dimensional artificial data
28 | X, y = make_classification(n_samples=1_000_000,
29 | n_features=500,
30 | n_informative=400,
31 | random_state=543)
32 |
33 | X_train, X_test, y_train, y_test = train_test_split(X, y,
34 | test_size=10_000,
35 | stratify=y,
36 | shuffle=True,
37 | random_state=2346)
38 |
39 | # Approximate hubness estimation
40 | hub = LegacyHubness(k=10,
41 | return_value='robinhood',
42 | algorithm='hnsw',
43 | random_state=2345,
44 | shuffle_equal=False,
45 | n_jobs=-1,
46 | verbose=2)
47 | hub.fit(X_train)
48 | robin_hood = hub.score(X_test)
49 | print(f'Hubness (Robin Hood): {robin_hood:.3f}')
50 | # 0.944
51 |
52 | # Approximate hubness reduction for classification
53 | knn = KNeighborsClassifier(n_neighbor=10,
54 | algorithm='hnsw',
55 | hubness='ls',
56 | n_jobs=-1,
57 | verbose=2)
58 |
59 | knn.fit(X_train, y_train)
60 | y_pred = knn.predict(X_test)
61 | acc = accuracy_score(y_test, y_pred)
62 | print(f'Test accuracy: {acc:.3f}')
63 | # Test accuracy: 0.987
64 |
65 |
66 | .. rst-class:: sphx-glr-timing
67 |
68 | **Total running time of the script:** ( 0 minutes 0.000 seconds)
69 |
70 |
71 | .. _sphx_glr_download_documentation_auto_examples_ahr_high_dim_gaussian.py:
72 |
73 |
74 | .. only :: html
75 |
76 | .. container:: sphx-glr-footer
77 | :class: sphx-glr-footer-example
78 |
79 |
80 |
81 | .. container:: sphx-glr-download
82 |
83 | :download:`Download Python source code: high_dim_gaussian.py `
84 |
85 |
86 |
87 | .. container:: sphx-glr-download
88 |
89 | :download:`Download Jupyter notebook: high_dim_gaussian.ipynb `
90 |
91 |
92 | .. only:: html
93 |
94 | .. rst-class:: sphx-glr-signature
95 |
96 | `Gallery generated by Sphinx-Gallery `_
97 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/pipelines.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n========================================\nExample: skhubness in Pipelines\n========================================\n\nEstimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``.\nIn this example, we select the best hubness reduction method and several other\nhyperparameters in grid search w.r.t. to classification performance.\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "from sklearn.datasets import make_classification\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nfrom skhubness.neighbors import KNeighborsClassifier\n\n# Not so high-dimensional data\nX, y = make_classification(n_samples=1_000,\n n_features=50,\n n_informative=20,\n n_classes=2,\n random_state=3453)\n\nX, X_test, y, y_test = train_test_split(X, y,\n test_size=100,\n stratify=y,\n shuffle=True,\n random_state=124)\n\n# Pipeline of standardization, dimensionality reduction, and kNN classification\npipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)),\n ('pca', PCA(n_components=20, random_state=1213)),\n ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))])\n\n# Exhaustive search for best algorithms and hyperparameters\nparam_grid = {'pca__n_components': [10, 20, 30],\n 'knn__n_neighbors': [5, 10, 20],\n 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'],\n 'knn__hubness': [None, 'mp', 'ls', 'dsl']}\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354)\nsearch = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1)\nsearch.fit(X, y)\n\n# Performance on hold-out data\nacc = search.score(y_test, y_test)\nprint(acc)\n# 0.79\n\nprint(search.best_params_)\n# {'knn__algorithm': 'auto',\n# 'knn__hubness': 'dsl',\n# 'knn__n_neighbors': 20,\n# 'pca__n_components': 30}"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/docs/getting_started/example.rst:
--------------------------------------------------------------------------------
1 | ===================
2 | Quick start example
3 | ===================
4 |
5 | Users of ``scikit-hubness`` typically want to
6 |
7 | 1. analyse, whether their data show hubness
8 | 2. reduce hubness
9 | 3. perform learning (classification, regression, ...)
10 |
11 | The following example shows all these steps for an example dataset
12 | from the text domain (dexter).
13 | Please make sure you have installed ``scikit-hubness``
14 | (`installation instructions `_).
15 |
16 | First, we load the dataset and inspect its size.
17 |
18 | .. code-block:: python
19 |
20 | from skhubness.data import load_dexter
21 | X, y = load_dexter()
22 | print(f'X.shape = {X.shape}, y.shape={y.shape}')
23 |
24 | Dexter is embedded in a high-dimensional space,
25 | and could, thus, be prone to hubness.
26 | Therefore, we assess the actual degree of hubness.
27 |
28 | .. code-block:: python
29 |
30 | from skhubness import LegacyHubness
31 | hub = LegacyHubness(k=10, metric='cosine')
32 | hub.fit(X)
33 | k_skew = hub.score()
34 | print(f'Skewness = {k_skew:.3f}')
35 |
36 | As a rule-of-thumb, skewness > 1.2 indicates significant hubness.
37 | Additional hubness indices are available, for example:
38 |
39 | .. code-block:: python
40 |
41 | print(f'Robin hood index: {hub.robinhood_index:.3f}')
42 | print(f'Antihub occurrence: {hub.antihub_occurrence:.3f}')
43 | print(f'Hub occurrence: {hub.hub_occurrence:.3f}')
44 |
45 | There is considerable hubness in dexter.
46 | Let's see, whether hubness reduction can improve
47 | kNN classification performance.
48 |
49 | .. code-block:: python
50 |
51 | from sklearn.model_selection import cross_val_score
52 | from skhubness.neighbors import KNeighborsClassifier
53 |
54 | # vanilla kNN
55 | knn_standard = KNeighborsClassifier(n_neighbors=5,
56 | metric='cosine')
57 | acc_standard = cross_val_score(knn_standard, X, y, cv=5)
58 |
59 | # kNN with hubness reduction (mutual proximity)
60 | knn_mp = KNeighborsClassifier(n_neighbors=5,
61 | metric='cosine',
62 | hubness='mutual_proximity')
63 | acc_mp = cross_val_score(knn_mp, X, y, cv=5)
64 |
65 | print(f'Accuracy (vanilla kNN): {acc_standard.mean():.3f}')
66 | print(f'Accuracy (kNN with hubness reduction): {acc_mp.mean():.3f}')
67 |
68 |
69 | Accuracy was considerably improved by mutual proximity (MP).
70 | But did MP actually reduce hubness?
71 |
72 | .. code-block:: python
73 |
74 | hub_mp = LegacyHubness(k=10, metric='cosine',
75 | hubness='mutual_proximity')
76 | hub_mp.fit(X)
77 | k_skew_mp = hub_mp.score()
78 | print(f'Skewness after MP: {k_skew_mp:.3f} '
79 | f'(reduction of {k_skew - k_skew_mp:.3f})')
80 | print(f'Robin hood: {hub_mp.robinhood_index:.3f} '
81 | f'(reduction of {hub.robinhood_index - hub_mp.robinhood_index:.3f})')
82 |
83 | Yes!
84 |
85 | The neighbor graph can also be created directly,
86 | with or without hubness reduction:
87 |
88 | .. code-block:: python
89 |
90 | from skhubness.neighbors import kneighbors_graph
91 | neighbor_graph = kneighbors_graph(X,
92 | n_neighbors=5,
93 | hubness='mutual_proximity')
94 |
95 | You may want to precompute the graph like this,
96 | in order to avoid computing it repeatedly for subsequent hubness estimation and learning.
97 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. scikit-hubness documentation master file, created by
2 | sphinx-quickstart on Mon Jul 8 13:54:25 2019.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | `scikit-hubness`: high-dimensional data mining
7 | ================================================
8 |
9 | ``scikit-hubness`` is a Python package for analysis of hubness
10 | in high-dimensional data. It provides hubness reduction and
11 | approximate nearest neighbor search via a drop-in replacement for
12 | `sklearn.neighbors `_.
13 |
14 | .. toctree::
15 | :maxdepth: 1
16 | :hidden:
17 | :caption: Getting Started
18 |
19 | Installation
20 | Quick start example
21 |
22 | .. toctree::
23 | :maxdepth: 3
24 | :hidden:
25 | :caption: Documentation
26 |
27 | User Guide
28 | scikit-hubness API
29 | History
30 |
31 | .. toctree::
32 | :maxdepth: 2
33 | :titlesonly:
34 | :hidden:
35 | :caption: Development
36 |
37 | Contributing
38 | Github Repository
39 | What's new (Changelog)
40 |
41 |
42 | `Getting started `_
43 | -------------------------------------------------------
44 |
45 | Get started with ``scikit-hubness`` in a breeze.
46 | Find how to `install the package `_ and
47 | see all core functionality applied in a single `quick start example `_.
48 |
49 |
50 | `User Guide `_
51 | -----------------------------------------------
52 |
53 | The `User Guide `_ introduces the main concepts of ``scikit-hubness``.
54 | It explains, how to analyze your data sets for hubness,
55 | and how to use the package to lift this *curse of dimensionality*.
56 | You will also find examples how to use ``skhubness.neighbors``
57 | for approximate nearest neighbor search (with or without hubness reduction).
58 |
59 |
60 | `API Documentation `_
61 | --------------------------------------------------------
62 |
63 | The `API Documentation `_ provides detailed information
64 | of the implemented methods.
65 | This information includes method descriptions, parameters, references, examples, etc.
66 | Find all the information about specific modules and functions of ``scikit-hubness`` in this section.
67 |
68 |
69 | `History `_
70 | ----------------------------------------
71 |
72 | A `brief history `_ of the package,
73 | and how it relates to the ``Hub-Toolbox``'es.
74 |
75 |
76 | `Development `_
77 | -----------------------------------------------
78 |
79 | There are several possibilities to `contribute `_
80 | to this free open source software. We highly appreciate all input from the community,
81 | be it bug reports or code contributions.
82 |
83 | Source code, issue tracking, discussion, and continuous integration appear on
84 | our `GitHub page `_.
85 |
86 |
87 | `What's new `_
88 | --------------------------------
89 |
90 | To see what's new in the latest version of ``scikit-hubness``,
91 | have a look at the `changelog `_.
92 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at sci@feldbauer.org. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/pipelines.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | :class: sphx-glr-download-link-note
3 |
4 | Click :ref:`here ` to download the full example code
5 | .. rst-class:: sphx-glr-example-title
6 |
7 | .. _sphx_glr_documentation_auto_examples_hr_pipelines.py:
8 |
9 |
10 | ========================================
11 | Example: skhubness in Pipelines
12 | ========================================
13 |
14 | Estimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``.
15 | In this example, we select the best hubness reduction method and several other
16 | hyperparameters in grid search w.r.t. to classification performance.
17 |
18 |
19 | .. code-block:: default
20 |
21 | from sklearn.datasets import make_classification
22 | from sklearn.decomposition import PCA
23 | from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
24 | from sklearn.pipeline import Pipeline
25 | from sklearn.preprocessing import StandardScaler
26 |
27 | from skhubness.neighbors import KNeighborsClassifier
28 |
29 | # Not so high-dimensional data
30 | X, y = make_classification(n_samples=1_000,
31 | n_features=50,
32 | n_informative=20,
33 | n_classes=2,
34 | random_state=3453)
35 |
36 | X, X_test, y, y_test = train_test_split(X, y,
37 | test_size=100,
38 | stratify=y,
39 | shuffle=True,
40 | random_state=124)
41 |
42 | # Pipeline of standardization, dimensionality reduction, and kNN classification
43 | pipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)),
44 | ('pca', PCA(n_components=20, random_state=1213)),
45 | ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))])
46 |
47 | # Exhaustive search for best algorithms and hyperparameters
48 | param_grid = {'pca__n_components': [10, 20, 30],
49 | 'knn__n_neighbors': [5, 10, 20],
50 | 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'],
51 | 'knn__hubness': [None, 'mp', 'ls', 'dsl']}
52 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354)
53 | search = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1)
54 | search.fit(X, y)
55 |
56 | # Performance on hold-out data
57 | acc = search.score(y_test, y_test)
58 | print(acc)
59 | # 0.79
60 |
61 | print(search.best_params_)
62 | # {'knn__algorithm': 'auto',
63 | # 'knn__hubness': 'dsl',
64 | # 'knn__n_neighbors': 20,
65 | # 'pca__n_components': 30}
66 |
67 |
68 | .. rst-class:: sphx-glr-timing
69 |
70 | **Total running time of the script:** ( 0 minutes 0.000 seconds)
71 |
72 |
73 | .. _sphx_glr_download_documentation_auto_examples_hr_pipelines.py:
74 |
75 |
76 | .. only :: html
77 |
78 | .. container:: sphx-glr-footer
79 | :class: sphx-glr-footer-example
80 |
81 |
82 |
83 | .. container:: sphx-glr-download
84 |
85 | :download:`Download Python source code: pipelines.py `
86 |
87 |
88 |
89 | .. container:: sphx-glr-download
90 |
91 | :download:`Download Jupyter notebook: pipelines.ipynb `
92 |
93 |
94 | .. only:: html
95 |
96 | .. rst-class:: sphx-glr-signature
97 |
98 | `Gallery generated by Sphinx-Gallery `_
99 |
--------------------------------------------------------------------------------
/skhubness/neighbors/tests/test_neighbors.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import pytest
4 |
5 | import numpy as np
6 | from scipy.sparse import csr_matrix
7 | from sklearn.datasets import make_classification
8 | from sklearn.metrics import accuracy_score
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier
11 |
12 | from skhubness.neighbors import AnnoyTransformer, NGTTransformer, NMSlibTransformer, PuffinnTransformer
13 |
14 |
15 | @pytest.mark.parametrize("n_neighbors", [1, 5, 10])
16 | @pytest.mark.parametrize("metric", [None, "euclidean", "cosine"])
17 | @pytest.mark.parametrize("ApproximateNNTransformer",
18 | [AnnoyTransformer, NGTTransformer, NMSlibTransformer, PuffinnTransformer])
19 | def test_ann_transformers_similar_to_exact_transformer(ApproximateNNTransformer, n_neighbors, metric):
20 | if sys.platform == "win32" and issubclass(ApproximateNNTransformer, (NGTTransformer, PuffinnTransformer)):
21 | pytest.skip(f"{ApproximateNNTransformer.__name__} is not available on Windows.")
22 | knn_metric = metric
23 | ann_metric = metric
24 | if issubclass(ApproximateNNTransformer, PuffinnTransformer) and metric in ["euclidean", "cosine"]:
25 | pytest.skip(f"{ApproximateNNTransformer.__name__} does not support metric={metric}")
26 | if issubclass(ApproximateNNTransformer, AnnoyTransformer) and metric == "cosine":
27 | ann_metric = "angular"
28 | n_samples = 100
29 | X, y = make_classification(
30 | n_samples=n_samples,
31 | random_state=123,
32 | )
33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=456, shuffle=True, stratify=y)
34 |
35 | # Exackt kNN graph for comparison
36 | kwargs = {}
37 | if knn_metric is not None:
38 | kwargs["metric"] = knn_metric
39 | knn = KNeighborsTransformer(n_neighbors=n_neighbors, **kwargs)
40 | graph_train = knn.fit_transform(X_train, y_train)
41 | knn_graph: csr_matrix = knn.transform(X_test)
42 | knn_clf = KNeighborsClassifier(n_neighbors=n_neighbors, metric="precomputed")
43 | y_pred_knn = knn_clf.fit(graph_train, y_train).predict(knn_graph)
44 | knn_acc = accuracy_score(y_true=y_test, y_pred=y_pred_knn)
45 |
46 | # ANN graph
47 | kwargs = {}
48 | if ann_metric is not None:
49 | kwargs["metric"] = ann_metric
50 | ann = ApproximateNNTransformer(n_neighbors=n_neighbors, **kwargs)
51 | graph_train = ann.fit_transform(X_train, y_train)
52 | ann_graph = ann.transform(X_test)
53 | ann_clf = KNeighborsClassifier(n_neighbors=n_neighbors, metric="precomputed")
54 | y_pred_ann = ann_clf.fit(graph_train, y_train).predict(ann_graph)
55 | ann_acc = accuracy_score(y_true=y_test, y_pred=y_pred_ann)
56 |
57 | # Neighbor graphs should be same class, same shape, same dtype
58 | assert ann_graph.__class__ == knn_graph.__class__
59 | assert ann_graph.shape == knn_graph.shape
60 | assert ann_graph.dtype == knn_graph.dtype
61 | assert ann_graph.nnz == knn_graph.nnz
62 | if issubclass(ApproximateNNTransformer, AnnoyTransformer):
63 | pass # Known inaccuracy
64 | elif issubclass(ApproximateNNTransformer, PuffinnTransformer) and metric is None:
65 | pass # Known inaccuracy
66 | else:
67 | np.testing.assert_array_equal(ann_graph.indices.ravel(), knn_graph.indices.ravel())
68 | np.testing.assert_array_almost_equal(ann_graph.data.ravel(), knn_graph.data.ravel())
69 | if issubclass(ApproximateNNTransformer, AnnoyTransformer) and metric == "cosine" and n_neighbors == 1:
70 | return # Known inaccurate result
71 | assert ann_acc > knn_acc or np.isclose(ann_acc, knn_acc), "ApproximateNN accuracy << exact kNN accuracy."
72 |
--------------------------------------------------------------------------------
/examples/sklearn/plot_multioutput_face_completion.py:
--------------------------------------------------------------------------------
1 | """
2 | ===================================================
3 | Face completion with a multi-output estimators
4 | ===================================================
5 |
6 | This example shows the use of multi-output estimator to complete images.
7 | The goal is to predict the lower half of a face given its upper half.
8 |
9 | The first column of images shows true faces. The next columns illustrate
10 | how extremely randomized trees, linear regression, ridge regression,
11 | and k nearest neighbors with or without hubness reduction
12 | complete the lower half of those faces.
13 |
14 |
15 | Adapted from ``_
16 | """
17 | print(__doc__)
18 |
19 | import numpy as np
20 | import matplotlib.pyplot as plt
21 |
22 | from sklearn.datasets import fetch_olivetti_faces
23 | from sklearn.utils.validation import check_random_state
24 |
25 | from sklearn.ensemble import ExtraTreesRegressor
26 | from sklearn.linear_model import LinearRegression
27 | from sklearn.linear_model import RidgeCV
28 |
29 | from skhubness.neighbors import KNeighborsRegressor
30 |
31 | # Load the faces datasets
32 | data = fetch_olivetti_faces()
33 | targets = data.target
34 |
35 | data = data.images.reshape((len(data.images), -1))
36 | train = data[targets < 30]
37 | test = data[targets >= 30] # Test on independent people
38 |
39 | # Test on a subset of people
40 | n_faces = 5
41 | rng = check_random_state(4)
42 | face_ids = rng.randint(test.shape[0], size=(n_faces, ))
43 | test = test[face_ids, :]
44 |
45 | n_pixels = data.shape[1]
46 | # Upper half of the faces
47 | X_train = train[:, :(n_pixels + 1) // 2]
48 | # Lower half of the faces
49 | y_train = train[:, n_pixels // 2:]
50 | X_test = test[:, :(n_pixels + 1) // 2]
51 | y_test = test[:, n_pixels // 2:]
52 |
53 | # Fit estimators
54 | ESTIMATORS = {
55 | "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32,
56 | random_state=0),
57 | "k-NN": KNeighborsRegressor(weights='distance'),
58 | "k-NN MP": KNeighborsRegressor(hubness='mp',
59 | hubness_params={'method': 'normal'},
60 | weights='distance'),
61 | "Linear regression": LinearRegression(),
62 | "Ridge": RidgeCV(),
63 | }
64 |
65 | y_test_predict = dict()
66 | for name, estimator in ESTIMATORS.items():
67 | estimator.fit(X_train, y_train)
68 | y_test_predict[name] = estimator.predict(X_test)
69 |
70 | # Plot the completed faces
71 | image_shape = (64, 64)
72 |
73 | n_cols = 1 + len(ESTIMATORS)
74 | plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
75 | plt.suptitle("Face completion with multi-output estimators", size=16)
76 |
77 | for i in range(n_faces):
78 | true_face = np.hstack((X_test[i], y_test[i]))
79 |
80 | if i:
81 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)
82 | else:
83 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,
84 | title="true faces")
85 |
86 | sub.axis("off")
87 | sub.imshow(true_face.reshape(image_shape),
88 | cmap=plt.cm.gray,
89 | interpolation="nearest")
90 |
91 | for j, est in enumerate(sorted(ESTIMATORS)):
92 | completed_face = np.hstack((X_test[i], y_test_predict[est][i]))
93 |
94 | if i:
95 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)
96 |
97 | else:
98 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,
99 | title=est)
100 |
101 | sub.axis("off")
102 | sub.imshow(completed_face.reshape(image_shape),
103 | cmap=plt.cm.gray,
104 | interpolation="nearest")
105 |
106 | plt.show()
107 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_multioutput_face_completion.py:
--------------------------------------------------------------------------------
1 | """
2 | ===================================================
3 | Face completion with a multi-output estimators
4 | ===================================================
5 |
6 | This example shows the use of multi-output estimator to complete images.
7 | The goal is to predict the lower half of a face given its upper half.
8 |
9 | The first column of images shows true faces. The next columns illustrate
10 | how extremely randomized trees, linear regression, ridge regression,
11 | and k nearest neighbors with or without hubness reduction
12 | complete the lower half of those faces.
13 |
14 |
15 | Adapted from ``_
16 | """
17 | print(__doc__)
18 |
19 | import numpy as np
20 | import matplotlib.pyplot as plt
21 |
22 | from sklearn.datasets import fetch_olivetti_faces
23 | from sklearn.utils.validation import check_random_state
24 |
25 | from sklearn.ensemble import ExtraTreesRegressor
26 | from sklearn.linear_model import LinearRegression
27 | from sklearn.linear_model import RidgeCV
28 |
29 | from skhubness.neighbors import KNeighborsRegressor
30 |
31 | # Load the faces datasets
32 | data = fetch_olivetti_faces()
33 | targets = data.target
34 |
35 | data = data.images.reshape((len(data.images), -1))
36 | train = data[targets < 30]
37 | test = data[targets >= 30] # Test on independent people
38 |
39 | # Test on a subset of people
40 | n_faces = 5
41 | rng = check_random_state(4)
42 | face_ids = rng.randint(test.shape[0], size=(n_faces, ))
43 | test = test[face_ids, :]
44 |
45 | n_pixels = data.shape[1]
46 | # Upper half of the faces
47 | X_train = train[:, :(n_pixels + 1) // 2]
48 | # Lower half of the faces
49 | y_train = train[:, n_pixels // 2:]
50 | X_test = test[:, :(n_pixels + 1) // 2]
51 | y_test = test[:, n_pixels // 2:]
52 |
53 | # Fit estimators
54 | ESTIMATORS = {
55 | "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32,
56 | random_state=0),
57 | "k-NN": KNeighborsRegressor(weights='distance'),
58 | "k-NN MP": KNeighborsRegressor(hubness='mp',
59 | hubness_params={'method': 'normal'},
60 | weights='distance'),
61 | "Linear regression": LinearRegression(),
62 | "Ridge": RidgeCV(),
63 | }
64 |
65 | y_test_predict = dict()
66 | for name, estimator in ESTIMATORS.items():
67 | estimator.fit(X_train, y_train)
68 | y_test_predict[name] = estimator.predict(X_test)
69 |
70 | # Plot the completed faces
71 | image_shape = (64, 64)
72 |
73 | n_cols = 1 + len(ESTIMATORS)
74 | plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
75 | plt.suptitle("Face completion with multi-output estimators", size=16)
76 |
77 | for i in range(n_faces):
78 | true_face = np.hstack((X_test[i], y_test[i]))
79 |
80 | if i:
81 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)
82 | else:
83 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,
84 | title="true faces")
85 |
86 | sub.axis("off")
87 | sub.imshow(true_face.reshape(image_shape),
88 | cmap=plt.cm.gray,
89 | interpolation="nearest")
90 |
91 | for j, est in enumerate(sorted(ESTIMATORS)):
92 | completed_face = np.hstack((X_test[i], y_test_predict[est][i]))
93 |
94 | if i:
95 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)
96 |
97 | else:
98 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,
99 | title=est)
100 |
101 | sub.axis("off")
102 | sub.imshow(completed_face.reshape(image_shape),
103 | cmap=plt.cm.gray,
104 | interpolation="nearest")
105 |
106 | plt.show()
107 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/olivetti_faces.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n=================================\nFace recognition (Olivetti faces)\n=================================\n\nThis dataset contains a set of face images taken between April 1992\nand April 1994 at AT&T Laboratories Cambridge.\nImage data is typically embedded in very high-dimensional spaces,\nwhich might be prone to hubness.\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "import numpy as np\nfrom sklearn.datasets import olivetti_faces\nfrom sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV\n\nfrom skhubness import LegacyHubness\nfrom skhubness.neighbors import KNeighborsClassifier\n\n# Fetch data and have a look\nd = olivetti_faces.fetch_olivetti_faces()\nX, y = d['data'], d['target']\nprint(f'Data shape: {X.shape}')\nprint(f'Label shape: {y.shape}')\n# (400, 4096)\n# (400,)\n\n# The data is embedded in a high-dimensional space.\n# Is there hubness, and can we reduce it?\nfor hubness in [None, 'dsl', 'ls', 'mp']:\n hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness')\n hub.fit(X)\n score = hub.score()\n print(f'LegacyHubness (10-skew): {score:.3f} with hubness reduction: {hubness}')\n# LegacyHubness (10-skew): 1.972 with hubness reduction: None\n# LegacyHubness (10-skew): 1.526 with hubness reduction: dsl\n# LegacyHubness (10-skew): 0.943 with hubness reduction: ls\n# LegacyHubness (10-skew): 0.184 with hubness reduction: mp\n\n# There is some hubness, and all hubness reduction methods can reduce it (to varying degree)\n# Let's assess the best kNN strategy and its estimated performance.\ncv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)\ncv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)\n\nknn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})\n\n# specify parameters and distributions to sample from\nparam_dist = {\"n_neighbors\": np.arange(1, 26),\n \"weights\": ['uniform', 'distance'],\n \"hubness\": [None, 'dsl', 'ls', 'mp']}\n\n# Inner cross-validation to select best hyperparameters (incl hubness reduction method)\nsearch = RandomizedSearchCV(estimator=knn,\n param_distributions=param_dist,\n n_iter=100,\n cv=cv_select,\n random_state=2345,\n verbose=1)\n\n# Outer cross-validation to estimate performance\nscore = cross_val_score(search, X, y, cv=cv_perf, verbose=1)\nprint(f'Scores: {score}')\nprint(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}')\n\n# Select model that maximizes accuracy\nsearch.fit(X, y)\n\n# The best model's parameters\nprint(search.best_params_)\n\n# Does it correspond to the results of hubness reduction above?\n# Scores: [0.95 0.9625 1. 0.95 0.925 ]\n# Mean acc = 0.957 +/- 0.024\n# {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'}"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_regression.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | :class: sphx-glr-download-link-note
3 |
4 | Click :ref:`here ` to download the full example code
5 | .. rst-class:: sphx-glr-example-title
6 |
7 | .. _sphx_glr_documentation_auto_examples_plot_regression.py:
8 |
9 |
10 | ============================
11 | Nearest Neighbors regression
12 | ============================
13 |
14 | Demonstrate the resolution of a regression problem
15 | using a k-Nearest Neighbor and the interpolation of the
16 | target using both barycenter and constant weights.
17 |
18 | Hubness reduction of this low-dimensional dataset
19 | shows only small effects.
20 |
21 | Adapted from ``_
22 |
23 |
24 |
25 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_regression_001.png
26 | :class: sphx-glr-single-img
27 |
28 |
29 | .. rst-class:: sphx-glr-script-out
30 |
31 | Out:
32 |
33 | .. code-block:: none
34 |
35 |
36 | /home/user/feldbauer/PycharmProjects/hubness/examples/sklearn/plot_regression.py:60: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.
37 | plt.show()
38 |
39 |
40 |
41 |
42 |
43 | |
44 |
45 |
46 | .. code-block:: default
47 |
48 | print(__doc__)
49 |
50 | # Author: Alexandre Gramfort
51 | # Fabian Pedregosa
52 | #
53 | # License: BSD 3 clause (C) INRIA
54 |
55 |
56 | # #############################################################################
57 | # Generate sample data
58 | import numpy as np
59 | import matplotlib.pyplot as plt
60 | from skhubness.neighbors import KNeighborsRegressor
61 |
62 | np.random.seed(0)
63 | X = np.sort(5 * np.random.rand(40, 1), axis=0)
64 | T = np.linspace(0, 5, 500)[:, np.newaxis]
65 | y = np.sin(X).ravel()
66 |
67 | # Add noise to targets
68 | y[::5] += 1 * (0.5 - np.random.rand(8))
69 |
70 | # #############################################################################
71 | # Fit regression model
72 | n_neighbors = 5
73 |
74 | f = plt.figure()
75 | for i, weights in enumerate(['uniform', 'distance']):
76 | for j, hubness in enumerate([None, 'local_scaling']):
77 | knn = KNeighborsRegressor(n_neighbors,
78 | algorithm_params={'n_candidates': 39},
79 | weights=weights,
80 | hubness=hubness)
81 | y_ = knn.fit(X, y).predict(T)
82 |
83 | plt.subplot(2, 2, i * 2 + j + 1)
84 | f.set_figheight(15)
85 | f.set_figwidth(15)
86 | plt.scatter(X, y, c='k', label='data')
87 | plt.plot(T, y_, c='g', label='prediction')
88 | plt.axis('tight')
89 | plt.legend()
90 | plt.title(f"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')")
91 |
92 | plt.tight_layout()
93 | plt.show()
94 |
95 | .. rst-class:: sphx-glr-timing
96 |
97 | **Total running time of the script:** ( 0 minutes 0.737 seconds)
98 |
99 |
100 | .. _sphx_glr_download_documentation_auto_examples_plot_regression.py:
101 |
102 |
103 | .. only :: html
104 |
105 | .. container:: sphx-glr-footer
106 | :class: sphx-glr-footer-example
107 |
108 |
109 |
110 | .. container:: sphx-glr-download
111 |
112 | :download:`Download Python source code: plot_regression.py `
113 |
114 |
115 |
116 | .. container:: sphx-glr-download
117 |
118 | :download:`Download Jupyter notebook: plot_regression.ipynb `
119 |
120 |
121 | .. only:: html
122 |
123 | .. rst-class:: sphx-glr-signature
124 |
125 | `Gallery generated by Sphinx-Gallery `_
126 |
--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## [Next release]
4 | ...
5 |
6 | ### Added or enhanced
7 | - Lower memory footprint for sparse targets in multilabel classification
8 | (previously converted to dense arrays) #61
9 |
10 | ### Fixes
11 | - Hubness estimation could fail when ANN does not return enough neighbors #59
12 | - Heuristic to choose memory for Puffinn LSH.
13 |
14 | ### Maintenance
15 | - Switch to modern Python packaging with `pyproject.toml` and `setup.cfg`
16 | - Switch to Github Actions, dropping Travis CI and AppVeyor
17 |
18 |
19 | ## [0.21.2] - 2020-01-14
20 |
21 | This is a maintenance release due to the publication in the
22 | Journal of Open Source Software.
23 |
24 |
25 | ## [0.21.1] - 2019-12-10
26 |
27 | This is a bugfix release due to the recent update of scikit-learn to v0.22.
28 |
29 | ### Fixes
30 | - Require scikit-learn v0.21.3.
31 |
32 | Until the necessary adaptions for v0.22 are completed,
33 | scikit-hubness will require scikit-learn v0.21.3.
34 |
35 |
36 | ## [0.21.0] - 2019-11-25
37 |
38 | This is the first major release of scikit-hubness.
39 |
40 | ### Added
41 | - Enable ONNG provided by NGT (optimized ANNG). Pass ``optimize=True`` to ``LegacyNNG``.
42 | - User Guide: Description of all subpackages and common usage scenarios.
43 | - Examples: Various usage examples
44 | - Several tests
45 | - Classes inheriting from ``SupervisedIntegerMixin`` can be fit with an
46 | ``ApproximateNearestNeighbor`` or ``NearestNeighbors`` instance,
47 | thus reuse precomputed indices.
48 |
49 | ### Changes
50 | - Use argument ``algorithm='nng'`` for ANNG/ONNG provided by NGT instead of ``'onng'``.
51 | Also set ``optimize=True`` in order to use ONNG.
52 |
53 | ### Fixes
54 | - DisSimLocal would previously fail when invoked as ``hubness='dis_sim_local'``.
55 | - Hubness reduction would previously ignore ``verbose`` arguments under certain circumstances.
56 | - ``HNSW`` would previously ignore ``n_jobs`` on index creation.
57 | - Fix installation instructions for puffinn.
58 |
59 | ## [0.21.0a9] - 2019-10-30
60 | ### Added
61 | - General structure for docs
62 | - Enable NGT OpenMP support on MacOS (in addition to Linux)
63 | - Enable Puffinn LSH also on MacOS
64 |
65 | ### Fixes
66 | - Correct mutual proximity (empiric) calculation
67 | - Better handling of optional packages (ANN libraries)
68 |
69 | ### Maintenance
70 | - streamlined CI builds
71 | - several minor code improvements
72 |
73 | ### New contributors
74 | - Silvan David Peter
75 |
76 |
77 | ## [0.21.0a8] - 2019-09-12
78 | ### Added
79 | - Approximate nearest neighbor search
80 | * LSH by an additional provider, [`puffinn`](https://github.com/puffinn/puffinn) (Linux only, atm)
81 | * ANNG provided by [`ngtpy`](https://github.com/yahoojapan/NGT/) (Linux, MacOS)
82 | * Random projection forests provided by [`annoy`](https://github.com/spotify/annoy) (Linux, MacOS, Windows)
83 |
84 | ### Fixes
85 | - Several minor issues
86 | - Several documentations issues
87 |
88 |
89 | ## [0.21.0a7] - 2019-07-17
90 |
91 | The first alpha release of `scikit-hubness` to appear in this changelog.
92 | It already contains the following features:
93 |
94 | - Hubness estimation (exact or approximate)
95 | - Hubness reduction (exact or approximate)
96 | * Mutual proximity
97 | * Local scaling
98 | * DisSim Local
99 | - Approximate nearest neighbor search
100 | * HNSW provided by [nmslib](https://github.com/nmslib/nmslib)
101 | * LSH provided by [falconn](https://github.com/FALCONN-LIB/FALCONN)
102 |
103 | [Next release]: https://github.com/VarIr/scikit-hubness/compare/v0.21.2...HEAD
104 | [0.21.2]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.2
105 | [0.21.1]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.1
106 | [0.21.0]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0
107 | [0.21.0a9]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0-alpha.9
108 | [0.21.0a8]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0-alpha.8
109 | [0.21.0a7]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0-alpha.7
110 |
111 | [//]: # "Sections: Added, Fixed, Changed, Removed"
112 |
--------------------------------------------------------------------------------
/docs/documentation/reduction.rst:
--------------------------------------------------------------------------------
1 | =================
2 | Hubness reduction
3 | =================
4 |
5 | The :mod:`skhubness.reduction` subpackage provides several hubness reduction methods.
6 | Currently, the supported methods are
7 |
8 | - Mutual proximity (independent Gaussian distance distribution),
9 | provided by :class:`MutualProximity ` with ``method='normal'`` (default),
10 | - Mutual proximity (empiric distance distribution),
11 | provided by :class:`MutualProximity ` with ``method='empiric'``,
12 | - Local scaling,
13 | provided by :class:`LocalScaling ` with ``method='standard'`` (default),
14 | - Non-iterative contextual dissimilarity measure,
15 | provided by :class:`LocalScaling ` with ``method='nicdm'``,
16 | - DisSim Local,
17 | provided by :class:`DisSimLocal `,
18 |
19 | which represent the most successful hubness reduction methods as identified in
20 | our paper "A comprehensive empirical comparison of hubness reduction in high-dimensional spaces",
21 | KAIS (2019), `DOI `__.
22 | This survey paper also comes with an overview of how the individual methods work.
23 |
24 | There are two ways to use perform hubness reduction in scikit-hubness:
25 |
26 | - Implicitly, using the classes in :mod:`skhubness.neighbors`
27 | (see :ref:`User Guide: Nearest neighbors `),
28 | - Explicitly, using the classes in :mod:`skhubness.reduction`.
29 |
30 | The former is the common approach, if you simply want to improve your learning task
31 | by hubness reduction. Most examples here also do so.
32 | The latter may, however, be more useful for researchers, who would like to
33 | investigate the hubness phenomenon itself.
34 |
35 | All hubness reducers inherit from a common base class
36 | :class:`HubnessReduction `.
37 | This abstract class defines two important methods:
38 | :meth:`fit ` and
39 | :meth:`transform `,
40 | thus allowing to transform previously unseen data after the initial fit.
41 | Most hubness reduction methods do not operate on vector data,
42 | but manipulate pre-computed distances, in order to obtain `secondary distances`.
43 | Therefore, ``fit`` and ``transform`` take neighbor graphs as input, instead of vectors.
44 | Have a look at their signatures:
45 |
46 | .. code-block:: Python3
47 |
48 | @abstractmethod
49 | def fit(self, neigh_dist, neigh_ind, X, assume_sorted, *args, **kwargs):
50 | pass # pragma: no cover
51 |
52 | @abstractmethod
53 | def transform(self, neigh_dist, neigh_ind, X, assume_sorted, return_distance=True):
54 | pass # pragma: no cover
55 |
56 | The arguments ``neigh_dist`` and ``neigh_ind`` are two arrays representing the nearest neighbor graph
57 | with shape ``(n_indexed, n_neighbors)`` during fit, and
58 | shape ``(n_query, n_neighbors)`` during transform.
59 | The i-th row in each array corresponds to the i-th object in the data set.
60 | The j-th column in ``neigh_ind`` contains the index of one of the k-nearest neighbors among the indexed objects,
61 | while the j-th column in ``neigh_dist`` contains the corresponding distance.
62 | Note, that this is the same format as obtained by scikit-learn's ``kneighbors(return_distances=True)``
63 | method.
64 |
65 | This way, the user has full flexibility on how to calculate primary distances (Euclidean, cosine, KL divergence, etc).
66 | :class:`DisSimLocal ` (DSL) is the exception to this rule,
67 | because it is formulated specifically for Euclidean distances.
68 | DSL, therefore, also requires the training vectors in ``fit(..., X=X_train)``,
69 | and the test set vectors in ``transform(..., X=X_test)``.
70 | Argument ``X`` is ignored in the other hubness reduction methods.
71 |
72 | When the neighbor graph is already sorted (lowest to highest distance),
73 | ``assume_sorted=True`` should be set, so that hubness reduction methods
74 | will not sort the arrays again, thus saving computational time.
75 |
76 | Hubness reduction methods transform the primary distance graph,
77 | and return secondary distances.
78 | Note that for efficiency reasons, the returned arrays are not sorted.
79 | Please make sure to sort the arrays, if downstream tasks assume sorted arrays.
80 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_classification.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | :class: sphx-glr-download-link-note
3 |
4 | Click :ref:`here ` to download the full example code
5 | .. rst-class:: sphx-glr-example-title
6 |
7 | .. _sphx_glr_documentation_auto_examples_plot_classification.py:
8 |
9 |
10 | ================================
11 | Nearest Neighbors Classification
12 | ================================
13 | Sample usage of Nearest Neighbors classification.
14 | It will plot the decision boundaries for each class.
15 |
16 | Adapted from ``_
17 |
18 |
19 |
20 | .. rst-class:: sphx-glr-horizontal
21 |
22 |
23 | *
24 |
25 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_classification_001.png
26 | :class: sphx-glr-multi-img
27 |
28 | *
29 |
30 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_classification_002.png
31 | :class: sphx-glr-multi-img
32 |
33 |
34 | .. rst-class:: sphx-glr-script-out
35 |
36 | Out:
37 |
38 | .. code-block:: none
39 |
40 | /home/user/feldbauer/PycharmProjects/hubness/examples/sklearn/plot_classification.py:61: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.
41 | plt.show()
42 |
43 |
44 |
45 |
46 |
47 | |
48 |
49 |
50 | .. code-block:: default
51 |
52 |
53 | import numpy as np
54 | import matplotlib.pyplot as plt
55 | from matplotlib.colors import ListedColormap
56 | from sklearn import datasets
57 | from skhubness.neighbors import KNeighborsClassifier
58 |
59 | n_neighbors = 15
60 |
61 | # import some data to play with
62 | iris = datasets.load_iris()
63 |
64 | # we only take the first two features. We could avoid this ugly
65 | # slicing by using a two-dim dataset
66 | X = iris.data[:, :2]
67 | y = iris.target
68 |
69 | h = .02 # step size in the mesh
70 |
71 | # Create color maps
72 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
73 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
74 |
75 | for hubness in [None, 'mutual_proximity']:
76 | # we create an instance of Neighbours Classifier and fit the data.
77 | clf = KNeighborsClassifier(n_neighbors,
78 | hubness=hubness,
79 | weights='distance')
80 | clf.fit(X, y)
81 |
82 | # Plot the decision boundary. For that, we will assign a color to each
83 | # point in the mesh [x_min, x_max]x[y_min, y_max].
84 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
85 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
86 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
87 | np.arange(y_min, y_max, h))
88 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
89 |
90 | # Put the result into a color plot
91 | Z = Z.reshape(xx.shape)
92 | plt.figure()
93 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
94 |
95 | # Plot also the training points
96 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
97 | edgecolor='k', s=20)
98 | plt.xlim(xx.min(), xx.max())
99 | plt.ylim(yy.min(), yy.max())
100 | plt.title("3-Class classification (k = %i, hubness = '%s')"
101 | % (n_neighbors, hubness))
102 |
103 | plt.show()
104 |
105 |
106 | .. rst-class:: sphx-glr-timing
107 |
108 | **Total running time of the script:** ( 0 minutes 25.940 seconds)
109 |
110 |
111 | .. _sphx_glr_download_documentation_auto_examples_plot_classification.py:
112 |
113 |
114 | .. only :: html
115 |
116 | .. container:: sphx-glr-footer
117 | :class: sphx-glr-footer-example
118 |
119 |
120 |
121 | .. container:: sphx-glr-download
122 |
123 | :download:`Download Python source code: plot_classification.py `
124 |
125 |
126 |
127 | .. container:: sphx-glr-download
128 |
129 | :download:`Download Jupyter notebook: plot_classification.ipynb `
130 |
131 |
132 | .. only:: html
133 |
134 | .. rst-class:: sphx-glr-signature
135 |
136 | `Gallery generated by Sphinx-Gallery `_
137 |
--------------------------------------------------------------------------------
/examples/sklearn/plot_nca_dim_reduction.py:
--------------------------------------------------------------------------------
1 | """
2 | ==============================================================
3 | Dimensionality Reduction with Neighborhood Components Analysis
4 | ==============================================================
5 |
6 | Sample usage of Neighborhood Components Analysis for dimensionality reduction.
7 |
8 | This example compares different (linear) dimensionality reduction methods
9 | applied on the Digits data set. The data set contains images of digits from
10 | 0 to 9 with approximately 180 samples of each class. Each image is of
11 | dimension 8x8 = 64, and is reduced to a two-dimensional data point.
12 |
13 | Principal Component Analysis (PCA) applied to this data identifies the
14 | combination of attributes (principal components, or directions in the
15 | feature space) that account for the most variance in the data. Here we
16 | plot the different samples on the 2 first principal components.
17 |
18 | Linear Discriminant Analysis (LDA) tries to identify attributes that
19 | account for the most variance *between classes*. In particular,
20 | LDA, in contrast to PCA, is a supervised method, using known class labels.
21 |
22 | Neighborhood Components Analysis (NCA) tries to find a feature space such
23 | that a stochastic nearest neighbor algorithm will give the best accuracy.
24 | Like LDA, it is a supervised method.
25 |
26 | One can see that NCA enforces a clustering of the data that is visually
27 | meaningful despite the large reduction in dimension.
28 |
29 | Adapted from ``_
30 | """
31 | # License: BSD 3 clause
32 |
33 | import numpy as np
34 | import matplotlib.pyplot as plt
35 | from sklearn import datasets
36 | from sklearn.model_selection import train_test_split
37 | from sklearn.decomposition import PCA
38 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
39 | from sklearn.pipeline import make_pipeline
40 | from sklearn.preprocessing import StandardScaler
41 |
42 | from skhubness.neighbors import (KNeighborsClassifier,
43 | NeighborhoodComponentsAnalysis)
44 |
45 | print(__doc__)
46 |
47 | n_neighbors = 3
48 | random_state = 0
49 |
50 | # Load Digits dataset
51 | digits = datasets.load_digits()
52 | X, y = digits.data, digits.target
53 |
54 | # Split into train/test
55 | X_train, X_test, y_train, y_test = \
56 | train_test_split(X, y, test_size=0.5, stratify=y,
57 | random_state=random_state)
58 |
59 | dim = len(X[0])
60 | n_classes = len(np.unique(y))
61 |
62 | # Reduce dimension to 2 with PCA
63 | pca = make_pipeline(StandardScaler(),
64 | PCA(n_components=2, random_state=random_state))
65 |
66 | # Reduce dimension to 2 with LinearDiscriminantAnalysis
67 | lda = make_pipeline(StandardScaler(),
68 | LinearDiscriminantAnalysis(n_components=2))
69 |
70 | # Reduce dimension to 2 with NeighborhoodComponentAnalysis
71 | nca = make_pipeline(StandardScaler(),
72 | NeighborhoodComponentsAnalysis(n_components=2,
73 | random_state=random_state))
74 |
75 | # Use a nearest neighbor classifier to evaluate the methods
76 | knn = KNeighborsClassifier(n_neighbors=n_neighbors)
77 |
78 | # Make a list of the methods to be compared
79 | dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
80 |
81 | # plt.figure()
82 | for i, (name, model) in enumerate(dim_reduction_methods):
83 | plt.figure()
84 | # plt.subplot(1, 3, i + 1, aspect=1)
85 |
86 | # Fit the method's model
87 | model.fit(X_train, y_train)
88 |
89 | # Fit a nearest neighbor classifier on the embedded training set
90 | knn.fit(model.transform(X_train), y_train)
91 |
92 | # Compute the nearest neighbor accuracy on the embedded test set
93 | acc_knn = knn.score(model.transform(X_test), y_test)
94 |
95 | # Embed the data set in 2 dimensions using the fitted model
96 | X_embedded = model.transform(X)
97 |
98 | # Plot the projected points and show the evaluation score
99 | plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')
100 | plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
101 | n_neighbors,
102 | acc_knn))
103 | plt.show()
104 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nca_dim_reduction.py:
--------------------------------------------------------------------------------
1 | """
2 | ==============================================================
3 | Dimensionality Reduction with Neighborhood Components Analysis
4 | ==============================================================
5 |
6 | Sample usage of Neighborhood Components Analysis for dimensionality reduction.
7 |
8 | This example compares different (linear) dimensionality reduction methods
9 | applied on the Digits data set. The data set contains images of digits from
10 | 0 to 9 with approximately 180 samples of each class. Each image is of
11 | dimension 8x8 = 64, and is reduced to a two-dimensional data point.
12 |
13 | Principal Component Analysis (PCA) applied to this data identifies the
14 | combination of attributes (principal components, or directions in the
15 | feature space) that account for the most variance in the data. Here we
16 | plot the different samples on the 2 first principal components.
17 |
18 | Linear Discriminant Analysis (LDA) tries to identify attributes that
19 | account for the most variance *between classes*. In particular,
20 | LDA, in contrast to PCA, is a supervised method, using known class labels.
21 |
22 | Neighborhood Components Analysis (NCA) tries to find a feature space such
23 | that a stochastic nearest neighbor algorithm will give the best accuracy.
24 | Like LDA, it is a supervised method.
25 |
26 | One can see that NCA enforces a clustering of the data that is visually
27 | meaningful despite the large reduction in dimension.
28 |
29 | Adapted from ``_
30 | """
31 | # License: BSD 3 clause
32 |
33 | import numpy as np
34 | import matplotlib.pyplot as plt
35 | from sklearn import datasets
36 | from sklearn.model_selection import train_test_split
37 | from sklearn.decomposition import PCA
38 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
39 | from sklearn.pipeline import make_pipeline
40 | from sklearn.preprocessing import StandardScaler
41 |
42 | from skhubness.neighbors import (KNeighborsClassifier,
43 | NeighborhoodComponentsAnalysis)
44 |
45 | print(__doc__)
46 |
47 | n_neighbors = 3
48 | random_state = 0
49 |
50 | # Load Digits dataset
51 | digits = datasets.load_digits()
52 | X, y = digits.data, digits.target
53 |
54 | # Split into train/test
55 | X_train, X_test, y_train, y_test = \
56 | train_test_split(X, y, test_size=0.5, stratify=y,
57 | random_state=random_state)
58 |
59 | dim = len(X[0])
60 | n_classes = len(np.unique(y))
61 |
62 | # Reduce dimension to 2 with PCA
63 | pca = make_pipeline(StandardScaler(),
64 | PCA(n_components=2, random_state=random_state))
65 |
66 | # Reduce dimension to 2 with LinearDiscriminantAnalysis
67 | lda = make_pipeline(StandardScaler(),
68 | LinearDiscriminantAnalysis(n_components=2))
69 |
70 | # Reduce dimension to 2 with NeighborhoodComponentAnalysis
71 | nca = make_pipeline(StandardScaler(),
72 | NeighborhoodComponentsAnalysis(n_components=2,
73 | random_state=random_state))
74 |
75 | # Use a nearest neighbor classifier to evaluate the methods
76 | knn = KNeighborsClassifier(n_neighbors=n_neighbors)
77 |
78 | # Make a list of the methods to be compared
79 | dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
80 |
81 | # plt.figure()
82 | for i, (name, model) in enumerate(dim_reduction_methods):
83 | plt.figure()
84 | # plt.subplot(1, 3, i + 1, aspect=1)
85 |
86 | # Fit the method's model
87 | model.fit(X_train, y_train)
88 |
89 | # Fit a nearest neighbor classifier on the embedded training set
90 | knn.fit(model.transform(X_train), y_train)
91 |
92 | # Compute the nearest neighbor accuracy on the embedded test set
93 | acc_knn = knn.score(model.transform(X_test), y_test)
94 |
95 | # Embed the data set in 2 dimensions using the fitted model
96 | X_embedded = model.transform(X)
97 |
98 | # Plot the projected points and show the evaluation score
99 | plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')
100 | plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
101 | n_neighbors,
102 | acc_knn))
103 | plt.show()
104 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_hr/olivetti_faces.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | :class: sphx-glr-download-link-note
3 |
4 | Click :ref:`here ` to download the full example code
5 | .. rst-class:: sphx-glr-example-title
6 |
7 | .. _sphx_glr_documentation_auto_examples_hr_olivetti_faces.py:
8 |
9 |
10 | =================================
11 | Face recognition (Olivetti faces)
12 | =================================
13 |
14 | This dataset contains a set of face images taken between April 1992
15 | and April 1994 at AT&T Laboratories Cambridge.
16 | Image data is typically embedded in very high-dimensional spaces,
17 | which might be prone to hubness.
18 |
19 |
20 | .. code-block:: default
21 |
22 | import numpy as np
23 | from sklearn.datasets import olivetti_faces
24 | from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
25 |
26 | from skhubness import LegacyHubness
27 | from skhubness.neighbors import KNeighborsClassifier
28 |
29 | # Fetch data and have a look
30 | d = olivetti_faces.fetch_olivetti_faces()
31 | X, y = d['data'], d['target']
32 | print(f'Data shape: {X.shape}')
33 | print(f'Label shape: {y.shape}')
34 | # (400, 4096)
35 | # (400,)
36 |
37 | # The data is embedded in a high-dimensional space.
38 | # Is there hubness, and can we reduce it?
39 | for hubness in [None, 'dsl', 'ls', 'mp']:
40 | hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness')
41 | hub.fit(X)
42 | score = hub.score()
43 | print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}')
44 | # Hubness (10-skew): 1.972 with hubness reduction: None
45 | # Hubness (10-skew): 1.526 with hubness reduction: dsl
46 | # Hubness (10-skew): 0.943 with hubness reduction: ls
47 | # Hubness (10-skew): 0.184 with hubness reduction: mp
48 |
49 | # There is some hubness, and all hubness reduction methods can reduce it (to varying degree)
50 | # Let's assess the best kNN strategy and its estimated performance.
51 | cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)
52 | cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)
53 |
54 | knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})
55 |
56 | # specify parameters and distributions to sample from
57 | param_dist = {"n_neighbors": np.arange(1, 26),
58 | "weights": ['uniform', 'distance'],
59 | "hubness": [None, 'dsl', 'ls', 'mp']}
60 |
61 | # Inner cross-validation to select best hyperparameters (incl hubness reduction method)
62 | search = RandomizedSearchCV(estimator=knn,
63 | param_distributions=param_dist,
64 | n_iter=100,
65 | cv=cv_select,
66 | random_state=2345,
67 | verbose=1)
68 |
69 | # Outer cross-validation to estimate performance
70 | score = cross_val_score(search, X, y, cv=cv_perf, verbose=1)
71 | print(f'Scores: {score}')
72 | print(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}')
73 |
74 | # Select model that maximizes accuracy
75 | search.fit(X, y)
76 |
77 | # The best model's parameters
78 | print(search.best_params_)
79 |
80 | # Does it correspond to the results of hubness reduction above?
81 | # Scores: [0.95 0.9625 1. 0.95 0.925 ]
82 | # Mean acc = 0.957 +/- 0.024
83 | # {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'}
84 |
85 |
86 | .. rst-class:: sphx-glr-timing
87 |
88 | **Total running time of the script:** ( 0 minutes 0.000 seconds)
89 |
90 |
91 | .. _sphx_glr_download_documentation_auto_examples_hr_olivetti_faces.py:
92 |
93 |
94 | .. only :: html
95 |
96 | .. container:: sphx-glr-footer
97 | :class: sphx-glr-footer-example
98 |
99 |
100 |
101 | .. container:: sphx-glr-download
102 |
103 | :download:`Download Python source code: olivetti_faces.py `
104 |
105 |
106 |
107 | .. container:: sphx-glr-download
108 |
109 | :download:`Download Jupyter notebook: olivetti_faces.ipynb `
110 |
111 |
112 | .. only:: html
113 |
114 | .. rst-class:: sphx-glr-signature
115 |
116 | `Gallery generated by Sphinx-Gallery `_
117 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_multioutput_face_completion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n# Face completion with a multi-output estimators\n\n\nThis example shows the use of multi-output estimator to complete images.\nThe goal is to predict the lower half of a face given its upper half.\n\nThe first column of images shows true faces. The next columns illustrate\nhow extremely randomized trees, linear regression, ridge regression,\nand k nearest neighbors with or without hubness reduction\ncomplete the lower half of those faces.\n\n\nAdapted from ``_\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_olivetti_faces\nfrom sklearn.utils.validation import check_random_state\n\nfrom sklearn.ensemble import ExtraTreesRegressor\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.linear_model import RidgeCV\n\nfrom skhubness.neighbors import KNeighborsRegressor\n\n# Load the faces datasets\ndata = fetch_olivetti_faces()\ntargets = data.target\n\ndata = data.images.reshape((len(data.images), -1))\ntrain = data[targets < 30]\ntest = data[targets >= 30] # Test on independent people\n\n# Test on a subset of people\nn_faces = 5\nrng = check_random_state(4)\nface_ids = rng.randint(test.shape[0], size=(n_faces, ))\ntest = test[face_ids, :]\n\nn_pixels = data.shape[1]\n# Upper half of the faces\nX_train = train[:, :(n_pixels + 1) // 2]\n# Lower half of the faces\ny_train = train[:, n_pixels // 2:]\nX_test = test[:, :(n_pixels + 1) // 2]\ny_test = test[:, n_pixels // 2:]\n\n# Fit estimators\nESTIMATORS = {\n \"Extra trees\": ExtraTreesRegressor(n_estimators=10, max_features=32,\n random_state=0),\n \"k-NN\": KNeighborsRegressor(weights='distance'),\n \"k-NN MP\": KNeighborsRegressor(hubness='mp',\n hubness_params={'method': 'normal'},\n weights='distance'),\n \"Linear regression\": LinearRegression(),\n \"Ridge\": RidgeCV(),\n}\n\ny_test_predict = dict()\nfor name, estimator in ESTIMATORS.items():\n estimator.fit(X_train, y_train)\n y_test_predict[name] = estimator.predict(X_test)\n\n# Plot the completed faces\nimage_shape = (64, 64)\n\nn_cols = 1 + len(ESTIMATORS)\nplt.figure(figsize=(2. * n_cols, 2.26 * n_faces))\nplt.suptitle(\"Face completion with multi-output estimators\", size=16)\n\nfor i in range(n_faces):\n true_face = np.hstack((X_test[i], y_test[i]))\n\n if i:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)\n else:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,\n title=\"true faces\")\n\n sub.axis(\"off\")\n sub.imshow(true_face.reshape(image_shape),\n cmap=plt.cm.gray,\n interpolation=\"nearest\")\n\n for j, est in enumerate(sorted(ESTIMATORS)):\n completed_face = np.hstack((X_test[i], y_test_predict[est][i]))\n\n if i:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)\n\n else:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,\n title=est)\n\n sub.axis(\"off\")\n sub.imshow(completed_face.reshape(image_shape),\n cmap=plt.cm.gray,\n interpolation=\"nearest\")\n\nplt.show()"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------
/examples/approximate_neighbors/word_embeddings.py:
--------------------------------------------------------------------------------
1 | """
2 | =============================
3 | Retrieving GLOVE word vectors
4 | =============================
5 |
6 | In this example we will retrieve similar words from
7 | GLOVE embeddings with an ANNG graph.
8 |
9 | Precomputed ground-truth nearest neighbors are available
10 | from `ANN benchmarks `__.
11 | """
12 |
13 | # For this example, the `h5py` package is required in addition to the requirements of scikit-hubness.
14 | # You may install it from PyPI by the following command (if you're in an IPython/Jupyter environment):
15 | # !pip install h5py
16 |
17 | import numpy as np
18 | import h5py
19 | from skhubness.neighbors import NearestNeighbors
20 |
21 | # Download the dataset with the following command.
22 | # If the dataset is already available in the current working dir, you can skip this:
23 | # !wget http://ann-benchmarks.com/glove-100-angular.hdf5
24 | f = h5py.File('glove-100-angular.hdf5', 'r')
25 |
26 | # Extract the split and ground-truth
27 | X_train = f['train']
28 | X_test = f['test']
29 | neigh_true = f['neighbors']
30 | dist = f['distances']
31 |
32 | # How many object have we got?
33 | for k in f.keys():
34 | print(f'{k}: shape = {f[k].shape}')
35 |
36 | # APPROXIMATE NEAREST NEIGHBOR SEARCH
37 | # In order to retrieve most similar words from the GLOVE embeddings,
38 | # we use the unsupervised `skhubness.neighbors.NearestNeighbors` class.
39 | # The (approximate) nearest neighbor algorithm is set to LegacyNNG by passing `algorithm='nng'`.
40 | # We can pass additional parameters to `LegacyNNG` via the `algorithm_params` dict.
41 | # Here we set `n_jobs=8` to enable parallelism.
42 | # Create the nearest neighbor index
43 | nn_plain = NearestNeighbors(n_neighbors=100,
44 | algorithm='nng',
45 | algorithm_params={'n_candidates': 1_000,
46 | 'index_dir': 'auto',
47 | 'n_jobs': 8},
48 | verbose=2,
49 | )
50 | nn_plain.fit(X_train)
51 |
52 | # Note that LegacyNNG must save its index. By setting `index_dir='auto'`,
53 | # LegacyNNG will try to save it to shared memory, if available, otherwise to $TMP.
54 | # This index is NOT removed automatically, as one will typically want build an index once and use it often.
55 | # Retrieve nearest neighbors for each test object
56 | neigh_pred_plain = nn_plain.kneighbors(X_test,
57 | n_neighbors=100,
58 | return_distance=False)
59 |
60 | # Calculate the recall per test object
61 | recalled_plain = [np.intersect1d(neigh_true[i], neigh_pred_plain)
62 | for i in range(len(X_test))]
63 | recall_plain = np.array([recalled_plain[i].size / neigh_true.shape[1]
64 | for i in range(len(X_test))])
65 |
66 | # Statistics
67 | print(f'Mean = {recall_plain.mean():.4f}, '
68 | f'stdev = {recall_plain.std():.4f}')
69 |
70 |
71 | # ANN with HUBNESS REDUCTION
72 | # Here we set `n_candidates=1000`, so that for each query,
73 | # 1000 neighbors will be retrieved first by `LegacyNNG`,
74 | # that are subsequently refined by hubness reduction.
75 | # Hubness reduction is performed by local scaling as specified with `hubness='ls'`.
76 | # Creating the NN index with hubness reduction enabled
77 | nn = NearestNeighbors(n_neighbors=100,
78 | algorithm='nng',
79 | algorithm_params={'n_candidates': 1_000,
80 | 'n_jobs': 8},
81 | hubness='ls',
82 | verbose=2,
83 | )
84 | nn.fit(X_train)
85 |
86 | # Retrieve nearest neighbors for each test object
87 | neigh_pred = nn.kneighbors(X_test,
88 | n_neighbors=100,
89 | return_distance=False)
90 |
91 | # Measure recall per object and on average
92 | recalled = [np.intersect1d(neigh_true[i], neigh_pred)
93 | for i in range(len(X_test))]
94 | recall = np.array([recalled[i].size / neigh_true.shape[1]
95 | for i in range(len(X_test))])
96 | print(f'Mean = {recall.mean():.4f}, '
97 | f'stdev = {recall.std():.4f}')
98 |
99 | # If the second results are significantly better than the first,
100 | # this could indicate that the chosen ANN method is more prone
101 | # to hubness than exact NN, which might be an interesting research question.
102 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples_ann/word_embeddings.py:
--------------------------------------------------------------------------------
1 | """
2 | =============================
3 | Retrieving GLOVE word vectors
4 | =============================
5 |
6 | In this example we will retrieve similar words from
7 | GLOVE embeddings with an ANNG graph.
8 |
9 | Precomputed ground-truth nearest neighbors are available
10 | from `ANN benchmarks `__.
11 | """
12 |
13 | # For this example, the `h5py` package is required in addition to the requirements of scikit-hubness.
14 | # You may install it from PyPI by the following command (if you're in an IPython/Jupyter environment):
15 | # !pip install h5py
16 |
17 | import numpy as np
18 | import h5py
19 | from skhubness.neighbors import NearestNeighbors
20 |
21 | # Download the dataset with the following command.
22 | # If the dataset is already available in the current working dir, you can skip this:
23 | # !wget http://ann-benchmarks.com/glove-100-angular.hdf5
24 | f = h5py.File('glove-100-angular.hdf5', 'r')
25 |
26 | # Extract the split and ground-truth
27 | X_train = f['train']
28 | X_test = f['test']
29 | neigh_true = f['neighbors']
30 | dist = f['distances']
31 |
32 | # How many object have we got?
33 | for k in f.keys():
34 | print(f'{k}: shape = {f[k].shape}')
35 |
36 | # APPROXIMATE NEAREST NEIGHBOR SEARCH
37 | # In order to retrieve most similar words from the GLOVE embeddings,
38 | # we use the unsupervised `skhubness.neighbors.NearestNeighbors` class.
39 | # The (approximate) nearest neighbor algorithm is set to LegacyNNG by passing `algorithm='nng'`.
40 | # We can pass additional parameters to `LegacyNNG` via the `algorithm_params` dict.
41 | # Here we set `n_jobs=8` to enable parallelism.
42 | # Create the nearest neighbor index
43 | nn_plain = NearestNeighbors(n_neighbors=100,
44 | algorithm='nng',
45 | algorithm_params={'n_candidates': 1_000,
46 | 'index_dir': 'auto',
47 | 'n_jobs': 8},
48 | verbose=2,
49 | )
50 | nn_plain.fit(X_train)
51 |
52 | # Note that LegacyNNG must save its index. By setting `index_dir='auto'`,
53 | # LegacyNNG will try to save it to shared memory, if available, otherwise to $TMP.
54 | # This index is NOT removed automatically, as one will typically want build an index once and use it often.
55 | # Retrieve nearest neighbors for each test object
56 | neigh_pred_plain = nn_plain.kneighbors(X_test,
57 | n_neighbors=100,
58 | return_distance=False)
59 |
60 | # Calculate the recall per test object
61 | recalled_plain = [np.intersect1d(neigh_true[i], neigh_pred_plain)
62 | for i in range(len(X_test))]
63 | recall_plain = np.array([recalled_plain[i].size / neigh_true.shape[1]
64 | for i in range(len(X_test))])
65 |
66 | # Statistics
67 | print(f'Mean = {recall_plain.mean():.4f}, '
68 | f'stdev = {recall_plain.std():.4f}')
69 |
70 |
71 | # ANN with HUBNESS REDUCTION
72 | # Here we set `n_candidates=1000`, so that for each query,
73 | # 1000 neighbors will be retrieved first by `LegacyNNG`,
74 | # that are subsequently refined by hubness reduction.
75 | # Hubness reduction is performed by local scaling as specified with `hubness='ls'`.
76 | # Creating the NN index with hubness reduction enabled
77 | nn = NearestNeighbors(n_neighbors=100,
78 | algorithm='nng',
79 | algorithm_params={'n_candidates': 1_000,
80 | 'n_jobs': 8},
81 | hubness='ls',
82 | verbose=2,
83 | )
84 | nn.fit(X_train)
85 |
86 | # Retrieve nearest neighbors for each test object
87 | neigh_pred = nn.kneighbors(X_test,
88 | n_neighbors=100,
89 | return_distance=False)
90 |
91 | # Measure recall per object and on average
92 | recalled = [np.intersect1d(neigh_true[i], neigh_pred)
93 | for i in range(len(X_test))]
94 | recall = np.array([recalled[i].size / neigh_true.shape[1]
95 | for i in range(len(X_test))])
96 | print(f'Mean = {recall.mean():.4f}, '
97 | f'stdev = {recall.std():.4f}')
98 |
99 | # If the second results are significantly better than the first,
100 | # this could indicate that the chosen ANN method is more prone
101 | # to hubness than exact NN, which might be an interesting research question.
102 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nearest_centroid.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | :class: sphx-glr-download-link-note
3 |
4 | Click :ref:`here ` to download the full example code
5 | .. rst-class:: sphx-glr-example-title
6 |
7 | .. _sphx_glr_documentation_auto_examples_plot_nearest_centroid.py:
8 |
9 |
10 | ===============================
11 | Nearest Centroid Classification
12 | ===============================
13 |
14 | Sample usage of Nearest Centroid classification.
15 | It will plot the decision boundaries for each class.
16 |
17 | Note that no hubness reduction is currently implemented for centroids.
18 | However, `hubness.neighbors` retains all the features of `sklearn.neighbors`,
19 | in order to act as a full drop-in replacement.
20 |
21 | Adapted from ``_
22 |
23 |
24 |
25 | .. rst-class:: sphx-glr-horizontal
26 |
27 |
28 | *
29 |
30 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_001.png
31 | :class: sphx-glr-multi-img
32 |
33 | *
34 |
35 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_002.png
36 | :class: sphx-glr-multi-img
37 |
38 |
39 | .. rst-class:: sphx-glr-script-out
40 |
41 | Out:
42 |
43 | .. code-block:: none
44 |
45 |
46 | None 0.8133333333333334
47 | 0.2 0.82
48 | /home/user/feldbauer/PycharmProjects/hubness/examples/sklearn/plot_nearest_centroid.py:64: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.
49 | plt.show()
50 |
51 |
52 |
53 |
54 |
55 | |
56 |
57 |
58 | .. code-block:: default
59 |
60 | print(__doc__)
61 |
62 | import numpy as np
63 | import matplotlib.pyplot as plt
64 | from matplotlib.colors import ListedColormap
65 | from sklearn import datasets
66 | from skhubness.neighbors import NearestCentroid
67 |
68 | n_neighbors = 15
69 |
70 | # import some data to play with
71 | iris = datasets.load_iris()
72 | # we only take the first two features. We could avoid this ugly
73 | # slicing by using a two-dim dataset
74 | X = iris.data[:, :2]
75 | y = iris.target
76 |
77 | h = .02 # step size in the mesh
78 |
79 | # Create color maps
80 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
81 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
82 |
83 | for shrinkage in [None, .2]:
84 | # we create an instance of Neighbours Classifier and fit the data.
85 | clf = NearestCentroid(shrink_threshold=shrinkage)
86 | clf.fit(X, y)
87 | y_pred = clf.predict(X)
88 | print(shrinkage, np.mean(y == y_pred))
89 | # Plot the decision boundary. For that, we will assign a color to each
90 | # point in the mesh [x_min, x_max]x[y_min, y_max].
91 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
92 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
93 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
94 | np.arange(y_min, y_max, h))
95 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
96 |
97 | # Put the result into a color plot
98 | Z = Z.reshape(xx.shape)
99 | plt.figure()
100 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
101 |
102 | # Plot also the training points
103 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
104 | edgecolor='k', s=20)
105 | plt.title("3-Class classification (shrink_threshold=%r)"
106 | % shrinkage)
107 | plt.axis('tight')
108 |
109 | plt.show()
110 |
111 |
112 | .. rst-class:: sphx-glr-timing
113 |
114 | **Total running time of the script:** ( 0 minutes 0.737 seconds)
115 |
116 |
117 | .. _sphx_glr_download_documentation_auto_examples_plot_nearest_centroid.py:
118 |
119 |
120 | .. only :: html
121 |
122 | .. container:: sphx-glr-footer
123 | :class: sphx-glr-footer-example
124 |
125 |
126 |
127 | .. container:: sphx-glr-download
128 |
129 | :download:`Download Python source code: plot_nearest_centroid.py `
130 |
131 |
132 |
133 | .. container:: sphx-glr-download
134 |
135 | :download:`Download Jupyter notebook: plot_nearest_centroid.ipynb `
136 |
137 |
138 | .. only:: html
139 |
140 | .. rst-class:: sphx-glr-signature
141 |
142 | `Gallery generated by Sphinx-Gallery `_
143 |
--------------------------------------------------------------------------------
/docs/documentation/auto_examples/plot_nca_dim_reduction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n# Dimensionality Reduction with Neighborhood Components Analysis\n\n\nSample usage of Neighborhood Components Analysis for dimensionality reduction.\n\nThis example compares different (linear) dimensionality reduction methods\napplied on the Digits data set. The data set contains images of digits from\n0 to 9 with approximately 180 samples of each class. Each image is of\ndimension 8x8 = 64, and is reduced to a two-dimensional data point.\n\nPrincipal Component Analysis (PCA) applied to this data identifies the\ncombination of attributes (principal components, or directions in the\nfeature space) that account for the most variance in the data. Here we\nplot the different samples on the 2 first principal components.\n\nLinear Discriminant Analysis (LDA) tries to identify attributes that\naccount for the most variance *between classes*. In particular,\nLDA, in contrast to PCA, is a supervised method, using known class labels.\n\nNeighborhood Components Analysis (NCA) tries to find a feature space such\nthat a stochastic nearest neighbor algorithm will give the best accuracy.\nLike LDA, it is a supervised method.\n\nOne can see that NCA enforces a clustering of the data that is visually\nmeaningful despite the large reduction in dimension.\n\nAdapted from ``_\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import PCA\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nfrom skhubness.neighbors import (KNeighborsClassifier,\n NeighborhoodComponentsAnalysis)\n\nprint(__doc__)\n\nn_neighbors = 3\nrandom_state = 0\n\n# Load Digits dataset\ndigits = datasets.load_digits()\nX, y = digits.data, digits.target\n\n# Split into train/test\nX_train, X_test, y_train, y_test = \\\n train_test_split(X, y, test_size=0.5, stratify=y,\n random_state=random_state)\n\ndim = len(X[0])\nn_classes = len(np.unique(y))\n\n# Reduce dimension to 2 with PCA\npca = make_pipeline(StandardScaler(),\n PCA(n_components=2, random_state=random_state))\n\n# Reduce dimension to 2 with LinearDiscriminantAnalysis\nlda = make_pipeline(StandardScaler(),\n LinearDiscriminantAnalysis(n_components=2))\n\n# Reduce dimension to 2 with NeighborhoodComponentAnalysis\nnca = make_pipeline(StandardScaler(),\n NeighborhoodComponentsAnalysis(n_components=2,\n random_state=random_state))\n\n# Use a nearest neighbor classifier to evaluate the methods\nknn = KNeighborsClassifier(n_neighbors=n_neighbors)\n\n# Make a list of the methods to be compared\ndim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]\n\n# plt.figure()\nfor i, (name, model) in enumerate(dim_reduction_methods):\n plt.figure()\n # plt.subplot(1, 3, i + 1, aspect=1)\n\n # Fit the method's model\n model.fit(X_train, y_train)\n\n # Fit a nearest neighbor classifier on the embedded training set\n knn.fit(model.transform(X_train), y_train)\n\n # Compute the nearest neighbor accuracy on the embedded test set\n acc_knn = knn.score(model.transform(X_test), y_test)\n\n # Embed the data set in 2 dimensions using the fitted model\n X_embedded = model.transform(X)\n\n # Plot the projected points and show the evaluation score\n plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')\n plt.title(\"{}, KNN (k={})\\nTest accuracy = {:.2f}\".format(name,\n n_neighbors,\n acc_knn))\nplt.show()"
30 | ]
31 | }
32 | ],
33 | "metadata": {
34 | "kernelspec": {
35 | "display_name": "Python 3",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.7.4"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
--------------------------------------------------------------------------------