├── docs ├── documentation │ ├── ahr.rst │ ├── ann.rst │ ├── hr.rst │ ├── sklearn.rst │ ├── auto_examples │ │ ├── plot_lle_digits.py.md5 │ │ ├── plot_regression.py.md5 │ │ ├── plot_classification.py.md5 │ │ ├── plot_nearest_centroid.py.md5 │ │ ├── plot_nca_classification.py.md5 │ │ ├── plot_nca_dim_reduction.py.md5 │ │ ├── plot_multioutput_face_completion.py.md5 │ │ ├── auto_examples_jupyter.zip │ │ ├── auto_examples_python.zip │ │ ├── images │ │ │ ├── sphx_glr_plot_lle_digits_001.png │ │ │ ├── sphx_glr_plot_lle_digits_002.png │ │ │ ├── sphx_glr_plot_lle_digits_003.png │ │ │ ├── sphx_glr_plot_lle_digits_004.png │ │ │ ├── sphx_glr_plot_lle_digits_005.png │ │ │ ├── sphx_glr_plot_lle_digits_006.png │ │ │ ├── sphx_glr_plot_lle_digits_007.png │ │ │ ├── sphx_glr_plot_lle_digits_008.png │ │ │ ├── sphx_glr_plot_lle_digits_009.png │ │ │ ├── sphx_glr_plot_lle_digits_010.png │ │ │ ├── sphx_glr_plot_lle_digits_011.png │ │ │ ├── sphx_glr_plot_lle_digits_012.png │ │ │ ├── sphx_glr_plot_lle_digits_013.png │ │ │ ├── sphx_glr_plot_lle_digits_014.png │ │ │ ├── sphx_glr_plot_lle_digits_015.png │ │ │ ├── sphx_glr_plot_lle_digits_016.png │ │ │ ├── sphx_glr_plot_regression_001.png │ │ │ ├── sphx_glr_plot_classification_001.png │ │ │ ├── sphx_glr_plot_classification_002.png │ │ │ ├── sphx_glr_plot_nearest_centroid_001.png │ │ │ ├── sphx_glr_plot_nearest_centroid_002.png │ │ │ ├── sphx_glr_plot_nca_classification_001.png │ │ │ ├── sphx_glr_plot_nca_classification_002.png │ │ │ ├── sphx_glr_plot_nca_classification_003.png │ │ │ ├── sphx_glr_plot_nca_classification_004.png │ │ │ ├── sphx_glr_plot_nca_classification_005.png │ │ │ ├── sphx_glr_plot_nca_classification_006.png │ │ │ ├── sphx_glr_plot_nca_dim_reduction_001.png │ │ │ ├── sphx_glr_plot_nca_dim_reduction_002.png │ │ │ ├── sphx_glr_plot_nca_dim_reduction_003.png │ │ │ ├── thumb │ │ │ │ ├── sphx_glr_plot_lle_digits_thumb.png │ │ │ │ ├── sphx_glr_plot_regression_thumb.png │ │ │ │ ├── sphx_glr_plot_classification_thumb.png │ │ │ │ ├── sphx_glr_plot_nearest_centroid_thumb.png │ │ │ │ ├── sphx_glr_plot_nca_classification_thumb.png │ │ │ │ ├── sphx_glr_plot_nca_dim_reduction_thumb.png │ │ │ │ └── sphx_glr_plot_multioutput_face_completion_thumb.png │ │ │ └── sphx_glr_plot_multioutput_face_completion_001.png │ │ ├── sg_execution_times.rst │ │ ├── plot_regression.py │ │ ├── plot_classification.py │ │ ├── plot_nearest_centroid.py │ │ ├── plot_regression.ipynb │ │ ├── plot_classification.ipynb │ │ ├── plot_nearest_centroid.ipynb │ │ ├── plot_multioutput_face_completion.py │ │ ├── plot_regression.rst │ │ ├── plot_classification.rst │ │ ├── plot_nca_dim_reduction.py │ │ ├── plot_multioutput_face_completion.ipynb │ │ ├── plot_nearest_centroid.rst │ │ └── plot_nca_dim_reduction.ipynb │ ├── auto_examples_hr │ │ ├── auto_examples_hr_python.zip │ │ ├── auto_examples_hr_jupyter.zip │ │ ├── images │ │ │ └── thumb │ │ │ │ ├── sphx_glr_pipelines_thumb.png │ │ │ │ └── sphx_glr_olivetti_faces_thumb.png │ │ ├── index.rst │ │ ├── pipelines.py │ │ ├── olivetti_faces.py │ │ ├── pipelines.ipynb │ │ ├── pipelines.rst │ │ ├── olivetti_faces.ipynb │ │ └── olivetti_faces.rst │ ├── auto_examples_ahr │ │ ├── auto_examples_ahr_jupyter.zip │ │ ├── auto_examples_ahr_python.zip │ │ ├── images │ │ │ └── thumb │ │ │ │ ├── sphx_glr_reusing_index_thumb.png │ │ │ │ └── sphx_glr_high_dim_gaussian_thumb.png │ │ ├── reusing_index.py │ │ ├── high_dim_gaussian.py │ │ ├── index.rst │ │ ├── reusing_index.ipynb │ │ ├── high_dim_gaussian.ipynb │ │ ├── reusing_index.rst │ │ └── high_dim_gaussian.rst │ ├── auto_examples_ann │ │ ├── auto_examples_ann_jupyter.zip │ │ ├── auto_examples_ann_python.zip │ │ ├── images │ │ │ └── thumb │ │ │ │ └── sphx_glr_word_embeddings_thumb.png │ │ ├── index.rst │ │ └── word_embeddings.py │ ├── examples.rst │ ├── user_guide.rst │ ├── history.rst │ ├── nearestneighbors.rst │ ├── documentation.rst │ └── reduction.rst ├── Makefile ├── make.bat ├── getting_started │ ├── installation.rst │ └── example.rst ├── github_link.py ├── index.rst └── changelog.md ├── skhubness ├── utils │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── test_io.py │ ├── check.py │ ├── multiprocessing.py │ └── io.py ├── data │ ├── tests │ │ ├── __init__.py │ │ └── test_load_datasets.py │ ├── __init__.py │ ├── dexter │ │ ├── ABOUT │ │ └── dexter_train.labels │ └── load_dataset.py ├── analysis │ ├── tests │ │ └── __init__.py │ └── __init__.py ├── neighbors │ ├── tests │ │ ├── __init__.py │ │ └── test_neighbors.py │ ├── __init__.py │ └── approximate_neighbors.py ├── reduction │ ├── tests │ │ ├── __init__.py │ │ ├── test_local_scaling.py │ │ └── test_hubness_reduction.py │ ├── _base.py │ └── __init__.py └── __init__.py ├── .flake8 ├── paper └── arxiv │ └── scikit-hubness_arxiv_v1.pdf ├── MANIFEST.in ├── requirements.txt ├── requirements-win.txt ├── examples ├── approximate_neighbors │ ├── README.rst │ └── word_embeddings.py ├── approximate_hub_red │ ├── README.rst │ ├── reusing_index.py │ └── high_dim_gaussian.py ├── hubness_reduction │ ├── README.rst │ ├── pipelines.py │ └── olivetti_faces.py └── sklearn │ ├── README.rst │ ├── plot_regression.py │ ├── plot_classification.py │ ├── plot_nearest_centroid.py │ ├── plot_multioutput_face_completion.py │ └── plot_nca_dim_reduction.py ├── requirements-rtd.txt ├── .coveragerc ├── .readthedocs.yml ├── pyproject.toml ├── WARRANTY.txt ├── scripts ├── install-puffinn.sh └── install-ngt.sh ├── LICENSE.txt ├── .github └── workflows │ └── scikit-hubness_ci.yml ├── .gitignore ├── setup.cfg └── CODE_OF_CONDUCT.md /docs/documentation/ahr.rst: -------------------------------------------------------------------------------- 1 | .. include:: auto_examples_ahr/index.rst 2 | -------------------------------------------------------------------------------- /docs/documentation/ann.rst: -------------------------------------------------------------------------------- 1 | .. include:: auto_examples_ann/index.rst 2 | -------------------------------------------------------------------------------- /docs/documentation/hr.rst: -------------------------------------------------------------------------------- 1 | .. include:: auto_examples_hr/index.rst 2 | -------------------------------------------------------------------------------- /docs/documentation/sklearn.rst: -------------------------------------------------------------------------------- 1 | .. include:: auto_examples/index.rst 2 | -------------------------------------------------------------------------------- /skhubness/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | -------------------------------------------------------------------------------- /skhubness/data/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | -------------------------------------------------------------------------------- /skhubness/utils/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | -------------------------------------------------------------------------------- /skhubness/analysis/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | -------------------------------------------------------------------------------- /skhubness/neighbors/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | -------------------------------------------------------------------------------- /skhubness/reduction/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_lle_digits.py.md5: -------------------------------------------------------------------------------- 1 | af9f3e15361795b55a753e531924945c -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_regression.py.md5: -------------------------------------------------------------------------------- 1 | bb057885f6f41ce374c6001535beee34 -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = *puffinn/include/external/ffht* 4 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_classification.py.md5: -------------------------------------------------------------------------------- 1 | 828dcf172d84ae7101cc889cf5c7c9f8 -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nearest_centroid.py.md5: -------------------------------------------------------------------------------- 1 | f82e0922b095569b290ff698edf738ae -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nca_classification.py.md5: -------------------------------------------------------------------------------- 1 | 92a38f10df6d7ae167988498b8907ef5 -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nca_dim_reduction.py.md5: -------------------------------------------------------------------------------- 1 | f825086405653531d6cb420f49988d4c -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_multioutput_face_completion.py.md5: -------------------------------------------------------------------------------- 1 | dfd8de51f7a147dc438a3fbf7fcd091a -------------------------------------------------------------------------------- /paper/arxiv/scikit-hubness_arxiv_v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/paper/arxiv/scikit-hubness_arxiv_v1.pdf -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | recursive-include docs * 4 | recursive-include skhubness * 5 | include skhubness/data/dexter/* 6 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/auto_examples_jupyter.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/auto_examples_jupyter.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples/auto_examples_python.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/auto_examples_python.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/auto_examples_hr_python.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/auto_examples_hr_python.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/auto_examples_ahr_jupyter.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/auto_examples_ahr_jupyter.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/auto_examples_ahr_python.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/auto_examples_ahr_python.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ann/auto_examples_ann_jupyter.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ann/auto_examples_ann_jupyter.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ann/auto_examples_ann_python.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ann/auto_examples_ann_python.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/auto_examples_hr_jupyter.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/auto_examples_hr_jupyter.zip -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_001.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_002.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_003.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_004.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_005.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_006.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_007.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_008.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_009.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_010.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_011.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_012.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_013.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_014.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_015.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_016.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_lle_digits_016.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_regression_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_regression_001.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_classification_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_classification_001.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_classification_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_classification_002.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_001.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_002.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/images/thumb/sphx_glr_pipelines_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/images/thumb/sphx_glr_pipelines_thumb.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy>=1.2 3 | scikit-learn 4 | pandas 5 | joblib>=0.12 6 | tqdm 7 | numba 8 | annoy 9 | nmslib 10 | ngt>=1.8 11 | pytest 12 | pytest-cov 13 | codecov 14 | nose 15 | flake8 16 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_001.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_002.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_003.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_004.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_005.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_classification_006.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_001.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_002.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_nca_dim_reduction_003.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/thumb/sphx_glr_plot_lle_digits_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_lle_digits_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/thumb/sphx_glr_plot_regression_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_regression_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_reusing_index_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_reusing_index_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/images/thumb/sphx_glr_olivetti_faces_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_hr/images/thumb/sphx_glr_olivetti_faces_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/thumb/sphx_glr_plot_classification_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_classification_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nearest_centroid_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nearest_centroid_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_high_dim_gaussian_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ahr/images/thumb/sphx_glr_high_dim_gaussian_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ann/images/thumb/sphx_glr_word_embeddings_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples_ann/images/thumb/sphx_glr_word_embeddings_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_classification_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_classification_thumb.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_dim_reduction_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_nca_dim_reduction_thumb.png -------------------------------------------------------------------------------- /requirements-win.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy>=1.2 3 | scikit-learn 4 | pandas 5 | joblib>=0.12 6 | tqdm 7 | nmslib 8 | annoy 9 | # ngt # DOES NOT support Windows 10 | pytest 11 | pytest-cov 12 | codecov 13 | nose 14 | flake8 15 | -------------------------------------------------------------------------------- /skhubness/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | """ 3 | The :mod:`skhubness.data` package provides example data sets. 4 | """ 5 | from .load_dataset import load_dexter 6 | 7 | __all__ = ['load_dexter'] 8 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png -------------------------------------------------------------------------------- /docs/documentation/auto_examples/images/thumb/sphx_glr_plot_multioutput_face_completion_thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VarIr/scikit-hubness/HEAD/docs/documentation/auto_examples/images/thumb/sphx_glr_plot_multioutput_face_completion_thumb.png -------------------------------------------------------------------------------- /examples/approximate_neighbors/README.rst: -------------------------------------------------------------------------------- 1 | ============================================ 2 | Example: Approximate nearest neighbor search 3 | ============================================ 4 | 5 | This example shows how to perform approximate nearest neighbor search. 6 | -------------------------------------------------------------------------------- /examples/approximate_hub_red/README.rst: -------------------------------------------------------------------------------- 1 | ======================================== 2 | Example: Approximate hubness reduction 3 | ======================================== 4 | 5 | These examples show how to combine approximate nearest neighbor search and hubness reduction. 6 | -------------------------------------------------------------------------------- /examples/hubness_reduction/README.rst: -------------------------------------------------------------------------------- 1 | ============================================ 2 | Example: Hubness reduction 3 | ============================================ 4 | 5 | These examples show how to perform hubness reduction in kNN classification 6 | in (nested) cross-validation and pipelines. 7 | -------------------------------------------------------------------------------- /skhubness/data/dexter/ABOUT: -------------------------------------------------------------------------------- 1 | DEXTER is a text classification problem in a bag-of-word representation. This 2 | is a two-class classification problem with sparse continuous input variables. 3 | This dataset is one of five datasets of the NIPS 2003 feature selection 4 | challenge. 5 | 6 | http://archive.ics.uci.edu/ml/datasets/Dexter 7 | -------------------------------------------------------------------------------- /skhubness/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | 4 | """ 5 | The :mod:`skhubness.analysis` package provides methods for measuring hubness. 6 | """ 7 | from .estimation import Hubness, VALID_HUBNESS_MEASURES 8 | 9 | __all__ = [ 10 | "Hubness", 11 | "VALID_HUBNESS_MEASURES", 12 | ] 13 | -------------------------------------------------------------------------------- /requirements-rtd.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy>=1.2 3 | scikit-learn 4 | pandas 5 | joblib>=0.12 6 | tqdm 7 | pytest 8 | pytest-cov 9 | codecov 10 | nose 11 | flake8 12 | git+https://github.com/readthedocs/readthedocs-sphinx-search@master # TODO update to PyPI when it becomes available 13 | sphinx>=2.1 14 | sphinx-automodapi 15 | sphinx-gallery 16 | sphinx-pdj-theme 17 | mock 18 | graphviz 19 | numpydoc 20 | -------------------------------------------------------------------------------- /docs/documentation/examples.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Examples 3 | ========== 4 | 5 | In this section, we provide usage examples for ``skhubness``. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Contents: 10 | 11 | Example: Hubness reduction
12 | Example: Approximate nearest neighbor search 13 | Example: Approximate hubness reduction 14 | Example: From sklearn to skhubness 15 | -------------------------------------------------------------------------------- /skhubness/data/tests/test_load_datasets.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | 3 | from skhubness.data import load_dexter 4 | 5 | 6 | def test_load_dexter(): 7 | X, y = load_dexter() 8 | n_samples = 300 9 | n_features = 20_000 10 | assert X.shape == (n_samples, n_features), f'Wrong shape: X.shape = {X.shape}, should be (300, 20_000).' 11 | assert y.shape == (n_samples, ), f'Wrong shape: y.shape = {y.shape}, should be (300, ).' 12 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | setup.py 4 | branch = True 5 | parallel = True 6 | concurrency = multiprocessing 7 | 8 | [report] 9 | exclude_lines = 10 | pragma: no cover 11 | def __repr__ 12 | raise AssertionError 13 | raise NotImplementedError 14 | raise ValueError 15 | raise TypeError 16 | warnings.warn 17 | only on win32 18 | sys.platform == 'win32' 19 | except ImportError 20 | ModuleNotFoundError 21 | if __name__ == .__main__.: -------------------------------------------------------------------------------- /docs/documentation/user_guide.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | User guide 3 | ========== 4 | 5 | Welcome to ``scikit-hubness``! 6 | Here we describe the core functionality of the package 7 | (hubness analysis, hubness reduction, neighbor search), 8 | and provide several usage examples. 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :caption: Contents: 14 | 15 | Core concepts 16 | Hubness analysis 17 | Hubness reduction 18 | Nearest neighbors 19 | Examples 20 | -------------------------------------------------------------------------------- /skhubness/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | 4 | """ Python package for nearest neighbor retrieval in high-dimensional space.""" 5 | 6 | __version__ = '0.30.0a1' 7 | 8 | from . import analysis 9 | from . import data 10 | from .analysis.estimation import Hubness 11 | from . import neighbors 12 | from . import reduction 13 | from . import utils 14 | 15 | 16 | __all__ = [ 17 | "analysis", 18 | "data", 19 | "neighbors", 20 | "reduction", 21 | "utils", 22 | "Hubness", 23 | ] 24 | -------------------------------------------------------------------------------- /skhubness/utils/check.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # Author: Roman Feldbauer 4 | import numpy as np 5 | 6 | __all__ = [ 7 | "check_n_candidates", 8 | ] 9 | 10 | 11 | def check_n_candidates(n_candidates): 12 | # Check the n_neighbors parameter 13 | if n_candidates <= 0: 14 | raise ValueError(f"Expected n_neighbors > 0. Got {n_candidates:d}") 15 | if not np.issubdtype(type(n_candidates), np.integer): 16 | raise TypeError(f"n_neighbors does not take {type(n_candidates)} value, enter integer value") 17 | return n_candidates 18 | -------------------------------------------------------------------------------- /skhubness/reduction/_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | 4 | from abc import ABC, abstractmethod 5 | 6 | from sklearn.base import BaseEstimator 7 | 8 | 9 | class HubnessReduction(BaseEstimator, ABC): 10 | """ Base class for hubness reduction in a sparse neighbors graph. """ 11 | @abstractmethod 12 | def __init__(self, **kwargs): 13 | # TODO whether to include/exclude self distances, or let the user decide... 14 | pass 15 | 16 | @abstractmethod 17 | def fit(self, X, y=None, **kwargs): 18 | pass 19 | 20 | @abstractmethod 21 | def transform(self, X, y=None, **kwargs): 22 | pass 23 | -------------------------------------------------------------------------------- /skhubness/utils/tests/test_io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # Author: Roman Feldbauer 4 | import os 5 | import platform 6 | import pytest 7 | from skhubness.utils.io import create_tempfile_preferably_in_dir 8 | 9 | 10 | @pytest.mark.parametrize('directory', [None, '/does/not/exist/kluawev']) 11 | @pytest.mark.parametrize('persistent', [True, False]) 12 | def test_tempfile(directory, persistent): 13 | f = create_tempfile_preferably_in_dir(directory=directory, persistent=persistent) 14 | assert isinstance(f, str) 15 | if persistent and platform.system() != 'Windows': # locked by running process on Windows 16 | os.remove(f) 17 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | install: 23 | - requirements: requirements-rtd.txt 24 | system_packages: false 25 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /skhubness/reduction/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | 4 | """ 5 | The :mod:`skhubness.reduction` package provides methods for hubness reduction. 6 | """ 7 | 8 | from ._mutual_proximity import MutualProximity 9 | from ._local_scaling import LocalScaling 10 | from ._dis_sim import DisSimLocal 11 | 12 | #: Supported hubness reduction algorithms 13 | hubness_algorithms = [ 14 | "mp", 15 | "ls", 16 | "dsl", 17 | ] 18 | hubness_algorithms_long = [ 19 | "mutual_proximity", 20 | "local_scaling", 21 | "dis_sim_local", 22 | ] 23 | 24 | 25 | __all__ = [ 26 | "LocalScaling", 27 | "MutualProximity", 28 | "DisSimLocal", 29 | "hubness_algorithms", 30 | ] 31 | -------------------------------------------------------------------------------- /examples/sklearn/README.rst: -------------------------------------------------------------------------------- 1 | ================================================ 2 | scikit-learn examples adapted for scikit-hubness 3 | ================================================ 4 | 5 | Examples concerning using :mod:`skhubness.neighbors` 6 | as drop-in replacement for :mod:`sklearn.neighbors`. 7 | 8 | These examples are taken from scikit-learn and demonstrate the ease of transition 9 | from ``sklearn.neighbors`` to ``skhubness.neighbors``. 10 | You will find that many examples require no more than modifying an import line, 11 | and/or adding one argument when instantiating an estimator. 12 | 13 | Note, that these examples are not intended to demonstrate improved learning performance 14 | due to hubness reduction (the data are rather low-dimensional). 15 | -------------------------------------------------------------------------------- /skhubness/neighbors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | """ 4 | The :mod:`skhubness.neighbors` package provides wrappers for various 5 | approximate nearest neighbor packages. These are compatible with the 6 | scikit-learn `KNeighborsTransformer`. 7 | """ 8 | from ._annoy import AnnoyTransformer, LegacyRandomProjectionTree 9 | from ._nmslib import NMSlibTransformer, LegacyHNSW 10 | from ._puffinn import PuffinnTransformer, LegacyPuffinn 11 | from ._ngt import NGTTransformer, LegacyNNG 12 | 13 | 14 | __all__ = [ 15 | "AnnoyTransformer", 16 | "LegacyHNSW", 17 | "LegacyNNG", 18 | "LegacyPuffinn", 19 | "LegacyRandomProjectionTree", 20 | "NGTTransformer", 21 | "NMSlibTransformer", 22 | "PuffinnTransformer", 23 | ] 24 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "pybind11"] 3 | 4 | [tool.black] 5 | line-length = 88 6 | target_version = ['py38', 'py39', 'py310'] 7 | experimental_string_processing = true 8 | exclude = """ 9 | /( 10 | \\.eggs # exclude a few common directories in the 11 | | \\.git # root of the project 12 | | \\.mypy_cache 13 | | \\.vscode 14 | | examples 15 | | build 16 | | dist 17 | | doc/tutorial 18 | | doc/_build 19 | | doc/auto_examples 20 | )/ 21 | """ 22 | 23 | [tool.coverage.run] 24 | omit = ["setup.py", ] 25 | branch = true 26 | parallel = true 27 | concurrency = ["multiprocessing", ] 28 | 29 | [tool.coverage.report] 30 | exclude_lines = [ 31 | "pragma: no cover", 32 | "def __repr__", 33 | "raise AssertionError", 34 | "only on win32", 35 | "sys.platform == 'win32'", 36 | ] 37 | -------------------------------------------------------------------------------- /WARRANTY.txt: -------------------------------------------------------------------------------- 1 | THIS SOURCE CODE IS SUPPLIED “AS IS” WITHOUT WAR- 2 | RANTY OF ANY KIND, AND ITS AUTHOR AND THE JOURNAL OF 3 | MACHINE LEARNING RESEARCH (JMLR) AND JMLR’S PUBLISH- 4 | ERS AND DISTRIBUTORS, DISCLAIM ANY AND ALL WARRANTIES, 5 | INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES 6 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PUR- 7 | POSE, AND ANY WARRANTIES OR NON INFRINGEMENT. THE USER 8 | ASSUMES ALL LIABILITY AND RESPONSIBILITY FOR USE OF THIS 9 | SOURCE CODE, AND NEITHER THE AUTHOR NOR JMLR, NOR 10 | JMLR’S PUBLISHERS AND DISTRIBUTORS, WILL BE LIABLE FOR 11 | DAMAGES OF ANY KIND RESULTING FROM ITS USE. Without lim- 12 | iting the generality of the foregoing, neither the author, nor JMLR, nor 13 | JMLR’s publishers and distributors, warrant that the Source Code will be 14 | error-free, will operate without interruption, or will meet the needs of the 15 | user -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /skhubness/utils/multiprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # Author: Roman Feldbauer 4 | from multiprocessing import cpu_count 5 | 6 | __all__ = [ 7 | "validate_n_jobs", 8 | ] 9 | 10 | 11 | def register_parallel_pytest_cov(): 12 | try: 13 | from pytest_cov.embed import cleanup_on_sigterm 14 | except ImportError: 15 | pass 16 | else: 17 | cleanup_on_sigterm() 18 | 19 | 20 | def validate_n_jobs(n_jobs): 21 | """ Handle special integers and non-integer `n_jobs` values. """ 22 | if n_jobs is None: 23 | n_jobs = 1 24 | elif n_jobs == -1: 25 | n_jobs = cpu_count() 26 | elif n_jobs < -1 or n_jobs == 0: 27 | raise ValueError(f"Number of parallel processes 'n_jobs' must be " 28 | f"a positive integer, or ``-1`` to use all local" 29 | f" CPU cores. Was {n_jobs} instead.") 30 | return n_jobs 31 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/sg_execution_times.rst: -------------------------------------------------------------------------------- 1 | 2 | :orphan: 3 | 4 | .. _sphx_glr_documentation_auto_examples_sg_execution_times: 5 | 6 | Computation times 7 | ================= 8 | **00:25.940** total execution time for **documentation_auto_examples** files: 9 | 10 | - **00:25.940**: :ref:`sphx_glr_documentation_auto_examples_plot_classification.py` (``plot_classification.py``) 11 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_lle_digits.py` (``plot_lle_digits.py``) 12 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_multioutput_face_completion.py` (``plot_multioutput_face_completion.py``) 13 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_nca_classification.py` (``plot_nca_classification.py``) 14 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_nca_dim_reduction.py` (``plot_nca_dim_reduction.py``) 15 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_nearest_centroid.py` (``plot_nearest_centroid.py``) 16 | - **00:00.000**: :ref:`sphx_glr_documentation_auto_examples_plot_regression.py` (``plot_regression.py``) 17 | -------------------------------------------------------------------------------- /skhubness/data/load_dataset.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | 3 | import os 4 | import numpy as np 5 | 6 | __all__ = ["load_dexter"] 7 | 8 | 9 | def load_dexter() -> (np.ndarray, np.ndarray): 10 | """Load the example data set (dexter). 11 | 12 | Returns 13 | ------- 14 | X, y : ndarray, ndarray 15 | Vector data, and class labels 16 | """ 17 | n = 300 18 | dim = 20000 19 | 20 | # Read class labels 21 | dexter_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "dexter") 22 | dexter_labels = os.path.join(dexter_path, "dexter_train.labels") 23 | dexter_vectors = os.path.join(dexter_path, "dexter_train.data") 24 | y = np.loadtxt(dexter_labels) 25 | 26 | # Read data 27 | X = np.zeros((n, dim)) 28 | with open(dexter_vectors, mode="r") as fid: 29 | data = fid.readlines() 30 | row = 0 31 | for line in data: 32 | line = line.strip().split() # line now contains pairs of dim:val 33 | for word in line: 34 | col, val = word.split(":") 35 | X[row][int(col) - 1] = int(val) 36 | row += 1 37 | 38 | return X, y 39 | -------------------------------------------------------------------------------- /scripts/install-puffinn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Build external dependencies that cannot successfully install via pip or conda 3 | # If you use this file as template, don't forget to `chmod a+x newfile` 4 | 5 | set -e 6 | 7 | # Check for the operating system and install puffinn 8 | if [[ $(uname) == "Darwin" ]]; then 9 | echo "Running under Mac OS X..." 10 | git clone https://github.com/puffinn/puffinn.git 11 | cd puffinn 12 | python3 setup.py build 13 | pip install . 14 | cd .. 15 | rm -r puffinn 16 | 17 | elif [[ $(uname -s) == Linux* ]]; then 18 | echo "Running under Linux..." 19 | # Trying to install puffinn from cache, 20 | # and only build if this fails. 21 | # pip install puffinn || (\ 22 | # git clone https://github.com/puffinn/puffinn.git;\ 23 | # cd puffinn;\ 24 | # python3 setup.py build;\ 25 | # pip install . ;\ 26 | # cd ..) 27 | git clone https://github.com/puffinn/puffinn.git 28 | cd puffinn 29 | python3 setup.py build 30 | pip install . 31 | cd .. 32 | rm -r puffinn 33 | 34 | elif [[ $(uname -s) == MINGW32_NT* ]]; then 35 | echo "Running under Win x86-32" 36 | echo "Nothing to build." 37 | 38 | elif [[ $(uname -s) == MINGW64_NT* ]]; then 39 | echo "Running under Win x86-64" 40 | echo "Nothing to build." 41 | 42 | elif [[ $(uname -s) == CYGWIN* ]]; then 43 | echo "Running under Cygwin" 44 | echo "Nothing to build." 45 | 46 | fi 47 | -------------------------------------------------------------------------------- /skhubness/utils/io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # Author: Roman Feldbauer 4 | import logging 5 | from tempfile import mkstemp, NamedTemporaryFile 6 | 7 | __all__ = [ 8 | "create_tempfile_preferably_in_dir", 9 | "validate_verbose", 10 | ] 11 | 12 | 13 | def create_tempfile_preferably_in_dir(suffix=None, prefix=None, directory=None, persistent: bool = False, ): 14 | """ Create a temporary file with precedence for directory if possible, in TMP otherwise. 15 | For example, this is useful to try to save into /dev/shm. 16 | """ 17 | temp_file = mkstemp if persistent else NamedTemporaryFile 18 | try: 19 | handle = temp_file(suffix=suffix, prefix=prefix, dir=directory) 20 | warn = False 21 | except FileNotFoundError: 22 | handle = temp_file(suffix=suffix, prefix=prefix, dir=None) 23 | warn = True 24 | 25 | # Extract the path (as string) 26 | try: 27 | path = handle.name 28 | except AttributeError: 29 | _, path = handle 30 | 31 | if warn: 32 | logging.warning(f"Could not create temp file in {directory}. " 33 | f"Instead, the path is {path}.") 34 | return path 35 | 36 | 37 | def validate_verbose(verbose): 38 | """ Handle special values for verbose parameter. """ 39 | if verbose is None: 40 | verbose = 0 41 | elif verbose < 0: 42 | verbose = 0 43 | return verbose 44 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018-2019, the scikit-hubness developers 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /docs/documentation/history.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | History of ``scikit-hubness`` 3 | ============================== 4 | 5 | ``scikit-hubness`` builds upon previous software: the Hub-Toolbox. 6 | The original `Hub-Toolbox `_ 7 | was written for Matlab, and released in parallel 8 | with the release of the first hubness reduction methods in 9 | `JMLR `_. 10 | In essence, it comprises methods to reduce hubness in distance matrices. 11 | 12 | The `Hub-Toolbox for Python3 `_ 13 | is a port from Matlab to Python, 14 | which over the years got several extensions and additional functionality, 15 | such as more hubness reduction methods (Localized Centering, DisSimLocal, mp-dissim, etc.), 16 | approximate hubness reduction, and more. 17 | The software was developed by hubness researchers for hubness research. 18 | 19 | The new ``scikit-hubness`` package is rewritten from scratch with a different goal in mind: 20 | Providing easy-to-use neighborhood-based data mining methods (classification, regression, etc.) 21 | with transparent hubness reduction. 22 | Building upon scikit-learn's ``neighbors`` package, we provide a drop-in replacement 23 | called ``skhubness.neighbors``, which offers all the functionality of ``sklearn.neighbors``, 24 | but adds additional functionality (approximate nearest neighbor search, hubness reduction). 25 | 26 | This way, we think that machine learning researchers and practitioners 27 | (many of which will be fluent in scikit-learn) 28 | can quickly and effectively employ ``scikit-hubness`` in their existing workflows, 29 | and improve learning in their high-dimensional data. 30 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ann/index.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | 4 | 5 | .. _sphx_glr_documentation_auto_examples_ann: 6 | 7 | ============================================ 8 | Example: Approximate nearest neighbor search 9 | ============================================ 10 | 11 | This example shows how to perform approximate nearest neighbor search. 12 | 13 | 14 | 15 | .. raw:: html 16 | 17 |
18 | 19 | .. only:: html 20 | 21 | .. figure:: /documentation/auto_examples_ann/images/thumb/sphx_glr_word_embeddings_thumb.png 22 | 23 | :ref:`sphx_glr_documentation_auto_examples_ann_word_embeddings.py` 24 | 25 | .. raw:: html 26 | 27 |
28 | 29 | 30 | .. toctree:: 31 | :hidden: 32 | 33 | /documentation/auto_examples_ann/word_embeddings 34 | .. raw:: html 35 | 36 |
37 | 38 | 39 | 40 | .. only :: html 41 | 42 | .. container:: sphx-glr-footer 43 | :class: sphx-glr-footer-gallery 44 | 45 | 46 | .. container:: sphx-glr-download 47 | 48 | :download:`Download all examples in Python source code: auto_examples_ann_python.zip ` 49 | 50 | 51 | 52 | .. container:: sphx-glr-download 53 | 54 | :download:`Download all examples in Jupyter notebooks: auto_examples_ann_jupyter.zip ` 55 | 56 | 57 | .. only:: html 58 | 59 | .. rst-class:: sphx-glr-signature 60 | 61 | `Gallery generated by Sphinx-Gallery `_ 62 | -------------------------------------------------------------------------------- /docs/documentation/nearestneighbors.rst: -------------------------------------------------------------------------------- 1 | ======================================================== 2 | Nearest neighbors 3 | ======================================================== 4 | 5 | The :mod:`skhubness.neighbors` subpackage provides several neighbors-based learning methods. 6 | It is designed as a drop-in replacement for scikit-learn's ``neighbors``. 7 | The package provides all functionality from ``sklearn.neighbors``, 8 | and adds support for transparent hubness reduction, where applicable, including 9 | 10 | - classification (e.g. :class:`KNeighborsClassifier `), 11 | - regression (e.g. :class:`RadiusNeighborsRegressor `), 12 | - unsupervised learning (e.g. :class:`NearestNeighbors `), 13 | - outlier detection (:class:`LocalOutlierFactor `), and 14 | - kNN graphs (:meth:`kneighbors_graph `). 15 | 16 | In addition, scikit-hubness provides approximate nearest neighbor (ANN) search, 17 | in order to support large data sets with millions of data objects and more. 18 | A list of currently provided ANN methods is available 19 | :ref:`here `. 20 | 21 | Hubness reduction and ANN search can be used independently or in conjunction, 22 | the latter yielding `approximate hubness reduction`. 23 | User of scikit-learn will find that only minor modification of their code 24 | is required to enable one or both of the above. 25 | We describe how to do so :ref:`here `. 26 | 27 | For general information and details about nearest neighbors, 28 | we refer to the excellent scikit-learn 29 | `User Guide on Nearest Neighbors `__. 30 | -------------------------------------------------------------------------------- /docs/documentation/documentation.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | API Documentation 3 | ================= 4 | 5 | This is the API documentation for ``scikit-hubness``. 6 | 7 | .. _data_ref: 8 | 9 | Analysis: :mod:`skhubness.analysis` 10 | =================================== 11 | 12 | .. automodule:: skhubness.analysis 13 | :no-members: 14 | :no-inherited-members: 15 | 16 | .. currentmodule:: skhubness 17 | 18 | .. autosummary:: 19 | :nosignatures: 20 | :toctree: _autosummary 21 | 22 | analysis.LegacyHubness 23 | analysis.VALID_HUBNESS_MEASURES 24 | 25 | 26 | Neighbors: :mod:`skhubness.neighbors` 27 | ===================================== 28 | 29 | .. automodule:: skhubness.neighbors 30 | :no-members: 31 | :no-inherited-members: 32 | 33 | .. currentmodule:: skhubness 34 | 35 | .. autosummary:: 36 | :nosignatures: 37 | :toctree: _autosummary 38 | 39 | neighbors.BallTree 40 | neighbors.DistanceMetric 41 | neighbors.KDTree 42 | neighbors.LegacyHNSW 43 | neighbors.KNeighborsClassifier 44 | neighbors.KNeighborsRegressor 45 | neighbors.LegacyFalconn 46 | neighbors.NearestCentroid 47 | neighbors.NearestNeighbors 48 | neighbors.LegacyNNG 49 | neighbors.LegacyPuffinn 50 | neighbors.RadiusNeighborsClassifier 51 | neighbors.RadiusNeighborsRegressor 52 | neighbors.LegacyRandomProjectionTree 53 | neighbors.kneighbors_graph 54 | neighbors.radius_neighbors_graph 55 | neighbors.KernelDensity 56 | neighbors.LocalOutlierFactor 57 | neighbors.NeighborhoodComponentsAnalysis 58 | 59 | 60 | Reduction: :mod:`skhubness.reduction` 61 | ===================================== 62 | 63 | .. automodule:: skhubness.reduction 64 | :no-members: 65 | :no-inherited-members: 66 | 67 | .. currentmodule:: skhubness 68 | 69 | .. autosummary:: 70 | :nosignatures: 71 | :toctree: _autosummary 72 | 73 | reduction.MutualProximity 74 | reduction.LocalScaling 75 | reduction.DisSimLocal 76 | reduction.hubness_algorithms 77 | -------------------------------------------------------------------------------- /examples/approximate_hub_red/reusing_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================== 3 | Example: Reusing index structures 4 | ======================================== 5 | 6 | This example shows how to reuse index structures. If you want to first estimate hubness, 7 | and then perform kNN, you can avoid recomputing the ANN index structure, which can be 8 | costly. 9 | """ 10 | from sklearn.datasets import make_classification 11 | from sklearn.model_selection import train_test_split 12 | 13 | from skhubness.analysis import LegacyHubness 14 | from skhubness.neighbors import KNeighborsClassifier 15 | 16 | X, y = make_classification(n_samples=100_000, 17 | n_features=500, 18 | n_informative=400, 19 | random_state=543) 20 | 21 | X_train, X_test, y_train, y_test = train_test_split(X, y, 22 | test_size=0.01, 23 | stratify=y, 24 | shuffle=True, 25 | random_state=2346) 26 | 27 | # Approximate hubness estimation: Creates LSH index and computes local scaling factors 28 | hub = LegacyHubness(k=10, 29 | return_value='robinhood', 30 | algorithm='falconn_lsh', 31 | hubness='ls', 32 | random_state=2345, 33 | shuffle_equal=False, 34 | verbose=1) 35 | hub.fit(X_train) 36 | 37 | robin_hood = hub.score(X_test) 38 | print(f'Hubness (Robin Hood): {robin_hood}:.4f') 39 | # 0.9060 40 | 41 | # Approximate hubness reduction for classification: Reuse index & factors 42 | knn = KNeighborsClassifier(n_neighbor=10, 43 | algorithm='falconn_lsh', 44 | hubness='ls', 45 | n_jobs=1) 46 | 47 | knn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE 48 | acc = knn.score(X_test, y_test) 49 | print(f'Test accuracy: {acc:.3f}') 50 | # 0.959 51 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/reusing_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================== 3 | Example: Reusing index structures 4 | ======================================== 5 | 6 | This example shows how to reuse index structures. If you want to first estimate hubness, 7 | and then perform kNN, you can avoid recomputing the ANN index structure, which can be 8 | costly. 9 | """ 10 | from sklearn.datasets import make_classification 11 | from sklearn.model_selection import train_test_split 12 | 13 | from skhubness.analysis import LegacyHubness 14 | from skhubness.neighbors import KNeighborsClassifier 15 | 16 | X, y = make_classification(n_samples=100_000, 17 | n_features=500, 18 | n_informative=400, 19 | random_state=543) 20 | 21 | X_train, X_test, y_train, y_test = train_test_split(X, y, 22 | test_size=0.01, 23 | stratify=y, 24 | shuffle=True, 25 | random_state=2346) 26 | 27 | # Approximate hubness estimation: Creates LSH index and computes local scaling factors 28 | hub = LegacyHubness(k=10, 29 | return_value='robinhood', 30 | algorithm='falconn_lsh', 31 | hubness='ls', 32 | random_state=2345, 33 | shuffle_equal=False, 34 | verbose=1) 35 | hub.fit(X_train) 36 | 37 | robin_hood = hub.score(X_test) 38 | print(f'Hubness (Robin Hood): {robin_hood}:.4f') 39 | # 0.9060 40 | 41 | # Approximate hubness reduction for classification: Reuse index & factors 42 | knn = KNeighborsClassifier(n_neighbor=10, 43 | algorithm='falconn_lsh', 44 | hubness='ls', 45 | n_jobs=1) 46 | 47 | knn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE 48 | acc = knn.score(X_test, y_test) 49 | print(f'Test accuracy: {acc:.3f}') 50 | # 0.959 51 | -------------------------------------------------------------------------------- /examples/approximate_hub_red/high_dim_gaussian.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================== 3 | Example: Approximate hubness reduction 4 | ======================================== 5 | 6 | This example shows how to combine approximate nearest neighbor search and hubness reduction 7 | in order to perform approximate hubness reduction for large data sets. 8 | """ 9 | from sklearn.datasets import make_classification 10 | from sklearn.metrics import accuracy_score 11 | from sklearn.model_selection import train_test_split 12 | 13 | from skhubness.analysis import LegacyHubness 14 | from skhubness.neighbors import KNeighborsClassifier 15 | 16 | # High-dimensional artificial data 17 | X, y = make_classification(n_samples=1_000_000, 18 | n_features=500, 19 | n_informative=400, 20 | random_state=543) 21 | 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, 23 | test_size=10_000, 24 | stratify=y, 25 | shuffle=True, 26 | random_state=2346) 27 | 28 | # Approximate hubness estimation 29 | hub = LegacyHubness(k=10, 30 | return_value='robinhood', 31 | algorithm='hnsw', 32 | random_state=2345, 33 | shuffle_equal=False, 34 | n_jobs=-1, 35 | verbose=2) 36 | hub.fit(X_train) 37 | robin_hood = hub.score(X_test) 38 | print(f'Hubness (Robin Hood): {robin_hood:.3f}') 39 | # 0.944 40 | 41 | # Approximate hubness reduction for classification 42 | knn = KNeighborsClassifier(n_neighbor=10, 43 | algorithm='hnsw', 44 | hubness='ls', 45 | n_jobs=-1, 46 | verbose=2) 47 | 48 | knn.fit(X_train, y_train) 49 | y_pred = knn.predict(X_test) 50 | acc = accuracy_score(y_test, y_pred) 51 | print(f'Test accuracy: {acc:.3f}') 52 | # Test accuracy: 0.987 53 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/high_dim_gaussian.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================== 3 | Example: Approximate hubness reduction 4 | ======================================== 5 | 6 | This example shows how to combine approximate nearest neighbor search and hubness reduction 7 | in order to perform approximate hubness reduction for large data sets. 8 | """ 9 | from sklearn.datasets import make_classification 10 | from sklearn.metrics import accuracy_score 11 | from sklearn.model_selection import train_test_split 12 | 13 | from skhubness.analysis import LegacyHubness 14 | from skhubness.neighbors import KNeighborsClassifier 15 | 16 | # High-dimensional artificial data 17 | X, y = make_classification(n_samples=1_000_000, 18 | n_features=500, 19 | n_informative=400, 20 | random_state=543) 21 | 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, 23 | test_size=10_000, 24 | stratify=y, 25 | shuffle=True, 26 | random_state=2346) 27 | 28 | # Approximate hubness estimation 29 | hub = LegacyHubness(k=10, 30 | return_value='robinhood', 31 | algorithm='hnsw', 32 | random_state=2345, 33 | shuffle_equal=False, 34 | n_jobs=-1, 35 | verbose=2) 36 | hub.fit(X_train) 37 | robin_hood = hub.score(X_test) 38 | print(f'LegacyHubness (Robin Hood): {robin_hood:.3f}') 39 | # 0.944 40 | 41 | # Approximate hubness reduction for classification 42 | knn = KNeighborsClassifier(n_neighbor=10, 43 | algorithm='hnsw', 44 | hubness='ls', 45 | n_jobs=-1, 46 | verbose=2) 47 | 48 | knn.fit(X_train, y_train) 49 | y_pred = knn.predict(X_test) 50 | acc = accuracy_score(y_test, y_pred) 51 | print(f'Test accuracy: {acc:.3f}') 52 | # Test accuracy: 0.987 53 | -------------------------------------------------------------------------------- /examples/sklearn/plot_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================ 3 | Nearest Neighbors regression 4 | ============================ 5 | 6 | Demonstrate the resolution of a regression problem 7 | using a k-Nearest Neighbor and the interpolation of the 8 | target using both barycenter and constant weights. 9 | 10 | Hubness reduction of this low-dimensional dataset 11 | shows only small effects. 12 | 13 | Adapted from ``_ 14 | """ 15 | print(__doc__) 16 | 17 | # Author: Alexandre Gramfort 18 | # Fabian Pedregosa 19 | # 20 | # License: BSD 3 clause (C) INRIA 21 | 22 | 23 | # ############################################################################# 24 | # Generate sample data 25 | import numpy as np 26 | import matplotlib.pyplot as plt 27 | from skhubness.neighbors import KNeighborsRegressor 28 | 29 | np.random.seed(0) 30 | X = np.sort(5 * np.random.rand(40, 1), axis=0) 31 | T = np.linspace(0, 5, 500)[:, np.newaxis] 32 | y = np.sin(X).ravel() 33 | 34 | # Add noise to targets 35 | y[::5] += 1 * (0.5 - np.random.rand(8)) 36 | 37 | # ############################################################################# 38 | # Fit regression model 39 | n_neighbors = 5 40 | 41 | f = plt.figure() 42 | for i, weights in enumerate(['uniform', 'distance']): 43 | for j, hubness in enumerate([None, 'local_scaling']): 44 | knn = KNeighborsRegressor(n_neighbors, 45 | algorithm_params={'n_candidates': 39}, 46 | weights=weights, 47 | hubness=hubness) 48 | y_ = knn.fit(X, y).predict(T) 49 | 50 | plt.subplot(2, 2, i * 2 + j + 1) 51 | f.set_figheight(15) 52 | f.set_figwidth(15) 53 | plt.scatter(X, y, c='k', label='data') 54 | plt.plot(T, y_, c='g', label='prediction') 55 | plt.axis('tight') 56 | plt.legend() 57 | plt.title(f"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')") 58 | 59 | plt.tight_layout() 60 | plt.show() -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================ 3 | Nearest Neighbors regression 4 | ============================ 5 | 6 | Demonstrate the resolution of a regression problem 7 | using a k-Nearest Neighbor and the interpolation of the 8 | target using both barycenter and constant weights. 9 | 10 | Hubness reduction of this low-dimensional dataset 11 | shows only small effects. 12 | 13 | Adapted from ``_ 14 | """ 15 | print(__doc__) 16 | 17 | # Author: Alexandre Gramfort 18 | # Fabian Pedregosa 19 | # 20 | # License: BSD 3 clause (C) INRIA 21 | 22 | 23 | # ############################################################################# 24 | # Generate sample data 25 | import numpy as np 26 | import matplotlib.pyplot as plt 27 | from skhubness.neighbors import KNeighborsRegressor 28 | 29 | np.random.seed(0) 30 | X = np.sort(5 * np.random.rand(40, 1), axis=0) 31 | T = np.linspace(0, 5, 500)[:, np.newaxis] 32 | y = np.sin(X).ravel() 33 | 34 | # Add noise to targets 35 | y[::5] += 1 * (0.5 - np.random.rand(8)) 36 | 37 | # ############################################################################# 38 | # Fit regression model 39 | n_neighbors = 5 40 | 41 | f = plt.figure() 42 | for i, weights in enumerate(['uniform', 'distance']): 43 | for j, hubness in enumerate([None, 'local_scaling']): 44 | knn = KNeighborsRegressor(n_neighbors, 45 | algorithm_params={'n_candidates': 39}, 46 | weights=weights, 47 | hubness=hubness) 48 | y_ = knn.fit(X, y).predict(T) 49 | 50 | plt.subplot(2, 2, i * 2 + j + 1) 51 | f.set_figheight(15) 52 | f.set_figwidth(15) 53 | plt.scatter(X, y, c='k', label='data') 54 | plt.plot(T, y_, c='g', label='prediction') 55 | plt.axis('tight') 56 | plt.legend() 57 | plt.title(f"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')") 58 | 59 | plt.tight_layout() 60 | plt.show() -------------------------------------------------------------------------------- /scripts/install-ngt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Build external dependencies that cannot successfully install via pip or conda 3 | # If you use this file as template, don't forget to `chmod a+x newfile` 4 | 5 | set -e 6 | 7 | # Check for the operating system and install NGT 8 | if [[ $(uname) == "Darwin" ]]; then 9 | if [[ $(command ngt > /dev/null 2>&1) && $(command ngtq > /dev/null 2>&1) && $(command ngtqg > /dev/null 2>&1) ]]; then 10 | # This only checks for available ngt commands. Does not currently check the version. 11 | # To update NGT, this must be adapted. 12 | echo "NGT already installed" 13 | else 14 | echo "Installing NGT under Mac OS X..." 15 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 16 | brew install cmake 17 | brew install gcc@9 18 | export CXX=/usr/local/bin/g++-9 19 | export CC=/usr/local/bin/gcc-9 20 | pushd /tmp/ 21 | git clone https://github.com/yahoojapan/NGT 22 | cd NGT/ 23 | mkdir build 24 | cd build/ 25 | cmake .. 26 | make 27 | sudo make install 28 | cd ../python 29 | pip install . 30 | popd 31 | rm -r /tmp/NGT 32 | fi 33 | 34 | elif [[ $(uname -s) == Linux* ]]; then 35 | if [[ $(command ngt > /dev/null 2>&1) && $(command ngtq > /dev/null 2>&1) && $(command ngtqg > /dev/null 2>&1) ]]; then 36 | # This only checks for available ngt commands. Does not currently check the version. 37 | # To update NGT, this must be adapted. 38 | echo "NGT already installed" 39 | else 40 | echo "Installing NGT under Linux..." 41 | pushd /tmp/ 42 | git clone https://github.com/yahoojapan/NGT 43 | cd NGT/ 44 | mkdir build 45 | cd build/ 46 | cmake .. 47 | make 48 | sudo make install 49 | sudo ldconfig /usr/local/lib/ 50 | cd ../python 51 | pip install . 52 | popd 53 | rm -r /tmp/NGT 54 | fi 55 | 56 | elif [[ $(uname -s) == MINGW32_NT* ]]; then 57 | echo "NGT not available under Win x86-32" 58 | 59 | elif [[ $(uname -s) == MINGW64_NT* ]]; then 60 | echo "NGT not available under Win x86-64" 61 | 62 | elif [[ $(uname -s) == CYGWIN* ]]; then 63 | echo "NGT not available under Cygwin" 64 | 65 | fi 66 | -------------------------------------------------------------------------------- /.github/workflows/scikit-hubness_ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: scikit-hubness CI 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ ubuntu-latest, macos-latest, windows-latest ] 19 | python: [ "3.8" , "3.9", "3.10" ] 20 | exclude: 21 | # Building nmslib from source fails on Windows: issue #102 22 | - os: windows-latest 23 | python: "3.9" 24 | - os: windows-latest 25 | python: "3.10" 26 | 27 | steps: 28 | - uses: actions/checkout@v2 29 | - name: Set up Python 30 | uses: actions/setup-python@v2 31 | with: 32 | python-version: ${{ matrix.python }} 33 | - name: Install dependencies 34 | run: | 35 | python3 -m pip install --upgrade pip 36 | python3 -m pip install setuptools wheel pybind11 37 | - name: Install ANN packages with special care 38 | run: | 39 | scripts/install-ngt.sh 40 | scripts/install-puffinn.sh 41 | - name: Install scikit-hubness 42 | run: | 43 | echo "Running on platform.system()=$(python -c 'import platform; print(platform.system())')" 44 | python3 -m pip install .[ann,tests] 45 | - name: Lint with flake8 46 | run: | 47 | # stop the build if there are Python syntax errors or undefined names 48 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exit-zero 49 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 50 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 51 | - name: Test with pytest 52 | run: | 53 | pytest --cov=skhubness --cov-append 54 | - name: Test coverage 55 | run: coverage html 56 | - name: Codecov 57 | run: codecov 58 | -------------------------------------------------------------------------------- /examples/sklearn/plot_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================ 3 | Nearest Neighbors Classification 4 | ================================ 5 | Sample usage of Nearest Neighbors classification. 6 | It will plot the decision boundaries for each class. 7 | 8 | Adapted from ``_ 9 | """ 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from matplotlib.colors import ListedColormap 14 | from sklearn import datasets 15 | from skhubness.neighbors import KNeighborsClassifier 16 | 17 | n_neighbors = 15 18 | 19 | # import some data to play with 20 | iris = datasets.load_iris() 21 | 22 | # we only take the first two features. We could avoid this ugly 23 | # slicing by using a two-dim dataset 24 | X = iris.data[:, :2] 25 | y = iris.target 26 | 27 | h = .02 # step size in the mesh 28 | 29 | # Create color maps 30 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 31 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 32 | 33 | for hubness in [None, 'mutual_proximity']: 34 | # we create an instance of Neighbours Classifier and fit the data. 35 | clf = KNeighborsClassifier(n_neighbors, 36 | hubness=hubness, 37 | weights='distance') 38 | clf.fit(X, y) 39 | 40 | # Plot the decision boundary. For that, we will assign a color to each 41 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 42 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 43 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 44 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 45 | np.arange(y_min, y_max, h)) 46 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | 48 | # Put the result into a color plot 49 | Z = Z.reshape(xx.shape) 50 | plt.figure() 51 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light) 52 | 53 | # Plot also the training points 54 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, 55 | edgecolor='k', s=20) 56 | plt.xlim(xx.min(), xx.max()) 57 | plt.ylim(yy.min(), yy.max()) 58 | plt.title("3-Class classification (k = %i, hubness = '%s')" 59 | % (n_neighbors, hubness)) 60 | 61 | plt.show() 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Manually added 2 | dist_prev/ 3 | notebooks/ 4 | examples/playground 5 | new_test_pypi_release.bash 6 | new_pypi_release.bash 7 | coverage.html/* 8 | _autosummary/ 9 | .idea/ 10 | venv/ 11 | codemeta.json 12 | generate_joss_metadata.rb 13 | *__pycache__* 14 | *egg-info* 15 | 16 | # From github 17 | Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | .python-version 99 | 100 | # celery beat schedule file 101 | celerybeat-schedule 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================ 3 | Nearest Neighbors Classification 4 | ================================ 5 | Sample usage of Nearest Neighbors classification. 6 | It will plot the decision boundaries for each class. 7 | 8 | Adapted from ``_ 9 | """ 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from matplotlib.colors import ListedColormap 14 | from sklearn import datasets 15 | from skhubness.neighbors import KNeighborsClassifier 16 | 17 | n_neighbors = 15 18 | 19 | # import some data to play with 20 | iris = datasets.load_iris() 21 | 22 | # we only take the first two features. We could avoid this ugly 23 | # slicing by using a two-dim dataset 24 | X = iris.data[:, :2] 25 | y = iris.target 26 | 27 | h = .02 # step size in the mesh 28 | 29 | # Create color maps 30 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 31 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 32 | 33 | for hubness in [None, 'mutual_proximity']: 34 | # we create an instance of Neighbours Classifier and fit the data. 35 | clf = KNeighborsClassifier(n_neighbors, 36 | hubness=hubness, 37 | weights='distance') 38 | clf.fit(X, y) 39 | 40 | # Plot the decision boundary. For that, we will assign a color to each 41 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 42 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 43 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 44 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 45 | np.arange(y_min, y_max, h)) 46 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | 48 | # Put the result into a color plot 49 | Z = Z.reshape(xx.shape) 50 | plt.figure() 51 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light) 52 | 53 | # Plot also the training points 54 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, 55 | edgecolor='k', s=20) 56 | plt.xlim(xx.min(), xx.max()) 57 | plt.ylim(yy.min(), yy.max()) 58 | plt.title("3-Class classification (k = %i, hubness = '%s')" 59 | % (n_neighbors, hubness)) 60 | 61 | plt.show() 62 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/index.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | 4 | 5 | .. _sphx_glr_documentation_auto_examples_hr: 6 | 7 | ============================================ 8 | Example: Hubness reduction 9 | ============================================ 10 | 11 | These examples show how to perform hubness reduction in kNN classification 12 | in (nested) cross-validation and pipelines. 13 | 14 | 15 | 16 | .. raw:: html 17 | 18 |
19 | 20 | .. only:: html 21 | 22 | .. figure:: /documentation/auto_examples_hr/images/thumb/sphx_glr_pipelines_thumb.png 23 | 24 | :ref:`sphx_glr_documentation_auto_examples_hr_pipelines.py` 25 | 26 | .. raw:: html 27 | 28 |
29 | 30 | 31 | .. toctree:: 32 | :hidden: 33 | 34 | /documentation/auto_examples_hr/pipelines 35 | 36 | .. raw:: html 37 | 38 |
39 | 40 | .. only:: html 41 | 42 | .. figure:: /documentation/auto_examples_hr/images/thumb/sphx_glr_olivetti_faces_thumb.png 43 | 44 | :ref:`sphx_glr_documentation_auto_examples_hr_olivetti_faces.py` 45 | 46 | .. raw:: html 47 | 48 |
49 | 50 | 51 | .. toctree:: 52 | :hidden: 53 | 54 | /documentation/auto_examples_hr/olivetti_faces 55 | .. raw:: html 56 | 57 |
58 | 59 | 60 | 61 | .. only :: html 62 | 63 | .. container:: sphx-glr-footer 64 | :class: sphx-glr-footer-gallery 65 | 66 | 67 | .. container:: sphx-glr-download 68 | 69 | :download:`Download all examples in Python source code: auto_examples_hr_python.zip ` 70 | 71 | 72 | 73 | .. container:: sphx-glr-download 74 | 75 | :download:`Download all examples in Jupyter notebooks: auto_examples_hr_jupyter.zip ` 76 | 77 | 78 | .. only:: html 79 | 80 | .. rst-class:: sphx-glr-signature 81 | 82 | `Gallery generated by Sphinx-Gallery `_ 83 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/index.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | 4 | 5 | .. _sphx_glr_documentation_auto_examples_ahr: 6 | 7 | ======================================== 8 | Example: Approximate hubness reduction 9 | ======================================== 10 | 11 | These examples show how to combine approximate nearest neighbor search and hubness reduction. 12 | 13 | 14 | 15 | .. raw:: html 16 | 17 |
18 | 19 | .. only:: html 20 | 21 | .. figure:: /documentation/auto_examples_ahr/images/thumb/sphx_glr_reusing_index_thumb.png 22 | 23 | :ref:`sphx_glr_documentation_auto_examples_ahr_reusing_index.py` 24 | 25 | .. raw:: html 26 | 27 |
28 | 29 | 30 | .. toctree:: 31 | :hidden: 32 | 33 | /documentation/auto_examples_ahr/reusing_index 34 | 35 | .. raw:: html 36 | 37 |
38 | 39 | .. only:: html 40 | 41 | .. figure:: /documentation/auto_examples_ahr/images/thumb/sphx_glr_high_dim_gaussian_thumb.png 42 | 43 | :ref:`sphx_glr_documentation_auto_examples_ahr_high_dim_gaussian.py` 44 | 45 | .. raw:: html 46 | 47 |
48 | 49 | 50 | .. toctree:: 51 | :hidden: 52 | 53 | /documentation/auto_examples_ahr/high_dim_gaussian 54 | .. raw:: html 55 | 56 |
57 | 58 | 59 | 60 | .. only :: html 61 | 62 | .. container:: sphx-glr-footer 63 | :class: sphx-glr-footer-gallery 64 | 65 | 66 | .. container:: sphx-glr-download 67 | 68 | :download:`Download all examples in Python source code: auto_examples_ahr_python.zip ` 69 | 70 | 71 | 72 | .. container:: sphx-glr-download 73 | 74 | :download:`Download all examples in Jupyter notebooks: auto_examples_ahr_jupyter.zip ` 75 | 76 | 77 | .. only:: html 78 | 79 | .. rst-class:: sphx-glr-signature 80 | 81 | `Gallery generated by Sphinx-Gallery `_ 82 | -------------------------------------------------------------------------------- /examples/sklearn/plot_nearest_centroid.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============================== 3 | Nearest Centroid Classification 4 | =============================== 5 | 6 | Sample usage of Nearest Centroid classification. 7 | It will plot the decision boundaries for each class. 8 | 9 | Note that no hubness reduction is currently implemented for centroids. 10 | However, `hubness.neighbors` retains all the features of `sklearn.neighbors`, 11 | in order to act as a full drop-in replacement. 12 | 13 | Adapted from ``_ 14 | """ 15 | print(__doc__) 16 | 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | from matplotlib.colors import ListedColormap 20 | from sklearn import datasets 21 | from skhubness.neighbors import NearestCentroid 22 | 23 | n_neighbors = 15 24 | 25 | # import some data to play with 26 | iris = datasets.load_iris() 27 | # we only take the first two features. We could avoid this ugly 28 | # slicing by using a two-dim dataset 29 | X = iris.data[:, :2] 30 | y = iris.target 31 | 32 | h = .02 # step size in the mesh 33 | 34 | # Create color maps 35 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 36 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 37 | 38 | for shrinkage in [None, .2]: 39 | # we create an instance of Neighbours Classifier and fit the data. 40 | clf = NearestCentroid(shrink_threshold=shrinkage) 41 | clf.fit(X, y) 42 | y_pred = clf.predict(X) 43 | print(shrinkage, np.mean(y == y_pred)) 44 | # Plot the decision boundary. For that, we will assign a color to each 45 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 46 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 47 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 48 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 49 | np.arange(y_min, y_max, h)) 50 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 51 | 52 | # Put the result into a color plot 53 | Z = Z.reshape(xx.shape) 54 | plt.figure() 55 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light) 56 | 57 | # Plot also the training points 58 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, 59 | edgecolor='k', s=20) 60 | plt.title("3-Class classification (shrink_threshold=%r)" 61 | % shrinkage) 62 | plt.axis('tight') 63 | 64 | plt.show() 65 | -------------------------------------------------------------------------------- /examples/hubness_reduction/pipelines.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================== 3 | Example: skhubness in Pipelines 4 | ======================================== 5 | 6 | Estimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``. 7 | In this example, we select the best hubness reduction method and several other 8 | hyperparameters in grid search w.r.t. to classification performance. 9 | """ 10 | from sklearn.datasets import make_classification 11 | from sklearn.decomposition import PCA 12 | from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV 13 | from sklearn.pipeline import Pipeline 14 | from sklearn.preprocessing import StandardScaler 15 | 16 | from skhubness.neighbors import KNeighborsClassifier 17 | 18 | # Not so high-dimensional data 19 | X, y = make_classification(n_samples=1_000, 20 | n_features=50, 21 | n_informative=20, 22 | n_classes=2, 23 | random_state=3453) 24 | 25 | X, X_test, y, y_test = train_test_split(X, y, 26 | test_size=100, 27 | stratify=y, 28 | shuffle=True, 29 | random_state=124) 30 | 31 | # Pipeline of standardization, dimensionality reduction, and kNN classification 32 | pipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)), 33 | ('pca', PCA(n_components=20, random_state=1213)), 34 | ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))]) 35 | 36 | # Exhaustive search for best algorithms and hyperparameters 37 | param_grid = {'pca__n_components': [10, 20, 30], 38 | 'knn__n_neighbors': [5, 10, 20], 39 | 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'], 40 | 'knn__hubness': [None, 'mp', 'ls', 'dsl']} 41 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354) 42 | search = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1) 43 | search.fit(X, y) 44 | 45 | # Performance on hold-out data 46 | acc = search.score(y_test, y_test) 47 | print(acc) 48 | # 0.79 49 | 50 | print(search.best_params_) 51 | # {'knn__algorithm': 'auto', 52 | # 'knn__hubness': 'dsl', 53 | # 'knn__n_neighbors': 20, 54 | # 'pca__n_components': 30} 55 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nearest_centroid.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============================== 3 | Nearest Centroid Classification 4 | =============================== 5 | 6 | Sample usage of Nearest Centroid classification. 7 | It will plot the decision boundaries for each class. 8 | 9 | Note that no hubness reduction is currently implemented for centroids. 10 | However, `hubness.neighbors` retains all the features of `sklearn.neighbors`, 11 | in order to act as a full drop-in replacement. 12 | 13 | Adapted from ``_ 14 | """ 15 | print(__doc__) 16 | 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | from matplotlib.colors import ListedColormap 20 | from sklearn import datasets 21 | from skhubness.neighbors import NearestCentroid 22 | 23 | n_neighbors = 15 24 | 25 | # import some data to play with 26 | iris = datasets.load_iris() 27 | # we only take the first two features. We could avoid this ugly 28 | # slicing by using a two-dim dataset 29 | X = iris.data[:, :2] 30 | y = iris.target 31 | 32 | h = .02 # step size in the mesh 33 | 34 | # Create color maps 35 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 36 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 37 | 38 | for shrinkage in [None, .2]: 39 | # we create an instance of Neighbours Classifier and fit the data. 40 | clf = NearestCentroid(shrink_threshold=shrinkage) 41 | clf.fit(X, y) 42 | y_pred = clf.predict(X) 43 | print(shrinkage, np.mean(y == y_pred)) 44 | # Plot the decision boundary. For that, we will assign a color to each 45 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 46 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 47 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 48 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 49 | np.arange(y_min, y_max, h)) 50 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 51 | 52 | # Put the result into a color plot 53 | Z = Z.reshape(xx.shape) 54 | plt.figure() 55 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light) 56 | 57 | # Plot also the training points 58 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, 59 | edgecolor='k', s=20) 60 | plt.title("3-Class classification (shrink_threshold=%r)" 61 | % shrinkage) 62 | plt.axis('tight') 63 | 64 | plt.show() 65 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/pipelines.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================== 3 | Example: skhubness in Pipelines 4 | ======================================== 5 | 6 | Estimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``. 7 | In this example, we select the best hubness reduction method and several other 8 | hyperparameters in grid search w.r.t. to classification performance. 9 | """ 10 | from sklearn.datasets import make_classification 11 | from sklearn.decomposition import PCA 12 | from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV 13 | from sklearn.pipeline import Pipeline 14 | from sklearn.preprocessing import StandardScaler 15 | 16 | from skhubness.neighbors import KNeighborsClassifier 17 | 18 | # Not so high-dimensional data 19 | X, y = make_classification(n_samples=1_000, 20 | n_features=50, 21 | n_informative=20, 22 | n_classes=2, 23 | random_state=3453) 24 | 25 | X, X_test, y, y_test = train_test_split(X, y, 26 | test_size=100, 27 | stratify=y, 28 | shuffle=True, 29 | random_state=124) 30 | 31 | # Pipeline of standardization, dimensionality reduction, and kNN classification 32 | pipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)), 33 | ('pca', PCA(n_components=20, random_state=1213)), 34 | ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))]) 35 | 36 | # Exhaustive search for best algorithms and hyperparameters 37 | param_grid = {'pca__n_components': [10, 20, 30], 38 | 'knn__n_neighbors': [5, 10, 20], 39 | 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'], 40 | 'knn__hubness': [None, 'mp', 'ls', 'dsl']} 41 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354) 42 | search = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1) 43 | search.fit(X, y) 44 | 45 | # Performance on hold-out data 46 | acc = search.score(y_test, y_test) 47 | print(acc) 48 | # 0.79 49 | 50 | print(search.best_params_) 51 | # {'knn__algorithm': 'auto', 52 | # 'knn__hubness': 'dsl', 53 | # 'knn__n_neighbors': 20, 54 | # 'pca__n_components': 30} 55 | -------------------------------------------------------------------------------- /skhubness/data/dexter/dexter_train.labels: -------------------------------------------------------------------------------- 1 | 1 2 | -1 3 | 1 4 | -1 5 | 1 6 | -1 7 | 1 8 | -1 9 | 1 10 | 1 11 | 1 12 | 1 13 | -1 14 | 1 15 | 1 16 | 1 17 | -1 18 | 1 19 | -1 20 | -1 21 | 1 22 | -1 23 | 1 24 | 1 25 | 1 26 | 1 27 | 1 28 | -1 29 | -1 30 | -1 31 | 1 32 | -1 33 | -1 34 | 1 35 | 1 36 | 1 37 | 1 38 | -1 39 | 1 40 | -1 41 | -1 42 | -1 43 | -1 44 | 1 45 | -1 46 | -1 47 | -1 48 | -1 49 | -1 50 | 1 51 | -1 52 | -1 53 | 1 54 | -1 55 | -1 56 | -1 57 | 1 58 | 1 59 | 1 60 | 1 61 | 1 62 | -1 63 | -1 64 | -1 65 | -1 66 | -1 67 | 1 68 | -1 69 | 1 70 | -1 71 | 1 72 | -1 73 | -1 74 | -1 75 | 1 76 | 1 77 | 1 78 | 1 79 | 1 80 | -1 81 | -1 82 | -1 83 | -1 84 | -1 85 | 1 86 | 1 87 | 1 88 | 1 89 | -1 90 | -1 91 | -1 92 | -1 93 | 1 94 | -1 95 | 1 96 | -1 97 | -1 98 | 1 99 | 1 100 | -1 101 | 1 102 | 1 103 | -1 104 | -1 105 | 1 106 | 1 107 | 1 108 | 1 109 | -1 110 | -1 111 | -1 112 | 1 113 | 1 114 | -1 115 | 1 116 | 1 117 | -1 118 | -1 119 | 1 120 | 1 121 | -1 122 | 1 123 | -1 124 | -1 125 | 1 126 | 1 127 | 1 128 | -1 129 | -1 130 | 1 131 | 1 132 | 1 133 | -1 134 | -1 135 | 1 136 | 1 137 | -1 138 | -1 139 | 1 140 | -1 141 | 1 142 | 1 143 | 1 144 | -1 145 | -1 146 | -1 147 | 1 148 | 1 149 | -1 150 | -1 151 | 1 152 | -1 153 | 1 154 | -1 155 | 1 156 | -1 157 | -1 158 | 1 159 | 1 160 | -1 161 | 1 162 | -1 163 | 1 164 | -1 165 | -1 166 | 1 167 | -1 168 | 1 169 | 1 170 | -1 171 | 1 172 | -1 173 | 1 174 | -1 175 | -1 176 | -1 177 | 1 178 | -1 179 | 1 180 | 1 181 | 1 182 | 1 183 | -1 184 | -1 185 | 1 186 | -1 187 | 1 188 | 1 189 | 1 190 | -1 191 | -1 192 | 1 193 | -1 194 | -1 195 | 1 196 | -1 197 | -1 198 | -1 199 | 1 200 | -1 201 | -1 202 | 1 203 | 1 204 | -1 205 | 1 206 | -1 207 | 1 208 | 1 209 | -1 210 | 1 211 | 1 212 | -1 213 | -1 214 | -1 215 | 1 216 | -1 217 | -1 218 | 1 219 | 1 220 | -1 221 | 1 222 | -1 223 | -1 224 | -1 225 | -1 226 | 1 227 | 1 228 | 1 229 | 1 230 | 1 231 | 1 232 | 1 233 | -1 234 | -1 235 | 1 236 | -1 237 | -1 238 | 1 239 | 1 240 | -1 241 | 1 242 | 1 243 | -1 244 | -1 245 | -1 246 | 1 247 | 1 248 | 1 249 | -1 250 | 1 251 | 1 252 | -1 253 | 1 254 | -1 255 | -1 256 | -1 257 | -1 258 | 1 259 | -1 260 | 1 261 | 1 262 | -1 263 | -1 264 | 1 265 | 1 266 | -1 267 | -1 268 | 1 269 | 1 270 | 1 271 | -1 272 | -1 273 | -1 274 | -1 275 | 1 276 | 1 277 | 1 278 | 1 279 | 1 280 | -1 281 | -1 282 | 1 283 | 1 284 | -1 285 | -1 286 | 1 287 | 1 288 | -1 289 | 1 290 | -1 291 | -1 292 | 1 293 | 1 294 | 1 295 | -1 296 | -1 297 | -1 298 | -1 299 | 1 300 | -1 301 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = scikit-hubness 3 | version = attr: skhubness.__version__ 4 | author = Roman Feldbauer 5 | author_email = sci@feldbauer.org 6 | maintainer=Roman Feldbauer 7 | maintainer_email=sci@feldbauer.org 8 | url = https://github.com/VarIr/scikit-hubness 9 | description = Hubness reduction and analysis tools 10 | long_description = file: README.md 11 | long_description_content_type = text/markdown 12 | # This includes the license file(s) in the wheel. 13 | # https://wheel.readthedocs.io/en/stable/user_guide.html#including-license-files-in-the-generated-wheel-file 14 | license_files = LICENSE.txt 15 | platform = any 16 | keywords = 17 | machine-learning 18 | high-dimensional-data 19 | hubness 20 | nearest-neighbor 21 | data-science 22 | data-mining 23 | artificial-intelligence 24 | 25 | # https://pypi.org/classifiers/ 26 | classifiers = 27 | Development Status :: 4 - Beta 28 | Environment :: Console 29 | Intended Audience :: Developers 30 | Intended Audience :: Science/Research 31 | License :: OSI Approved :: BSD License 32 | Operating System :: OS Independent 33 | Operating System :: POSIX :: Linux 34 | Operating System :: MacOS :: MacOS X 35 | Operating System :: Microsoft :: Windows 36 | Programming Language :: Python 37 | Programming Language :: Python :: 3.8 38 | Programming Language :: Python :: 3.9 39 | Programming Language :: Python :: 3.10 40 | Topic :: Software Development :: Libraries :: Python Modules 41 | Topic :: Scientific/Engineering :: Artificial Intelligence 42 | 43 | project_urls = 44 | Bug Tracker = https://github.com/VarIr/scikit-hubness/issues 45 | Changelog = https://github.com/VarIr/scikit-hubness/docs/changelog.md 46 | Documentation = https://scikit-hubness.readthedocs.io 47 | Say Thanks! = https://saythanks.io/to/VarIr 48 | Source = https://github.com/VarIr/scikit-hubness 49 | 50 | [options] 51 | zip_safe = false 52 | include_package_data = true 53 | python_requires = >= 3.8 54 | packages = find: 55 | test_suite = tests 56 | install_requires = 57 | numpy # These packages will be installed by pip. 58 | scipy >= 1.2 # For comparison with requirements.txt see also: 59 | scikit-learn >= 0.22 # https://packaging.python.org/en/latest/requirements.html 60 | tqdm 61 | joblib >= 0.12 62 | numba 63 | 64 | [options.extras_require] 65 | ann = 66 | annoy 67 | ngt; platform_system == "Linux" or platform_system == "Darwin" 68 | nmslib 69 | tests = 70 | codecov 71 | flake8 72 | pytest 73 | pytest-cov 74 | 75 | [options.package_data] 76 | * = *.data, *.labels 77 | -------------------------------------------------------------------------------- /skhubness/reduction/tests/test_local_scaling.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | import warnings 3 | 4 | import pytest 5 | from sklearn.datasets import make_classification 6 | from sklearn.neighbors import NearestNeighbors 7 | from sklearn.utils._testing import assert_array_almost_equal 8 | from sklearn.utils._testing import assert_array_equal 9 | from sklearn.utils._testing import assert_raises 10 | 11 | from skhubness.reduction import LocalScaling 12 | from skhubness.reduction.tests.reference_algorithms import ReferenceLocalScaling 13 | 14 | LS_METHODS = [ 15 | "standard", 16 | "nicdm", 17 | ] 18 | 19 | 20 | @pytest.mark.parametrize("method", LS_METHODS) 21 | @pytest.mark.parametrize("verbose", [0, 1]) 22 | def test_fit_sorted(method, verbose): 23 | # TODO add LocalScaling class tests 24 | X, y = make_classification() 25 | nn = NearestNeighbors() 26 | nn.fit(X, y) 27 | neigh_dist, neigh_ind = nn.kneighbors() 28 | 29 | ls = ReferenceLocalScaling(method=method, verbose=verbose) 30 | 31 | nd_sorted, ni_sorted = ls.fit( 32 | neigh_dist, neigh_ind, X, assume_sorted=True, 33 | ).transform( 34 | neigh_dist, neigh_ind, X, assume_sorted=True, 35 | ) 36 | nd_unsort, ni_unsort = ls.fit( 37 | neigh_dist, neigh_ind, X, assume_sorted=False, 38 | ).transform( 39 | neigh_dist, neigh_ind, X, assume_sorted=False, 40 | ) 41 | 42 | assert_array_almost_equal(nd_sorted, nd_unsort) 43 | assert_array_equal(ni_sorted, ni_unsort) 44 | 45 | 46 | @pytest.mark.parametrize("method", ["invalid", None]) 47 | @pytest.mark.parametrize("LocalScalingClass", [ReferenceLocalScaling, LocalScaling]) 48 | def test_invalid_method(method, LocalScalingClass): 49 | X, y = make_classification(n_samples=10, ) 50 | nn = NearestNeighbors(n_neighbors=6) 51 | nn.fit(X, y) 52 | neigh_dist, neigh_ind = nn.kneighbors() 53 | neigh_graph = nn.kneighbors_graph(mode="distance") 54 | 55 | ls = LocalScalingClass(method=method) 56 | if isinstance(ls, LocalScaling): 57 | kwargs = {"X": neigh_graph} 58 | else: 59 | kwargs = {"neigh_dist": neigh_dist, "neigh_ind": neigh_ind, "X": X, "assume_sorted": True} 60 | with assert_raises(ValueError): 61 | ls.fit(**kwargs).transform(**kwargs) 62 | 63 | 64 | @pytest.mark.parametrize("k", [0, 1, 5, 6]) 65 | def test_local_scaling_various_k_values(k): 66 | X, y = make_classification(n_samples=10) 67 | nn = NearestNeighbors(n_neighbors=5) 68 | graph = nn.fit(X).kneighbors_graph(X, mode="distance") 69 | ls = LocalScaling(k=k) 70 | if 1 <= k < 5: 71 | with warnings.catch_warnings(): 72 | warnings.simplefilter("error") 73 | ls.fit(graph) 74 | else: 75 | with pytest.raises(ValueError, match="n_neighbors"): 76 | ls.fit(graph) 77 | -------------------------------------------------------------------------------- /docs/getting_started/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | Installation from PyPI 6 | ====================== 7 | 8 | The current release of ``scikit-hubness`` can be installed from PyPI: 9 | 10 | .. code-block:: bash 11 | 12 | pip install scikit-hubness 13 | 14 | 15 | Dependencies 16 | ============ 17 | 18 | All strict dependencies of ``scikit-hubness`` are automatically installed 19 | by ``pip``. Some optional dependencies (certain ANN libraries) may not 20 | yet be available from PyPI. If you require one of these libraries, 21 | please refer to the library's documentation for building instructions. 22 | For example, at the time of writing, ``puffinn`` was not available on PyPI. 23 | Building and installing is straight-forward: 24 | 25 | .. code-block:: bash 26 | 27 | git clone https://github.com/puffinn/puffinn.git 28 | cd puffinn 29 | python3 setup.py build 30 | pip install . 31 | 32 | 33 | Installation from source 34 | ======================== 35 | 36 | You can always grab the latest version of ``scikit-hubness`` directly from GitHub: 37 | 38 | .. code-block:: bash 39 | 40 | cd install_dir 41 | git clone git@github.com:VarIr/scikit-hubness.git 42 | cd scikit-hubness 43 | pip install -e . 44 | 45 | This is the recommended approach, if you want to contribute to the development of ``scikit-hubness``. 46 | 47 | 48 | Supported platforms 49 | =================== 50 | 51 | ``scikit-hubness`` currently supports all major operating systems: 52 | 53 | - Linux 54 | - MacOS X 55 | - Windows 56 | 57 | Note, that not all approximate nearest neighbor algorithms used in ``scikit-hubness`` 58 | are available on all platforms. 59 | This is because we rely on third-party libraries, which in some cases are not 60 | available for all platforms. 61 | The table below indicates, which libraries and 62 | algorithms are currently supported on your operating system. 63 | All exact nearest neighbor algorithms (as provided by scikit-learn) are available on all platforms. 64 | 65 | +---------+-------------+-------+-------+---------+ 66 | | library | algorithm | Linux | MacOS | Windows | 67 | +---------+-------------+-------+-------+---------+ 68 | | nmslib | hnsw | x | x | x | 69 | +---------+-------------+-------+-------+---------+ 70 | | annoy | rptree | x | x | x | 71 | +---------+-------------+-------+-------+---------+ 72 | | ngtpy | nng | x | x | | 73 | +---------+-------------+-------+-------+---------+ 74 | | falconn | falconn_lsh | x | x | | 75 | +---------+-------------+-------+-------+---------+ 76 | | puffinn | lsh | x | x | | 77 | +---------+-------------+-------+-------+---------+ 78 | | sklearn | (all exact) | x | x | x | 79 | +---------+-------------+-------+-------+---------+ -------------------------------------------------------------------------------- /skhubness/neighbors/approximate_neighbors.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | 3 | from abc import ABC, abstractmethod 4 | from multiprocessing import cpu_count 5 | from typing import Union, Tuple 6 | import warnings 7 | import numpy as np 8 | 9 | 10 | class ApproximateNearestNeighbor(ABC): 11 | """ Abstract base class for approximate nearest neighbor search methods. 12 | 13 | Parameters 14 | ---------- 15 | n_candidates: int, default = 5 16 | Number of neighbors to retrieve 17 | metric: str, default = 'euclidean' 18 | Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot" 19 | n_jobs: int, default = 1 20 | Number of parallel jobs 21 | verbose: int, default = 0 22 | Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. 23 | """ 24 | def __init__(self, n_candidates: int = 5, metric: str = 'sqeuclidean', 25 | n_jobs: int = 1, verbose: int = 0, *args, **kwargs): 26 | self.n_candidates = n_candidates 27 | self.metric = metric 28 | if n_jobs is None: 29 | n_jobs = 1 30 | elif n_jobs == -1: 31 | n_jobs = cpu_count() 32 | self.n_jobs = n_jobs 33 | self.verbose = verbose 34 | 35 | @abstractmethod 36 | def fit(self, X, y=None): 37 | """ Setup ANN index from training data. 38 | 39 | Parameters 40 | ---------- 41 | X: np.array 42 | Data to be indexed 43 | y: any 44 | Ignored 45 | """ 46 | pass # pragma: no cover 47 | 48 | @abstractmethod 49 | def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]: 50 | """ Retrieve k nearest neighbors. 51 | 52 | Parameters 53 | ---------- 54 | X: np.array or None, optional, default = None 55 | Query objects. If None, search among the indexed objects. 56 | n_candidates: int or None, optional, default = None 57 | Number of neighbors to retrieve. 58 | If None, use the value passed during construction. 59 | return_distance: bool, default = True 60 | If return_distance, will return distances and indices to neighbors. 61 | Else, only return the indices. 62 | """ 63 | pass # pragma: no cover 64 | 65 | 66 | class UnavailableANN(ApproximateNearestNeighbor): 67 | """ Placeholder for ANN methods that are not available on specific platforms. """ 68 | def __init__(self, *args, **kwargs): 69 | super().__init__(*args, **kwargs) 70 | warnings.warn("The chosen approximate nearest neighbor method is not supported on your platform.") 71 | 72 | def fit(self, X, y=None): 73 | pass 74 | 75 | def kneighbors(self, X=None, n_candidates=None, return_distance=True): 76 | pass 77 | -------------------------------------------------------------------------------- /skhubness/reduction/tests/test_hubness_reduction.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | 3 | from itertools import product 4 | import pytest 5 | from sklearn.datasets import make_classification 6 | from sklearn.utils._testing import assert_array_equal 7 | from sklearn.neighbors import NearestNeighbors 8 | 9 | from skhubness.analysis import Hubness 10 | from skhubness.data import load_dexter 11 | from skhubness.reduction import LocalScaling, MutualProximity, DisSimLocal 12 | from skhubness.reduction.tests.reference_algorithms import ReferenceNoHubnessReduction 13 | 14 | 15 | HUBNESS_REDUCTION = ( 16 | LocalScaling, MutualProximity, DisSimLocal, 17 | ) 18 | MP_PARAMS = tuple({"method": method} for method in ["normal", "empiric"]) 19 | LS_PARAMS = tuple({"method": method} for method in ["standard", "nicdm"]) 20 | HUBNESS_REDUCTION_WITH_PARAMS = (( 21 | *product([MutualProximity], MP_PARAMS), 22 | *product([LocalScaling], LS_PARAMS), 23 | (DisSimLocal, {}), 24 | )) 25 | 26 | 27 | @pytest.mark.parametrize("hubness_param", HUBNESS_REDUCTION_WITH_PARAMS) 28 | @pytest.mark.parametrize("metric", ["sqeuclidean", "euclidean", "cosine"]) 29 | def test_neighbors_dexter(hubness_param, metric): 30 | HubnessReduction, param = hubness_param 31 | if HubnessReduction is MutualProximity and param.get("method") == "normal": 32 | pytest.skip("MP normal does not improve dexter") 33 | if HubnessReduction is DisSimLocal and metric != "sqeuclidean": 34 | pytest.skip("DisSimLocal works only with squared Euclidean distances") 35 | X, y = load_dexter() 36 | 37 | # Hubness in standard spaces 38 | hub = Hubness(k=10, metric=metric) 39 | hub.fit(X) 40 | k_skew_orig = hub.score() 41 | 42 | # Hubness in secondary distance spaces (after hub. red.) 43 | nn = NearestNeighbors(n_neighbors=50, metric=metric) 44 | graph = nn.fit(X).kneighbors_graph(mode="distance") 45 | hub_red = HubnessReduction(method=param.get("method")) 46 | if HubnessReduction is DisSimLocal: 47 | # TODO check_sorted="full" fails here with for unknown reasons (SIGSEGV during debug) 48 | graph = hub_red.fit_transform(graph, vectors=X, check_sorted=False) 49 | else: 50 | graph = hub_red.fit(graph).transform(graph) 51 | hub = Hubness(k=10, metric="precomputed") 52 | hub.fit(graph) 53 | k_skew_hr = hub.score() 54 | 55 | assert k_skew_hr < k_skew_orig * 8/10,\ 56 | f"k-occurrence skewness was not reduced by at least 20% for dexter with {HubnessReduction}" 57 | 58 | 59 | def test_same_indices(): 60 | X, y = make_classification() 61 | nn = NearestNeighbors() 62 | nn.fit(X, y) 63 | neigh_dist, neigh_ind = nn.kneighbors() 64 | hr = ReferenceNoHubnessReduction() 65 | _, neigh_ind_hr = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=True) 66 | neigh_ind_ht_no_dist = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=False) 67 | assert_array_equal(neigh_ind, neigh_ind_hr) 68 | assert_array_equal(neigh_ind_hr, neigh_ind_ht_no_dist) 69 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/reusing_index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n========================================\nExample: Reusing index structures\n========================================\n\nThis example shows how to reuse index structures. If you want to first estimate hubness,\nand then perform kNN, you can avoid recomputing the ANN index structure, which can be\ncostly.\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "from sklearn.datasets import make_classification\nfrom sklearn.model_selection import train_test_split\n\nfrom skhubness.analysis import LegacyHubness\nfrom skhubness.neighbors import KNeighborsClassifier\n\nX, y = make_classification(n_samples=100_000,\n n_features=500,\n n_informative=400,\n random_state=543)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.01,\n stratify=y,\n shuffle=True,\n random_state=2346)\n\n# Approximate hubness estimation: Creates LSH index and computes local scaling factors\nhub = LegacyHubness(k=10,\n return_value='robinhood',\n algorithm='falconn_lsh',\n hubness='ls',\n random_state=2345,\n shuffle_equal=False,\n verbose=1)\nhub.fit(X_train)\n\nrobin_hood = hub.score(X_test)\nprint(f'LegacyHubness (Robin Hood): {robin_hood}:.4f')\n# 0.9060\n\n# Approximate hubness reduction for classification: Reuse index & factors\nknn = KNeighborsClassifier(n_neighbor=10,\n algorithm='falconn_lsh',\n hubness='ls',\n n_jobs=1)\n\nknn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE\nacc = knn.score(X_test, y_test)\nprint(f'Test accuracy: {acc:.3f}')\n# 0.959" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /examples/hubness_reduction/olivetti_faces.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================= 3 | Face recognition (Olivetti faces) 4 | ================================= 5 | 6 | This dataset contains a set of face images taken between April 1992 7 | and April 1994 at AT&T Laboratories Cambridge. 8 | Image data is typically embedded in very high-dimensional spaces, 9 | which might be prone to hubness. 10 | """ 11 | import numpy as np 12 | from sklearn.datasets import olivetti_faces 13 | from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV 14 | 15 | from skhubness import LegacyHubness 16 | from skhubness.neighbors import KNeighborsClassifier 17 | 18 | # Fetch data and have a look 19 | d = olivetti_faces.fetch_olivetti_faces() 20 | X, y = d['data'], d['target'] 21 | print(f'Data shape: {X.shape}') 22 | print(f'Label shape: {y.shape}') 23 | # (400, 4096) 24 | # (400,) 25 | 26 | # The data is embedded in a high-dimensional space. 27 | # Is there hubness, and can we reduce it? 28 | for hubness in [None, 'dsl', 'ls', 'mp']: 29 | hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness') 30 | hub.fit(X) 31 | score = hub.score() 32 | print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}') 33 | # Hubness (10-skew): 1.972 with hubness reduction: None 34 | # Hubness (10-skew): 1.526 with hubness reduction: dsl 35 | # Hubness (10-skew): 0.943 with hubness reduction: ls 36 | # Hubness (10-skew): 0.184 with hubness reduction: mp 37 | 38 | # There is some hubness, and all hubness reduction methods can reduce it (to varying degree) 39 | # Let's assess the best kNN strategy and its estimated performance. 40 | cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263) 41 | cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634) 42 | 43 | knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100}) 44 | 45 | # specify parameters and distributions to sample from 46 | param_dist = {"n_neighbors": np.arange(1, 26), 47 | "weights": ['uniform', 'distance'], 48 | "hubness": [None, 'dsl', 'ls', 'mp']} 49 | 50 | # Inner cross-validation to select best hyperparameters (incl hubness reduction method) 51 | search = RandomizedSearchCV(estimator=knn, 52 | param_distributions=param_dist, 53 | n_iter=100, 54 | cv=cv_select, 55 | random_state=2345, 56 | verbose=1) 57 | 58 | # Outer cross-validation to estimate performance 59 | score = cross_val_score(search, X, y, cv=cv_perf, verbose=1) 60 | print(f'Scores: {score}') 61 | print(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}') 62 | 63 | # Select model that maximizes accuracy 64 | search.fit(X, y) 65 | 66 | # The best model's parameters 67 | print(search.best_params_) 68 | 69 | # Does it correspond to the results of hubness reduction above? 70 | # Scores: [0.95 0.9625 1. 0.95 0.925 ] 71 | # Mean acc = 0.957 +/- 0.024 72 | # {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'} 73 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/olivetti_faces.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================= 3 | Face recognition (Olivetti faces) 4 | ================================= 5 | 6 | This dataset contains a set of face images taken between April 1992 7 | and April 1994 at AT&T Laboratories Cambridge. 8 | Image data is typically embedded in very high-dimensional spaces, 9 | which might be prone to hubness. 10 | """ 11 | import numpy as np 12 | from sklearn.datasets import olivetti_faces 13 | from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV 14 | 15 | from skhubness import LegacyHubness 16 | from skhubness.neighbors import KNeighborsClassifier 17 | 18 | # Fetch data and have a look 19 | d = olivetti_faces.fetch_olivetti_faces() 20 | X, y = d['data'], d['target'] 21 | print(f'Data shape: {X.shape}') 22 | print(f'Label shape: {y.shape}') 23 | # (400, 4096) 24 | # (400,) 25 | 26 | # The data is embedded in a high-dimensional space. 27 | # Is there hubness, and can we reduce it? 28 | for hubness in [None, 'dsl', 'ls', 'mp']: 29 | hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness') 30 | hub.fit(X) 31 | score = hub.score() 32 | print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}') 33 | # Hubness (10-skew): 1.972 with hubness reduction: None 34 | # Hubness (10-skew): 1.526 with hubness reduction: dsl 35 | # Hubness (10-skew): 0.943 with hubness reduction: ls 36 | # Hubness (10-skew): 0.184 with hubness reduction: mp 37 | 38 | # There is some hubness, and all hubness reduction methods can reduce it (to varying degree) 39 | # Let's assess the best kNN strategy and its estimated performance. 40 | cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263) 41 | cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634) 42 | 43 | knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100}) 44 | 45 | # specify parameters and distributions to sample from 46 | param_dist = {"n_neighbors": np.arange(1, 26), 47 | "weights": ['uniform', 'distance'], 48 | "hubness": [None, 'dsl', 'ls', 'mp']} 49 | 50 | # Inner cross-validation to select best hyperparameters (incl hubness reduction method) 51 | search = RandomizedSearchCV(estimator=knn, 52 | param_distributions=param_dist, 53 | n_iter=100, 54 | cv=cv_select, 55 | random_state=2345, 56 | verbose=1) 57 | 58 | # Outer cross-validation to estimate performance 59 | score = cross_val_score(search, X, y, cv=cv_perf, verbose=1) 60 | print(f'Scores: {score}') 61 | print(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}') 62 | 63 | # Select model that maximizes accuracy 64 | search.fit(X, y) 65 | 66 | # The best model's parameters 67 | print(search.best_params_) 68 | 69 | # Does it correspond to the results of hubness reduction above? 70 | # Scores: [0.95 0.9625 1. 0.95 0.925 ] 71 | # Mean acc = 0.957 +/- 0.024 72 | # {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'} 73 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n# Nearest Neighbors regression\n\n\nDemonstrate the resolution of a regression problem\nusing a k-Nearest Neighbor and the interpolation of the\ntarget using both barycenter and constant weights.\n\nHubness reduction of this low-dimensional dataset\nshows only small effects.\n\nAdapted from ``_\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "print(__doc__)\n\n# Author: Alexandre Gramfort \n# Fabian Pedregosa \n#\n# License: BSD 3 clause (C) INRIA\n\n\n# #############################################################################\n# Generate sample data\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom skhubness.neighbors import KNeighborsRegressor\n\nnp.random.seed(0)\nX = np.sort(5 * np.random.rand(40, 1), axis=0)\nT = np.linspace(0, 5, 500)[:, np.newaxis]\ny = np.sin(X).ravel()\n\n# Add noise to targets\ny[::5] += 1 * (0.5 - np.random.rand(8))\n\n# #############################################################################\n# Fit regression model\nn_neighbors = 5\n\nf = plt.figure()\nfor i, weights in enumerate(['uniform', 'distance']):\n for j, hubness in enumerate([None, 'local_scaling']):\n knn = KNeighborsRegressor(n_neighbors,\n algorithm_params={'n_candidates': 39},\n weights=weights,\n hubness=hubness)\n y_ = knn.fit(X, y).predict(T)\n\n plt.subplot(2, 2, i * 2 + j + 1)\n f.set_figheight(15)\n f.set_figwidth(15)\n plt.scatter(X, y, c='k', label='data')\n plt.plot(T, y_, c='g', label='prediction')\n plt.axis('tight')\n plt.legend()\n plt.title(f\"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')\")\n\nplt.tight_layout()\nplt.show()" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/high_dim_gaussian.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n========================================\nExample: Approximate hubness reduction\n========================================\n\nThis example shows how to combine approximate nearest neighbor search and hubness reduction\nin order to perform approximate hubness reduction for large data sets.\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "from sklearn.datasets import make_classification\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom skhubness.analysis import LegacyHubness\nfrom skhubness.neighbors import KNeighborsClassifier\n\n# High-dimensional artificial data\nX, y = make_classification(n_samples=1_000_000,\n n_features=500,\n n_informative=400,\n random_state=543)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=10_000,\n stratify=y,\n shuffle=True,\n random_state=2346)\n\n# Approximate hubness estimation\nhub = LegacyHubness(k=10,\n return_value='robinhood',\n algorithm='hnsw',\n random_state=2345,\n shuffle_equal=False,\n n_jobs=-1,\n verbose=2)\nhub.fit(X_train)\nrobin_hood = hub.score(X_test)\nprint(f'LegacyHubness (Robin Hood): {robin_hood:.3f}')\n# 0.944\n\n# Approximate hubness reduction for classification\nknn = KNeighborsClassifier(n_neighbor=10,\n algorithm='hnsw',\n hubness='ls',\n n_jobs=-1,\n verbose=2)\n\nknn.fit(X_train, y_train)\ny_pred = knn.predict(X_test)\nacc = accuracy_score(y_test, y_pred)\nprint(f'Test accuracy: {acc:.3f}')\n# Test accuracy: 0.987" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n# Nearest Neighbors Classification\n\nSample usage of Nearest Neighbors classification.\nIt will plot the decision boundaries for each class.\n\nAdapted from ``_\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom skhubness.neighbors import KNeighborsClassifier\n\nn_neighbors = 15\n\n# import some data to play with\niris = datasets.load_iris()\n\n# we only take the first two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = iris.data[:, :2]\ny = iris.target\n\nh = .02 # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n\nfor hubness in [None, 'mutual_proximity']:\n # we create an instance of Neighbours Classifier and fit the data.\n clf = KNeighborsClassifier(n_neighbors,\n hubness=hubness,\n weights='distance')\n clf.fit(X, y)\n\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n np.arange(y_min, y_max, h))\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.figure()\n plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n\n # Plot also the training points\n plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n edgecolor='k', s=20)\n plt.xlim(xx.min(), xx.max())\n plt.ylim(yy.min(), yy.max())\n plt.title(\"3-Class classification (k = %i, hubness = '%s')\"\n % (n_neighbors, hubness))\n\nplt.show()" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /docs/github_link.py: -------------------------------------------------------------------------------- 1 | # from https://github.com/scikit-learn/scikit-learn/blob/master/doc/sphinxext/github_link.py 2 | 3 | from operator import attrgetter 4 | import inspect 5 | import subprocess 6 | import os 7 | import sys 8 | from functools import partial 9 | 10 | REVISION_CMD = 'git rev-parse --short HEAD' 11 | 12 | 13 | def _get_git_revision(): 14 | try: 15 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 16 | except (subprocess.CalledProcessError, OSError): 17 | print('Failed to execute git to get revision') 18 | return None 19 | return revision.decode('utf-8') 20 | 21 | 22 | def _linkcode_resolve(domain, info, package, url_fmt, revision): 23 | """Determine a link to online source for a class/method/function 24 | 25 | This is called by sphinx.ext.linkcode 26 | 27 | An example with a long-untouched module that everyone has 28 | >>> _linkcode_resolve('py', {'module': 'tty', 29 | ... 'fullname': 'setraw'}, 30 | ... package='tty', 31 | ... url_fmt='http://hg.python.org/cpython/file/' 32 | ... '{revision}/Lib/{package}/{path}#L{lineno}', 33 | ... revision='xxxx') 34 | 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' 35 | """ 36 | 37 | if revision is None: 38 | return 39 | if domain not in ('py', 'pyx'): 40 | return 41 | if not info.get('module') or not info.get('fullname'): 42 | return 43 | 44 | class_name = info['fullname'].split('.')[0] 45 | if type(class_name) != str: 46 | # Python 2 only 47 | class_name = class_name.encode('utf-8') 48 | module = __import__(info['module'], fromlist=[class_name]) 49 | obj = attrgetter(info['fullname'])(module) 50 | 51 | try: 52 | fn = inspect.getsourcefile(obj) 53 | except Exception: 54 | fn = None 55 | if not fn: 56 | try: 57 | fn = inspect.getsourcefile(sys.modules[obj.__module__]) 58 | except Exception: 59 | fn = None 60 | if not fn: 61 | return 62 | # Work-around: disable links to imported packages (e.g. scikit-learn) 63 | if '/site-packages/' in fn: 64 | return 65 | 66 | fn = os.path.relpath(fn, 67 | start=os.path.dirname(__import__(package).__file__)) 68 | try: 69 | lineno = inspect.getsourcelines(obj)[1] 70 | except Exception: 71 | lineno = '' 72 | return url_fmt.format(revision=revision, package=package, 73 | path=fn, lineno=lineno) 74 | 75 | 76 | def make_linkcode_resolve(package, url_fmt): 77 | """Returns a linkcode_resolve function for the given URL format 78 | 79 | revision is a git commit reference (hash or name) 80 | 81 | package is the name of the root module of the package 82 | 83 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 84 | 'blob/{revision}/{package}/' 85 | '{path}#L{lineno}') 86 | """ 87 | revision = _get_git_revision() 88 | return partial(_linkcode_resolve, revision=revision, package=package, 89 | url_fmt=url_fmt) 90 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/reusing_index.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | :class: sphx-glr-download-link-note 3 | 4 | Click :ref:`here ` to download the full example code 5 | .. rst-class:: sphx-glr-example-title 6 | 7 | .. _sphx_glr_documentation_auto_examples_ahr_reusing_index.py: 8 | 9 | 10 | ======================================== 11 | Example: Reusing index structures 12 | ======================================== 13 | 14 | This example shows how to reuse index structures. If you want to first estimate hubness, 15 | and then perform kNN, you can avoid recomputing the ANN index structure, which can be 16 | costly. 17 | 18 | 19 | .. code-block:: default 20 | 21 | from sklearn.datasets import make_classification 22 | from sklearn.model_selection import train_test_split 23 | 24 | from skhubness.analysis import LegacyHubness 25 | from skhubness.neighbors import KNeighborsClassifier 26 | 27 | X, y = make_classification(n_samples=100_000, 28 | n_features=500, 29 | n_informative=400, 30 | random_state=543) 31 | 32 | X_train, X_test, y_train, y_test = train_test_split(X, y, 33 | test_size=0.01, 34 | stratify=y, 35 | shuffle=True, 36 | random_state=2346) 37 | 38 | # Approximate hubness estimation: Creates LSH index and computes local scaling factors 39 | hub = LegacyHubness(k=10, 40 | return_value='robinhood', 41 | algorithm='falconn_lsh', 42 | hubness='ls', 43 | random_state=2345, 44 | shuffle_equal=False, 45 | verbose=1) 46 | hub.fit(X_train) 47 | 48 | robin_hood = hub.score(X_test) 49 | print(f'Hubness (Robin Hood): {robin_hood}:.4f') 50 | # 0.9060 51 | 52 | # Approximate hubness reduction for classification: Reuse index & factors 53 | knn = KNeighborsClassifier(n_neighbor=10, 54 | algorithm='falconn_lsh', 55 | hubness='ls', 56 | n_jobs=1) 57 | 58 | knn.fit(hub.nn_index_, y_train) # REUSE INDEX HERE 59 | acc = knn.score(X_test, y_test) 60 | print(f'Test accuracy: {acc:.3f}') 61 | # 0.959 62 | 63 | 64 | .. rst-class:: sphx-glr-timing 65 | 66 | **Total running time of the script:** ( 0 minutes 0.000 seconds) 67 | 68 | 69 | .. _sphx_glr_download_documentation_auto_examples_ahr_reusing_index.py: 70 | 71 | 72 | .. only :: html 73 | 74 | .. container:: sphx-glr-footer 75 | :class: sphx-glr-footer-example 76 | 77 | 78 | 79 | .. container:: sphx-glr-download 80 | 81 | :download:`Download Python source code: reusing_index.py ` 82 | 83 | 84 | 85 | .. container:: sphx-glr-download 86 | 87 | :download:`Download Jupyter notebook: reusing_index.ipynb ` 88 | 89 | 90 | .. only:: html 91 | 92 | .. rst-class:: sphx-glr-signature 93 | 94 | `Gallery generated by Sphinx-Gallery `_ 95 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nearest_centroid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n# Nearest Centroid Classification\n\n\nSample usage of Nearest Centroid classification.\nIt will plot the decision boundaries for each class.\n\nNote that no hubness reduction is currently implemented for centroids.\nHowever, `hubness.neighbors` retains all the features of `sklearn.neighbors`,\nin order to act as a full drop-in replacement.\n\nAdapted from ``_\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn import datasets\nfrom skhubness.neighbors import NearestCentroid\n\nn_neighbors = 15\n\n# import some data to play with\niris = datasets.load_iris()\n# we only take the first two features. We could avoid this ugly\n# slicing by using a two-dim dataset\nX = iris.data[:, :2]\ny = iris.target\n\nh = .02 # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n\nfor shrinkage in [None, .2]:\n # we create an instance of Neighbours Classifier and fit the data.\n clf = NearestCentroid(shrink_threshold=shrinkage)\n clf.fit(X, y)\n y_pred = clf.predict(X)\n print(shrinkage, np.mean(y == y_pred))\n # Plot the decision boundary. For that, we will assign a color to each\n # point in the mesh [x_min, x_max]x[y_min, y_max].\n x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n np.arange(y_min, y_max, h))\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n # Put the result into a color plot\n Z = Z.reshape(xx.shape)\n plt.figure()\n plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n\n # Plot also the training points\n plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n edgecolor='k', s=20)\n plt.title(\"3-Class classification (shrink_threshold=%r)\"\n % shrinkage)\n plt.axis('tight')\n\nplt.show()" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ahr/high_dim_gaussian.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | :class: sphx-glr-download-link-note 3 | 4 | Click :ref:`here ` to download the full example code 5 | .. rst-class:: sphx-glr-example-title 6 | 7 | .. _sphx_glr_documentation_auto_examples_ahr_high_dim_gaussian.py: 8 | 9 | 10 | ======================================== 11 | Example: Approximate hubness reduction 12 | ======================================== 13 | 14 | This example shows how to combine approximate nearest neighbor search and hubness reduction 15 | in order to perform approximate hubness reduction for large data sets. 16 | 17 | 18 | .. code-block:: default 19 | 20 | from sklearn.datasets import make_classification 21 | from sklearn.metrics import accuracy_score 22 | from sklearn.model_selection import train_test_split 23 | 24 | from skhubness.analysis import LegacyHubness 25 | from skhubness.neighbors import KNeighborsClassifier 26 | 27 | # High-dimensional artificial data 28 | X, y = make_classification(n_samples=1_000_000, 29 | n_features=500, 30 | n_informative=400, 31 | random_state=543) 32 | 33 | X_train, X_test, y_train, y_test = train_test_split(X, y, 34 | test_size=10_000, 35 | stratify=y, 36 | shuffle=True, 37 | random_state=2346) 38 | 39 | # Approximate hubness estimation 40 | hub = LegacyHubness(k=10, 41 | return_value='robinhood', 42 | algorithm='hnsw', 43 | random_state=2345, 44 | shuffle_equal=False, 45 | n_jobs=-1, 46 | verbose=2) 47 | hub.fit(X_train) 48 | robin_hood = hub.score(X_test) 49 | print(f'Hubness (Robin Hood): {robin_hood:.3f}') 50 | # 0.944 51 | 52 | # Approximate hubness reduction for classification 53 | knn = KNeighborsClassifier(n_neighbor=10, 54 | algorithm='hnsw', 55 | hubness='ls', 56 | n_jobs=-1, 57 | verbose=2) 58 | 59 | knn.fit(X_train, y_train) 60 | y_pred = knn.predict(X_test) 61 | acc = accuracy_score(y_test, y_pred) 62 | print(f'Test accuracy: {acc:.3f}') 63 | # Test accuracy: 0.987 64 | 65 | 66 | .. rst-class:: sphx-glr-timing 67 | 68 | **Total running time of the script:** ( 0 minutes 0.000 seconds) 69 | 70 | 71 | .. _sphx_glr_download_documentation_auto_examples_ahr_high_dim_gaussian.py: 72 | 73 | 74 | .. only :: html 75 | 76 | .. container:: sphx-glr-footer 77 | :class: sphx-glr-footer-example 78 | 79 | 80 | 81 | .. container:: sphx-glr-download 82 | 83 | :download:`Download Python source code: high_dim_gaussian.py ` 84 | 85 | 86 | 87 | .. container:: sphx-glr-download 88 | 89 | :download:`Download Jupyter notebook: high_dim_gaussian.ipynb ` 90 | 91 | 92 | .. only:: html 93 | 94 | .. rst-class:: sphx-glr-signature 95 | 96 | `Gallery generated by Sphinx-Gallery `_ 97 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n========================================\nExample: skhubness in Pipelines\n========================================\n\nEstimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``.\nIn this example, we select the best hubness reduction method and several other\nhyperparameters in grid search w.r.t. to classification performance.\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "from sklearn.datasets import make_classification\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nfrom skhubness.neighbors import KNeighborsClassifier\n\n# Not so high-dimensional data\nX, y = make_classification(n_samples=1_000,\n n_features=50,\n n_informative=20,\n n_classes=2,\n random_state=3453)\n\nX, X_test, y, y_test = train_test_split(X, y,\n test_size=100,\n stratify=y,\n shuffle=True,\n random_state=124)\n\n# Pipeline of standardization, dimensionality reduction, and kNN classification\npipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)),\n ('pca', PCA(n_components=20, random_state=1213)),\n ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))])\n\n# Exhaustive search for best algorithms and hyperparameters\nparam_grid = {'pca__n_components': [10, 20, 30],\n 'knn__n_neighbors': [5, 10, 20],\n 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'],\n 'knn__hubness': [None, 'mp', 'ls', 'dsl']}\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354)\nsearch = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1)\nsearch.fit(X, y)\n\n# Performance on hold-out data\nacc = search.score(y_test, y_test)\nprint(acc)\n# 0.79\n\nprint(search.best_params_)\n# {'knn__algorithm': 'auto',\n# 'knn__hubness': 'dsl',\n# 'knn__n_neighbors': 20,\n# 'pca__n_components': 30}" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /docs/getting_started/example.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Quick start example 3 | =================== 4 | 5 | Users of ``scikit-hubness`` typically want to 6 | 7 | 1. analyse, whether their data show hubness 8 | 2. reduce hubness 9 | 3. perform learning (classification, regression, ...) 10 | 11 | The following example shows all these steps for an example dataset 12 | from the text domain (dexter). 13 | Please make sure you have installed ``scikit-hubness`` 14 | (`installation instructions `_). 15 | 16 | First, we load the dataset and inspect its size. 17 | 18 | .. code-block:: python 19 | 20 | from skhubness.data import load_dexter 21 | X, y = load_dexter() 22 | print(f'X.shape = {X.shape}, y.shape={y.shape}') 23 | 24 | Dexter is embedded in a high-dimensional space, 25 | and could, thus, be prone to hubness. 26 | Therefore, we assess the actual degree of hubness. 27 | 28 | .. code-block:: python 29 | 30 | from skhubness import LegacyHubness 31 | hub = LegacyHubness(k=10, metric='cosine') 32 | hub.fit(X) 33 | k_skew = hub.score() 34 | print(f'Skewness = {k_skew:.3f}') 35 | 36 | As a rule-of-thumb, skewness > 1.2 indicates significant hubness. 37 | Additional hubness indices are available, for example: 38 | 39 | .. code-block:: python 40 | 41 | print(f'Robin hood index: {hub.robinhood_index:.3f}') 42 | print(f'Antihub occurrence: {hub.antihub_occurrence:.3f}') 43 | print(f'Hub occurrence: {hub.hub_occurrence:.3f}') 44 | 45 | There is considerable hubness in dexter. 46 | Let's see, whether hubness reduction can improve 47 | kNN classification performance. 48 | 49 | .. code-block:: python 50 | 51 | from sklearn.model_selection import cross_val_score 52 | from skhubness.neighbors import KNeighborsClassifier 53 | 54 | # vanilla kNN 55 | knn_standard = KNeighborsClassifier(n_neighbors=5, 56 | metric='cosine') 57 | acc_standard = cross_val_score(knn_standard, X, y, cv=5) 58 | 59 | # kNN with hubness reduction (mutual proximity) 60 | knn_mp = KNeighborsClassifier(n_neighbors=5, 61 | metric='cosine', 62 | hubness='mutual_proximity') 63 | acc_mp = cross_val_score(knn_mp, X, y, cv=5) 64 | 65 | print(f'Accuracy (vanilla kNN): {acc_standard.mean():.3f}') 66 | print(f'Accuracy (kNN with hubness reduction): {acc_mp.mean():.3f}') 67 | 68 | 69 | Accuracy was considerably improved by mutual proximity (MP). 70 | But did MP actually reduce hubness? 71 | 72 | .. code-block:: python 73 | 74 | hub_mp = LegacyHubness(k=10, metric='cosine', 75 | hubness='mutual_proximity') 76 | hub_mp.fit(X) 77 | k_skew_mp = hub_mp.score() 78 | print(f'Skewness after MP: {k_skew_mp:.3f} ' 79 | f'(reduction of {k_skew - k_skew_mp:.3f})') 80 | print(f'Robin hood: {hub_mp.robinhood_index:.3f} ' 81 | f'(reduction of {hub.robinhood_index - hub_mp.robinhood_index:.3f})') 82 | 83 | Yes! 84 | 85 | The neighbor graph can also be created directly, 86 | with or without hubness reduction: 87 | 88 | .. code-block:: python 89 | 90 | from skhubness.neighbors import kneighbors_graph 91 | neighbor_graph = kneighbors_graph(X, 92 | n_neighbors=5, 93 | hubness='mutual_proximity') 94 | 95 | You may want to precompute the graph like this, 96 | in order to avoid computing it repeatedly for subsequent hubness estimation and learning. 97 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. scikit-hubness documentation master file, created by 2 | sphinx-quickstart on Mon Jul 8 13:54:25 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | `scikit-hubness`: high-dimensional data mining 7 | ================================================ 8 | 9 | ``scikit-hubness`` is a Python package for analysis of hubness 10 | in high-dimensional data. It provides hubness reduction and 11 | approximate nearest neighbor search via a drop-in replacement for 12 | `sklearn.neighbors `_. 13 | 14 | .. toctree:: 15 | :maxdepth: 1 16 | :hidden: 17 | :caption: Getting Started 18 | 19 | Installation 20 | Quick start example 21 | 22 | .. toctree:: 23 | :maxdepth: 3 24 | :hidden: 25 | :caption: Documentation 26 | 27 | User Guide 28 | scikit-hubness API 29 | History 30 | 31 | .. toctree:: 32 | :maxdepth: 2 33 | :titlesonly: 34 | :hidden: 35 | :caption: Development 36 | 37 | Contributing 38 | Github Repository 39 | What's new (Changelog) 40 | 41 | 42 | `Getting started `_ 43 | ------------------------------------------------------- 44 | 45 | Get started with ``scikit-hubness`` in a breeze. 46 | Find how to `install the package `_ and 47 | see all core functionality applied in a single `quick start example `_. 48 | 49 | 50 | `User Guide `_ 51 | ----------------------------------------------- 52 | 53 | The `User Guide `_ introduces the main concepts of ``scikit-hubness``. 54 | It explains, how to analyze your data sets for hubness, 55 | and how to use the package to lift this *curse of dimensionality*. 56 | You will also find examples how to use ``skhubness.neighbors`` 57 | for approximate nearest neighbor search (with or without hubness reduction). 58 | 59 | 60 | `API Documentation `_ 61 | -------------------------------------------------------- 62 | 63 | The `API Documentation `_ provides detailed information 64 | of the implemented methods. 65 | This information includes method descriptions, parameters, references, examples, etc. 66 | Find all the information about specific modules and functions of ``scikit-hubness`` in this section. 67 | 68 | 69 | `History `_ 70 | ---------------------------------------- 71 | 72 | A `brief history `_ of the package, 73 | and how it relates to the ``Hub-Toolbox``'es. 74 | 75 | 76 | `Development `_ 77 | ----------------------------------------------- 78 | 79 | There are several possibilities to `contribute `_ 80 | to this free open source software. We highly appreciate all input from the community, 81 | be it bug reports or code contributions. 82 | 83 | Source code, issue tracking, discussion, and continuous integration appear on 84 | our `GitHub page `_. 85 | 86 | 87 | `What's new `_ 88 | -------------------------------- 89 | 90 | To see what's new in the latest version of ``scikit-hubness``, 91 | have a look at the `changelog `_. 92 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at sci@feldbauer.org. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/pipelines.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | :class: sphx-glr-download-link-note 3 | 4 | Click :ref:`here ` to download the full example code 5 | .. rst-class:: sphx-glr-example-title 6 | 7 | .. _sphx_glr_documentation_auto_examples_hr_pipelines.py: 8 | 9 | 10 | ======================================== 11 | Example: skhubness in Pipelines 12 | ======================================== 13 | 14 | Estimators from scikit-hubness can - of course - be used in a scikit-learn ``Pipeline``. 15 | In this example, we select the best hubness reduction method and several other 16 | hyperparameters in grid search w.r.t. to classification performance. 17 | 18 | 19 | .. code-block:: default 20 | 21 | from sklearn.datasets import make_classification 22 | from sklearn.decomposition import PCA 23 | from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV 24 | from sklearn.pipeline import Pipeline 25 | from sklearn.preprocessing import StandardScaler 26 | 27 | from skhubness.neighbors import KNeighborsClassifier 28 | 29 | # Not so high-dimensional data 30 | X, y = make_classification(n_samples=1_000, 31 | n_features=50, 32 | n_informative=20, 33 | n_classes=2, 34 | random_state=3453) 35 | 36 | X, X_test, y, y_test = train_test_split(X, y, 37 | test_size=100, 38 | stratify=y, 39 | shuffle=True, 40 | random_state=124) 41 | 42 | # Pipeline of standardization, dimensionality reduction, and kNN classification 43 | pipe = Pipeline([('scale', StandardScaler(with_mean=True, with_std=True)), 44 | ('pca', PCA(n_components=20, random_state=1213)), 45 | ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='lsh', hubness='mp'))]) 46 | 47 | # Exhaustive search for best algorithms and hyperparameters 48 | param_grid = {'pca__n_components': [10, 20, 30], 49 | 'knn__n_neighbors': [5, 10, 20], 50 | 'knn__algorithm': ['auto', 'hnsw', 'lsh', 'falconn_lsh', 'nng', 'rptree'], 51 | 'knn__hubness': [None, 'mp', 'ls', 'dsl']} 52 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1354) 53 | search = GridSearchCV(pipe, param_grid, n_jobs=5, cv=cv, verbose=1) 54 | search.fit(X, y) 55 | 56 | # Performance on hold-out data 57 | acc = search.score(y_test, y_test) 58 | print(acc) 59 | # 0.79 60 | 61 | print(search.best_params_) 62 | # {'knn__algorithm': 'auto', 63 | # 'knn__hubness': 'dsl', 64 | # 'knn__n_neighbors': 20, 65 | # 'pca__n_components': 30} 66 | 67 | 68 | .. rst-class:: sphx-glr-timing 69 | 70 | **Total running time of the script:** ( 0 minutes 0.000 seconds) 71 | 72 | 73 | .. _sphx_glr_download_documentation_auto_examples_hr_pipelines.py: 74 | 75 | 76 | .. only :: html 77 | 78 | .. container:: sphx-glr-footer 79 | :class: sphx-glr-footer-example 80 | 81 | 82 | 83 | .. container:: sphx-glr-download 84 | 85 | :download:`Download Python source code: pipelines.py ` 86 | 87 | 88 | 89 | .. container:: sphx-glr-download 90 | 91 | :download:`Download Jupyter notebook: pipelines.ipynb ` 92 | 93 | 94 | .. only:: html 95 | 96 | .. rst-class:: sphx-glr-signature 97 | 98 | `Gallery generated by Sphinx-Gallery `_ 99 | -------------------------------------------------------------------------------- /skhubness/neighbors/tests/test_neighbors.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pytest 4 | 5 | import numpy as np 6 | from scipy.sparse import csr_matrix 7 | from sklearn.datasets import make_classification 8 | from sklearn.metrics import accuracy_score 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier 11 | 12 | from skhubness.neighbors import AnnoyTransformer, NGTTransformer, NMSlibTransformer, PuffinnTransformer 13 | 14 | 15 | @pytest.mark.parametrize("n_neighbors", [1, 5, 10]) 16 | @pytest.mark.parametrize("metric", [None, "euclidean", "cosine"]) 17 | @pytest.mark.parametrize("ApproximateNNTransformer", 18 | [AnnoyTransformer, NGTTransformer, NMSlibTransformer, PuffinnTransformer]) 19 | def test_ann_transformers_similar_to_exact_transformer(ApproximateNNTransformer, n_neighbors, metric): 20 | if sys.platform == "win32" and issubclass(ApproximateNNTransformer, (NGTTransformer, PuffinnTransformer)): 21 | pytest.skip(f"{ApproximateNNTransformer.__name__} is not available on Windows.") 22 | knn_metric = metric 23 | ann_metric = metric 24 | if issubclass(ApproximateNNTransformer, PuffinnTransformer) and metric in ["euclidean", "cosine"]: 25 | pytest.skip(f"{ApproximateNNTransformer.__name__} does not support metric={metric}") 26 | if issubclass(ApproximateNNTransformer, AnnoyTransformer) and metric == "cosine": 27 | ann_metric = "angular" 28 | n_samples = 100 29 | X, y = make_classification( 30 | n_samples=n_samples, 31 | random_state=123, 32 | ) 33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=456, shuffle=True, stratify=y) 34 | 35 | # Exackt kNN graph for comparison 36 | kwargs = {} 37 | if knn_metric is not None: 38 | kwargs["metric"] = knn_metric 39 | knn = KNeighborsTransformer(n_neighbors=n_neighbors, **kwargs) 40 | graph_train = knn.fit_transform(X_train, y_train) 41 | knn_graph: csr_matrix = knn.transform(X_test) 42 | knn_clf = KNeighborsClassifier(n_neighbors=n_neighbors, metric="precomputed") 43 | y_pred_knn = knn_clf.fit(graph_train, y_train).predict(knn_graph) 44 | knn_acc = accuracy_score(y_true=y_test, y_pred=y_pred_knn) 45 | 46 | # ANN graph 47 | kwargs = {} 48 | if ann_metric is not None: 49 | kwargs["metric"] = ann_metric 50 | ann = ApproximateNNTransformer(n_neighbors=n_neighbors, **kwargs) 51 | graph_train = ann.fit_transform(X_train, y_train) 52 | ann_graph = ann.transform(X_test) 53 | ann_clf = KNeighborsClassifier(n_neighbors=n_neighbors, metric="precomputed") 54 | y_pred_ann = ann_clf.fit(graph_train, y_train).predict(ann_graph) 55 | ann_acc = accuracy_score(y_true=y_test, y_pred=y_pred_ann) 56 | 57 | # Neighbor graphs should be same class, same shape, same dtype 58 | assert ann_graph.__class__ == knn_graph.__class__ 59 | assert ann_graph.shape == knn_graph.shape 60 | assert ann_graph.dtype == knn_graph.dtype 61 | assert ann_graph.nnz == knn_graph.nnz 62 | if issubclass(ApproximateNNTransformer, AnnoyTransformer): 63 | pass # Known inaccuracy 64 | elif issubclass(ApproximateNNTransformer, PuffinnTransformer) and metric is None: 65 | pass # Known inaccuracy 66 | else: 67 | np.testing.assert_array_equal(ann_graph.indices.ravel(), knn_graph.indices.ravel()) 68 | np.testing.assert_array_almost_equal(ann_graph.data.ravel(), knn_graph.data.ravel()) 69 | if issubclass(ApproximateNNTransformer, AnnoyTransformer) and metric == "cosine" and n_neighbors == 1: 70 | return # Known inaccurate result 71 | assert ann_acc > knn_acc or np.isclose(ann_acc, knn_acc), "ApproximateNN accuracy << exact kNN accuracy." 72 | -------------------------------------------------------------------------------- /examples/sklearn/plot_multioutput_face_completion.py: -------------------------------------------------------------------------------- 1 | """ 2 | =================================================== 3 | Face completion with a multi-output estimators 4 | =================================================== 5 | 6 | This example shows the use of multi-output estimator to complete images. 7 | The goal is to predict the lower half of a face given its upper half. 8 | 9 | The first column of images shows true faces. The next columns illustrate 10 | how extremely randomized trees, linear regression, ridge regression, 11 | and k nearest neighbors with or without hubness reduction 12 | complete the lower half of those faces. 13 | 14 | 15 | Adapted from ``_ 16 | """ 17 | print(__doc__) 18 | 19 | import numpy as np 20 | import matplotlib.pyplot as plt 21 | 22 | from sklearn.datasets import fetch_olivetti_faces 23 | from sklearn.utils.validation import check_random_state 24 | 25 | from sklearn.ensemble import ExtraTreesRegressor 26 | from sklearn.linear_model import LinearRegression 27 | from sklearn.linear_model import RidgeCV 28 | 29 | from skhubness.neighbors import KNeighborsRegressor 30 | 31 | # Load the faces datasets 32 | data = fetch_olivetti_faces() 33 | targets = data.target 34 | 35 | data = data.images.reshape((len(data.images), -1)) 36 | train = data[targets < 30] 37 | test = data[targets >= 30] # Test on independent people 38 | 39 | # Test on a subset of people 40 | n_faces = 5 41 | rng = check_random_state(4) 42 | face_ids = rng.randint(test.shape[0], size=(n_faces, )) 43 | test = test[face_ids, :] 44 | 45 | n_pixels = data.shape[1] 46 | # Upper half of the faces 47 | X_train = train[:, :(n_pixels + 1) // 2] 48 | # Lower half of the faces 49 | y_train = train[:, n_pixels // 2:] 50 | X_test = test[:, :(n_pixels + 1) // 2] 51 | y_test = test[:, n_pixels // 2:] 52 | 53 | # Fit estimators 54 | ESTIMATORS = { 55 | "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32, 56 | random_state=0), 57 | "k-NN": KNeighborsRegressor(weights='distance'), 58 | "k-NN MP": KNeighborsRegressor(hubness='mp', 59 | hubness_params={'method': 'normal'}, 60 | weights='distance'), 61 | "Linear regression": LinearRegression(), 62 | "Ridge": RidgeCV(), 63 | } 64 | 65 | y_test_predict = dict() 66 | for name, estimator in ESTIMATORS.items(): 67 | estimator.fit(X_train, y_train) 68 | y_test_predict[name] = estimator.predict(X_test) 69 | 70 | # Plot the completed faces 71 | image_shape = (64, 64) 72 | 73 | n_cols = 1 + len(ESTIMATORS) 74 | plt.figure(figsize=(2. * n_cols, 2.26 * n_faces)) 75 | plt.suptitle("Face completion with multi-output estimators", size=16) 76 | 77 | for i in range(n_faces): 78 | true_face = np.hstack((X_test[i], y_test[i])) 79 | 80 | if i: 81 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1) 82 | else: 83 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, 84 | title="true faces") 85 | 86 | sub.axis("off") 87 | sub.imshow(true_face.reshape(image_shape), 88 | cmap=plt.cm.gray, 89 | interpolation="nearest") 90 | 91 | for j, est in enumerate(sorted(ESTIMATORS)): 92 | completed_face = np.hstack((X_test[i], y_test_predict[est][i])) 93 | 94 | if i: 95 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j) 96 | 97 | else: 98 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, 99 | title=est) 100 | 101 | sub.axis("off") 102 | sub.imshow(completed_face.reshape(image_shape), 103 | cmap=plt.cm.gray, 104 | interpolation="nearest") 105 | 106 | plt.show() 107 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_multioutput_face_completion.py: -------------------------------------------------------------------------------- 1 | """ 2 | =================================================== 3 | Face completion with a multi-output estimators 4 | =================================================== 5 | 6 | This example shows the use of multi-output estimator to complete images. 7 | The goal is to predict the lower half of a face given its upper half. 8 | 9 | The first column of images shows true faces. The next columns illustrate 10 | how extremely randomized trees, linear regression, ridge regression, 11 | and k nearest neighbors with or without hubness reduction 12 | complete the lower half of those faces. 13 | 14 | 15 | Adapted from ``_ 16 | """ 17 | print(__doc__) 18 | 19 | import numpy as np 20 | import matplotlib.pyplot as plt 21 | 22 | from sklearn.datasets import fetch_olivetti_faces 23 | from sklearn.utils.validation import check_random_state 24 | 25 | from sklearn.ensemble import ExtraTreesRegressor 26 | from sklearn.linear_model import LinearRegression 27 | from sklearn.linear_model import RidgeCV 28 | 29 | from skhubness.neighbors import KNeighborsRegressor 30 | 31 | # Load the faces datasets 32 | data = fetch_olivetti_faces() 33 | targets = data.target 34 | 35 | data = data.images.reshape((len(data.images), -1)) 36 | train = data[targets < 30] 37 | test = data[targets >= 30] # Test on independent people 38 | 39 | # Test on a subset of people 40 | n_faces = 5 41 | rng = check_random_state(4) 42 | face_ids = rng.randint(test.shape[0], size=(n_faces, )) 43 | test = test[face_ids, :] 44 | 45 | n_pixels = data.shape[1] 46 | # Upper half of the faces 47 | X_train = train[:, :(n_pixels + 1) // 2] 48 | # Lower half of the faces 49 | y_train = train[:, n_pixels // 2:] 50 | X_test = test[:, :(n_pixels + 1) // 2] 51 | y_test = test[:, n_pixels // 2:] 52 | 53 | # Fit estimators 54 | ESTIMATORS = { 55 | "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32, 56 | random_state=0), 57 | "k-NN": KNeighborsRegressor(weights='distance'), 58 | "k-NN MP": KNeighborsRegressor(hubness='mp', 59 | hubness_params={'method': 'normal'}, 60 | weights='distance'), 61 | "Linear regression": LinearRegression(), 62 | "Ridge": RidgeCV(), 63 | } 64 | 65 | y_test_predict = dict() 66 | for name, estimator in ESTIMATORS.items(): 67 | estimator.fit(X_train, y_train) 68 | y_test_predict[name] = estimator.predict(X_test) 69 | 70 | # Plot the completed faces 71 | image_shape = (64, 64) 72 | 73 | n_cols = 1 + len(ESTIMATORS) 74 | plt.figure(figsize=(2. * n_cols, 2.26 * n_faces)) 75 | plt.suptitle("Face completion with multi-output estimators", size=16) 76 | 77 | for i in range(n_faces): 78 | true_face = np.hstack((X_test[i], y_test[i])) 79 | 80 | if i: 81 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1) 82 | else: 83 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, 84 | title="true faces") 85 | 86 | sub.axis("off") 87 | sub.imshow(true_face.reshape(image_shape), 88 | cmap=plt.cm.gray, 89 | interpolation="nearest") 90 | 91 | for j, est in enumerate(sorted(ESTIMATORS)): 92 | completed_face = np.hstack((X_test[i], y_test_predict[est][i])) 93 | 94 | if i: 95 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j) 96 | 97 | else: 98 | sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, 99 | title=est) 100 | 101 | sub.axis("off") 102 | sub.imshow(completed_face.reshape(image_shape), 103 | cmap=plt.cm.gray, 104 | interpolation="nearest") 105 | 106 | plt.show() 107 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/olivetti_faces.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n=================================\nFace recognition (Olivetti faces)\n=================================\n\nThis dataset contains a set of face images taken between April 1992\nand April 1994 at AT&T Laboratories Cambridge.\nImage data is typically embedded in very high-dimensional spaces,\nwhich might be prone to hubness.\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import numpy as np\nfrom sklearn.datasets import olivetti_faces\nfrom sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV\n\nfrom skhubness import LegacyHubness\nfrom skhubness.neighbors import KNeighborsClassifier\n\n# Fetch data and have a look\nd = olivetti_faces.fetch_olivetti_faces()\nX, y = d['data'], d['target']\nprint(f'Data shape: {X.shape}')\nprint(f'Label shape: {y.shape}')\n# (400, 4096)\n# (400,)\n\n# The data is embedded in a high-dimensional space.\n# Is there hubness, and can we reduce it?\nfor hubness in [None, 'dsl', 'ls', 'mp']:\n hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness')\n hub.fit(X)\n score = hub.score()\n print(f'LegacyHubness (10-skew): {score:.3f} with hubness reduction: {hubness}')\n# LegacyHubness (10-skew): 1.972 with hubness reduction: None\n# LegacyHubness (10-skew): 1.526 with hubness reduction: dsl\n# LegacyHubness (10-skew): 0.943 with hubness reduction: ls\n# LegacyHubness (10-skew): 0.184 with hubness reduction: mp\n\n# There is some hubness, and all hubness reduction methods can reduce it (to varying degree)\n# Let's assess the best kNN strategy and its estimated performance.\ncv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)\ncv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)\n\nknn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})\n\n# specify parameters and distributions to sample from\nparam_dist = {\"n_neighbors\": np.arange(1, 26),\n \"weights\": ['uniform', 'distance'],\n \"hubness\": [None, 'dsl', 'ls', 'mp']}\n\n# Inner cross-validation to select best hyperparameters (incl hubness reduction method)\nsearch = RandomizedSearchCV(estimator=knn,\n param_distributions=param_dist,\n n_iter=100,\n cv=cv_select,\n random_state=2345,\n verbose=1)\n\n# Outer cross-validation to estimate performance\nscore = cross_val_score(search, X, y, cv=cv_perf, verbose=1)\nprint(f'Scores: {score}')\nprint(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}')\n\n# Select model that maximizes accuracy\nsearch.fit(X, y)\n\n# The best model's parameters\nprint(search.best_params_)\n\n# Does it correspond to the results of hubness reduction above?\n# Scores: [0.95 0.9625 1. 0.95 0.925 ]\n# Mean acc = 0.957 +/- 0.024\n# {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'}" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_regression.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | :class: sphx-glr-download-link-note 3 | 4 | Click :ref:`here ` to download the full example code 5 | .. rst-class:: sphx-glr-example-title 6 | 7 | .. _sphx_glr_documentation_auto_examples_plot_regression.py: 8 | 9 | 10 | ============================ 11 | Nearest Neighbors regression 12 | ============================ 13 | 14 | Demonstrate the resolution of a regression problem 15 | using a k-Nearest Neighbor and the interpolation of the 16 | target using both barycenter and constant weights. 17 | 18 | Hubness reduction of this low-dimensional dataset 19 | shows only small effects. 20 | 21 | Adapted from ``_ 22 | 23 | 24 | 25 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_regression_001.png 26 | :class: sphx-glr-single-img 27 | 28 | 29 | .. rst-class:: sphx-glr-script-out 30 | 31 | Out: 32 | 33 | .. code-block:: none 34 | 35 | 36 | /home/user/feldbauer/PycharmProjects/hubness/examples/sklearn/plot_regression.py:60: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure. 37 | plt.show() 38 | 39 | 40 | 41 | 42 | 43 | | 44 | 45 | 46 | .. code-block:: default 47 | 48 | print(__doc__) 49 | 50 | # Author: Alexandre Gramfort 51 | # Fabian Pedregosa 52 | # 53 | # License: BSD 3 clause (C) INRIA 54 | 55 | 56 | # ############################################################################# 57 | # Generate sample data 58 | import numpy as np 59 | import matplotlib.pyplot as plt 60 | from skhubness.neighbors import KNeighborsRegressor 61 | 62 | np.random.seed(0) 63 | X = np.sort(5 * np.random.rand(40, 1), axis=0) 64 | T = np.linspace(0, 5, 500)[:, np.newaxis] 65 | y = np.sin(X).ravel() 66 | 67 | # Add noise to targets 68 | y[::5] += 1 * (0.5 - np.random.rand(8)) 69 | 70 | # ############################################################################# 71 | # Fit regression model 72 | n_neighbors = 5 73 | 74 | f = plt.figure() 75 | for i, weights in enumerate(['uniform', 'distance']): 76 | for j, hubness in enumerate([None, 'local_scaling']): 77 | knn = KNeighborsRegressor(n_neighbors, 78 | algorithm_params={'n_candidates': 39}, 79 | weights=weights, 80 | hubness=hubness) 81 | y_ = knn.fit(X, y).predict(T) 82 | 83 | plt.subplot(2, 2, i * 2 + j + 1) 84 | f.set_figheight(15) 85 | f.set_figwidth(15) 86 | plt.scatter(X, y, c='k', label='data') 87 | plt.plot(T, y_, c='g', label='prediction') 88 | plt.axis('tight') 89 | plt.legend() 90 | plt.title(f"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}', hubness = '{hubness}')") 91 | 92 | plt.tight_layout() 93 | plt.show() 94 | 95 | .. rst-class:: sphx-glr-timing 96 | 97 | **Total running time of the script:** ( 0 minutes 0.737 seconds) 98 | 99 | 100 | .. _sphx_glr_download_documentation_auto_examples_plot_regression.py: 101 | 102 | 103 | .. only :: html 104 | 105 | .. container:: sphx-glr-footer 106 | :class: sphx-glr-footer-example 107 | 108 | 109 | 110 | .. container:: sphx-glr-download 111 | 112 | :download:`Download Python source code: plot_regression.py ` 113 | 114 | 115 | 116 | .. container:: sphx-glr-download 117 | 118 | :download:`Download Jupyter notebook: plot_regression.ipynb ` 119 | 120 | 121 | .. only:: html 122 | 123 | .. rst-class:: sphx-glr-signature 124 | 125 | `Gallery generated by Sphinx-Gallery `_ 126 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [Next release] 4 | ... 5 | 6 | ### Added or enhanced 7 | - Lower memory footprint for sparse targets in multilabel classification 8 | (previously converted to dense arrays) #61 9 | 10 | ### Fixes 11 | - Hubness estimation could fail when ANN does not return enough neighbors #59 12 | - Heuristic to choose memory for Puffinn LSH. 13 | 14 | ### Maintenance 15 | - Switch to modern Python packaging with `pyproject.toml` and `setup.cfg` 16 | - Switch to Github Actions, dropping Travis CI and AppVeyor 17 | 18 | 19 | ## [0.21.2] - 2020-01-14 20 | 21 | This is a maintenance release due to the publication in the 22 | Journal of Open Source Software. 23 | 24 | 25 | ## [0.21.1] - 2019-12-10 26 | 27 | This is a bugfix release due to the recent update of scikit-learn to v0.22. 28 | 29 | ### Fixes 30 | - Require scikit-learn v0.21.3. 31 | 32 | Until the necessary adaptions for v0.22 are completed, 33 | scikit-hubness will require scikit-learn v0.21.3. 34 | 35 | 36 | ## [0.21.0] - 2019-11-25 37 | 38 | This is the first major release of scikit-hubness. 39 | 40 | ### Added 41 | - Enable ONNG provided by NGT (optimized ANNG). Pass ``optimize=True`` to ``LegacyNNG``. 42 | - User Guide: Description of all subpackages and common usage scenarios. 43 | - Examples: Various usage examples 44 | - Several tests 45 | - Classes inheriting from ``SupervisedIntegerMixin`` can be fit with an 46 | ``ApproximateNearestNeighbor`` or ``NearestNeighbors`` instance, 47 | thus reuse precomputed indices. 48 | 49 | ### Changes 50 | - Use argument ``algorithm='nng'`` for ANNG/ONNG provided by NGT instead of ``'onng'``. 51 | Also set ``optimize=True`` in order to use ONNG. 52 | 53 | ### Fixes 54 | - DisSimLocal would previously fail when invoked as ``hubness='dis_sim_local'``. 55 | - Hubness reduction would previously ignore ``verbose`` arguments under certain circumstances. 56 | - ``HNSW`` would previously ignore ``n_jobs`` on index creation. 57 | - Fix installation instructions for puffinn. 58 | 59 | ## [0.21.0a9] - 2019-10-30 60 | ### Added 61 | - General structure for docs 62 | - Enable NGT OpenMP support on MacOS (in addition to Linux) 63 | - Enable Puffinn LSH also on MacOS 64 | 65 | ### Fixes 66 | - Correct mutual proximity (empiric) calculation 67 | - Better handling of optional packages (ANN libraries) 68 | 69 | ### Maintenance 70 | - streamlined CI builds 71 | - several minor code improvements 72 | 73 | ### New contributors 74 | - Silvan David Peter 75 | 76 | 77 | ## [0.21.0a8] - 2019-09-12 78 | ### Added 79 | - Approximate nearest neighbor search 80 | * LSH by an additional provider, [`puffinn`](https://github.com/puffinn/puffinn) (Linux only, atm) 81 | * ANNG provided by [`ngtpy`](https://github.com/yahoojapan/NGT/) (Linux, MacOS) 82 | * Random projection forests provided by [`annoy`](https://github.com/spotify/annoy) (Linux, MacOS, Windows) 83 | 84 | ### Fixes 85 | - Several minor issues 86 | - Several documentations issues 87 | 88 | 89 | ## [0.21.0a7] - 2019-07-17 90 | 91 | The first alpha release of `scikit-hubness` to appear in this changelog. 92 | It already contains the following features: 93 | 94 | - Hubness estimation (exact or approximate) 95 | - Hubness reduction (exact or approximate) 96 | * Mutual proximity 97 | * Local scaling 98 | * DisSim Local 99 | - Approximate nearest neighbor search 100 | * HNSW provided by [nmslib](https://github.com/nmslib/nmslib) 101 | * LSH provided by [falconn](https://github.com/FALCONN-LIB/FALCONN) 102 | 103 | [Next release]: https://github.com/VarIr/scikit-hubness/compare/v0.21.2...HEAD 104 | [0.21.2]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.2 105 | [0.21.1]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.1 106 | [0.21.0]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0 107 | [0.21.0a9]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0-alpha.9 108 | [0.21.0a8]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0-alpha.8 109 | [0.21.0a7]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0-alpha.7 110 | 111 | [//]: # "Sections: Added, Fixed, Changed, Removed" 112 | -------------------------------------------------------------------------------- /docs/documentation/reduction.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Hubness reduction 3 | ================= 4 | 5 | The :mod:`skhubness.reduction` subpackage provides several hubness reduction methods. 6 | Currently, the supported methods are 7 | 8 | - Mutual proximity (independent Gaussian distance distribution), 9 | provided by :class:`MutualProximity ` with ``method='normal'`` (default), 10 | - Mutual proximity (empiric distance distribution), 11 | provided by :class:`MutualProximity ` with ``method='empiric'``, 12 | - Local scaling, 13 | provided by :class:`LocalScaling ` with ``method='standard'`` (default), 14 | - Non-iterative contextual dissimilarity measure, 15 | provided by :class:`LocalScaling ` with ``method='nicdm'``, 16 | - DisSim Local, 17 | provided by :class:`DisSimLocal `, 18 | 19 | which represent the most successful hubness reduction methods as identified in 20 | our paper "A comprehensive empirical comparison of hubness reduction in high-dimensional spaces", 21 | KAIS (2019), `DOI `__. 22 | This survey paper also comes with an overview of how the individual methods work. 23 | 24 | There are two ways to use perform hubness reduction in scikit-hubness: 25 | 26 | - Implicitly, using the classes in :mod:`skhubness.neighbors` 27 | (see :ref:`User Guide: Nearest neighbors `), 28 | - Explicitly, using the classes in :mod:`skhubness.reduction`. 29 | 30 | The former is the common approach, if you simply want to improve your learning task 31 | by hubness reduction. Most examples here also do so. 32 | The latter may, however, be more useful for researchers, who would like to 33 | investigate the hubness phenomenon itself. 34 | 35 | All hubness reducers inherit from a common base class 36 | :class:`HubnessReduction `. 37 | This abstract class defines two important methods: 38 | :meth:`fit ` and 39 | :meth:`transform `, 40 | thus allowing to transform previously unseen data after the initial fit. 41 | Most hubness reduction methods do not operate on vector data, 42 | but manipulate pre-computed distances, in order to obtain `secondary distances`. 43 | Therefore, ``fit`` and ``transform`` take neighbor graphs as input, instead of vectors. 44 | Have a look at their signatures: 45 | 46 | .. code-block:: Python3 47 | 48 | @abstractmethod 49 | def fit(self, neigh_dist, neigh_ind, X, assume_sorted, *args, **kwargs): 50 | pass # pragma: no cover 51 | 52 | @abstractmethod 53 | def transform(self, neigh_dist, neigh_ind, X, assume_sorted, return_distance=True): 54 | pass # pragma: no cover 55 | 56 | The arguments ``neigh_dist`` and ``neigh_ind`` are two arrays representing the nearest neighbor graph 57 | with shape ``(n_indexed, n_neighbors)`` during fit, and 58 | shape ``(n_query, n_neighbors)`` during transform. 59 | The i-th row in each array corresponds to the i-th object in the data set. 60 | The j-th column in ``neigh_ind`` contains the index of one of the k-nearest neighbors among the indexed objects, 61 | while the j-th column in ``neigh_dist`` contains the corresponding distance. 62 | Note, that this is the same format as obtained by scikit-learn's ``kneighbors(return_distances=True)`` 63 | method. 64 | 65 | This way, the user has full flexibility on how to calculate primary distances (Euclidean, cosine, KL divergence, etc). 66 | :class:`DisSimLocal ` (DSL) is the exception to this rule, 67 | because it is formulated specifically for Euclidean distances. 68 | DSL, therefore, also requires the training vectors in ``fit(..., X=X_train)``, 69 | and the test set vectors in ``transform(..., X=X_test)``. 70 | Argument ``X`` is ignored in the other hubness reduction methods. 71 | 72 | When the neighbor graph is already sorted (lowest to highest distance), 73 | ``assume_sorted=True`` should be set, so that hubness reduction methods 74 | will not sort the arrays again, thus saving computational time. 75 | 76 | Hubness reduction methods transform the primary distance graph, 77 | and return secondary distances. 78 | Note that for efficiency reasons, the returned arrays are not sorted. 79 | Please make sure to sort the arrays, if downstream tasks assume sorted arrays. 80 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_classification.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | :class: sphx-glr-download-link-note 3 | 4 | Click :ref:`here ` to download the full example code 5 | .. rst-class:: sphx-glr-example-title 6 | 7 | .. _sphx_glr_documentation_auto_examples_plot_classification.py: 8 | 9 | 10 | ================================ 11 | Nearest Neighbors Classification 12 | ================================ 13 | Sample usage of Nearest Neighbors classification. 14 | It will plot the decision boundaries for each class. 15 | 16 | Adapted from ``_ 17 | 18 | 19 | 20 | .. rst-class:: sphx-glr-horizontal 21 | 22 | 23 | * 24 | 25 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_classification_001.png 26 | :class: sphx-glr-multi-img 27 | 28 | * 29 | 30 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_classification_002.png 31 | :class: sphx-glr-multi-img 32 | 33 | 34 | .. rst-class:: sphx-glr-script-out 35 | 36 | Out: 37 | 38 | .. code-block:: none 39 | 40 | /home/user/feldbauer/PycharmProjects/hubness/examples/sklearn/plot_classification.py:61: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure. 41 | plt.show() 42 | 43 | 44 | 45 | 46 | 47 | | 48 | 49 | 50 | .. code-block:: default 51 | 52 | 53 | import numpy as np 54 | import matplotlib.pyplot as plt 55 | from matplotlib.colors import ListedColormap 56 | from sklearn import datasets 57 | from skhubness.neighbors import KNeighborsClassifier 58 | 59 | n_neighbors = 15 60 | 61 | # import some data to play with 62 | iris = datasets.load_iris() 63 | 64 | # we only take the first two features. We could avoid this ugly 65 | # slicing by using a two-dim dataset 66 | X = iris.data[:, :2] 67 | y = iris.target 68 | 69 | h = .02 # step size in the mesh 70 | 71 | # Create color maps 72 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 73 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 74 | 75 | for hubness in [None, 'mutual_proximity']: 76 | # we create an instance of Neighbours Classifier and fit the data. 77 | clf = KNeighborsClassifier(n_neighbors, 78 | hubness=hubness, 79 | weights='distance') 80 | clf.fit(X, y) 81 | 82 | # Plot the decision boundary. For that, we will assign a color to each 83 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 84 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 85 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 86 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 87 | np.arange(y_min, y_max, h)) 88 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 89 | 90 | # Put the result into a color plot 91 | Z = Z.reshape(xx.shape) 92 | plt.figure() 93 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light) 94 | 95 | # Plot also the training points 96 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, 97 | edgecolor='k', s=20) 98 | plt.xlim(xx.min(), xx.max()) 99 | plt.ylim(yy.min(), yy.max()) 100 | plt.title("3-Class classification (k = %i, hubness = '%s')" 101 | % (n_neighbors, hubness)) 102 | 103 | plt.show() 104 | 105 | 106 | .. rst-class:: sphx-glr-timing 107 | 108 | **Total running time of the script:** ( 0 minutes 25.940 seconds) 109 | 110 | 111 | .. _sphx_glr_download_documentation_auto_examples_plot_classification.py: 112 | 113 | 114 | .. only :: html 115 | 116 | .. container:: sphx-glr-footer 117 | :class: sphx-glr-footer-example 118 | 119 | 120 | 121 | .. container:: sphx-glr-download 122 | 123 | :download:`Download Python source code: plot_classification.py ` 124 | 125 | 126 | 127 | .. container:: sphx-glr-download 128 | 129 | :download:`Download Jupyter notebook: plot_classification.ipynb ` 130 | 131 | 132 | .. only:: html 133 | 134 | .. rst-class:: sphx-glr-signature 135 | 136 | `Gallery generated by Sphinx-Gallery `_ 137 | -------------------------------------------------------------------------------- /examples/sklearn/plot_nca_dim_reduction.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================================== 3 | Dimensionality Reduction with Neighborhood Components Analysis 4 | ============================================================== 5 | 6 | Sample usage of Neighborhood Components Analysis for dimensionality reduction. 7 | 8 | This example compares different (linear) dimensionality reduction methods 9 | applied on the Digits data set. The data set contains images of digits from 10 | 0 to 9 with approximately 180 samples of each class. Each image is of 11 | dimension 8x8 = 64, and is reduced to a two-dimensional data point. 12 | 13 | Principal Component Analysis (PCA) applied to this data identifies the 14 | combination of attributes (principal components, or directions in the 15 | feature space) that account for the most variance in the data. Here we 16 | plot the different samples on the 2 first principal components. 17 | 18 | Linear Discriminant Analysis (LDA) tries to identify attributes that 19 | account for the most variance *between classes*. In particular, 20 | LDA, in contrast to PCA, is a supervised method, using known class labels. 21 | 22 | Neighborhood Components Analysis (NCA) tries to find a feature space such 23 | that a stochastic nearest neighbor algorithm will give the best accuracy. 24 | Like LDA, it is a supervised method. 25 | 26 | One can see that NCA enforces a clustering of the data that is visually 27 | meaningful despite the large reduction in dimension. 28 | 29 | Adapted from ``_ 30 | """ 31 | # License: BSD 3 clause 32 | 33 | import numpy as np 34 | import matplotlib.pyplot as plt 35 | from sklearn import datasets 36 | from sklearn.model_selection import train_test_split 37 | from sklearn.decomposition import PCA 38 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 39 | from sklearn.pipeline import make_pipeline 40 | from sklearn.preprocessing import StandardScaler 41 | 42 | from skhubness.neighbors import (KNeighborsClassifier, 43 | NeighborhoodComponentsAnalysis) 44 | 45 | print(__doc__) 46 | 47 | n_neighbors = 3 48 | random_state = 0 49 | 50 | # Load Digits dataset 51 | digits = datasets.load_digits() 52 | X, y = digits.data, digits.target 53 | 54 | # Split into train/test 55 | X_train, X_test, y_train, y_test = \ 56 | train_test_split(X, y, test_size=0.5, stratify=y, 57 | random_state=random_state) 58 | 59 | dim = len(X[0]) 60 | n_classes = len(np.unique(y)) 61 | 62 | # Reduce dimension to 2 with PCA 63 | pca = make_pipeline(StandardScaler(), 64 | PCA(n_components=2, random_state=random_state)) 65 | 66 | # Reduce dimension to 2 with LinearDiscriminantAnalysis 67 | lda = make_pipeline(StandardScaler(), 68 | LinearDiscriminantAnalysis(n_components=2)) 69 | 70 | # Reduce dimension to 2 with NeighborhoodComponentAnalysis 71 | nca = make_pipeline(StandardScaler(), 72 | NeighborhoodComponentsAnalysis(n_components=2, 73 | random_state=random_state)) 74 | 75 | # Use a nearest neighbor classifier to evaluate the methods 76 | knn = KNeighborsClassifier(n_neighbors=n_neighbors) 77 | 78 | # Make a list of the methods to be compared 79 | dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)] 80 | 81 | # plt.figure() 82 | for i, (name, model) in enumerate(dim_reduction_methods): 83 | plt.figure() 84 | # plt.subplot(1, 3, i + 1, aspect=1) 85 | 86 | # Fit the method's model 87 | model.fit(X_train, y_train) 88 | 89 | # Fit a nearest neighbor classifier on the embedded training set 90 | knn.fit(model.transform(X_train), y_train) 91 | 92 | # Compute the nearest neighbor accuracy on the embedded test set 93 | acc_knn = knn.score(model.transform(X_test), y_test) 94 | 95 | # Embed the data set in 2 dimensions using the fitted model 96 | X_embedded = model.transform(X) 97 | 98 | # Plot the projected points and show the evaluation score 99 | plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1') 100 | plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name, 101 | n_neighbors, 102 | acc_knn)) 103 | plt.show() 104 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nca_dim_reduction.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================================== 3 | Dimensionality Reduction with Neighborhood Components Analysis 4 | ============================================================== 5 | 6 | Sample usage of Neighborhood Components Analysis for dimensionality reduction. 7 | 8 | This example compares different (linear) dimensionality reduction methods 9 | applied on the Digits data set. The data set contains images of digits from 10 | 0 to 9 with approximately 180 samples of each class. Each image is of 11 | dimension 8x8 = 64, and is reduced to a two-dimensional data point. 12 | 13 | Principal Component Analysis (PCA) applied to this data identifies the 14 | combination of attributes (principal components, or directions in the 15 | feature space) that account for the most variance in the data. Here we 16 | plot the different samples on the 2 first principal components. 17 | 18 | Linear Discriminant Analysis (LDA) tries to identify attributes that 19 | account for the most variance *between classes*. In particular, 20 | LDA, in contrast to PCA, is a supervised method, using known class labels. 21 | 22 | Neighborhood Components Analysis (NCA) tries to find a feature space such 23 | that a stochastic nearest neighbor algorithm will give the best accuracy. 24 | Like LDA, it is a supervised method. 25 | 26 | One can see that NCA enforces a clustering of the data that is visually 27 | meaningful despite the large reduction in dimension. 28 | 29 | Adapted from ``_ 30 | """ 31 | # License: BSD 3 clause 32 | 33 | import numpy as np 34 | import matplotlib.pyplot as plt 35 | from sklearn import datasets 36 | from sklearn.model_selection import train_test_split 37 | from sklearn.decomposition import PCA 38 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 39 | from sklearn.pipeline import make_pipeline 40 | from sklearn.preprocessing import StandardScaler 41 | 42 | from skhubness.neighbors import (KNeighborsClassifier, 43 | NeighborhoodComponentsAnalysis) 44 | 45 | print(__doc__) 46 | 47 | n_neighbors = 3 48 | random_state = 0 49 | 50 | # Load Digits dataset 51 | digits = datasets.load_digits() 52 | X, y = digits.data, digits.target 53 | 54 | # Split into train/test 55 | X_train, X_test, y_train, y_test = \ 56 | train_test_split(X, y, test_size=0.5, stratify=y, 57 | random_state=random_state) 58 | 59 | dim = len(X[0]) 60 | n_classes = len(np.unique(y)) 61 | 62 | # Reduce dimension to 2 with PCA 63 | pca = make_pipeline(StandardScaler(), 64 | PCA(n_components=2, random_state=random_state)) 65 | 66 | # Reduce dimension to 2 with LinearDiscriminantAnalysis 67 | lda = make_pipeline(StandardScaler(), 68 | LinearDiscriminantAnalysis(n_components=2)) 69 | 70 | # Reduce dimension to 2 with NeighborhoodComponentAnalysis 71 | nca = make_pipeline(StandardScaler(), 72 | NeighborhoodComponentsAnalysis(n_components=2, 73 | random_state=random_state)) 74 | 75 | # Use a nearest neighbor classifier to evaluate the methods 76 | knn = KNeighborsClassifier(n_neighbors=n_neighbors) 77 | 78 | # Make a list of the methods to be compared 79 | dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)] 80 | 81 | # plt.figure() 82 | for i, (name, model) in enumerate(dim_reduction_methods): 83 | plt.figure() 84 | # plt.subplot(1, 3, i + 1, aspect=1) 85 | 86 | # Fit the method's model 87 | model.fit(X_train, y_train) 88 | 89 | # Fit a nearest neighbor classifier on the embedded training set 90 | knn.fit(model.transform(X_train), y_train) 91 | 92 | # Compute the nearest neighbor accuracy on the embedded test set 93 | acc_knn = knn.score(model.transform(X_test), y_test) 94 | 95 | # Embed the data set in 2 dimensions using the fitted model 96 | X_embedded = model.transform(X) 97 | 98 | # Plot the projected points and show the evaluation score 99 | plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1') 100 | plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name, 101 | n_neighbors, 102 | acc_knn)) 103 | plt.show() 104 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_hr/olivetti_faces.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | :class: sphx-glr-download-link-note 3 | 4 | Click :ref:`here ` to download the full example code 5 | .. rst-class:: sphx-glr-example-title 6 | 7 | .. _sphx_glr_documentation_auto_examples_hr_olivetti_faces.py: 8 | 9 | 10 | ================================= 11 | Face recognition (Olivetti faces) 12 | ================================= 13 | 14 | This dataset contains a set of face images taken between April 1992 15 | and April 1994 at AT&T Laboratories Cambridge. 16 | Image data is typically embedded in very high-dimensional spaces, 17 | which might be prone to hubness. 18 | 19 | 20 | .. code-block:: default 21 | 22 | import numpy as np 23 | from sklearn.datasets import olivetti_faces 24 | from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV 25 | 26 | from skhubness import LegacyHubness 27 | from skhubness.neighbors import KNeighborsClassifier 28 | 29 | # Fetch data and have a look 30 | d = olivetti_faces.fetch_olivetti_faces() 31 | X, y = d['data'], d['target'] 32 | print(f'Data shape: {X.shape}') 33 | print(f'Label shape: {y.shape}') 34 | # (400, 4096) 35 | # (400,) 36 | 37 | # The data is embedded in a high-dimensional space. 38 | # Is there hubness, and can we reduce it? 39 | for hubness in [None, 'dsl', 'ls', 'mp']: 40 | hub = LegacyHubness(k=10, hubness=hubness, return_value='k_skewness') 41 | hub.fit(X) 42 | score = hub.score() 43 | print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}') 44 | # Hubness (10-skew): 1.972 with hubness reduction: None 45 | # Hubness (10-skew): 1.526 with hubness reduction: dsl 46 | # Hubness (10-skew): 0.943 with hubness reduction: ls 47 | # Hubness (10-skew): 0.184 with hubness reduction: mp 48 | 49 | # There is some hubness, and all hubness reduction methods can reduce it (to varying degree) 50 | # Let's assess the best kNN strategy and its estimated performance. 51 | cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263) 52 | cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634) 53 | 54 | knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100}) 55 | 56 | # specify parameters and distributions to sample from 57 | param_dist = {"n_neighbors": np.arange(1, 26), 58 | "weights": ['uniform', 'distance'], 59 | "hubness": [None, 'dsl', 'ls', 'mp']} 60 | 61 | # Inner cross-validation to select best hyperparameters (incl hubness reduction method) 62 | search = RandomizedSearchCV(estimator=knn, 63 | param_distributions=param_dist, 64 | n_iter=100, 65 | cv=cv_select, 66 | random_state=2345, 67 | verbose=1) 68 | 69 | # Outer cross-validation to estimate performance 70 | score = cross_val_score(search, X, y, cv=cv_perf, verbose=1) 71 | print(f'Scores: {score}') 72 | print(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}') 73 | 74 | # Select model that maximizes accuracy 75 | search.fit(X, y) 76 | 77 | # The best model's parameters 78 | print(search.best_params_) 79 | 80 | # Does it correspond to the results of hubness reduction above? 81 | # Scores: [0.95 0.9625 1. 0.95 0.925 ] 82 | # Mean acc = 0.957 +/- 0.024 83 | # {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'} 84 | 85 | 86 | .. rst-class:: sphx-glr-timing 87 | 88 | **Total running time of the script:** ( 0 minutes 0.000 seconds) 89 | 90 | 91 | .. _sphx_glr_download_documentation_auto_examples_hr_olivetti_faces.py: 92 | 93 | 94 | .. only :: html 95 | 96 | .. container:: sphx-glr-footer 97 | :class: sphx-glr-footer-example 98 | 99 | 100 | 101 | .. container:: sphx-glr-download 102 | 103 | :download:`Download Python source code: olivetti_faces.py ` 104 | 105 | 106 | 107 | .. container:: sphx-glr-download 108 | 109 | :download:`Download Jupyter notebook: olivetti_faces.ipynb ` 110 | 111 | 112 | .. only:: html 113 | 114 | .. rst-class:: sphx-glr-signature 115 | 116 | `Gallery generated by Sphinx-Gallery `_ 117 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_multioutput_face_completion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n# Face completion with a multi-output estimators\n\n\nThis example shows the use of multi-output estimator to complete images.\nThe goal is to predict the lower half of a face given its upper half.\n\nThe first column of images shows true faces. The next columns illustrate\nhow extremely randomized trees, linear regression, ridge regression,\nand k nearest neighbors with or without hubness reduction\ncomplete the lower half of those faces.\n\n\nAdapted from ``_\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_olivetti_faces\nfrom sklearn.utils.validation import check_random_state\n\nfrom sklearn.ensemble import ExtraTreesRegressor\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.linear_model import RidgeCV\n\nfrom skhubness.neighbors import KNeighborsRegressor\n\n# Load the faces datasets\ndata = fetch_olivetti_faces()\ntargets = data.target\n\ndata = data.images.reshape((len(data.images), -1))\ntrain = data[targets < 30]\ntest = data[targets >= 30] # Test on independent people\n\n# Test on a subset of people\nn_faces = 5\nrng = check_random_state(4)\nface_ids = rng.randint(test.shape[0], size=(n_faces, ))\ntest = test[face_ids, :]\n\nn_pixels = data.shape[1]\n# Upper half of the faces\nX_train = train[:, :(n_pixels + 1) // 2]\n# Lower half of the faces\ny_train = train[:, n_pixels // 2:]\nX_test = test[:, :(n_pixels + 1) // 2]\ny_test = test[:, n_pixels // 2:]\n\n# Fit estimators\nESTIMATORS = {\n \"Extra trees\": ExtraTreesRegressor(n_estimators=10, max_features=32,\n random_state=0),\n \"k-NN\": KNeighborsRegressor(weights='distance'),\n \"k-NN MP\": KNeighborsRegressor(hubness='mp',\n hubness_params={'method': 'normal'},\n weights='distance'),\n \"Linear regression\": LinearRegression(),\n \"Ridge\": RidgeCV(),\n}\n\ny_test_predict = dict()\nfor name, estimator in ESTIMATORS.items():\n estimator.fit(X_train, y_train)\n y_test_predict[name] = estimator.predict(X_test)\n\n# Plot the completed faces\nimage_shape = (64, 64)\n\nn_cols = 1 + len(ESTIMATORS)\nplt.figure(figsize=(2. * n_cols, 2.26 * n_faces))\nplt.suptitle(\"Face completion with multi-output estimators\", size=16)\n\nfor i in range(n_faces):\n true_face = np.hstack((X_test[i], y_test[i]))\n\n if i:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)\n else:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,\n title=\"true faces\")\n\n sub.axis(\"off\")\n sub.imshow(true_face.reshape(image_shape),\n cmap=plt.cm.gray,\n interpolation=\"nearest\")\n\n for j, est in enumerate(sorted(ESTIMATORS)):\n completed_face = np.hstack((X_test[i], y_test_predict[est][i]))\n\n if i:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)\n\n else:\n sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,\n title=est)\n\n sub.axis(\"off\")\n sub.imshow(completed_face.reshape(image_shape),\n cmap=plt.cm.gray,\n interpolation=\"nearest\")\n\nplt.show()" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } -------------------------------------------------------------------------------- /examples/approximate_neighbors/word_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================= 3 | Retrieving GLOVE word vectors 4 | ============================= 5 | 6 | In this example we will retrieve similar words from 7 | GLOVE embeddings with an ANNG graph. 8 | 9 | Precomputed ground-truth nearest neighbors are available 10 | from `ANN benchmarks `__. 11 | """ 12 | 13 | # For this example, the `h5py` package is required in addition to the requirements of scikit-hubness. 14 | # You may install it from PyPI by the following command (if you're in an IPython/Jupyter environment): 15 | # !pip install h5py 16 | 17 | import numpy as np 18 | import h5py 19 | from skhubness.neighbors import NearestNeighbors 20 | 21 | # Download the dataset with the following command. 22 | # If the dataset is already available in the current working dir, you can skip this: 23 | # !wget http://ann-benchmarks.com/glove-100-angular.hdf5 24 | f = h5py.File('glove-100-angular.hdf5', 'r') 25 | 26 | # Extract the split and ground-truth 27 | X_train = f['train'] 28 | X_test = f['test'] 29 | neigh_true = f['neighbors'] 30 | dist = f['distances'] 31 | 32 | # How many object have we got? 33 | for k in f.keys(): 34 | print(f'{k}: shape = {f[k].shape}') 35 | 36 | # APPROXIMATE NEAREST NEIGHBOR SEARCH 37 | # In order to retrieve most similar words from the GLOVE embeddings, 38 | # we use the unsupervised `skhubness.neighbors.NearestNeighbors` class. 39 | # The (approximate) nearest neighbor algorithm is set to LegacyNNG by passing `algorithm='nng'`. 40 | # We can pass additional parameters to `LegacyNNG` via the `algorithm_params` dict. 41 | # Here we set `n_jobs=8` to enable parallelism. 42 | # Create the nearest neighbor index 43 | nn_plain = NearestNeighbors(n_neighbors=100, 44 | algorithm='nng', 45 | algorithm_params={'n_candidates': 1_000, 46 | 'index_dir': 'auto', 47 | 'n_jobs': 8}, 48 | verbose=2, 49 | ) 50 | nn_plain.fit(X_train) 51 | 52 | # Note that LegacyNNG must save its index. By setting `index_dir='auto'`, 53 | # LegacyNNG will try to save it to shared memory, if available, otherwise to $TMP. 54 | # This index is NOT removed automatically, as one will typically want build an index once and use it often. 55 | # Retrieve nearest neighbors for each test object 56 | neigh_pred_plain = nn_plain.kneighbors(X_test, 57 | n_neighbors=100, 58 | return_distance=False) 59 | 60 | # Calculate the recall per test object 61 | recalled_plain = [np.intersect1d(neigh_true[i], neigh_pred_plain) 62 | for i in range(len(X_test))] 63 | recall_plain = np.array([recalled_plain[i].size / neigh_true.shape[1] 64 | for i in range(len(X_test))]) 65 | 66 | # Statistics 67 | print(f'Mean = {recall_plain.mean():.4f}, ' 68 | f'stdev = {recall_plain.std():.4f}') 69 | 70 | 71 | # ANN with HUBNESS REDUCTION 72 | # Here we set `n_candidates=1000`, so that for each query, 73 | # 1000 neighbors will be retrieved first by `LegacyNNG`, 74 | # that are subsequently refined by hubness reduction. 75 | # Hubness reduction is performed by local scaling as specified with `hubness='ls'`. 76 | # Creating the NN index with hubness reduction enabled 77 | nn = NearestNeighbors(n_neighbors=100, 78 | algorithm='nng', 79 | algorithm_params={'n_candidates': 1_000, 80 | 'n_jobs': 8}, 81 | hubness='ls', 82 | verbose=2, 83 | ) 84 | nn.fit(X_train) 85 | 86 | # Retrieve nearest neighbors for each test object 87 | neigh_pred = nn.kneighbors(X_test, 88 | n_neighbors=100, 89 | return_distance=False) 90 | 91 | # Measure recall per object and on average 92 | recalled = [np.intersect1d(neigh_true[i], neigh_pred) 93 | for i in range(len(X_test))] 94 | recall = np.array([recalled[i].size / neigh_true.shape[1] 95 | for i in range(len(X_test))]) 96 | print(f'Mean = {recall.mean():.4f}, ' 97 | f'stdev = {recall.std():.4f}') 98 | 99 | # If the second results are significantly better than the first, 100 | # this could indicate that the chosen ANN method is more prone 101 | # to hubness than exact NN, which might be an interesting research question. 102 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples_ann/word_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================= 3 | Retrieving GLOVE word vectors 4 | ============================= 5 | 6 | In this example we will retrieve similar words from 7 | GLOVE embeddings with an ANNG graph. 8 | 9 | Precomputed ground-truth nearest neighbors are available 10 | from `ANN benchmarks `__. 11 | """ 12 | 13 | # For this example, the `h5py` package is required in addition to the requirements of scikit-hubness. 14 | # You may install it from PyPI by the following command (if you're in an IPython/Jupyter environment): 15 | # !pip install h5py 16 | 17 | import numpy as np 18 | import h5py 19 | from skhubness.neighbors import NearestNeighbors 20 | 21 | # Download the dataset with the following command. 22 | # If the dataset is already available in the current working dir, you can skip this: 23 | # !wget http://ann-benchmarks.com/glove-100-angular.hdf5 24 | f = h5py.File('glove-100-angular.hdf5', 'r') 25 | 26 | # Extract the split and ground-truth 27 | X_train = f['train'] 28 | X_test = f['test'] 29 | neigh_true = f['neighbors'] 30 | dist = f['distances'] 31 | 32 | # How many object have we got? 33 | for k in f.keys(): 34 | print(f'{k}: shape = {f[k].shape}') 35 | 36 | # APPROXIMATE NEAREST NEIGHBOR SEARCH 37 | # In order to retrieve most similar words from the GLOVE embeddings, 38 | # we use the unsupervised `skhubness.neighbors.NearestNeighbors` class. 39 | # The (approximate) nearest neighbor algorithm is set to LegacyNNG by passing `algorithm='nng'`. 40 | # We can pass additional parameters to `LegacyNNG` via the `algorithm_params` dict. 41 | # Here we set `n_jobs=8` to enable parallelism. 42 | # Create the nearest neighbor index 43 | nn_plain = NearestNeighbors(n_neighbors=100, 44 | algorithm='nng', 45 | algorithm_params={'n_candidates': 1_000, 46 | 'index_dir': 'auto', 47 | 'n_jobs': 8}, 48 | verbose=2, 49 | ) 50 | nn_plain.fit(X_train) 51 | 52 | # Note that LegacyNNG must save its index. By setting `index_dir='auto'`, 53 | # LegacyNNG will try to save it to shared memory, if available, otherwise to $TMP. 54 | # This index is NOT removed automatically, as one will typically want build an index once and use it often. 55 | # Retrieve nearest neighbors for each test object 56 | neigh_pred_plain = nn_plain.kneighbors(X_test, 57 | n_neighbors=100, 58 | return_distance=False) 59 | 60 | # Calculate the recall per test object 61 | recalled_plain = [np.intersect1d(neigh_true[i], neigh_pred_plain) 62 | for i in range(len(X_test))] 63 | recall_plain = np.array([recalled_plain[i].size / neigh_true.shape[1] 64 | for i in range(len(X_test))]) 65 | 66 | # Statistics 67 | print(f'Mean = {recall_plain.mean():.4f}, ' 68 | f'stdev = {recall_plain.std():.4f}') 69 | 70 | 71 | # ANN with HUBNESS REDUCTION 72 | # Here we set `n_candidates=1000`, so that for each query, 73 | # 1000 neighbors will be retrieved first by `LegacyNNG`, 74 | # that are subsequently refined by hubness reduction. 75 | # Hubness reduction is performed by local scaling as specified with `hubness='ls'`. 76 | # Creating the NN index with hubness reduction enabled 77 | nn = NearestNeighbors(n_neighbors=100, 78 | algorithm='nng', 79 | algorithm_params={'n_candidates': 1_000, 80 | 'n_jobs': 8}, 81 | hubness='ls', 82 | verbose=2, 83 | ) 84 | nn.fit(X_train) 85 | 86 | # Retrieve nearest neighbors for each test object 87 | neigh_pred = nn.kneighbors(X_test, 88 | n_neighbors=100, 89 | return_distance=False) 90 | 91 | # Measure recall per object and on average 92 | recalled = [np.intersect1d(neigh_true[i], neigh_pred) 93 | for i in range(len(X_test))] 94 | recall = np.array([recalled[i].size / neigh_true.shape[1] 95 | for i in range(len(X_test))]) 96 | print(f'Mean = {recall.mean():.4f}, ' 97 | f'stdev = {recall.std():.4f}') 98 | 99 | # If the second results are significantly better than the first, 100 | # this could indicate that the chosen ANN method is more prone 101 | # to hubness than exact NN, which might be an interesting research question. 102 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nearest_centroid.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | :class: sphx-glr-download-link-note 3 | 4 | Click :ref:`here ` to download the full example code 5 | .. rst-class:: sphx-glr-example-title 6 | 7 | .. _sphx_glr_documentation_auto_examples_plot_nearest_centroid.py: 8 | 9 | 10 | =============================== 11 | Nearest Centroid Classification 12 | =============================== 13 | 14 | Sample usage of Nearest Centroid classification. 15 | It will plot the decision boundaries for each class. 16 | 17 | Note that no hubness reduction is currently implemented for centroids. 18 | However, `hubness.neighbors` retains all the features of `sklearn.neighbors`, 19 | in order to act as a full drop-in replacement. 20 | 21 | Adapted from ``_ 22 | 23 | 24 | 25 | .. rst-class:: sphx-glr-horizontal 26 | 27 | 28 | * 29 | 30 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_001.png 31 | :class: sphx-glr-multi-img 32 | 33 | * 34 | 35 | .. image:: /documentation/auto_examples/images/sphx_glr_plot_nearest_centroid_002.png 36 | :class: sphx-glr-multi-img 37 | 38 | 39 | .. rst-class:: sphx-glr-script-out 40 | 41 | Out: 42 | 43 | .. code-block:: none 44 | 45 | 46 | None 0.8133333333333334 47 | 0.2 0.82 48 | /home/user/feldbauer/PycharmProjects/hubness/examples/sklearn/plot_nearest_centroid.py:64: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure. 49 | plt.show() 50 | 51 | 52 | 53 | 54 | 55 | | 56 | 57 | 58 | .. code-block:: default 59 | 60 | print(__doc__) 61 | 62 | import numpy as np 63 | import matplotlib.pyplot as plt 64 | from matplotlib.colors import ListedColormap 65 | from sklearn import datasets 66 | from skhubness.neighbors import NearestCentroid 67 | 68 | n_neighbors = 15 69 | 70 | # import some data to play with 71 | iris = datasets.load_iris() 72 | # we only take the first two features. We could avoid this ugly 73 | # slicing by using a two-dim dataset 74 | X = iris.data[:, :2] 75 | y = iris.target 76 | 77 | h = .02 # step size in the mesh 78 | 79 | # Create color maps 80 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 81 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 82 | 83 | for shrinkage in [None, .2]: 84 | # we create an instance of Neighbours Classifier and fit the data. 85 | clf = NearestCentroid(shrink_threshold=shrinkage) 86 | clf.fit(X, y) 87 | y_pred = clf.predict(X) 88 | print(shrinkage, np.mean(y == y_pred)) 89 | # Plot the decision boundary. For that, we will assign a color to each 90 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 91 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 92 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 93 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 94 | np.arange(y_min, y_max, h)) 95 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 96 | 97 | # Put the result into a color plot 98 | Z = Z.reshape(xx.shape) 99 | plt.figure() 100 | plt.pcolormesh(xx, yy, Z, cmap=cmap_light) 101 | 102 | # Plot also the training points 103 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, 104 | edgecolor='k', s=20) 105 | plt.title("3-Class classification (shrink_threshold=%r)" 106 | % shrinkage) 107 | plt.axis('tight') 108 | 109 | plt.show() 110 | 111 | 112 | .. rst-class:: sphx-glr-timing 113 | 114 | **Total running time of the script:** ( 0 minutes 0.737 seconds) 115 | 116 | 117 | .. _sphx_glr_download_documentation_auto_examples_plot_nearest_centroid.py: 118 | 119 | 120 | .. only :: html 121 | 122 | .. container:: sphx-glr-footer 123 | :class: sphx-glr-footer-example 124 | 125 | 126 | 127 | .. container:: sphx-glr-download 128 | 129 | :download:`Download Python source code: plot_nearest_centroid.py ` 130 | 131 | 132 | 133 | .. container:: sphx-glr-download 134 | 135 | :download:`Download Jupyter notebook: plot_nearest_centroid.ipynb ` 136 | 137 | 138 | .. only:: html 139 | 140 | .. rst-class:: sphx-glr-signature 141 | 142 | `Gallery generated by Sphinx-Gallery `_ 143 | -------------------------------------------------------------------------------- /docs/documentation/auto_examples/plot_nca_dim_reduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n# Dimensionality Reduction with Neighborhood Components Analysis\n\n\nSample usage of Neighborhood Components Analysis for dimensionality reduction.\n\nThis example compares different (linear) dimensionality reduction methods\napplied on the Digits data set. The data set contains images of digits from\n0 to 9 with approximately 180 samples of each class. Each image is of\ndimension 8x8 = 64, and is reduced to a two-dimensional data point.\n\nPrincipal Component Analysis (PCA) applied to this data identifies the\ncombination of attributes (principal components, or directions in the\nfeature space) that account for the most variance in the data. Here we\nplot the different samples on the 2 first principal components.\n\nLinear Discriminant Analysis (LDA) tries to identify attributes that\naccount for the most variance *between classes*. In particular,\nLDA, in contrast to PCA, is a supervised method, using known class labels.\n\nNeighborhood Components Analysis (NCA) tries to find a feature space such\nthat a stochastic nearest neighbor algorithm will give the best accuracy.\nLike LDA, it is a supervised method.\n\nOne can see that NCA enforces a clustering of the data that is visually\nmeaningful despite the large reduction in dimension.\n\nAdapted from ``_\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import PCA\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nfrom skhubness.neighbors import (KNeighborsClassifier,\n NeighborhoodComponentsAnalysis)\n\nprint(__doc__)\n\nn_neighbors = 3\nrandom_state = 0\n\n# Load Digits dataset\ndigits = datasets.load_digits()\nX, y = digits.data, digits.target\n\n# Split into train/test\nX_train, X_test, y_train, y_test = \\\n train_test_split(X, y, test_size=0.5, stratify=y,\n random_state=random_state)\n\ndim = len(X[0])\nn_classes = len(np.unique(y))\n\n# Reduce dimension to 2 with PCA\npca = make_pipeline(StandardScaler(),\n PCA(n_components=2, random_state=random_state))\n\n# Reduce dimension to 2 with LinearDiscriminantAnalysis\nlda = make_pipeline(StandardScaler(),\n LinearDiscriminantAnalysis(n_components=2))\n\n# Reduce dimension to 2 with NeighborhoodComponentAnalysis\nnca = make_pipeline(StandardScaler(),\n NeighborhoodComponentsAnalysis(n_components=2,\n random_state=random_state))\n\n# Use a nearest neighbor classifier to evaluate the methods\nknn = KNeighborsClassifier(n_neighbors=n_neighbors)\n\n# Make a list of the methods to be compared\ndim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]\n\n# plt.figure()\nfor i, (name, model) in enumerate(dim_reduction_methods):\n plt.figure()\n # plt.subplot(1, 3, i + 1, aspect=1)\n\n # Fit the method's model\n model.fit(X_train, y_train)\n\n # Fit a nearest neighbor classifier on the embedded training set\n knn.fit(model.transform(X_train), y_train)\n\n # Compute the nearest neighbor accuracy on the embedded test set\n acc_knn = knn.score(model.transform(X_test), y_test)\n\n # Embed the data set in 2 dimensions using the fitted model\n X_embedded = model.transform(X)\n\n # Plot the projected points and show the evaluation score\n plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')\n plt.title(\"{}, KNN (k={})\\nTest accuracy = {:.2f}\".format(name,\n n_neighbors,\n acc_knn))\nplt.show()" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.4" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } --------------------------------------------------------------------------------