├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── conf.py ├── index.rst ├── source │ ├── hub_toolbox.rst │ └── modules.rst └── user │ ├── installation.rst │ ├── matlab_vs_python.rst │ └── tutorial.rst ├── hub_toolbox ├── __init__.py ├── approximate.py ├── centering.py ├── distances.py ├── example_datasets │ ├── ABOUT │ ├── dexter_train.data │ └── dexter_train.labels ├── global_scaling.py ├── goodman_kruskal.py ├── htlogging.py ├── hubness.py ├── hubness_analysis.py ├── intrinsic_dimension.py ├── io.py ├── knn_classification.py ├── local_scaling.py ├── shared_neighbors.py └── utils.py ├── readthedocs.yml ├── readthedocs_requirements.txt ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── approximate_test.py ├── centering_test.py ├── distances_test.py ├── goodmankruskal_test.py ├── hubness_test.py ├── hubnessanalysis_test.py ├── intrinsicdim_test.py ├── io_test.py ├── knn_test.py ├── localscaling_test.py ├── logging_test.py ├── mutualproximity_test.py └── sharednn_test.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | tests/* 4 | setup.py 5 | branch = True 6 | parallel = True 7 | concurrency = multiprocessing 8 | 9 | [report] 10 | exclude_lines = 11 | pragma: no cover 12 | def __repr__ 13 | raise AssertionError 14 | raise NotImplementedError 15 | if __name__ == .__main__.: -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | build/ 3 | dist/ 4 | hub_toolbox.egg-info/ 5 | 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: true 3 | dist: xenial 4 | python: 5 | - '3.6' 6 | - '3.7' 7 | install: 8 | - sudo apt-get update 9 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 10 | - bash miniconda.sh -b -p $HOME/miniconda; 11 | - export PATH="$HOME/miniconda/bin:$PATH" 12 | - hash -r 13 | - conda config --set always_yes yes --set changeps1 no 14 | - conda update -q conda 15 | - conda info -a 16 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy pandas scikit-learn 17 | coverage 18 | - source activate test-environment 19 | - pip install pybind11 20 | - pip install coveralls joblib falconn nmslib 21 | - python setup.py build 22 | - python setup.py install 23 | script: 24 | - python setup.py test 25 | - coverage run setup.py test 26 | - coverage combine # required for multiprocessing 27 | after_success: 28 | - coveralls 29 | cache: 30 | - apt 31 | - directories: 32 | - "$HOME/.cache/pip" 33 | deploy: 34 | - provider: releases 35 | api_key: 36 | secure: "kyQ/EbH2A4fedw49y2hATgSnoonY7r1zf/jiFRaXy+ZqcnOlileDAzJihd7RHeeRN9wL2Gw1L+U3s62AwjddiGZRz9Qv87Ub/bvJVU+aNB0uHlxSRjw3Q8zhtO0hDEyp4wWQnhPqNhGEJCfrRVuAYG956XdgdpfdL6ZdSPWaHt+nj7yfDEwZ/5iiU8UpjgxZzAgO3k7EIvW188dl75SgL9xf5eYxTXjf2NNNbvpvUXvgrpAMUkTCKix5EHMJcnoKDqlNqnURBQI+f/TpqBoO3g+F+KfF/wLIwmiqJIKALhTsTfyHb+Auv+evJ/kxtWe+GSkeF9+SBT5RdCZx3uh6U9RVm/soy12nf88f344HgS/xnj5WLfqPcG53gdwHdoKbA41OzCNGJ66mTtQfNtVFnsYfphU2fZ7yTq3JHxRwknAWDeKWh9cZixf6U8Y9Pi4vpkDNyl56sHDSlroZltRSz37M3grQOJ3kKdPfB1XOTH6nhN2yiuv1047mSj0WVGDsIwFGECc/iUDvUtdY6cliAiC0rRZX1A/axLQKW8LD3GpBSgXmPS1hJy+l8iPiiHjJvwrldz5t0OMeHfvW2ln7jMqN/yirQiHqJJN7oWKYM3qrTCK0pEJg6KS+eje4GOfMSPl2+/RkJF8ViZPfCpE37HmjsZYAHdHKD8dX89C9Y5I=" 37 | #file: '' 38 | on: 39 | repo: OFAI/hub-toolbox-python3 40 | tags: true 41 | - provider: pypi 42 | user: "feldbauer" 43 | password: 44 | secure: "fSerf/lsApqvZjbNYmOSAuG33+TyW7aKsM/aS2pItzr9u3GQSSQls0Lo+yCduV3/12joBKh4G9k+SXqZnxmdMFPJ/L2RT9ZCx28HFrD+mGakoFX2nVqVhxnqw2bfSg4Wndw1fyNcimYQGNhHHp1WECTFjZInV162719cwID6fLaVzn1AHM8LcR1WPoO5RPZJ/0KhAuxbpkMsoMp5EDJtAxDgn7QGnyTZfwo8jV4ZlUGVTiKYbiPBvLPZ3eTp7b88x4X846X2QzdBHfQ6Qr6nzA6IOJAzkZ+NNpEhDQlQRX44ty0JR0jd5Bz3IypFodZVtDguz29L5oCcuYxJGaul1ANpoqfPZ4vR6b9FkWf3CQW1BNXd5SLVbscf9l4yorDUX4KeagvPJ2z65Y/IaTIoMZjgeZX0/Pm0rcuRFkn/6KobK+lG1IaLMs6F7H7LM+TJn5v9tUYNDbPthPbr7kGmm0E5OtwX8+QZD9h9ufAPgEnsvJkLurus5HbxUiSyARE1SwayBKatJAOY3AyjC3t3tjDSWY+FVTSPbvEbMIg3BMQy0NP0oRzNJLBJ5ZbFO1bcGpMEiqbYim9ZgYonagXsmhfWzRaWughkHZABZZMyFW4uhmyvDV3SiMZpM3wE7DiWm/Oq2PhkAvLJkW085qbjiw6wFxqjgSJ3amCeH3/ZdAY=" 45 | on: 46 | tags: true 47 | # safelist 48 | branches: 49 | only: 50 | - master 51 | - develop 52 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include README.rst 3 | recursive-include hub_toolbox * 4 | recursive-include tests *.py -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://badge.fury.io/py/hub-toolbox.svg 2 | :target: https://badge.fury.io/py/hub-toolbox 3 | 4 | .. image:: https://readthedocs.org/projects/hub-toolbox-python3/badge/?version=latest 5 | :target: http://hub-toolbox-python3.readthedocs.io/en/latest/?badge=latest 6 | :alt: Documentation Status 7 | 8 | .. image:: https://travis-ci.org/OFAI/hub-toolbox-python3.svg?branch=master 9 | :target: https://travis-ci.org/OFAI/hub-toolbox-python3 10 | 11 | .. image:: https://coveralls.io/repos/github/OFAI/hub-toolbox-python3/badge.svg?branch=master 12 | :target: https://coveralls.io/github/OFAI/hub-toolbox-python3?branch=master 13 | 14 | .. image:: https://img.shields.io/aur/license/yaourt.svg?maxAge=2592000 15 | :target: https://github.com/OFAI/hub-toolbox-python3/blob/master/LICENSE.txt 16 | 17 | 18 | HUB-TOOLBOX 19 | =========== 20 | 21 | #----------------------------------------------------------------------------------- 22 | 23 | Checkout our new project `scikit-hubness `_ 24 | which provides the functionality of the Hub-Toolbox while integrating nicely into 25 | `scikit-learn` workflows. 26 | 27 | Use `skhubness.neighbors` as a drop-in replacement for `sklearn.neighbors`. 28 | It offers the same functionality and adds transparent support for hubness reduction, 29 | approximate nearest neighbor search (HNSW, LSH), and approximate hubness reduction. 30 | 31 | We strive to improve usability of hubness reduction with the development of 32 | `scikit-hubness`, and we are very interested in 33 | `user feedback `_! 34 | 35 | #----------------------------------------------------------------------------------- 36 | 37 | The Hub Toolbox is a software suite for hubness analysis and 38 | hubness reduction in high-dimensional data. 39 | 40 | It allows to 41 | 42 | - analyze, whether your datasets show hubness 43 | - reduce hubness via a variety of different techniques 44 | (including scaling and centering approaches) 45 | and obtain secondary distances for downstream analysis inside or 46 | outside the Hub Toolbox 47 | - perform evaluation tasks with both internal and external measures 48 | (e.g. Goodman-Kruskal index and k-NN classification) 49 | - NEW IN 2.5: 50 | The ``approximate`` module provides approximate hubness reduction methods 51 | with linear complexity which allow to analyze large datasets. 52 | - NEW IN 2.5: 53 | Measure hubness with the recently proposed Robin-Hood index 54 | for fast and reliable hubness estimation. 55 | 56 | Installation 57 | ------------ 58 | 59 | Make sure you have a working Python3 environment (at least 3.6) with 60 | numpy, scipy and scikit-learn packages. Use pip3 to install the latest 61 | stable version: 62 | 63 | .. code-block:: bash 64 | 65 | pip3 install hub-toolbox 66 | 67 | For more details and alternatives, please see the `Installation instructions 68 | `_. 69 | 70 | Documentation 71 | ------------- 72 | 73 | Documentation is available online: 74 | http://hub-toolbox-python3.readthedocs.io/en/latest/index.html 75 | 76 | Example 77 | ------- 78 | 79 | To run a full hubness analysis on the example dataset (DEXTER) 80 | using some of the provided hubness reduction methods, 81 | simply run the following in a Python shell: 82 | 83 | .. code-block:: python 84 | 85 | >>> from hub_toolbox.HubnessAnalysis import HubnessAnalysis 86 | >>> ana = HubnessAnalysis() 87 | >>> ana.analyze_hubness() 88 | 89 | See how you can conduct the individual analysis steps: 90 | 91 | .. code-block:: python 92 | 93 | import hub_toolbox 94 | 95 | # load the DEXTER example dataset 96 | D, labels, vectors = hub_toolbox.io.load_dexter() 97 | 98 | # calculate intrinsic dimension estimate 99 | d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vector) 100 | 101 | # calculate hubness (here, skewness of 5-occurence) 102 | S_k, _, _ = hub_toolbox.hubness.hubness(D=D, k=5, metric='distance') 103 | 104 | # perform k-NN classification LOO-CV for two different values of k 105 | acc, _, _ = hub_toolbox.knn_classification.score( 106 | D=D, target=labels, k=[1,5], metric='distance') 107 | 108 | # calculate Goodman-Kruskal index 109 | gamma = hub_toolbox.goodman_kruskal.goodman_kruskal_index( 110 | D=D, classes=labels, metric='distance') 111 | 112 | # Reduce hubness with Mutual Proximity (Empiric distance distribution) 113 | D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric( 114 | D=D, metric='distance') 115 | 116 | # Reduce hubness with Local Scaling variant NICDM 117 | D_nicdm = hub_toolbox.local_scaling.nicdm(D=D, k=10, metric='distance') 118 | 119 | # Check whether indices improve after hubness reduction 120 | S_k_mp, _, _ = hub_toolbox.hubness.hubness(D=D_mp, k=5, metric='distance') 121 | acc_mp, _, _ = hub_toolbox.knn_classification.score( 122 | D=D_mp, target=labels, k=[1,5], metric='distance') 123 | gamma_mp = hub_toolbox.goodman_kruskal.goodman_kruskal_index( 124 | D=D_mp, classes=labels, metric='distance') 125 | 126 | # Repeat the last steps for all secondary distances you calculated 127 | ... 128 | 129 | Check the `Tutorial 130 | `_ 131 | for in-depth explanations of the same. 132 | 133 | 134 | Development 135 | ----------- 136 | 137 | Development of the Hub Toolbox has finished. Check out its successor 138 | `scikit-hubness `_ for fully 139 | scikit-learn compatible hubness analysis and approximate neighbor search. 140 | 141 | .. code-block:: text 142 | 143 | (c) 2011-2018, Dominik Schnitzer and Roman Feldbauer 144 | Austrian Research Institute for Artificial Intelligence (OFAI) 145 | Contact: 146 | 147 | Citation 148 | -------- 149 | 150 | If you use the Hub Toolbox in your scientific publication, please cite: 151 | 152 | .. code-block:: text 153 | 154 | @InProceedings{Feldbauer2018b, 155 | author = {Roman Feldbauer and Maximilian Leodolter and Claudia Plant and Arthur Flexer}, 156 | title = {Fast Approximate Hubness Reduction for Large High-Dimensional Data}, 157 | booktitle = {2018 {IEEE} International Conference on Big Knowledge, {ICBK} 2018, Singapore, November 17-18, 2018}, 158 | year = {2018}, 159 | editor = {Xindong Wu and Yew{-}Soon Ong and Charu C. Aggarwal and Huanhuan Chen}, 160 | pages = {358--367}, 161 | publisher = {{IEEE} Computer Society}, 162 | bibsource = {dblp computer science bibliography, https://dblp.org}, 163 | biburl = {https://dblp.org/rec/conf/icbk/FeldbauerLPF18.bib}, 164 | doi = {10.1109/ICBK.2018.00055}, 165 | } 166 | 167 | Relevant literature: 168 | 169 | 2018: ``Fast approximate hubness reduction for large high-dimensional data``, available as 170 | technical report at ``_. 171 | 172 | 2018: ``A comprehensive empirical comparison of hubness reduction in high-dimensional spaces``, 173 | full paper available at https://doi.org/10.1007/s10115-018-1205-y 174 | 175 | 2016: ``Centering Versus Scaling for Hubness Reduction``, available as technical report 176 | at ``_ . 177 | 178 | 2012: ``Local and Global Scaling Reduce Hubs in Space``, full paper available at 179 | ``_ . 180 | 181 | License 182 | ------- 183 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 184 | 185 | Acknowledgements 186 | ---------------- 187 | PyVmMonitor is being used to support the development of this free open source 188 | software package. For more information go to http://www.pyvmmonitor.com 189 | 190 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | @echo " coverage to run coverage check of the documentation (if enabled)" 49 | 50 | .PHONY: clean 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | .PHONY: html 55 | html: 56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 57 | @echo 58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 59 | 60 | .PHONY: dirhtml 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | .PHONY: singlehtml 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | .PHONY: pickle 73 | pickle: 74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 75 | @echo 76 | @echo "Build finished; now you can process the pickle files." 77 | 78 | .PHONY: json 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | .PHONY: htmlhelp 85 | htmlhelp: 86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 87 | @echo 88 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 89 | ".hhp project file in $(BUILDDIR)/htmlhelp." 90 | 91 | .PHONY: qthelp 92 | qthelp: 93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 94 | @echo 95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/hub-toolbox.qhcp" 98 | @echo "To view the help file:" 99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/hub-toolbox.qhc" 100 | 101 | .PHONY: applehelp 102 | applehelp: 103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 104 | @echo 105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 106 | @echo "N.B. You won't be able to view it unless you put it in" \ 107 | "~/Library/Documentation/Help or install it in your application" \ 108 | "bundle." 109 | 110 | .PHONY: devhelp 111 | devhelp: 112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 113 | @echo 114 | @echo "Build finished." 115 | @echo "To view the help file:" 116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/hub-toolbox" 117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/hub-toolbox" 118 | @echo "# devhelp" 119 | 120 | .PHONY: epub 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | .PHONY: latex 127 | latex: 128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 129 | @echo 130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 132 | "(use \`make latexpdf' here to do that automatically)." 133 | 134 | .PHONY: latexpdf 135 | latexpdf: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through pdflatex..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | .PHONY: latexpdfja 142 | latexpdfja: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through platex and dvipdfmx..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: text 149 | text: 150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 151 | @echo 152 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 153 | 154 | .PHONY: man 155 | man: 156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 157 | @echo 158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 159 | 160 | .PHONY: texinfo 161 | texinfo: 162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 163 | @echo 164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 165 | @echo "Run \`make' in that directory to run these through makeinfo" \ 166 | "(use \`make info' here to do that automatically)." 167 | 168 | .PHONY: info 169 | info: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo "Running Texinfo files through makeinfo..." 172 | make -C $(BUILDDIR)/texinfo info 173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 174 | 175 | .PHONY: gettext 176 | gettext: 177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 178 | @echo 179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 180 | 181 | .PHONY: changes 182 | changes: 183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 184 | @echo 185 | @echo "The overview file is in $(BUILDDIR)/changes." 186 | 187 | .PHONY: linkcheck 188 | linkcheck: 189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 190 | @echo 191 | @echo "Link check complete; look for any errors in the above output " \ 192 | "or in $(BUILDDIR)/linkcheck/output.txt." 193 | 194 | .PHONY: doctest 195 | doctest: 196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 197 | @echo "Testing of doctests in the sources finished, look at the " \ 198 | "results in $(BUILDDIR)/doctest/output.txt." 199 | 200 | .PHONY: coverage 201 | coverage: 202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 203 | @echo "Testing of coverage in the sources finished, look at the " \ 204 | "results in $(BUILDDIR)/coverage/python.txt." 205 | 206 | .PHONY: xml 207 | xml: 208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 209 | @echo 210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 211 | 212 | .PHONY: pseudoxml 213 | pseudoxml: 214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 215 | @echo 216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 217 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # hub-toolbox documentation build configuration file, created by 5 | # sphinx-quickstart on Wed Aug 24 14:36:53 2016. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | sys.path.insert(0, os.path.abspath('../')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | #needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'sphinx.ext.autodoc', 34 | 'sphinx.ext.doctest', 35 | 'sphinx.ext.coverage', 36 | 'sphinx.ext.napoleon', 37 | 'sphinx.ext.mathjax', 38 | 'sphinx.ext.viewcode', 39 | ] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # The suffix(es) of source filenames. 45 | # You can specify multiple suffix as a list of string: 46 | # source_suffix = ['.rst', '.md'] 47 | source_suffix = '.rst' 48 | 49 | # The encoding of source files. 50 | #source_encoding = 'utf-8-sig' 51 | 52 | # The master toctree document. 53 | master_doc = 'index' 54 | 55 | # General information about the project. 56 | project = 'hub-toolbox' 57 | copyright = '2016, Roman Feldbauer' 58 | author = 'Roman Feldbauer' 59 | 60 | # The version info for the project you're documenting, acts as replacement for 61 | # |version| and |release|, also used in various other places throughout the 62 | # built documents. 63 | # 64 | # The short X.Y version. 65 | version = '2.3' 66 | # The full version, including alpha/beta/rc tags. 67 | release = '2.3' 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | # 72 | # This is also used if you do content translation via gettext catalogs. 73 | # Usually you set "language" from the command line for these cases. 74 | language = None 75 | 76 | # There are two options for replacing |today|: either, you set today to some 77 | # non-false value, then it is used: 78 | #today = '' 79 | # Else, today_fmt is used as the format for a strftime call. 80 | #today_fmt = '%B %d, %Y' 81 | 82 | # List of patterns, relative to source directory, that match files and 83 | # directories to ignore when looking for source files. 84 | exclude_patterns = ['_build'] 85 | 86 | # The reST default role (used for this markup: `text`) to use for all 87 | # documents. 88 | #default_role = None 89 | 90 | # If true, '()' will be appended to :func: etc. cross-reference text. 91 | #add_function_parentheses = True 92 | 93 | # If true, the current module name will be prepended to all description 94 | # unit titles (such as .. function::). 95 | #add_module_names = True 96 | 97 | # If true, sectionauthor and moduleauthor directives will be shown in the 98 | # output. They are ignored by default. 99 | #show_authors = False 100 | 101 | # The name of the Pygments (syntax highlighting) style to use. 102 | pygments_style = 'sphinx' 103 | 104 | # A list of ignored prefixes for module index sorting. 105 | #modindex_common_prefix = [] 106 | 107 | # If true, keep warnings as "system message" paragraphs in the built documents. 108 | #keep_warnings = False 109 | 110 | # If true, `todo` and `todoList` produce output, else they produce nothing. 111 | todo_include_todos = False 112 | 113 | 114 | # -- Options for HTML output ---------------------------------------------- 115 | 116 | # The theme to use for HTML and HTML Help pages. See the documentation for 117 | # a list of builtin themes. 118 | #html_theme = 'alabaster' 119 | html_theme = 'sphinx_rtd_theme' 120 | 121 | # Theme options are theme-specific and customize the look and feel of a theme 122 | # further. For a list of options available for each theme, see the 123 | # documentation. 124 | #html_theme_options = {} 125 | 126 | # Add any paths that contain custom themes here, relative to this directory. 127 | #html_theme_path = [] 128 | 129 | # The name for this set of Sphinx documents. If None, it defaults to 130 | # " v documentation". 131 | #html_title = None 132 | 133 | # A shorter title for the navigation bar. Default is the same as html_title. 134 | #html_short_title = None 135 | 136 | # The name of an image file (relative to this directory) to place at the top 137 | # of the sidebar. 138 | #html_logo = None 139 | 140 | # The name of an image file (within the static path) to use as favicon of the 141 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 142 | # pixels large. 143 | #html_favicon = None 144 | 145 | # Add any paths that contain custom static files (such as style sheets) here, 146 | # relative to this directory. They are copied after the builtin static files, 147 | # so a file named "default.css" will overwrite the builtin "default.css". 148 | html_static_path = ['_static'] 149 | 150 | # Add any extra paths that contain custom files (such as robots.txt or 151 | # .htaccess) here, relative to this directory. These files are copied 152 | # directly to the root of the documentation. 153 | #html_extra_path = [] 154 | 155 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 156 | # using the given strftime format. 157 | #html_last_updated_fmt = '%b %d, %Y' 158 | 159 | # If true, SmartyPants will be used to convert quotes and dashes to 160 | # typographically correct entities. 161 | #html_use_smartypants = True 162 | 163 | # Custom sidebar templates, maps document names to template names. 164 | #html_sidebars = {} 165 | 166 | # Additional templates that should be rendered to pages, maps page names to 167 | # template names. 168 | #html_additional_pages = {} 169 | 170 | # If false, no module index is generated. 171 | #html_domain_indices = True 172 | 173 | # If false, no index is generated. 174 | #html_use_index = True 175 | 176 | # If true, the index is split into individual pages for each letter. 177 | #html_split_index = False 178 | 179 | # If true, links to the reST sources are added to the pages. 180 | #html_show_sourcelink = True 181 | 182 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 183 | #html_show_sphinx = True 184 | 185 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 186 | #html_show_copyright = True 187 | 188 | # If true, an OpenSearch description file will be output, and all pages will 189 | # contain a tag referring to it. The value of this option must be the 190 | # base URL from which the finished HTML is served. 191 | #html_use_opensearch = '' 192 | 193 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 194 | #html_file_suffix = None 195 | 196 | # Language to be used for generating the HTML full-text search index. 197 | # Sphinx supports the following languages: 198 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 199 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' 200 | #html_search_language = 'en' 201 | 202 | # A dictionary with options for the search language support, empty by default. 203 | # Now only 'ja' uses this config value 204 | #html_search_options = {'type': 'default'} 205 | 206 | # The name of a javascript file (relative to the configuration directory) that 207 | # implements a search results scorer. If empty, the default will be used. 208 | #html_search_scorer = 'scorer.js' 209 | 210 | # Output file base name for HTML help builder. 211 | htmlhelp_basename = 'hub-toolboxdoc' 212 | 213 | # -- Options for LaTeX output --------------------------------------------- 214 | 215 | latex_elements = { 216 | # The paper size ('letterpaper' or 'a4paper'). 217 | #'papersize': 'letterpaper', 218 | 219 | # The font size ('10pt', '11pt' or '12pt'). 220 | #'pointsize': '10pt', 221 | 222 | # Additional stuff for the LaTeX preamble. 223 | #'preamble': '', 224 | 225 | # Latex figure (float) alignment 226 | #'figure_align': 'htbp', 227 | } 228 | 229 | # Grouping the document tree into LaTeX files. List of tuples 230 | # (source start file, target name, title, 231 | # author, documentclass [howto, manual, or own class]). 232 | latex_documents = [ 233 | (master_doc, 'hub-toolbox.tex', 'hub-toolbox Documentation', 234 | 'Roman Feldbauer', 'manual'), 235 | ] 236 | 237 | # The name of an image file (relative to this directory) to place at the top of 238 | # the title page. 239 | #latex_logo = None 240 | 241 | # For "manual" documents, if this is true, then toplevel headings are parts, 242 | # not chapters. 243 | #latex_use_parts = False 244 | 245 | # If true, show page references after internal links. 246 | #latex_show_pagerefs = False 247 | 248 | # If true, show URL addresses after external links. 249 | #latex_show_urls = False 250 | 251 | # Documents to append as an appendix to all manuals. 252 | #latex_appendices = [] 253 | 254 | # If false, no module index is generated. 255 | #latex_domain_indices = True 256 | 257 | 258 | # -- Options for manual page output --------------------------------------- 259 | 260 | # One entry per manual page. List of tuples 261 | # (source start file, name, description, authors, manual section). 262 | man_pages = [ 263 | (master_doc, 'hub-toolbox', 'hub-toolbox Documentation', 264 | [author], 1) 265 | ] 266 | 267 | # If true, show URL addresses after external links. 268 | #man_show_urls = False 269 | 270 | 271 | # -- Options for Texinfo output ------------------------------------------- 272 | 273 | # Grouping the document tree into Texinfo files. List of tuples 274 | # (source start file, target name, title, author, 275 | # dir menu entry, description, category) 276 | texinfo_documents = [ 277 | (master_doc, 'hub-toolbox', 'hub-toolbox Documentation', 278 | author, 'hub-toolbox', 'One line description of project.', 279 | 'Miscellaneous'), 280 | ] 281 | 282 | # Documents to append as an appendix to all manuals. 283 | #texinfo_appendices = [] 284 | 285 | # If false, no module index is generated. 286 | #texinfo_domain_indices = True 287 | 288 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 289 | #texinfo_show_urls = 'footnote' 290 | 291 | # If true, do not generate a @detailmenu in the "Top" node's menu. 292 | #texinfo_no_detailmenu = False 293 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to the HUB TOOLBOX! 2 | =========================== 3 | 4 | The Hub Toolbox is a software suite for hubness analysis and hubness reduction 5 | in high-dimensional data. 6 | 7 | User Guide 8 | ------------ 9 | 10 | The user guide explains how to install the Hub Toolbox, how to analyze your 11 | data sets for hubness, and how to use the Hub Toolbox to lift this 12 | *curse of dimensionality*. 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | user/matlab_vs_python 18 | user/installation 19 | user/tutorial 20 | 21 | 22 | API Reference 23 | ------------- 24 | 25 | Find all the information about specific modules and functions of the Hub 26 | Toolbox in this section. 27 | 28 | * :ref:`genindex` 29 | * :ref:`modindex` 30 | -------------------------------------------------------------------------------- /docs/source/hub_toolbox.rst: -------------------------------------------------------------------------------- 1 | hub_toolbox package 2 | =================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | hub_toolbox.Centering module 8 | ---------------------------- 9 | 10 | .. automodule:: hub_toolbox.Centering 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | hub_toolbox.Distances module 16 | ---------------------------- 17 | 18 | .. automodule:: hub_toolbox.Distances 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | hub_toolbox.GoodmanKruskal module 24 | --------------------------------- 25 | 26 | .. automodule:: hub_toolbox.GoodmanKruskal 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | hub_toolbox.Hubness module 32 | -------------------------- 33 | 34 | .. automodule:: hub_toolbox.Hubness 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | hub_toolbox.HubnessAnalysis module 40 | ---------------------------------- 41 | 42 | .. automodule:: hub_toolbox.HubnessAnalysis 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | hub_toolbox.Hubness_parallel module 48 | ----------------------------------- 49 | 50 | .. automodule:: hub_toolbox.Hubness_parallel 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | hub_toolbox.IO module 56 | --------------------- 57 | 58 | .. automodule:: hub_toolbox.IO 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | hub_toolbox.IntrinsicDim module 64 | ------------------------------- 65 | 66 | .. automodule:: hub_toolbox.IntrinsicDim 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | hub_toolbox.KnnClassification module 72 | ------------------------------------ 73 | 74 | .. automodule:: hub_toolbox.KnnClassification 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | hub_toolbox.LocalScaling module 80 | ------------------------------- 81 | 82 | .. automodule:: hub_toolbox.LocalScaling 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | hub_toolbox.Logging module 88 | -------------------------- 89 | 90 | .. automodule:: hub_toolbox.Logging 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | hub_toolbox.MutualProximity module 96 | ---------------------------------- 97 | 98 | .. automodule:: hub_toolbox.MutualProximity 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | hub_toolbox.MutualProximity_parallel module 104 | ------------------------------------------- 105 | 106 | .. automodule:: hub_toolbox.MutualProximity_parallel 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | 111 | hub_toolbox.SharedNN module 112 | --------------------------- 113 | 114 | .. automodule:: hub_toolbox.SharedNN 115 | :members: 116 | :undoc-members: 117 | :show-inheritance: 118 | 119 | 120 | Module contents 121 | --------------- 122 | 123 | .. automodule:: hub_toolbox 124 | :members: 125 | :undoc-members: 126 | :show-inheritance: 127 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | hub_toolbox 2 | =========== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | hub_toolbox 8 | -------------------------------------------------------------------------------- /docs/user/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Most of the instructions below assume you are running a Linux system. 9 | It might be possible to install the Hub Toolbox on Mac or Windows systems. 10 | We cannot, however, give any guidance for these cases at this point. 11 | 12 | 13 | Prerequisites 14 | ============= 15 | 16 | Python 17 | ------ 18 | 19 | The Hub Toolbox currently requires Python 3.6 or higher. You can check this 20 | on your system with: 21 | 22 | .. code-block:: bash 23 | 24 | python3 --version 25 | 26 | If Python3 is missing, or its version is lower than 3.6, please install it 27 | via the package manager of your operating system (e.g. ``apt`` in 28 | Debian/Ubuntu or ``dnf`` in Fedora). 29 | 30 | You might also consider using the `Anaconda environment 31 | `_ for easy Python environment 32 | and package handling. 33 | 34 | numpy/scipy/scikit-learn 35 | ------------------------ 36 | 37 | The Hub Toolbox heavily relies on numpy and requires scipy and scikit-learn 38 | for some functions. 39 | Please install these packages via your operating system's package manager 40 | (e.g. ``sudo apt install python3-numpy python3-scipy python3-sklearn``) or 41 | use Anaconda: ``conda install numpy scipy scikit-learn``. 42 | We do not recommend installation via ``pip`` since this may lead to suboptimal 43 | performance unless configured properly. 44 | 45 | 46 | Stable Hub Toolbox release 47 | ========================== 48 | 49 | Stable releases of the Hub Toolbox are added to 50 | `PyPI `_ . 51 | To install the latest stable release, simply use `pip` 52 | (you may need to install it first via your operating system's package manager, 53 | e.g. ``sudo apt install python3-pip``). 54 | 55 | .. code-block:: bash 56 | 57 | pip3 install hub-toolbox 58 | 59 | Alternatively, you may download the `latest release from GitHub 60 | `_ and follow 61 | the instructions of a development installation (from source) below, 62 | omitting the ``git clone`` step. 63 | 64 | 65 | .. _hubtoolbox-development-install: 66 | 67 | Installation from source 68 | ======================== 69 | 70 | For a bleeding edge version of the Hub Toolbox, you can install it from 71 | the latest sources: 72 | On the console, change to the directory, under which the Hub Toolbox should 73 | be installed. Then obtain a copy of the latest sources from GitHub: 74 | 75 | .. code-block:: bash 76 | 77 | git clone https://github.com/OFAI/hub-toolbox-python3.git 78 | 79 | They will be cloned to a subdirectory called ``hub-toolbox-python3``. 80 | The Hub Toolbox must then be built and installed with 81 | 82 | .. code-block:: bash 83 | 84 | cd hub-toolbox-python3 85 | python3 setup.py build 86 | sudo python3 setup.py install 87 | 88 | The Hub Toolbox is now available system wide. Optionally, you can now run 89 | a test suite by 90 | 91 | .. code-block:: bash 92 | 93 | sudo python3 setup.py test 94 | 95 | If this prints an ``OK`` message, you are ready to go. Note, that some 96 | skipped tests are fine. 97 | -------------------------------------------------------------------------------- /docs/user/matlab_vs_python.rst: -------------------------------------------------------------------------------- 1 | Which Hub Toolbox to choose 2 | =========================== 3 | 4 | The Hub Toolbox is available as Python and Matlab scripts. 5 | If in doubt, use the Hub Toolbox for Python. See below 6 | for a more detailed description. 7 | 8 | hub-toolbox-matlab 9 | -------------------- 10 | 11 | The Hub Toolbox was originally developed for Matlab/Octave. 12 | We still provide these scripts, however, development is limited to bugfixing. 13 | No new functionality will be added. 14 | The `Hub Toolbox for Matlab `_ 15 | supports: 16 | 17 | - hubness analysis 18 | 19 | - hubness reduction 20 | 21 | - Mutual Proximity 22 | - Local Scaling 23 | - Shared Nearest Neighbors 24 | - evaluation 25 | 26 | - k-NN classification 27 | - Goodman-Kruskal index 28 | 29 | for distance matrices. 30 | 31 | hub-toolbox-python3 32 | ------------------- 33 | 34 | The `Hub Toolbox for Python3 `_ 35 | was initially ported from the Matlab code. 36 | Development now focuses on these scripts. It is thus continuously being extended 37 | for new functionality and is tested and documented thoroughly. 38 | The Hub Toolbox for Python3 offers all the functionality the Matlab 39 | scripts offer, plus: 40 | 41 | - additional hubness reduction methods 42 | 43 | - centering 44 | - DisSim 45 | - using similarity matrices instead of distance matrices 46 | - support for sparse matrices (some modules) 47 | - support for parallel processing (some modules) 48 | - performance improvements (some modules) 49 | - unit tests 50 | - this documentation 51 | 52 | We recommend using hub-toolbox-python3 for all users. This documentation will 53 | assume you are using these scripts. 54 | -------------------------------------------------------------------------------- /docs/user/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial: 2 | 3 | ======== 4 | Tutorial 5 | ======== 6 | 7 | In this tutorial you will analyze the dexter dataset for hubness, reduce 8 | hubness, and observe how this improves internal and external evaluation 9 | measures. 10 | 11 | From there on you will be able to apply the techniques offered by the 12 | Hub Toolbox to the dataset of your choice. 13 | 14 | 15 | Prerequisites 16 | ============= 17 | 18 | For this tutorial, you will require a working installation of the Hub 19 | Toolbox. If you don't have one yet, please follow the instructions in 20 | :ref:`installation`. 21 | 22 | 23 | Analyze the dexter dataset 24 | ========================== 25 | 26 | The Hub Toolbox ships with DEXTER as an example dataset. DEXTER is a text 27 | classification problem in a bag-of-word representation. This is a 28 | binary classification problem with sparse continuous input variables. 29 | This dataset was one of five datasets of the NIPS 2003 feature selection 30 | challenge. For more info, see: http://archive.ics.uci.edu/ml/datasets/Dexter 31 | 32 | On the terminal, start a Python shell: 33 | 34 | .. code-block:: bash 35 | 36 | python3 37 | 38 | Consider using an `IPython/jupyter notebook `_ as a 39 | more flexible and powerful alternative. 40 | 41 | The :class:`HubnessAnalysis ` class automatically 42 | analyzes the DEXTER example dataset, if invoked without further parameters: 43 | 44 | .. code-block:: python 45 | 46 | >>> from hub_toolbox.HubnessAnalysis import HubnessAnalysis 47 | >>> ana = HubnessAnalysis() 48 | >>> ana.analyze_hubness() 49 | 50 | This will print a rather lengthy result log: 51 | 52 | .. code-block:: text 53 | 54 | NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set. 55 | DEXTER is a text classification problem in a bag-of-word 56 | representation. This is a two-class classification problem 57 | with sparse continuous input variables. 58 | This dataset is one of five datasets of the NIPS 2003 59 | feature selection challenge. 60 | http://archive.ics.uci.edu/ml/datasets/Dexter 61 | 62 | 63 | ================ 64 | Hubness Analysis 65 | ================ 66 | 67 | ORIGINAL DATA: 68 | data set hubness (S^k= 5) : 4.22 69 | % of anti-hubs at k= 5 : 26.67% 70 | % of k= 5-NN lists the largest hub occurs: 23.67% 71 | data set hubness (S^k=10) : 3.98 72 | % of anti-hubs at k=10 : 17.67% 73 | % of k=10-NN lists the largest hub occurs: 50.0% 74 | k= 1-NN classification accuracy : 80.33% 75 | k= 5-NN classification accuracy : 80.33% 76 | k=20-NN classification accuracy : 84.33% 77 | Goodman-Kruskal index (higher=better) : 0.104 78 | embedding dimensionality : 20000 79 | intrinsic dimensionality estimate : 161 80 | 81 | MUTUAL PROXIMITY (Empiric): 82 | data set hubness (S^k= 5) : 0.712 83 | % of anti-hubs at k= 5 : 3.0% 84 | % of k= 5-NN lists the largest hub occurs: 6.0% 85 | data set hubness (S^k=10) : 0.71 86 | % of anti-hubs at k=10 : 0.0% 87 | % of k=10-NN lists the largest hub occurs: 10.67% 88 | k= 1-NN classification accuracy : 82.67% 89 | k= 5-NN classification accuracy : 89.67% 90 | k=20-NN classification accuracy : 88.67% 91 | Goodman-Kruskal index (higher=better) : 0.132 92 | embedding dimensionality : 20000 93 | intrinsic dimensionality estimate : 161 94 | 95 | MUTUAL PROXIMITY (Independent Gaussians): 96 | data set hubness (S^k= 5) : 0.805 97 | % of anti-hubs at k= 5 : 4.667% 98 | % of k= 5-NN lists the largest hub occurs: 5.667% 99 | data set hubness (S^k=10) : 1.21 100 | % of anti-hubs at k=10 : 0.0% 101 | % of k=10-NN lists the largest hub occurs: 12.67% 102 | k= 1-NN classification accuracy : 83.67% 103 | k= 5-NN classification accuracy : 89.0% 104 | k=20-NN classification accuracy : 90.0% 105 | Goodman-Kruskal index (higher=better) : 0.135 106 | embedding dimensionality : 20000 107 | intrinsic dimensionality estimate : 161 108 | 109 | LOCAL SCALING (NICDM): 110 | parameter k = 7 (for optimization use the individual modules of the HUB-TOOLBOX) 111 | data set hubness (S^k= 5) : 2.1 112 | % of anti-hubs at k= 5 : 0.6667% 113 | % of k= 5-NN lists the largest hub occurs: 8.667% 114 | data set hubness (S^k=10) : 1.74 115 | % of anti-hubs at k=10 : 0.0% 116 | % of k=10-NN lists the largest hub occurs: 16.0% 117 | k= 1-NN classification accuracy : 84.67% 118 | k= 5-NN classification accuracy : 85.0% 119 | k=20-NN classification accuracy : 85.0% 120 | Goodman-Kruskal index (higher=better) : 0.118 121 | embedding dimensionality : 20000 122 | intrinsic dimensionality estimate : 161 123 | 124 | CENTERING: 125 | data set hubness (S^k= 5) : 1.62 126 | % of anti-hubs at k= 5 : 6.667% 127 | % of k= 5-NN lists the largest hub occurs: 8.333% 128 | data set hubness (S^k=10) : 1.38 129 | % of anti-hubs at k=10 : 1.333% 130 | % of k=10-NN lists the largest hub occurs: 13.0% 131 | k= 1-NN classification accuracy : 85.0% 132 | k= 5-NN classification accuracy : 87.67% 133 | k=20-NN classification accuracy : 89.33% 134 | Goodman-Kruskal index (higher=better) : 0.19 135 | embedding dimensionality : 20000 136 | intrinsic dimensionality estimate : 161 137 | 138 | DISSIM GLOBAL: 139 | data set hubness (S^k= 5) : 1.87 140 | % of anti-hubs at k= 5 : 6.333% 141 | % of k= 5-NN lists the largest hub occurs: 8.667% 142 | data set hubness (S^k=10) : 1.62 143 | % of anti-hubs at k=10 : 1.667% 144 | % of k=10-NN lists the largest hub occurs: 14.67% 145 | k= 1-NN classification accuracy : 84.0% 146 | k= 5-NN classification accuracy : 88.67% 147 | k=20-NN classification accuracy : 88.67% 148 | Goodman-Kruskal index (higher=better) : 0.189 149 | embedding dimensionality : 20000 150 | intrinsic dimensionality estimate : 161 151 | 152 | 153 | Interpreting the results 154 | ======================== 155 | 156 | Let us dissect these results: The first block appears, because we did not 157 | provide any parameters, when instantiating 158 | :class:`HubnessAnalysis `. It thus goes 159 | into example mode and tells you a little bit about the dataset being used. 160 | 161 | The actual results of the analysis are grouped into blocks by experiments. 162 | Here, an experiment comprises the following: 163 | 164 | #. a hubness reduction method is applied to the dataset's distance matrix 165 | to obtain a matrix of secondary distances (except for centering, which 166 | changes vector data) 167 | #. hubness and additional measures of hubs and anti-hubs are calculated 168 | (in this case twice, for two different neighborhood sizes) 169 | #. k-nearest neighbor classification leave-one-out cross-validation is 170 | performed (in this case three times, for three different values of `k`) 171 | #. the Goodman-Kruskal index is calculated for the secondary distance matrix 172 | 173 | Additionally, the intrinsic dimension is estimated once for the dataset 174 | for all experiments. 175 | 176 | The second block (under the `Hubness Analysis` headline) is the experiment 177 | using primary distances. For text-based datasets like DEXTER cosine distances 178 | are used frequently. We observe considerable hubness of ``S^(k=5) = 4.22``. 179 | (As a rule of thumb, consider values above ``1.2`` as 'high hubness'). 180 | Knowing that hubness is a phenomenon of intrinsically high dimensional data, 181 | it is not surprising that the intrinsic dimension estimate of ``161`` is also 182 | considerably high (although much lower than the embedding dimension 183 | of ``20000``). We also observe a lot of anti-hubs (i.e. points that are 184 | not among the k-nearest neighbors of any other point; or in other words: 185 | their ``k-occurence=0``), while the largest hub is among the k-nearest 186 | neighbors of very many points. We find k-NN classification accuracy of 187 | roughly ``80%``. 188 | 189 | The third block contains the results of an Mutual Proximity experiment, 190 | using the empirical distance distribution to rescale these distances. 191 | We observe tremendously reduced hubness, hardly any anti-hubs, and reduced 192 | k-occurence of the largest hub. Also, internal evaluation with the 193 | Goodman-Kruskal index improves compared to using the primary distances. 194 | Mutual Proximity is thus able to reduce hubness, but we don't know yet, 195 | whether these secondary distances still reflect the semantics of the dataset. 196 | Looking at the k-NN classification, it seems like these were actually 197 | improved, because accuracy increased to nearly ``90%``. 198 | Note that embedding and intrinsic dimension do not change, because they are 199 | computed on the original dataset. 200 | 201 | The following blocks represent other hubness reduction methods, some 202 | performing as well as Mutual Proximity, some performing worse. However, 203 | all of them improve internal as well as external evaluation measures. 204 | 205 | 206 | Analyzing other datasets 207 | ======================== 208 | 209 | :class:`HubnessAnalysis ` can also be used to 210 | investigate other datasets. You will require at least a numpy array of your 211 | feature vectors (called `vectors`), or a distance matrix ``D`` (where 212 | ``D[i, j]`` is the distance between your ``i-th`` and ``j-th`` feature vector). 213 | If you want to perform classification, you also need to provide a vector 214 | with integer labels for each data point (``target`` or 'ground-truth'). 215 | If you don't have a distance matrix yet, you can use the methods from 216 | :class:`Distances ` to create one based on euclidean 217 | or cosine distances. For other types of distances, you can also use 218 | `scipy.spatial.distance.pdist `_. 220 | 221 | Now simply call 222 | 223 | .. code-block:: python 224 | 225 | >>> from hub_toolbox.HubnessAnalysis import HubnessAnalysis 226 | >>> ana = HubnessAnalysis(D, vectors, target) 227 | >>> ana.analyze_hubness(experiments="orig,mp,nicdm,dsg", 228 | hubness_k=(5, 10), knn_k=(10, 20)) 229 | 230 | Note, how we provided parameters to ``analyze_hubness``: The Hub Toolbox 231 | will now perform four experiments (original data, Mutual Proximity (Empiric), 232 | Local Scaling (NICDM), and DisSim Global). The neighborhood size is the same 233 | as in the last example, but we changed the classification to 10-NN and 20-NN 234 | (instead of 1-NN, 5-NN, and 20-NN). 235 | 236 | Looking at your output, you may notice a line that was not discussed before: 237 | `NICDM` has a parameter `k` that can be tuned. Other methods do so as well. 238 | The convenience class :class:`HubnessAnalysis ` 239 | does not allow to change the default values for the methods' parameters. 240 | To do so, you can use the individual methods of the Hub Toolbox directly, 241 | which will be covered in the next section. 242 | 243 | 244 | Using individual methods 245 | ======================== 246 | 247 | In this section we will revisit the analysis we performed previously 248 | on the DEXTER dataset. This time, instead of using the convenience class 249 | :class:`HubnessAnalysis `, we will employ 250 | the individual modules of the Hub Toolbox in order to see, how to use 251 | it in a more flexible way. 252 | 253 | Loading the example dataset 254 | --------------------------- 255 | 256 | .. code-block:: python 257 | 258 | >>> from hub_toolbox.IO import load_dexter 259 | >>> D, labels, vectors = load_dexter() 260 | >>> vectors.shape 261 | (300, 20000) 262 | 263 | We see that DEXTER comprises ``300`` points in an embedding 264 | dimension of ``20000``. The `IntrinsicDim` module can provide some insight, 265 | how well this reflects the 'true' dimensionality of the dataset, by 266 | 267 | Calculating an intrinsic dimension estimate 268 | ------------------------------------------- 269 | 270 | .. code-block:: python 271 | 272 | >>> from hub_toolbox.IntrinsicDim import intrinsic_dimension 273 | >>> intrinsic_dimension(vectors, k1=6, k2=12, estimator='levina', trafo=None) 274 | 74 275 | 276 | The MLE by Levina and Bickel with neighborhood ``[6, 12]`` tells us 277 | that the intrinsic dimension is much lower than the embedding dimension, 278 | but is still considerably high. We can assume, that this dataset is prone 279 | to 280 | 281 | Hubness 282 | ------- 283 | 284 | .. code-block:: python 285 | 286 | >>> from hub_toolbox.Hubness import hubness 287 | >>> S_k, D_k, N_k = hubness(D=D, k=5, metric='distance') 288 | >>> print("Hubness:", S_k) 289 | Hubness: 4.222131665788378 290 | 291 | Besides the hubness in ``S_k``, you also get the objects ``D_k`` 292 | and ``N_k``, which contain the ``k`` nearest neighbors of all elements 293 | and the n-occurence, respectively. From them you can extract more 294 | detailed information about hubs and anti-hubs. 295 | 296 | External and internal evaluation can be performed with the following 297 | methods: 298 | 299 | k-NN classification 300 | ------------------- 301 | 302 | .. code-block:: python 303 | 304 | >>> from hub_toolbox.KnnClassification import score 305 | >>> acc, corr, cmat = score(D=D, target=labels, k=[1,5], metric='distance') 306 | >>> print("k=5-NN accuracy:", acc[1, 0]) 307 | k=5-NN accuracy: 0.803333333333 308 | 309 | Also in this case, you obtain three objects: ``acc`` contains the 310 | accuracy values, 311 | ``corr`` contains information about each point, whether it was classified 312 | correctly or not, and ``cmat`` contains the corresponding confusion 313 | matrices. All three objects contain their information of each 314 | k-NN experiment defined with parameter ``k=[1,5]``. 315 | 316 | Goodman-Kruskal index 317 | --------------------- 318 | 319 | .. code-block:: python 320 | 321 | >>> from hub_toolbox.GoodmanKruskal import goodman_kruskal_index 322 | >>> gamma = goodman_kruskal_index(D=D, classes=labels, metric='distance') 323 | >>> print("Goodman-Kruskal index:", gamma) 324 | Goodman-Kruskal index: 0.103701886155 325 | 326 | Calculating the :meth:`Goodman-Kruskal index ` 327 | is straight forward. 328 | 329 | Hubness reduction 330 | ----------------- 331 | 332 | .. code-block:: python 333 | 334 | >>> from hub_toolbox.MutualProximity import mutual_proximity_empiric 335 | >>> D_mp = mutual_proximity_empiric(D=D, metric='distance') 336 | 337 | .. code-block:: python 338 | 339 | >>> from hub_toolbox.LocalScaling import nicdm 340 | >>> D_nicdm = nicdm(D=D, k=10, metric='distance') 341 | 342 | You now have two objects ``D_mp`` and ``D_nicdm`` which contain 343 | secondary distances of the DEXTER dataset, rescaled with Mutual 344 | Proximity (Empiric) and Local Scaling (NICDM), respectively. 345 | They can now be used just as illustrated above for k-NN classification, 346 | hubness calculation etc. 347 | 348 | The Hub Toolbox provides more methods for hubness reduction than these 349 | two, and additional ones will be integrated as they are developed by 350 | the hubness community. To see, which methods are currently included, try 351 | 352 | .. code-block:: python 353 | 354 | >>> from hub_toolbox.HubnessAnalysis import SEC_DIST 355 | >>> for k, v in SEC_DIST.items(): 356 | ... print(k) 357 | ... 358 | dsl 359 | snn 360 | wcent 361 | lcent 362 | mp_gaussi 363 | mp 364 | orig 365 | mp_gauss 366 | nicdm 367 | dsg 368 | cent 369 | ls 370 | mp_gammai 371 | 372 | The values ``v`` in this dictionary are actually the hubness reduction 373 | functions, so you may invoke them for example like this: 374 | 375 | .. code-block:: python 376 | 377 | >>> D_snn = SEC_DIST['snn'](D) 378 | 379 | to obtain shared nearest neighbor distances. 380 | 381 | Approximate hubness reduction 382 | ----------------------------- 383 | TODO 384 | 385 | For now, please consider the docstrings. If in doubt, please don't hesitate to 386 | contact the author. 387 | -------------------------------------------------------------------------------- /hub_toolbox/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | """ 13 | 14 | __version__ = '2.5.2' 15 | 16 | try: 17 | import numpy 18 | import scipy 19 | import sklearn 20 | del numpy 21 | del scipy 22 | del sklearn 23 | except ImportError: # pragma: no cover 24 | raise ImportError("Could not import numpy and/or scipy.\n" 25 | "Please make sure you install the following Python3 " 26 | "packages: numpy, scipy and scikit-learn.\n" 27 | "See the installation docs for more details:" 28 | "http://hub-toolbox-python3.readthedocs.io/en/latest/" 29 | "user/installation.html#numpy-scipy-scikit-learn") 30 | 31 | from hub_toolbox import centering 32 | from hub_toolbox import distances 33 | from hub_toolbox import goodman_kruskal 34 | from hub_toolbox import hubness 35 | from hub_toolbox import hubness_analysis 36 | from hub_toolbox import intrinsic_dimension 37 | from hub_toolbox import io 38 | from hub_toolbox import knn_classification 39 | from hub_toolbox import local_scaling 40 | from hub_toolbox import htlogging 41 | from hub_toolbox import global_scaling 42 | from hub_toolbox import shared_neighbors 43 | -------------------------------------------------------------------------------- /hub_toolbox/centering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2015-2018, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | """ 13 | import ctypes 14 | from multiprocessing import cpu_count, Pool, RawArray 15 | import numpy as np 16 | from sklearn.metrics.pairwise import euclidean_distances 17 | from hub_toolbox.distances import cosine_distance as cos 18 | from hub_toolbox import io 19 | from functools import partial 20 | 21 | __all__ = ['centering', 'weighted_centering', 'localized_centering', 22 | 'dis_sim_global', 'dis_sim_local'] 23 | 24 | def centering(X:np.ndarray, metric:str='vector', test_set_mask:np.ndarray=None): 25 | """ 26 | Perform centering, i.e. shift the origin to the data centroid. 27 | 28 | Centering of vector data `X` with ``n`` objects in an ``m``-dimensional 29 | feature space. 30 | The mean of each feature is calculated and subtracted from each 31 | point [1]_. In distance based mode, it must be checked upstream, that 32 | the distance matrix is a gram matrix as described below! 33 | 34 | Parameters 35 | ---------- 36 | X : ndarray 37 | - An ``(n x m)`` vector data matrix with ``n`` objects in an 38 | ``m``-dimensional feature space 39 | - An ``(n x n)`` distance matrix 40 | of form ``K = X(X.T)``, if `X` is an ``(n x m)`` matrix; 41 | and of form ``K = (X.T)X``, if `X` is an ``(m x n)`` matrix, 42 | where ``X.T`` denotes the transpose of `X`. 43 | 44 | NOTE: The type must be defined via parameter 'metric'! 45 | 46 | metric : {'vector', 'inner'}, optional (Default: 'vector') 47 | Define, whether `X` is vector data or a Gram matrix of inner product 48 | similarities as described above. 49 | 50 | test_set_mask : ndarray, optional (default: None) 51 | Hold back data as a test set and perform centering on the remaining 52 | data (training set). 53 | 54 | Returns 55 | ------- 56 | X_cent : ndarray 57 | 58 | Centered vectors with shape (n, m), if given vector data. 59 | 60 | K_cent : ndarray 61 | 62 | Centered inner product similarities with shape (n, n), if given Gram matrix. 63 | 64 | References 65 | ---------- 66 | .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 67 | Centering similarity measures to reduce hubs. In Proceedings of the 68 | 2013 Conference on Empirical Methods in Natural Language Processing 69 | (pp 613–623). 70 | Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf 71 | """ 72 | # Kernel based centering requires inner product similarities, NOT distances. 73 | # Since the parameter was previously erroneously called 'distance', 74 | # this is kept for compatibility reasons. 75 | if metric in ('similarity', 'distance', 'inner', 'inner_product'): 76 | if test_set_mask is not None: 77 | raise NotImplementedError("Kernel based centering does not " 78 | "support train/test splits so far.") 79 | io.check_distance_matrix_shape(X) 80 | n = X.shape[0] 81 | H = np.identity(n) - np.ones((n, n)) / n 82 | # K = X.T.X must be provided upstream 83 | return H.dot(X).dot(H) 84 | elif metric == 'vector': 85 | n = X.shape[0] 86 | if test_set_mask is None: 87 | # center among all data 88 | return X - np.mean(X, axis=0) 89 | else: 90 | # center among training data 91 | train_ind = np.setdiff1d(np.arange(n), test_set_mask) 92 | return X - np.mean(X[train_ind], axis=0) 93 | else: 94 | raise ValueError("Parameter 'metric' must be 'inner' or 'vector'.") 95 | 96 | def weighted_centering(X:np.ndarray, metric:str='cosine', gamma:float=1., 97 | test_set_mask:np.ndarray=None): 98 | """ 99 | Perform weighted centering: shift origin to the weighted data mean 100 | 101 | Move the origin more actively towards hub objects in the dataset, 102 | rather than towards the data centroid [1]_. 103 | 104 | Parameters 105 | ---------- 106 | X : ndarray 107 | An ``m x n`` vector data matrix with ``n`` objects in an 108 | ``m`` dimensional feature space 109 | 110 | metric : {'cosine', 'euclidean'}, optional (default: 'cosine') 111 | Distance measure used to place more weight on objects that are more 112 | likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does 113 | not make much sense and might be removed in the future). 114 | 115 | gamma : float, optional (default: 1.0) 116 | Controls how much we emphasize the weighting effect 117 | 118 | - ``gamma=0`` : equivalent to normal centering 119 | - ``gamma>0`` : move origin closer to objects with larger similarity 120 | to other objects 121 | 122 | test_set_mask : ndarray, optional (default: None) 123 | Hold back data as a test set and perform centering on the remaining 124 | data (training set). 125 | 126 | Returns 127 | ------- 128 | X_wcent : ndarray 129 | Weighted centered vectors. 130 | 131 | References 132 | ---------- 133 | .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 134 | Centering similarity measures to reduce hubs. In Proceedings of the 135 | 2013 Conference on Empirical Methods in Natural Language Processing 136 | (pp 613–623). 137 | Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf 138 | """ 139 | n = X.shape[0] 140 | 141 | # Indices of training examples 142 | if test_set_mask is not None: 143 | train_set_mask = np.setdiff1d(np.arange(n), test_set_mask) 144 | else: 145 | train_set_mask = slice(0, n) 146 | 147 | n_train = X[train_set_mask].shape[0] 148 | d = np.zeros(n) 149 | 150 | if metric == 'cosine': 151 | vectors_sum = X[train_set_mask].sum(axis=0) 152 | for i in np.arange(n): 153 | d[i] = n_train * cos(np.array([X[i], vectors_sum/n_train]))[0, 1] 154 | # Using euclidean distances does not really make sense 155 | elif metric == 'euclidean': 156 | for i in range(n): 157 | displ_v = X[train_set_mask] - d[i] 158 | d[i] = np.sum(np.sqrt(displ_v * displ_v)) 159 | else: 160 | raise ValueError("Weighted centering only supports cosine distances.") 161 | d_sum = np.sum(d ** gamma) 162 | w = (d ** gamma) / d_sum 163 | vectors_mean_weighted = np.sum(w.reshape(n, 1) * X, axis=0) 164 | X_wcent = X - vectors_mean_weighted 165 | return X_wcent 166 | 167 | #=============================================================================== 168 | # #============================================================================= 169 | # # LOCALIZED CENTERING 170 | # #============================================================================= 171 | #=============================================================================== 172 | 173 | def _lcent_load_shared_data(w_, sim_train_, local_affinity_): 174 | global w, sim_train, local_affinity 175 | w = w_ 176 | sim_train = sim_train_ 177 | local_affinity = local_affinity_ 178 | return 179 | 180 | def _lcent_calculate_loc_af(i, kappa): 181 | # Get the kappa nearest neighbors (highest similarity) 182 | nn = np.argpartition(sim_train[i, :], kth=-kappa)[-1:-kappa-1:-1] 183 | # Local centroid 184 | c_kappa_x = w[nn, :].mean(axis=0) 185 | local_affinity[i] = np.inner(w[i, :], c_kappa_x) 186 | return 187 | 188 | def localized_centering(X:np.ndarray, Y:np.ndarray=None, 189 | kappa:int=40, gamma:float=1., n_jobs:int=1): 190 | """ 191 | Perform localized centering. 192 | 193 | Reduce hubness in datasets according to the method proposed in [2]_. 194 | 195 | Parameters 196 | ---------- 197 | X : ndarray 198 | An ``n x m`` vector data matrix with ``n`` objects in an 199 | ``m`` dimensional feature space 200 | 201 | Y : ndarray, optional 202 | If Y is provided, calculate similarities between all test data in `X` 203 | versus all training data in `Y`. 204 | 205 | kappa : int, optional (default: 40) 206 | Local segment size, determines the size of the local neighborhood for 207 | calculating the local affinity. When ``kappa=n`` localized centering 208 | reduces to standard centering. 209 | "select κ depending on the dataset, so that the correlation between 210 | Nk(x) and the local affinity is maximized" [2]_ 211 | 212 | gamma : float, optional (default: 1.0) 213 | Control the degree of penalty, so that used the similarity score 214 | is smaller depending on how likely a point is to become a hub. 215 | "Parameter γ can be tuned so as to maximally reduce the skewness 216 | of the Nk distribution" [2]_. 217 | 218 | n_jobs : int, optional 219 | Parallel execution 220 | 221 | Returns 222 | ------- 223 | S_lcent : ndarray 224 | Secondary similarity (localized centering) matrix. 225 | 226 | References 227 | ---------- 228 | .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 229 | Centering similarity measures to reduce hubs. In Proceedings of the 230 | 2013 Conference on Empirical Methods in Natural Language Processing 231 | (pp 613–623). 232 | Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf 233 | 234 | .. [2] Hara, K., Suzuki, I., Shimbo, M., Kobayashi, K., Fukumizu, K., & 235 | Radovanović, M. (2015). Localized centering: Reducing hubness in 236 | large-sample data hubness in high-dimensional data. In AAAI ’15: 237 | Proceedings of the 29th AAAI Conference on Artificial Intelligence 238 | (pp. 2645–2651). 239 | """ 240 | if n_jobs == -1: 241 | n_jobs = cpu_count() 242 | # Rescale vectors to unit length 243 | div_ = np.sqrt((X ** 2).sum(axis=-1))[..., np.newaxis] 244 | div_[div_ == 0] = 1e-7 245 | v = X / div_ 246 | if Y is None: # calc all-against-all in X 247 | w = v 248 | n, _ = X.shape 249 | sim = v.dot(w.T) 250 | sim_train = sim 251 | else: # calc sim from test data in X against train data in Y 252 | div_ = np.sqrt((Y ** 2).sum(axis=-1))[..., np.newaxis] 253 | div_[div_ == 0] = 1e-7 254 | w = Y / div_ 255 | n, _ = Y.shape 256 | sim = v.dot(w.T) 257 | sim_train = w.dot(w.T) 258 | 259 | if n_jobs > 1: 260 | local_affinity_ctype = RawArray(ctypes.c_double, n) 261 | local_affinity = np.frombuffer(local_affinity_ctype, dtype=np.float64) 262 | with Pool(processes=n_jobs, 263 | initializer=_lcent_load_shared_data, 264 | initargs=(w, sim_train, local_affinity)) as pool: 265 | for _ in pool.imap( 266 | func=partial(_lcent_calculate_loc_af, kappa=kappa), 267 | iterable=range(n)): 268 | pass # local_affinity is handled within func 269 | else: 270 | local_affinity = np.zeros(n) 271 | for i in range(n): 272 | # Get the kappa nearest neighbors (highest similarity) 273 | nn = np.argpartition(sim_train[i, :], kth=-kappa)[-1:-kappa-1:-1] 274 | # Local centroid 275 | c_kappa_x = w[nn, :].mean(axis=0) 276 | local_affinity[i] = np.inner(w[i, :], c_kappa_x) 277 | # Only change penalty, if all values are positive 278 | if gamma != 1 and (local_affinity < 0).sum() == 0: 279 | local_affinity **= gamma 280 | sim -= local_affinity 281 | return sim 282 | 283 | 284 | def dis_sim_global(X:np.ndarray, Y:np.ndarray=None): 285 | """ 286 | Calculate dissimilarity based on global 'sample-wise centrality' [1]_. 287 | 288 | Parameters 289 | ---------- 290 | X : ndarray 291 | An ``n x m`` vector data matrix with ``n`` objects in an 292 | ``m`` dimensional feature space 293 | 294 | Y : ndarray, optional 295 | If Y is provided, calculate dissimilarities between all test data 296 | in `X` and all training data in `Y`. 297 | 298 | Returns 299 | ------- 300 | D_dsg : ndarray 301 | Secondary dissimilarity (DisSimGlobal) matrix. 302 | 303 | References 304 | ---------- 305 | .. [1] Hara, K., Suzuki, I., Kobayashi, K., Fukumizu, K., & 306 | Radovanović, M. (2016). Flattening the density gradient for 307 | eliminating spatial centrality to reduce hubness. Proceedings of 308 | the Thirtieth AAAI Conference on Artificial Intelligence (AAAI ’16), 309 | 1659–1665. Retrieved from http://www.aaai.org/ocs/index.php/AAAI/ 310 | AAAI16/paper/download/12055/11787 311 | """ 312 | if Y is None: 313 | Y = X 314 | if X.shape[1] != Y.shape[1]: 315 | raise ValueError("X and Y must have same number of features.") 316 | c = Y.mean(0) 317 | x_c = euclidean_distances(Y, c[np.newaxis, :], squared=True) 318 | if id(X) == id(Y): # i.e. no Y was provided 319 | q_c = x_c 320 | else: # avoid duplicate calculations 321 | q_c = euclidean_distances(X, c[np.newaxis, :], squared=True) 322 | D_xq = euclidean_distances(X, Y, squared=True) 323 | D_xq -= x_c.T 324 | D_xq -= q_c 325 | return D_xq 326 | 327 | #=============================================================================== 328 | # #============================================================================= 329 | # # DisSim LOCAL 330 | # #============================================================================= 331 | #=============================================================================== 332 | 333 | def _dsl_init(c_k_X_or_Y_, D_test_or_train_, Y_): 334 | global c_k_X_or_Y, D_test_or_train, Y 335 | c_k_X_or_Y = c_k_X_or_Y_ 336 | D_test_or_train = D_test_or_train_ 337 | Y = Y_ 338 | return 339 | 340 | def _dsl_local_centroids(i, k): 341 | knn_idx = np.argpartition(D_test_or_train[i, :], kth=k)[:k] 342 | c_k_X_or_Y[i] = Y[knn_idx].mean(axis=0) 343 | return 344 | 345 | def dis_sim_local(X:np.ndarray, Y:np.ndarray=None, k:int=10, n_jobs:int=1): 346 | """Calculate dissimilarity based on local 'sample-wise centrality' [1]_. 347 | 348 | Parameters 349 | ---------- 350 | X : ndarray 351 | An ``n x m`` vector data matrix with ``n`` objects in an 352 | ``m`` dimensional feature space. 353 | 354 | Y : ndarray, optional 355 | If Y is provided, calculate dissimilarities between all test data 356 | in `X` and all training data in `Y`. 357 | 358 | k : int, optional (default: 10) 359 | Neighborhood size used for determining the local centroids. 360 | Can be optimized as to maximally reduce hubness [1]_. 361 | 362 | n_jobs : int, optional 363 | Parallel execution 364 | 365 | Returns 366 | ------- 367 | D_dsl : ndarray 368 | Secondary dissimiliarity (DisSimLocal) matrix. 369 | 370 | References 371 | ---------- 372 | .. [1] Hara, K., Suzuki, I., Kobayashi, K., Fukumizu, K., & 373 | Radovanović, M. (2016). Flattening the density gradient for 374 | eliminating spatial centrality to reduce hubness. Proceedings of 375 | the Thirtieth AAAI Conference on Artificial Intelligence (AAAI ’16), 376 | 1659–1665. Retrieved from http://www.aaai.org/ocs/index.php/AAAI/ 377 | AAAI16/paper/download/12055/11787 378 | """ 379 | X = X.copy() 380 | # all-against-all dissimilarities? 381 | if Y is None: 382 | Y = X 383 | else: 384 | Y = Y.copy() 385 | 386 | # dataset size and dimensionality 387 | n_test, m_test = X.shape 388 | n_train, m_train = Y.shape 389 | if m_test != m_train: 390 | raise ValueError("X and Y must have same number of features.") 391 | 392 | # Calc euclidean distances to find nearest neighbors among training data 393 | D_train = euclidean_distances(Y, squared=True) 394 | if id(Y) == id(X): 395 | # Exclude self distances from kNN lists: 396 | np.fill_diagonal(D_train, np.inf) 397 | D_test = D_train 398 | else: 399 | # ... and between test and training data 400 | D_test = euclidean_distances(X, Y, squared=True) 401 | 402 | # Local centroid for each point among its k-nearest training neighbors 403 | if n_jobs > 1: 404 | c_k_X_ctype = RawArray(ctypes.c_double, X.size) 405 | c_k_X = np.frombuffer(c_k_X_ctype, dtype=np.float64).reshape(X.shape) 406 | with Pool(processes=n_jobs, 407 | initializer=_dsl_init, 408 | initargs=(c_k_X, D_test, Y)) as pool: 409 | for _ in pool.imap( 410 | func=partial(_dsl_local_centroids, k=k), 411 | iterable=range(n_test)): 412 | pass # handling inside function 413 | else: 414 | c_k_X = np.zeros_like(X) 415 | for i in range(n_test): 416 | knn_idx = np.argpartition(D_test[i, :], kth=k)[:k] 417 | c_k_X[i] = Y[knn_idx].mean(axis=0) 418 | X -= c_k_X 419 | X **= 2 420 | x_c_k = X.sum(axis=1) 421 | if id(Y) == id(X): 422 | c_k_Y = c_k_X 423 | y_c_k = x_c_k 424 | else: 425 | if n_jobs > 1: 426 | c_k_Y_ctype = RawArray(ctypes.c_double, Y.size) 427 | c_k_Y = np.frombuffer(c_k_Y_ctype, dtype=np.float64).reshape(Y.shape) 428 | with Pool(processes=n_jobs, 429 | initializer=_dsl_init, 430 | initargs=(c_k_Y, D_train, Y)) as pool: 431 | for _ in pool.imap( 432 | func=partial(_dsl_local_centroids, k=k), 433 | iterable=range(n_train)): 434 | pass # handling within function 435 | else: 436 | c_k_Y = np.zeros_like(Y) 437 | for i in range(n_train): 438 | knn_idx = np.argpartition(D_train[i, :], kth=k)[:k] 439 | c_k_Y[i] = Y[knn_idx].mean(axis=0) 440 | Y -= c_k_Y 441 | Y **= 2 442 | y_c_k = Y.sum(axis=1) 443 | # DisSimLocal 444 | x_y = D_test 445 | x_y -= x_c_k[:, np.newaxis] 446 | x_y -= y_c_k 447 | if id(Y) == id(X): 448 | np.fill_diagonal(x_y, -np.inf) 449 | return x_y 450 | 451 | if __name__ == '__main__': 452 | #vectors = np.arange(12).reshape(3,4) 453 | np.random.seed(47) 454 | VECT_DATA = np.random.rand(3, 4) 455 | print("Vectors: ............... \n{}". 456 | format(VECT_DATA)) 457 | print("Centering: ............. \n{}". 458 | format(centering(VECT_DATA, 'vector'))) 459 | print("Weighted centering: .... \n{}". 460 | format(weighted_centering(VECT_DATA, 'cosine', 0.4))) 461 | print("Localized centering: ... \n{}". 462 | format(localized_centering(VECT_DATA, kappa=2, gamma=1))) 463 | print("DisSim (global): ....... \n{}". 464 | format(dis_sim_global(VECT_DATA))) 465 | print("DisSim (local): ........ \n{}". 466 | format(dis_sim_local(VECT_DATA, k=2))) 467 | -------------------------------------------------------------------------------- /hub_toolbox/distances.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | """ 13 | import ctypes 14 | from multiprocessing import Pool, cpu_count, RawArray 15 | import numpy as np 16 | from scipy.spatial.distance import cdist, pdist, squareform 17 | try: # for scikit-learn >= 0.18 18 | from sklearn.model_selection import StratifiedShuffleSplit 19 | except ImportError: # lower scikit-learn versions 20 | from sklearn.cross_validation import StratifiedShuffleSplit 21 | from sklearn.metrics.pairwise import pairwise_distances 22 | from hub_toolbox.io import check_vector_matrix_shape_fits_labels 23 | from hub_toolbox.htlogging import ConsoleLogging 24 | 25 | __all__ = ['cosine_distance', 'euclidean_distance', 26 | 'lp_norm', 'sample_distance'] 27 | 28 | def cosine_distance(X): 29 | """Calculate the cosine distance between all pairs of vectors in `X`.""" 30 | xn = np.sqrt(np.sum(X**2, 1)) 31 | Y = X / xn[:, np.newaxis] 32 | del xn 33 | D = 1. - Y.dot(Y.T) 34 | del Y 35 | D[D < 0] = 0 36 | D = np.triu(D, 1) + np.triu(D, 1).T 37 | return D 38 | 39 | def euclidean_distance(X): 40 | """Calculate the euclidean distances between all pairs of vectors in `X`. 41 | 42 | Consider using sklearn.metric.pairwise.euclidean_distances for faster, 43 | but less accurate distances (not necessarily symmetric, too).""" 44 | return squareform(pdist(X, 'euclidean')) 45 | 46 | def lp_norm(X:np.ndarray, Y:np.ndarray=None, p:float=None, n_jobs:int=1): 47 | """Calculate Minkowski distances with L^p norm. 48 | 49 | Calculate distances between all pairs of vectors within `X`, if `Y` is None. 50 | Otherwise calculate distances distances between all vectors in `X` against 51 | all vectors in `Y`. For example, this is useful if only distances from 52 | test data to training data are required. 53 | 54 | Parameters 55 | ---------- 56 | X : ndarray 57 | Vector data (e.g. test set) 58 | 59 | Y : ndarray, optional, default: None 60 | Vector data (e.g. training set) 61 | 62 | p : float, default: None 63 | Minkowski norm 64 | 65 | n_jobs : int, default: 1 66 | Parallel computation with multiple processes. See the scikit-learn 67 | docs for for more details. 68 | 69 | Returns 70 | ------- 71 | D : ndarray 72 | Distance matrix based on Lp-norm 73 | 74 | See also 75 | -------- 76 | http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html 77 | """ 78 | if p is None: 79 | raise ValueError("Please define the `p` parameter for lp_norm().") 80 | elif p == 1.: # Use efficient version for cityblock distances 81 | return pairwise_distances(X=X, Y=Y, metric='l1', 82 | n_jobs=n_jobs) 83 | elif p == 2.: # Use efficient version for Euclidean distances 84 | return pairwise_distances(X=X, Y=Y, metric='l2', 85 | n_jobs=n_jobs) 86 | else: # Use general, less efficient version for general Minkowski distances 87 | return pairwise_distances(X=X, Y=Y, metric='minkowski', 88 | n_jobs=n_jobs, **{'p' : p}) 89 | 90 | #=============================================================================== 91 | # #============================================================================= 92 | # # 93 | # # m_p dissimilarity 94 | # # 95 | # #============================================================================= 96 | #=============================================================================== 97 | def _mp_load_shared_Y(Y_, n_bins_): 98 | global Y, n_bins 99 | Y = Y_ 100 | n_bins = n_bins_ 101 | 102 | def _mp_load_shared_data(X_, Y_, p_, n_bins_, R_bins_, R_bins_np_, 103 | X_bins_, X_bins_np_, Y_bins_, Y_bins_np_, mp_, mp_np_): 104 | global X, Y, n_bins, n_x, n_y, d, p 105 | global X_bins, X_bins_np, Y_bins, Y_bins_np, R_bins, R_bins_np, mp, mp_np 106 | X = X_ 107 | Y = Y_ 108 | n_bins = n_bins_ 109 | n_x, d = X.shape 110 | n_y = Y.shape[0] 111 | p = p_ 112 | R_bins = R_bins_ 113 | R_bins_np = R_bins_np_ 114 | X_bins = X_bins_ 115 | X_bins_np = X_bins_np_ 116 | Y_bins = Y_bins_ 117 | Y_bins_np = Y_bins_np_ 118 | mp = mp_ 119 | mp_np = mp_np_ 120 | 121 | def _mp_find_bin_edges(i): 122 | return np.partition(Y[:, i], kth=kth)[kth] 123 | 124 | def _mp_calc_histograms(i): 125 | bins = _mp_find_bin_edges(i) 126 | return np.histogram(Y[:, i], bins=bins) 127 | 128 | def _mp_calc_histograms_n_bins(i): 129 | return np.histogram(Y[:, i], bins=n_bins) 130 | 131 | def _mp_create_r_bins(i): 132 | hist, _ = histograms[i] 133 | for b in range(n_bins): 134 | R_bins_np[i, b, b:] = np.cumsum(hist[b:]) 135 | R_bins_np[i] += np.triu(R_bins_np[i], k=1).T 136 | return 137 | 138 | def _mp_estimate_r(i): 139 | # Binning. Values outside the range are binned into the first/last bin 140 | _, bin_edges = histograms[i] 141 | bin_x = np.digitize(X[:, i], bins=bin_edges) 142 | bin_x -= 1 143 | np.clip(bin_x, 0, n_bins-1, out=bin_x) 144 | bin_y = np.digitize(Y[:, i], bins=bin_edges) 145 | bin_y -= 1 146 | np.clip(bin_y, 0, n_bins-1, out=bin_y) 147 | X_bins_np[i, :] = bin_x 148 | Y_bins_np[i, :] = bin_y 149 | return 150 | 151 | def _mp_calc_mp_dissim(x): 152 | mp_xy = np.zeros(n_y, dtype=float) 153 | for i in range(d): 154 | tmp = R_bins_np[i, X_bins_np[i, x], Y_bins_np[i, :]] / (n_x + n_y) 155 | tmp **= p 156 | mp_xy += tmp 157 | mp_xy /= d 158 | mp_xy **= (1. / p) 159 | mp_np[x, :] = mp_xy 160 | return 161 | 162 | def mp_dissim(X:np.ndarray, Y:np.ndarray=None, p:float=2, 163 | n_bins:int=0, bin_size:str='range', n_jobs:int=1, verbose:int=0): 164 | """ Calculate m_p dissimilarity. 165 | 166 | The data-dependent m_p dissimilarity measure considers the relative 167 | positions of objects x and y with respect to the rest of the data 168 | distribution in each dimension [1]_. 169 | 170 | Parameters 171 | ---------- 172 | X : ndarray 173 | Vector data (e.g. test set), shape (n_x, d) 174 | 175 | Y : ndarray, optional, default: None 176 | Vector data (e.g. training set), shape (n_y, d). 177 | Number of features ``d`` must be equal in `X` and `Y`. 178 | 179 | p : float, optional, default: 2 180 | Parameter, similar to `p` in Minkowski norm 181 | 182 | n_bins : int, optional, default: 0 183 | Number of bins for probability mass estimation 184 | 185 | bin_size : str, optional, default: 'range' 186 | Strategy for binning. May be one of: 187 | 'range' ... create bins with uniform range length 188 | 'mass' ... create bins with approx. uniform mass 189 | 190 | n_jobs : int, optional, default: 1 191 | Parallel computation with multiple processes. 192 | 193 | verbose : int, optional, default: 0 194 | Increasing level of output 195 | 196 | Returns 197 | ------- 198 | D : ndarray, shape (X.shape[0], Y.shape[0]) 199 | m_p dissimilarity matrix 200 | 201 | References 202 | ---------- 203 | .. [1] Aryal et al. (2017). Data-dependent dissimilarity measure: an 204 | effective alternative to geometric distance measures. 205 | Knowledge and Information Systems, Springer-Verlag London. 206 | """ 207 | # Some preparation 208 | n_x, d = X.shape 209 | # All-against-all in X, or X against Y? 210 | if Y is None: 211 | Y = X 212 | n_y, d_y = Y.shape 213 | # X and Y must have same dimensionality 214 | assert d == d_y 215 | if n_jobs == -1: 216 | n_jobs = cpu_count() 217 | n_bins = int(n_bins) 218 | if p == 0: 219 | log = ConsoleLogging() 220 | log.warning('Got mpDisSim parameter p=0. Changed to default ' 221 | 'value p=2 instead, in order to avoid zero division.') 222 | p = 2. 223 | 224 | # RawArrays have no locks. Must take EXTREME CARE!! 225 | R_bins = RawArray(ctypes.c_int32, d * n_bins * n_bins) 226 | R_bins_np = np.frombuffer(R_bins, dtype=np.int32).reshape((d, n_bins, n_bins)) 227 | X_bins = RawArray(ctypes.c_int32, d * n_x) 228 | X_bins_np = np.frombuffer(X_bins, dtype=np.int32).reshape((d, n_x)) 229 | Y_bins = RawArray(ctypes.c_int32, d * n_y) 230 | Y_bins_np = np.frombuffer(Y_bins, dtype=np.int32).reshape((d, n_y)) 231 | mp = RawArray(ctypes.c_double, n_x * n_y) 232 | mp_np = np.frombuffer(mp).reshape((n_x, n_y)) 233 | 234 | global histograms, kth 235 | kth = np.arange(0, n_y)[0:n_y:int(n_y/n_bins)] 236 | if kth[-1] != n_y - 1: 237 | kth = np.append(kth, n_y-1) 238 | if verbose: 239 | print("Creating bins for estimating probability data mass.") 240 | with Pool(processes=n_jobs, 241 | initializer=_mp_load_shared_Y, 242 | initargs=(Y, n_bins)) as pool: 243 | if 'mass'.startswith(bin_size): 244 | histograms = pool.map(func=_mp_calc_histograms, 245 | iterable=range(d)) 246 | elif 'range'.startswith(bin_size): 247 | histograms = pool.map(func=_mp_calc_histograms_n_bins, 248 | iterable=range(d)) 249 | else: 250 | raise ValueError("{}' is not a valid value for `bin_size`. " 251 | "Please use 'range' or 'mass'.".format(bin_size)) 252 | # The second pool needs `histograms` 253 | with Pool(processes=n_jobs, 254 | initializer=_mp_load_shared_data, 255 | initargs=(X, Y, p, n_bins, R_bins, R_bins_np, X_bins, X_bins_np, 256 | Y_bins, Y_bins_np, mp, mp_np)) as pool: 257 | pool.map(func=_mp_create_r_bins, iterable=range(d)) 258 | if verbose: 259 | print("Estimating probability data mass in all regions R_i(x,y).") 260 | pool.map(func=_mp_estimate_r, iterable=range(d)) 261 | if verbose: 262 | print("Calculating m_p dissimilarity for all pairs x, y.") 263 | pool.map(func=_mp_calc_mp_dissim, iterable=range(n_x)) 264 | if verbose: 265 | print("Done.") 266 | return mp_np 267 | 268 | 269 | def sample_distance(X, y, sample_size, metric='euclidean', strategy='a', 270 | random_state=None): 271 | """Calculate incomplete distance matrix. 272 | 273 | Parameters 274 | ---------- 275 | X : ndarray 276 | Input vector data. 277 | 278 | y : ndarray 279 | Input labels (used for stratified sampling). 280 | 281 | sample_size : int or float 282 | If float, must be between 0.0 and 1.0 and represent the proportion of 283 | the dataset for which distances should be calculated to. 284 | If int, represents the absolute number of sample distances. 285 | NOTE: See also the notes to the return value `y_sample`! 286 | 287 | metric : any scipy.spatial.distance.cdist metric (default: 'euclidean') 288 | Metric used to calculate distances. 289 | 290 | strategy : 'a', 'b' (default: 'a') 291 | 292 | - 'a': Stratified sampling, for all points the distances to the 293 | same points are chosen. 294 | - 'b': Stratified sampling, for each point it is chosen independently, 295 | to which other points distances are calculated. 296 | NOTE: currently not implemented. 297 | 298 | random_state : int or RandomState 299 | Pseudo-random number generator state used for random sampling. 300 | 301 | Returns 302 | ------- 303 | D : ndarray 304 | The ``n x s`` distance matrix, where ``n`` is the dataset size and 305 | ``s`` is the sample size. 306 | 307 | y_sample : ndarray 308 | The index array that determines, which column in `D` corresponds 309 | to which data point. 310 | 311 | NOTE: The size of `y_sample` may be slightly higher than defined by 312 | `sample_size` in order to meet stratification requirements! 313 | Thus, please always check the size in the downstream workflow. 314 | 315 | Notes 316 | ----- 317 | Only calculate distances to a fixed number/fraction of all ``n`` points. 318 | These ``s`` points are sampled according to the chosen strategy (see above). 319 | In other words, calculate the distance from all points to each point 320 | in the sample to obtain a ``n x s`` distance matrix. 321 | 322 | """ 323 | check_vector_matrix_shape_fits_labels(X, y) 324 | n = X.shape[0] 325 | if not isinstance(sample_size, int): 326 | sample_size = int(sample_size * n) 327 | if strategy == 'a': 328 | try: # scikit-learn == 0.18 329 | sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size, 330 | random_state=random_state) 331 | _, y_sample = sss.split(X=X, y=y) 332 | except ValueError: # scikit-learn >= 0.18.1 333 | _, y_sample = next(sss.split(X=X, y=y)) 334 | except TypeError: # scikit-learn < 0.18 335 | sss = StratifiedShuffleSplit(y=y, n_iter=1, test_size=sample_size, 336 | random_state=random_state) 337 | _, y_sample = next(iter(sss)) 338 | elif strategy == 'b': 339 | raise NotImplementedError("Strategy 'b' is not yet implemented.") 340 | #======================================================================= 341 | # y_sample = np.zeros((n, sample_size)) 342 | # try: # scikit-learn >= 0.18 343 | # for i in range(n): 344 | # sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size) 345 | # _, y_sample[i, :] = sss.split(X=y, y=y) 346 | # except TypeError: # scikit-learn < 0.18 347 | # for i in range(n): 348 | # sss = StratifiedShuffleSplit(y=y, n_iter=1, test_size=sample_size) 349 | # _, y_sample[i, :] = next(iter(sss)) 350 | # # TODO will need to adapt cdist call below... 351 | #======================================================================= 352 | else: 353 | raise NotImplementedError("Strategy", strategy, "unknown.") 354 | 355 | D = cdist(X, X[y_sample, :], metric=metric) 356 | return D, y_sample 357 | -------------------------------------------------------------------------------- /hub_toolbox/example_datasets/ABOUT: -------------------------------------------------------------------------------- 1 | DEXTER is a text classification problem in a bag-of-word representation. This 2 | is a two-class classification problem with sparse continuous input variables. 3 | This dataset is one of five datasets of the NIPS 2003 feature selection 4 | challenge. 5 | 6 | http://archive.ics.uci.edu/ml/datasets/Dexter 7 | -------------------------------------------------------------------------------- /hub_toolbox/example_datasets/dexter_train.labels: -------------------------------------------------------------------------------- 1 | 1 2 | -1 3 | 1 4 | -1 5 | 1 6 | -1 7 | 1 8 | -1 9 | 1 10 | 1 11 | 1 12 | 1 13 | -1 14 | 1 15 | 1 16 | 1 17 | -1 18 | 1 19 | -1 20 | -1 21 | 1 22 | -1 23 | 1 24 | 1 25 | 1 26 | 1 27 | 1 28 | -1 29 | -1 30 | -1 31 | 1 32 | -1 33 | -1 34 | 1 35 | 1 36 | 1 37 | 1 38 | -1 39 | 1 40 | -1 41 | -1 42 | -1 43 | -1 44 | 1 45 | -1 46 | -1 47 | -1 48 | -1 49 | -1 50 | 1 51 | -1 52 | -1 53 | 1 54 | -1 55 | -1 56 | -1 57 | 1 58 | 1 59 | 1 60 | 1 61 | 1 62 | -1 63 | -1 64 | -1 65 | -1 66 | -1 67 | 1 68 | -1 69 | 1 70 | -1 71 | 1 72 | -1 73 | -1 74 | -1 75 | 1 76 | 1 77 | 1 78 | 1 79 | 1 80 | -1 81 | -1 82 | -1 83 | -1 84 | -1 85 | 1 86 | 1 87 | 1 88 | 1 89 | -1 90 | -1 91 | -1 92 | -1 93 | 1 94 | -1 95 | 1 96 | -1 97 | -1 98 | 1 99 | 1 100 | -1 101 | 1 102 | 1 103 | -1 104 | -1 105 | 1 106 | 1 107 | 1 108 | 1 109 | -1 110 | -1 111 | -1 112 | 1 113 | 1 114 | -1 115 | 1 116 | 1 117 | -1 118 | -1 119 | 1 120 | 1 121 | -1 122 | 1 123 | -1 124 | -1 125 | 1 126 | 1 127 | 1 128 | -1 129 | -1 130 | 1 131 | 1 132 | 1 133 | -1 134 | -1 135 | 1 136 | 1 137 | -1 138 | -1 139 | 1 140 | -1 141 | 1 142 | 1 143 | 1 144 | -1 145 | -1 146 | -1 147 | 1 148 | 1 149 | -1 150 | -1 151 | 1 152 | -1 153 | 1 154 | -1 155 | 1 156 | -1 157 | -1 158 | 1 159 | 1 160 | -1 161 | 1 162 | -1 163 | 1 164 | -1 165 | -1 166 | 1 167 | -1 168 | 1 169 | 1 170 | -1 171 | 1 172 | -1 173 | 1 174 | -1 175 | -1 176 | -1 177 | 1 178 | -1 179 | 1 180 | 1 181 | 1 182 | 1 183 | -1 184 | -1 185 | 1 186 | -1 187 | 1 188 | 1 189 | 1 190 | -1 191 | -1 192 | 1 193 | -1 194 | -1 195 | 1 196 | -1 197 | -1 198 | -1 199 | 1 200 | -1 201 | -1 202 | 1 203 | 1 204 | -1 205 | 1 206 | -1 207 | 1 208 | 1 209 | -1 210 | 1 211 | 1 212 | -1 213 | -1 214 | -1 215 | 1 216 | -1 217 | -1 218 | 1 219 | 1 220 | -1 221 | 1 222 | -1 223 | -1 224 | -1 225 | -1 226 | 1 227 | 1 228 | 1 229 | 1 230 | 1 231 | 1 232 | 1 233 | -1 234 | -1 235 | 1 236 | -1 237 | -1 238 | 1 239 | 1 240 | -1 241 | 1 242 | 1 243 | -1 244 | -1 245 | -1 246 | 1 247 | 1 248 | 1 249 | -1 250 | 1 251 | 1 252 | -1 253 | 1 254 | -1 255 | -1 256 | -1 257 | -1 258 | 1 259 | -1 260 | 1 261 | 1 262 | -1 263 | -1 264 | 1 265 | 1 266 | -1 267 | -1 268 | 1 269 | 1 270 | 1 271 | -1 272 | -1 273 | -1 274 | -1 275 | 1 276 | 1 277 | 1 278 | 1 279 | 1 280 | -1 281 | -1 282 | 1 283 | 1 284 | -1 285 | -1 286 | 1 287 | 1 288 | -1 289 | 1 290 | -1 291 | -1 292 | 1 293 | 1 294 | 1 295 | -1 296 | -1 297 | -1 298 | -1 299 | 1 300 | -1 301 | -------------------------------------------------------------------------------- /hub_toolbox/goodman_kruskal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import sys 13 | import numpy as np 14 | from scipy.sparse import csr_matrix, lil_matrix 15 | from hub_toolbox import io 16 | 17 | __all__ = ['goodman_kruskal_index', 'sparse_goodman_kruskal_index'] 18 | 19 | def goodman_kruskal_index(D:np.ndarray, classes:np.ndarray, 20 | metric:str='distance') -> float: 21 | """Calculate the Goodman-Kruskal clustering index. 22 | 23 | Parameters 24 | ---------- 25 | D : ndarray 26 | The ``n x n`` symmetric distance (similarity) matrix. 27 | 28 | classes : ndarray 29 | The ``1 x n`` vector of class labels for each point. 30 | 31 | metric : {'distance', 'similarity'}, optional (default: 'distance') 32 | Define, whether the matrix `D` is a distance or similarity matrix 33 | 34 | Returns 35 | ------- 36 | gamma : float 37 | Goodman-Kruskal index in ``[-1, 1]`` (higher=better) 38 | 39 | Notes 40 | ----- 41 | This clustering quality measure relates the number of concordant (`Q_c`) 42 | and discordant (`Q_d`) quadruples (`d_ij`, `d_kl`) of a distance matrix. 43 | We only consider tuples, so that `i`, `j` are from the same class 44 | and `k`, `l` are from different classes. Then a quadruple is... 45 | concordant, if 46 | 47 | .. math:: 48 | d_{i,j} < d_{k,l} 49 | 50 | discordant, if 51 | 52 | .. math:: 53 | d_{i,j} > d_{k,l} 54 | 55 | and not counted, otherwise. 56 | 57 | The Goodman-Kruskal index gamma is then defined as: 58 | 59 | .. math:: 60 | gamma = \\frac{Q_c - Q_d}{Q_c + Q_d} 61 | 62 | `gamma` is bounded to ``[-1, 1]``, where larger values indicate better 63 | clustering. 64 | """ 65 | 66 | # Checking input 67 | io.check_distance_matrix_shape(D) 68 | io.check_distance_matrix_shape_fits_labels(D, classes) 69 | io.check_valid_metric_parameter(metric) 70 | 71 | # Calculations 72 | Q_c = 0.0 73 | Q_d = 0.0 74 | cls = np.unique(classes) 75 | 76 | # D_kl pairs in different classes 77 | other = classes[:, np.newaxis] != classes[np.newaxis, :] 78 | D_other = D[np.triu(other, 1)] 79 | 80 | for c in cls: 81 | sel = classes == c 82 | if np.sum(sel) > 1: 83 | sel = sel[:, np.newaxis].astype(bool) 84 | selD = np.logical_and(sel, sel.T) 85 | # D_ij pairs within same class 86 | D_self = D[np.triu(selD, 1).astype(bool).T].T 87 | else: 88 | # skip if there is only one item per class 89 | continue 90 | # D_kl pairs in different classes (D_other) are computed once for all c 91 | D_full = np.append(D_self, D_other) 92 | 93 | self_size = np.max(np.shape(D_self)) 94 | other_size = np.max(np.shape(D_other)) 95 | # Sort algorithm must be stable! 96 | full_idx = np.argsort(D_full, kind='mergesort')[::-1] 97 | 98 | # Calc number of quadruples with equal distance 99 | n_equidistant = 0 100 | sdf = np.sort(D_full, axis=None) 101 | equi_mask = np.zeros(sdf.size, dtype=bool) 102 | # Positions with repeated values 103 | equi_mask[1:] = sdf[1:] == sdf[:-1] 104 | equi_dist = sdf[equi_mask] 105 | # How often does each value occur in self/other: 106 | for dist in np.unique(equi_dist): 107 | equi_arg = np.where(D_full == dist)[0] 108 | self_equi = (equi_arg < self_size).sum() 109 | other_equi = len(equi_arg) - self_equi 110 | # Number of dc that are actually equal 111 | n_equidistant += self_equi * other_equi 112 | 113 | # Calc number of concordant quadruples 114 | cc = 0 115 | ccsize = other_size 116 | for idx in full_idx: 117 | if idx < self_size: 118 | cc += ccsize 119 | else: 120 | ccsize -= 1 121 | 122 | # Calc number of discordant quadruples 123 | dc = self_size * other_size - cc - n_equidistant 124 | 125 | Q_c += cc 126 | Q_d += dc 127 | 128 | # Calc Goodman-Kruskal's gamma 129 | if Q_c + Q_d == 0: 130 | gamma = 0.0 131 | else: 132 | if metric == 'similarity': 133 | gamma = (Q_c - Q_d) / (Q_c + Q_d) 134 | else: 135 | gamma = (Q_d - Q_c) / (Q_c + Q_d) 136 | 137 | return gamma 138 | 139 | def sparse_goodman_kruskal_index(S:csr_matrix, classes:np.ndarray, 140 | metric='similarity', zero_mv:bool=False, 141 | heuristic:str=None, verbose:int=0) -> float: 142 | """Calculate the Goodman-Kruskal clustering index. 143 | 144 | Parameters 145 | ---------- 146 | S : csr_matrix 147 | The ``n x n`` symmetric similarity matrix. 148 | 149 | classes : ndarray 150 | The ``1 x n`` vector of class labels for each point. 151 | 152 | metric : {'similarity', 'distance'}, optional (default: 'similarity') 153 | Define, whether the matrix `D` is a distance or similarity matrix. 154 | 155 | NOTE: 'distance' is used for debugging purposes only. Use standard 156 | goodman_kruskal_index function for distance matrices. 157 | 158 | zero_mv : boolean, optional (default: False) 159 | Treat zeros as missing values, i.e. tuples with any zero 160 | similarities are not counted. 161 | 162 | heuristic : {None, 'equal_sim'}, optional (default: None) 163 | * None - Exact GK 164 | * 'equal_sim' - omit expensive search for equal similarities 165 | Useful, when no/few equal similarites are expected. 166 | Do NOT use in case of SharedNN matrices! 167 | 168 | NOTE: Equal zero similarities are still considered 169 | when using the heuristic. 170 | 171 | verbose : int, optional (default: 0) 172 | Increasing level of output (progress report). 173 | 174 | Returns 175 | ------- 176 | gamma : float 177 | Goodman-Kruskal index in ``[-1, 1]`` (higher=better) 178 | 179 | Notes 180 | ----- 181 | This clustering quality measure relates the number of concordant (`Q_c`) 182 | and discordant (`Q_d`) quadruples (`d_ij`, `d_kl`) of a distance matrix. 183 | We only consider tuples, so that `i`, `j` are from the same class 184 | and `k`, `l` are from different classes. Then a quadruple is... 185 | concordant, if 186 | 187 | .. math:: 188 | d_{i,j} < d_{k,l} 189 | 190 | discordant, if 191 | 192 | .. math:: 193 | d_{i,j} > d_{k,l} 194 | 195 | and not counted, otherwise. 196 | 197 | The Goodman-Kruskal index gamma is then defined as: 198 | 199 | .. math:: 200 | gamma = \\frac{Q_c - Q_d}{Q_c + Q_d} 201 | 202 | `gamma` is bounded to ``[-1, 1]``, where larger values indicate better 203 | clustering. 204 | """ 205 | 206 | # Checking input 207 | io.check_distance_matrix_shape(S) 208 | io.check_distance_matrix_shape_fits_labels(S, classes) 209 | io.check_valid_metric_parameter(metric) 210 | 211 | if verbose: 212 | print("Sparse Goodman-Kruskal") 213 | sys.stdout.write("----------------------") 214 | print(flush=True) 215 | # Calculations 216 | Qc = 0.0 217 | Qd = 0.0 218 | n = classes.size 219 | 220 | # S_kl pairs in different classes 221 | S_other_list = lil_matrix((n, n)) 222 | other_nnz = 0 223 | # building the complete mask at once would result in dense N x N matrix 224 | if verbose >= 2: 225 | print("Finding S_kl pairs with different class labels...", 226 | end=' ', flush=True) 227 | for i, c in enumerate(classes): 228 | cur_other = csr_matrix((c != classes)[i+1:]) 229 | other_nnz += cur_other.nnz 230 | S_other_list[i, :cur_other.shape[1]] = \ 231 | S[i, i+1:].multiply(cur_other) 232 | n_other_zeros = other_nnz - S_other_list.nnz 233 | # The following might be achieved faster w/o csr intermediate 234 | S_other = S_other_list.tocsr().data 235 | del S_other_list, cur_other 236 | if verbose >= 2: 237 | print("done.", flush=True) 238 | 239 | cls = np.unique(classes) 240 | for c in cls: 241 | if verbose == 1:# and c % 10 == 0: 242 | # end='\r' does not work with jupyter notebook 243 | print("Class: {}/{}".format(c, len(cls)), end='') 244 | sel = classes == c 245 | if np.sum(sel) > 1: 246 | if verbose >= 2: 247 | print("Finding S_ij pairs for class {}..." 248 | .format(c), end=' ') 249 | n = sel.size 250 | # intra-class distances 251 | S_self_list = lil_matrix((n, n)) 252 | self_nnz = 0 253 | 254 | # Only visit points of self class 255 | sel_arg = np.where(sel > 0)[0] 256 | for i in sel_arg: 257 | cur_self = csr_matrix(sel[i+1:]) 258 | self_nnz += cur_self.nnz 259 | S_self_list[i, :cur_self.shape[1]] = \ 260 | S[i, i+1:].multiply(cur_self) 261 | 262 | n_self_zeros = self_nnz - S_self_list.nnz 263 | # Same as with S_other 264 | S_self = S_self_list.tocsr().data 265 | del S_self_list, cur_self 266 | if verbose >= 2: 267 | print("done.") 268 | else: 269 | # skip if there is only one item per class 270 | if verbose == 1: # and c % 10 == 0: 271 | sys.stdout.write('\r') 272 | continue 273 | 274 | # S_kl pairs in different classes are computed once for all c 275 | if verbose >= 2: 276 | print("Sorting data...", end=' ') 277 | S_full_data = np.append(S_self, S_other) 278 | 279 | self_data_size = S_self.size 280 | self_size = S_self.size + n_self_zeros 281 | other_data_size = S_other.size 282 | other_size = S_other.size + n_other_zeros 283 | full_data_idx = np.argsort(S_full_data, kind='mergesort')[::-1] 284 | del S_self 285 | if verbose >= 2: 286 | print("done.", flush=True) 287 | 288 | # Calc number of quadruples with equal distance 289 | if verbose >= 2: 290 | print("Calculating number of quadruples with equal distance...", 291 | end=' ') 292 | n_equidistant = 0 293 | # Number of equal zero similarities 294 | if zero_mv: 295 | n_zero = 0 296 | else: 297 | n_zero = n_self_zeros * n_other_zeros 298 | if heuristic == 'equal_sim': 299 | if verbose >= 2: 300 | print("OMITTED (heuristic).") 301 | else: 302 | pass 303 | else: 304 | sdf = np.sort(S_full_data, axis=None) 305 | equi_mask = np.zeros(sdf.size, dtype=bool) 306 | # Positions with repeated values 307 | equi_mask[1:] = sdf[1:] == sdf[:-1] 308 | equi_dist = sdf[equi_mask] 309 | equi_arg = 0 310 | # How often does each value occur in self/other: 311 | for dist in np.unique(equi_dist): 312 | equi_arg = np.where(S_full_data == dist)[0] 313 | self_equi = (equi_arg < self_data_size).sum() 314 | other_equi = len(equi_arg) - self_equi 315 | # Number of dc that are actually equal 316 | n_equidistant += self_equi * other_equi 317 | del S_full_data, equi_mask, equi_dist, equi_arg 318 | if verbose >= 2: 319 | print("done.", flush=True) 320 | 321 | # Calc number of concordant quadruples 322 | if verbose >= 2: 323 | print("Calculating number of concordant quadruples...", end=' ') 324 | cc = 0 325 | if zero_mv: 326 | ccsize = other_data_size 327 | else: 328 | ccsize = other_size 329 | for idx in full_data_idx: 330 | if idx < self_data_size: 331 | cc += ccsize 332 | else: 333 | ccsize -= 1 334 | if verbose >= 2: 335 | print("done.", flush=True) 336 | 337 | # Calc number of discordant quadruples 338 | if verbose >= 2: 339 | print("Calculating number of discordant quadruples...", end=' ') 340 | if zero_mv: 341 | dc = self_data_size * other_data_size - cc - n_equidistant 342 | else: 343 | dc = self_size * other_size - cc - n_equidistant - n_zero 344 | Qc += cc 345 | Qd += dc 346 | if verbose >= 2: 347 | print("done.", flush=True) 348 | if verbose == 1: # and c % 10 == 0: 349 | sys.stdout.write('\r') 350 | 351 | # Calc Goodman-Kruskal's gamma 352 | if verbose >= 2: 353 | print("Calculating Goodman-Kruskal gamma...", end=' ') 354 | if Qc + Qd == 0: 355 | gamma = 0.0 356 | else: 357 | if metric == 'similarity': 358 | gamma = (Qc - Qd) / (Qc + Qd) 359 | elif metric == 'distance': 360 | gamma = (Qd - Qc) / (Qc + Qd) 361 | else: 362 | print("WARNING: Unknown metric type {}. Assuming 'similarity' " 363 | "instead. Sign of result might be reversed, if this is " 364 | "wrong!".format(metric.__str__[0:32]), file=sys.stderr) 365 | gamma = (Qc - Qd) / (Qc + Qd) 366 | if verbose >= 2: 367 | print("done.", flush=True) 368 | return gamma 369 | 370 | def _naive_goodman_kruskal(D:np.ndarray, labels:np.ndarray, metric='distance'): 371 | """Calculate Goodman-Kruskal's gamma (slow naive implementation) 372 | 373 | Please use one of the others methods for calculating the GK index. This 374 | function is intended for testing purposes only. 375 | """ 376 | 377 | # Checking input 378 | io.check_distance_matrix_shape(D) 379 | io.check_distance_matrix_shape_fits_labels(D, labels) 380 | io.check_valid_metric_parameter(metric) 381 | n = D.shape[0] 382 | Q_c = 0 383 | Q_d = 0 384 | 385 | # loop through all quadruples... 386 | for i in range(n): 387 | # ...but ignore self distances and only count undirected edges 388 | for j in range(i + 1, n): 389 | if labels[i] == labels[j]: 390 | for k in range(n): 391 | for l in range(k + 1, n): 392 | if labels[l] != labels[k]: # or l == i or l == j: 393 | if D[i, j] < D[k, l]: 394 | Q_c += 1 395 | elif D[i, j] > D[k, l]: 396 | Q_d += 1 397 | else: # don't count equal distances 398 | pass 399 | if Q_c + Q_d == 0: 400 | return 0 401 | if metric == 'similarity': 402 | return (Q_d - Q_c) / (Q_c + Q_d) 403 | else: # metric == 'distance': 404 | return (Q_c - Q_d) / (Q_c + Q_d) 405 | -------------------------------------------------------------------------------- /hub_toolbox/htlogging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2015-2018, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | """ 13 | import sys, time 14 | from abc import ABCMeta, abstractmethod 15 | 16 | __all__ = ['ConsoleLogging'] 17 | 18 | class Logging(metaclass=ABCMeta): # pragma: no cover 19 | """Base class for time-stamped logging. 20 | 21 | Do not instantiate this class, but ConsoleLogging or FileLogging! 22 | """ 23 | @property 24 | def _current_time(self): 25 | """Formatted time stamp""" 26 | return time.strftime('%Y-%m-%d %H:%M:%S') 27 | 28 | @abstractmethod 29 | def message(self): 30 | ... 31 | @abstractmethod 32 | def warning(self): 33 | ... 34 | @abstractmethod 35 | def error(self): 36 | ... 37 | 38 | class ConsoleLogging(Logging): 39 | """Convenience functions for time-stamped logging to the console""" 40 | 41 | def message(self, *objs, flush=True): 42 | """Log normal program function""" 43 | print(self._current_time, 'INFO:', *objs) 44 | if flush: 45 | sys.stdout.flush() 46 | 47 | def warning(self, *objs, flush=True): 48 | """Log warning (program can still continue)""" 49 | print(self._current_time, 'WARNING:', *objs, file=sys.stderr) 50 | if flush: 51 | sys.stderr.flush() 52 | 53 | def error(self, *objs, flush=True): 54 | """Log error (program fails)""" 55 | print(self._current_time, 'ERROR:', *objs, file=sys.stderr) 56 | if flush: 57 | sys.stderr.flush() 58 | 59 | class FileLogging(ConsoleLogging): 60 | """Convenience functions for time-stamped logging to a file""" 61 | 62 | def __init__(self): 63 | """Not implemented""" 64 | self.warning("FileLogging not yet implemented, will print to " 65 | "console anyway.") 66 | 67 | if __name__ == '__main__': 68 | """Simple test of this module""" 69 | log = ConsoleLogging() 70 | log.message('This module supplies functions for printing and logging.') 71 | log.message('Examples:') 72 | sys.stdout.flush() 73 | time.sleep(0.01) 74 | log.warning('This is a warning.') 75 | log.error('This is an error!') 76 | sys.stderr.flush() 77 | time.sleep(0.01) 78 | log.message('You should have got three messages on stdout and ' 79 | 'two on stderr.') 80 | log = FileLogging() 81 | log.message('Still written to console, until implemented.') 82 | try: 83 | log = Logging() 84 | except TypeError as e: 85 | log.warning('Must not instantiate Logging(), got exception:\n', e) 86 | -------------------------------------------------------------------------------- /hub_toolbox/hubness_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | """ 13 | from inspect import signature 14 | import numpy as np 15 | from hub_toolbox import io 16 | from hub_toolbox.centering import centering, weighted_centering, \ 17 | localized_centering, dis_sim_global, dis_sim_local 18 | from hub_toolbox.distances import cosine_distance 19 | from hub_toolbox.global_scaling import mutual_proximity_empiric, \ 20 | mutual_proximity_gammai, mutual_proximity_gaussi 21 | from hub_toolbox.goodman_kruskal import goodman_kruskal_index 22 | from hub_toolbox.hubness import hubness 23 | from hub_toolbox.intrinsic_dimension import intrinsic_dimension 24 | from hub_toolbox.knn_classification import score 25 | from hub_toolbox.local_scaling import nicdm, local_scaling 26 | from hub_toolbox.shared_neighbors import shared_nearest_neighbors 27 | 28 | __all__ = ['HubnessAnalysis'] 29 | 30 | CITATION = \ 31 | """ 32 | R. Feldbauer, M. Leodolter, C. Plant and A. Flexer, 33 | "Fast Approximate Hubness Reduction for Large High-Dimensional Data", 34 | 2018 IEEE International Conference on Big Knowledge (ICBK), Singapore, 2018, 35 | pp. 358-367. doi: 10.1109/ICBK.2018.00055 36 | (tech report available at http://www.ofai.at/cgi-bin/tr-online?number+2018-02) 37 | 38 | or 39 | 40 | R. Feldbauer, A. Flexer, "A comprehensive empirical comparison of hubness reduction in high-dimensional spaces" 41 | Knowledge and Information Systems, 2018, https://doi.org/10.1007/s10115-018-1205-y 42 | """ 43 | 44 | 45 | def _primary_distance(D: np.ndarray, metric): 46 | """Return `D`, identical. (Dummy function.)""" 47 | return D 48 | 49 | 50 | # New types of hubness reduction methods must be added here 51 | SEC_DIST = {'mp': mutual_proximity_empiric, 52 | 'mp_gaussi': mutual_proximity_gaussi, 53 | 'mp_gammai': mutual_proximity_gammai, 54 | 'ls': local_scaling, 55 | 'nicdm': nicdm, 56 | 'snn': shared_nearest_neighbors, 57 | 'cent': centering, 58 | 'wcent': weighted_centering, 59 | 'lcent': localized_centering, 60 | 'dsg': dis_sim_global, 61 | 'dsl': dis_sim_local, 62 | 'orig': _primary_distance # a dummy function 63 | } 64 | 65 | 66 | class HubnessAnalysis: 67 | """The main hubness analysis class. 68 | 69 | For more detailed analyses (optimizing parameters, using similarity data, 70 | etc.) please use the individual modules. 71 | 72 | Examples 73 | -------- 74 | >>> from hub_toolbox.hubness_analysis import HubnessAnalysis 75 | >>> hub = HubnessAnalysis() 76 | >>> hub.analyze_hubness() 77 | 78 | >>> from hub_toolbox.io import load_dexter 79 | >>> D, y, X = load_dexter() 80 | >>> hub = HubnessAnalysis(D, classes=y, vectors=X) 81 | >>> hub.analyze_hubness() 82 | 83 | Notes 84 | ----- 85 | The first example loads the example data set and performs a quick 86 | hubness analysis with some of the functions provided in this toolbox. 87 | 88 | For the second example you must provide a distance matrix `D` (NxN) 89 | together with an optional class labels vector (`classes`) and the 90 | original (optional) data vectors (`vectors`) to perform a full hubness 91 | analysis. 92 | 93 | See also 94 | -------- 95 | analyse_hubness : additional parameters (e.g. k-occurrence, k-NN) 96 | """ 97 | 98 | def __init__(self, D: np.ndarray = None, classes: np.ndarray = None, 99 | vectors: np.ndarray = None, metric: str = 'distance'): 100 | """Initialize a quick hubness analysis. 101 | 102 | Parameters 103 | ---------- 104 | D : ndarray, optional (default: None) 105 | The n x n symmetric distance (similarity) matrix. 106 | Default: load example dataset (dexter). 107 | 108 | classes : ndarray, optional (default: None) 109 | The 1 x n class labels. Required for k-NN, GK. 110 | 111 | vectors : ndarray, optional (default: None) 112 | The m x n vector data. Required for IntrDim estimation. 113 | 114 | metric : {'distance', 'similarity'} 115 | Define whether `D` is a distance or similarity matrix. 116 | """ 117 | 118 | self.has_class_data, self.has_vector_data = False, False 119 | if D is None: 120 | print('\n' 121 | 'NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set.' 122 | '\n' 123 | 'DEXTER is a text classification problem in a bag-of-word \n' 124 | 'representation. This is a two-class classification problem\n' 125 | 'with sparse continuous input variables. \n' 126 | 'This dataset is one of five datasets of the NIPS 2003\n' 127 | 'feature selection challenge.\n' 128 | 'http://archive.ics.uci.edu/ml/datasets/Dexter\n') 129 | self.D, self.classes, self.vectors = io.load_dexter() 130 | self.has_class_data, self.has_vector_data = True, True 131 | self.metric = 'distance' 132 | else: 133 | # copy data and ensure correct type (not int16 etc.) 134 | self.D = np.copy(D).astype(np.float64) 135 | if classes is None: 136 | self.classes = None 137 | else: 138 | self.classes = np.copy(classes).astype(np.float64) 139 | self.has_class_data = True 140 | if vectors is None: 141 | self.vectors = None 142 | else: 143 | self.vectors = np.copy(vectors).astype(np.float64) 144 | self.has_vector_data = True 145 | self.metric = metric 146 | self.n = len(self.D) 147 | self.experiments = [] 148 | 149 | @property 150 | def _header(self): 151 | return {'mp': "MUTUAL PROXIMITY (Empiric)", 152 | 'mp_gaussi': "MUTUAL PROXIMITY (Independent Gaussians)", 153 | 'mp_gammai': "MUTUAL PROXIMITY (Independent Gamma)", 154 | 'ls': "LOCAL SCALING (original)", 155 | 'nicdm': "LOCAL SCALING (NICDM)", 156 | 'snn': "SHARED NEAREST NEIGHBORS", 157 | 'cent': "CENTERING", 158 | 'wcent': "WEIGHTED CENTERING", 159 | 'lcent': "LOCALIZED CENTERING", 160 | 'dsg': "DISSIM GLOBAL", 161 | 'dsl': "DISSIM LOCAL", 162 | 'orig': "ORIGINAL DATA"} 163 | 164 | def _calc_intrinsic_dim(self): 165 | """Calculate intrinsic dimension estimate.""" 166 | self.intrinsic_dim = intrinsic_dimension(X=self.vectors) 167 | return self 168 | 169 | def analyze_hubness(self, experiments="orig,mp,mp_gaussi,nicdm,cent,dsg", 170 | hubness_k=(5, 10), knn_k=(1, 5, 20), 171 | print_results=True, verbose: int = 0): 172 | """Analyse hubness in original data and rescaled distances. 173 | 174 | Parameters 175 | ---------- 176 | experiments : str, optional 177 | Define which experiments to perform. Please provide a string of 178 | comma separated values chosen from the following options: 179 | 180 | - "orig" : Original, primary distances 181 | - "mp" : Mutual Proximity (empiric) 182 | - "mp_gaussi" : Mutual Proximity (independent Gaussians) 183 | - "mp_gammai" ... Mutual Proximity (independent Gamma) 184 | - "ls" : Local Scaling (using k-th neighbor) 185 | - "nicdm" : Local Scaling variant NICDM (average of k neighbors) 186 | - "snn" : Shared Nearest Neighbors 187 | - "cent" : Centering 188 | - "wcent" : Weighted Centering 189 | - "lcent" : Localized Centering 190 | - "dsg" : DisSim Global 191 | - "dsl" : DisSim Local 192 | 193 | hubness_k : tuple, optional (default: (5, 10)) 194 | Hubness parameter (skewness of `k`-occurence) 195 | 196 | knn_k : tuple, optional (default: (1, 5, 20)) 197 | `k`-NN classification parameter 198 | 199 | print_results : bool, optional (default: True) 200 | Define whether to print hubness analysis report to stdout 201 | 202 | verbose : int, optional (default: 0) 203 | Increasing output verbosity 204 | 205 | Returns 206 | ------- 207 | self : optionally prints results to stdout 208 | """ 209 | experiments = experiments.split(',') 210 | if self.vectors is None: 211 | self.intrinsic_dim = None 212 | else: 213 | self._calc_intrinsic_dim() 214 | for i, exp_type in enumerate(experiments): 215 | if verbose: 216 | print("Experiment {}/{} ({})". 217 | format(i+1, len(experiments), exp_type), end="\r") 218 | experiment = HubnessExperiment(D=self.D, 219 | secondary_distance_type=exp_type, 220 | metric=self.metric, 221 | classes=self.classes, 222 | vectors=self.vectors) 223 | if self.D is not None: 224 | experiment._calc_secondary_distance() 225 | for k in hubness_k: 226 | experiment._calc_hubness(k=k) 227 | if self.classes is not None: 228 | for k in knn_k: 229 | experiment._calc_knn_accuracy(k=k) 230 | experiment._calc_gk_index() 231 | self.experiments.append(experiment) 232 | if print_results: 233 | self.print_analysis_report(experiment, report_nr=i) 234 | if print_results: 235 | print("------------------------------------------------------------") 236 | print("Thanks for using the HUB-TOOLBOX!") 237 | print("If you use this software in a research project, please cite:") 238 | print("\n", CITATION) 239 | print("Please also consider citing the references to the \n" 240 | "individual modules/hubness functions that you use.") 241 | return self 242 | 243 | def print_analysis_report(self, experiment=None, report_nr:int=0): 244 | """Print a report of the performed hubness analysis. 245 | 246 | Parameters 247 | ---------- 248 | experiment : HubnessExperiment, optional (default: None) 249 | If given, report only this `experiment`. Otherwise, report all 250 | experiments of this analysis. 251 | 252 | report_nr : int, optional (default: 0) 253 | Method only prints headline for first report 254 | 255 | Returns 256 | ------- 257 | None : Output is printed to stdout 258 | """ 259 | if experiment is not None: 260 | experiments = [experiment] 261 | else: 262 | experiments = self.experiments 263 | if report_nr == 0: 264 | print("\n" 265 | "================\n" 266 | "Hubness Analysis\n" 267 | "================\n") 268 | for experiment in experiments: 269 | print(self._header[experiment.secondary_distance_type] + ':') 270 | # Print used parameters (which are the default parameters) 271 | sig = signature(SEC_DIST[experiment.secondary_distance_type]) 272 | for p in ['k', 'kappa', 'gamma']: 273 | try: 274 | print("parameter {} = {} (for optimization use the " 275 | "individual modules of the HUB-TOOLBOX)". 276 | format(p, sig.parameters[p].default)) 277 | except KeyError: 278 | pass # function does not use this parameter 279 | try: # to print hubness results, if available 280 | for k in sorted(experiment.hubness.keys()): 281 | print('data set hubness (S^k={:2}) : {:.3}'. 282 | format(k, experiment.hubness[k])) 283 | print('% of anti-hubs at k={:2} : {:.4}%'. 284 | format(k, experiment.anti_hubs[k])) 285 | print('% of k={:2}-NN lists the largest hub occurs: {:.4}%'. 286 | format(k, experiment.max_hub_k_occurence[k])) 287 | except KeyError: 288 | print('data set hubness (S^k={:2}) : ' 289 | 'No k given') 290 | try: # to print k-NN results, if available 291 | for k in sorted(experiment.knn_accuracy.keys()): 292 | print('k={:2}-NN classification accuracy : {:.4}%'. 293 | format(k, 100.*float(experiment.knn_accuracy[k]))) 294 | except KeyError: 295 | print('k=5-NN classification accuracy : ' 296 | 'No classes given') 297 | # print Goodman-Kruskal result, if available 298 | if experiment.gk_index is None: 299 | print('Goodman-Kruskal index (higher=better) : ' 300 | 'No classes given/Not calculated') 301 | else: 302 | print('Goodman-Kruskal index (higher=better) : {:.3}'. 303 | format(experiment.gk_index)) 304 | # Embedding dimension 305 | if self.vectors is None: 306 | print('embedding dimensionality : ' 307 | 'No vectors given') 308 | else: 309 | print('embedding dimensionality : {}'. 310 | format(experiment.embedding_dim)) 311 | # Intrinsic dimension estimate, if available 312 | if self.intrinsic_dim is None: 313 | print('intrinsic dimensionality estimate : ' 314 | 'No vectors given') 315 | else: 316 | print('intrinsic dimensionality estimate : {}'. 317 | format(round(self.intrinsic_dim))) 318 | print() 319 | return 320 | 321 | 322 | class HubnessExperiment: 323 | """Perform a single hubness experiment""" 324 | 325 | def __init__(self, D: np.ndarray, secondary_distance_type: str, 326 | metric: str = 'distance', classes: np.ndarray = None, 327 | vectors: np.ndarray = None): 328 | """Initialize a hubness experiment""" 329 | 330 | io.check_distance_matrix_shape(D) 331 | io.check_valid_metric_parameter(metric) 332 | if secondary_distance_type not in SEC_DIST.keys(): 333 | raise ValueError("Requested secondary distance type unknown.") 334 | if classes is not None: 335 | io.check_distance_matrix_shape_fits_labels(D, classes) 336 | if vectors is None: 337 | self.embedding_dim = None 338 | else: # got vectors 339 | io.check_distance_matrix_shape_fits_vectors(D, vectors) 340 | self.embedding_dim = vectors.shape[1] 341 | self.original_distance = D 342 | self.secondary_distance_type = secondary_distance_type 343 | self.classes = classes 344 | self.vectors = vectors 345 | self.metric = metric 346 | self.n = D.shape[0] 347 | # Obtained later through functions: 348 | self.secondary_distance = None 349 | self.hubness = dict() 350 | self.anti_hubs = dict() 351 | self.max_hub_k_occurence = dict() 352 | self.knn_accuracy = dict() 353 | self.gk_index = None 354 | 355 | def _calc_secondary_distance(self): 356 | """Calculate secondary distances (e.g. Mutual Proximity)""" 357 | sec_dist_fun = SEC_DIST[self.secondary_distance_type] 358 | try: 359 | self.secondary_distance = sec_dist_fun( 360 | D=self.original_distance, metric=self.metric) 361 | except TypeError: # centering has no keyword 'D=' 362 | if self.secondary_distance_type in ['cent', 'wcent']: 363 | self.secondary_distance = \ 364 | cosine_distance(sec_dist_fun(X=self.vectors)) 365 | elif self.secondary_distance_type in ['lcent']: 366 | self.secondary_distance = 1. - sec_dist_fun(X=self.vectors) 367 | elif self.secondary_distance_type in ['dsg', 'dsl']: 368 | self.secondary_distance = sec_dist_fun(X=self.vectors) 369 | else: 370 | raise ValueError("Erroneous secondary distance type: {}". 371 | format(self.secondary_distance_type)) 372 | return self 373 | 374 | def _calc_hubness(self, k: int = 5): 375 | """Calculate hubness (skewness of `k`-occurence). 376 | 377 | Also calculate percentage of anti hubs (`k`-occurence == 0) and 378 | percentage of k-NN lists the largest hub occurs in. 379 | """ 380 | S_k, _, N_k = hubness(D=self.secondary_distance, 381 | metric=self.metric, k=k) 382 | self.hubness[k] = S_k 383 | self.anti_hubs[k] = 100 * (N_k == 0).sum() / self.n 384 | self.max_hub_k_occurence[k] = 100 * N_k.max() / self.n 385 | return self 386 | 387 | def _calc_knn_accuracy(self, k: int = 5): 388 | """Calculate `k`-NN accuracy.""" 389 | acc, _, _ = score(D=self.secondary_distance, target=self.classes, 390 | k=k, metric=self.metric) 391 | self.knn_accuracy[k] = acc 392 | return self 393 | 394 | def _calc_gk_index(self): 395 | """Calculate Goodman-Kruskal's gamma.""" 396 | self.gk_index = goodman_kruskal_index(D=self.secondary_distance, 397 | classes=self.classes, 398 | metric=self.metric) 399 | return self 400 | 401 | 402 | if __name__ == "__main__": 403 | hub = HubnessAnalysis() 404 | hub.analyze_hubness() 405 | -------------------------------------------------------------------------------- /hub_toolbox/intrinsic_dimension.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | 13 | This file is based on a Matlab script by Elizaveta Levina, University of 14 | Michigan, available at http://dept.stat.lsa.umich.edu/~elevina/mledim.m 15 | 16 | Reference: E. Levina and P.J. Bickel (2005). 17 | "Maximum Likelihood Estimation of Intrinsic Dimension." 18 | In Advances in NIPS 17, Eds. L. K. Saul, Y. Weiss, L. Bottou. 19 | """ 20 | import numpy as np 21 | 22 | __all__ = ['intrinsic_dimension'] 23 | 24 | def intrinsic_dimension(X:np.ndarray, k1:int=6, k2:int=12, 25 | estimator:str='mackay', metric:str='vector', 26 | trafo:str=None, mem_threshold:int=5000): 27 | """Calculate intrinsic dimension based on the MLE by Levina and Bickel [1]_. 28 | 29 | Parameters 30 | ---------- 31 | X : ndarray 32 | - An ``m x n`` vector data matrix with ``n`` objects in an 33 | ``m`` dimensional feature space 34 | - An ``n x n`` distance matrix. 35 | 36 | NOTE: The type must be defined via parameter `metric`! 37 | 38 | k1 : int, optional (default: 6) 39 | Start of neighborhood range to search in. 40 | 41 | k2 : int, optional (default: 12) 42 | End of neighborhood range to search in. 43 | 44 | estimator : {'levina', 'mackay'}, optional (default: 'mackay') 45 | Determine the summation strategy: see [2]_. 46 | 47 | metric : {'vector', 'distance'}, optional (default: 'vector') 48 | Determine data type of `X`. 49 | 50 | NOTE: the MLE was derived for euclidean distances. Using 51 | other dissimilarity measures may lead to undefined results. 52 | 53 | trafo : {None, 'std', 'var'}, optional (default: None) 54 | Transform vector data. 55 | 56 | - None: no transformation 57 | - 'std': standardization 58 | - 'var': subtract mean, divide by variance (default behavior of 59 | Laurens van der Maaten's DR toolbox; most likely for other 60 | ID/DR techniques). 61 | 62 | mem_treshold : int, optional, default: 5000 63 | Controls speed-memory usage trade-off: If number of points is higher 64 | than the given value, don't calculate complete distance matrix at 65 | once (fast, high memory), but per row (slower, less memory). 66 | 67 | Returns 68 | ------- 69 | d_mle : int 70 | Intrinsic dimension estimate (rounded to next integer) 71 | 72 | References 73 | ---------- 74 | .. [1] Levina, E., & Bickel, P. (2004). Maximum likelihood estimation of 75 | intrinsic dimension. Advances in Neural Information …, 17, 777–784. 76 | http://doi.org/10.2307/2335172 77 | .. [2] http://www.inference.phy.cam.ac.uk/mackay/dimension/ 78 | """ 79 | n = X.shape[0] 80 | if estimator not in ['levina', 'mackay']: 81 | raise ValueError("Parameter 'estimator' must be 'levina' or 'mackay'.") 82 | if k1 < 1 or k2 < k1 or k2 >= n: 83 | raise ValueError("Invalid neighborhood: Please make sure that " 84 | "0 < k1 <= k2 < n. (Got k1={} and k2={}).". 85 | format(k1, k2)) 86 | X = X.copy().astype(float) 87 | 88 | if metric == 'vector': 89 | # New array with unique rows 90 | X = X[np.lexsort(np.fliplr(X).T)] 91 | 92 | if trafo is None: 93 | pass 94 | elif trafo == 'var': 95 | X -= X.mean(axis=0) # broadcast 96 | X /= X.var(axis=0) + 1e-7 # broadcast 97 | elif trafo == 'std': 98 | # Standardization 99 | X -= X.mean(axis=0) # broadcast 100 | X /= X.std(axis=0) + 1e-7 # broadcast 101 | else: 102 | raise ValueError("Transformation must be None, 'std', or 'var'.") 103 | 104 | # Compute matrix of log nearest neighbor distances 105 | X2 = (X**2).sum(1) 106 | 107 | if n <= mem_threshold: # speed-memory trade-off 108 | distance = X2.reshape(-1, 1) + X2 - 2*np.dot(X, X.T) #2x br.cast 109 | distance.sort(1) 110 | # Replace invalid values with a small number 111 | distance[distance<=0] = 1e-7 112 | knnmatrix = .5 * np.log(distance[:, 1:k2+1]) 113 | else: 114 | knnmatrix = np.zeros((n, k2)) 115 | for i in range(n): 116 | distance = np.sort(X2[i] + X2 - 2 * np.dot(X, X[i, :])) 117 | # Replace invalid values with a small number 118 | distance[distance <= 0] = 1e-7 119 | knnmatrix[i, :] = .5 * np.log(distance[1:k2+1]) 120 | elif metric == 'distance': 121 | raise NotImplementedError("ID currently only supports vector data.") 122 | # XXX perhaps map to sufficiently high dim with MDS, then calc ID?? 123 | #======================================================================= 124 | # # TODO calculation WRONG 125 | # X.sort(1) 126 | # X[X < 0] = 1e-7 127 | # knnmatrix = np.log(X[:, 1:k2+1]) 128 | #======================================================================= 129 | elif metric == 'similarity': 130 | raise NotImplementedError("ID currently only supports vector data.") 131 | #======================================================================= 132 | # # TODO calculation WRONG 133 | # print("WARNING: using similarity data may return " 134 | # "undefined results.", file=sys.stderr) 135 | # X[X < 0] = 0 136 | # distance = 1 - (X / X.max()) 137 | # knnmatrix = np.log(distance[:, 1:k2+1]) 138 | #======================================================================= 139 | else: 140 | raise ValueError("Parameter `metric` must be 'vector'.") 141 | 142 | # Compute the ML estimate 143 | S = np.cumsum(knnmatrix, 1) 144 | indexk = np.arange(k1, k2+1) # broadcasted afterwards 145 | dhat = -(indexk - 2) / (S[:, k1-1:k2] - knnmatrix[:, k1-1:k2] * indexk) 146 | if estimator == 'levina': 147 | # Average over estimates and over values of k 148 | no_dims = dhat.mean() 149 | if estimator == 'mackay': 150 | # Average over inverses 151 | dhat **= -1 152 | dhat_k = dhat.mean(0) 153 | no_dims = (dhat_k ** -1).mean() 154 | return no_dims 155 | 156 | if __name__ == '__main__': 157 | m_dim = 100 158 | n_dim = 2000 159 | VECT_DATA = np.random.rand(n_dim, m_dim) 160 | id_ = intrinsic_dimension(VECT_DATA) 161 | print("Random {}x{} matrix: ID_MLE = {}".format(n_dim, m_dim, id_)) 162 | -------------------------------------------------------------------------------- /hub_toolbox/io.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2015-2018, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | """ 13 | 14 | import os 15 | import numpy as np 16 | from scipy import sparse 17 | from scipy.sparse.base import issparse 18 | 19 | __all__ = ['load_dexter', 'random_sparse_matrix', 20 | 'load_csr_matrix', 'save_csr_matrix'] 21 | 22 | def load_dexter(): 23 | """Load the example data set (dexter). 24 | 25 | Returns 26 | ------- 27 | D : ndarray 28 | Distance matrix 29 | 30 | classes : ndarray 31 | Class label vector 32 | 33 | vectors : ndarray 34 | Vector data matrix 35 | """ 36 | from hub_toolbox.distances import cosine_distance 37 | 38 | n = 300 39 | dim = 20000 40 | 41 | # Read class labels 42 | classes_file = os.path.dirname(os.path.realpath(__file__)) +\ 43 | '/example_datasets/dexter_train.labels' 44 | classes = np.loadtxt(classes_file) 45 | 46 | # Read data 47 | vectors = np.zeros((n, dim)) 48 | data_file = os.path.dirname(os.path.realpath(__file__)) + \ 49 | '/example_datasets/dexter_train.data' 50 | with open(data_file, mode='r') as fid: 51 | data = fid.readlines() 52 | row = 0 53 | for line in data: 54 | line = line.strip().split() # line now contains pairs of dim:val 55 | for word in line: 56 | col, val = word.split(':') 57 | vectors[row][int(col)-1] = int(val) 58 | row += 1 59 | 60 | # Calc distance 61 | D = cosine_distance(vectors) 62 | return D, classes, vectors 63 | 64 | def check_is_nD_array(arr:np.ndarray, n:int, arr_type=''): 65 | """ Check that array is exactly n dimensional. """ 66 | try: 67 | if arr.ndim != n: 68 | raise TypeError(arr_type + " array must be a " + str(n) + 69 | "D array, but was found to be a " + 70 | str(arr.ndim) + "D array with shape: " + 71 | str(arr.shape)) 72 | except AttributeError: 73 | raise TypeError("Object 'arr' does not seem to be an array.") 74 | 75 | def check_distance_matrix_shape(D:np.ndarray): 76 | """ Check that matrix is quadratic. """ 77 | check_is_nD_array(D, n=2, arr_type="Distance/similarity") 78 | if D.shape[0] != D.shape[1]: 79 | raise TypeError("Distance/similarity matrix is not quadratic. " 80 | "Shape: {}".format(D.shape)) 81 | 82 | def check_distance_matrix_shape_fits_vectors(D:np.ndarray, vectors:np.ndarray): 83 | """ Check number of points in distance matrix equal number of vectors. """ 84 | check_is_nD_array(D, 2, "Distance/similarity") 85 | check_is_nD_array(vectors, 2, "Data vectors") 86 | if D.shape[0] != vectors.shape[0]: 87 | raise TypeError("Number of points in `vectors` does not match " 88 | "number of points in `D`. Shape of `vectors`: {}, " 89 | "shape of `D`: {}".format(vectors.shape[0], D.shape[0])) 90 | 91 | def check_distance_matrix_shape_fits_labels(D:np.ndarray, classes:np.ndarray): 92 | """ Check the number of points in distance matrix equal number of labels.""" 93 | check_is_nD_array(D, 2, "Distance/similarity") 94 | check_is_nD_array(classes, 1, "Class label") 95 | if classes.size != D.shape[0]: 96 | raise TypeError("Number of class labels does not match number of " 97 | "points. Labels: {}, points: {}." 98 | .format(classes.size, D.shape[0])) 99 | 100 | def check_vector_matrix_shape_fits_labels(X:np.ndarray, classes:np.ndarray): 101 | """ Check the number of points in vector matrix equal number of labels.""" 102 | check_is_nD_array(X, 2, "Data vectors") 103 | check_is_nD_array(classes, 1, "Class label") 104 | if classes.size != X.shape[0]: 105 | raise TypeError("Number of class labels does not match number of " 106 | "points. Labels: {}, points: {}." 107 | .format(classes.size, X.shape[0])) 108 | 109 | def check_sample_shape_fits(D:np.ndarray, idx:np.ndarray): 110 | """ Check that number of columns in ``D`` equals the size of ``idx``. """ 111 | if issparse(D) or issparse(idx): 112 | raise TypeError("Sparse matrices are not supported for SampleMP.") 113 | check_is_nD_array(D, 2, "Distance/similarity") 114 | check_is_nD_array(idx, 1, "Index") 115 | if D.shape[1] > D.shape[0]: 116 | raise ValueError("Number of samples is higher than number of points. " 117 | "Must be less than or equal. In the latter case, " 118 | "consider not using samples at all for efficiency. " 119 | "Shape of `D`: {}.".format(D.shape)) 120 | if D.shape[1] != idx.size: 121 | raise TypeError("Number of samples in index array does not match " 122 | "the number of samples in the data matrix. " 123 | "Size of `idx`: {}, Columns in `D`: {}." 124 | .format(idx.size, D.shape[1])) 125 | 126 | def check_valid_metric_parameter(metric:str): 127 | """ Check parameter is either 'distance' or 'similarity'. """ 128 | if metric != 'distance' and metric != 'similarity': 129 | raise ValueError("Parameter 'metric' must be " 130 | "'distance' or 'similarity'." 131 | "Got: " + metric.__str__()) 132 | 133 | def matrix_split(rows, cols, elem_size=8, nr_matrices=4): # pragma: no cover 134 | """Determine how to split a matrix that does not fit into memory. 135 | 136 | Parameters 137 | ---------- 138 | rows, cols : int 139 | Shape of matrix that should be split. 140 | 141 | elem_size : int 142 | memory requirement per matrix element in bytes. E.g. 8 bytes for float64 143 | 144 | nr_matrices : int 145 | How many times must the split matrix fit into memory? 146 | This depends on the subsequent operations. 147 | 148 | Returns 149 | ------- 150 | nr_batches : int 151 | number of submatrices 152 | 153 | nr_rows : int 154 | number of rows per submatrix. 155 | 156 | Notes 157 | ----- 158 | - Submatrices always contain all columns per row. 159 | - The last batch will usually have less rows than `nr_rows` 160 | """ 161 | free_mem = FreeMemLinux(unit='k').user_free 162 | max_rows = int(free_mem / cols / elem_size) 163 | nr_rows = int(max_rows / nr_matrices) 164 | nr_batches = int(np.ceil(rows / nr_rows)) 165 | return nr_batches, nr_rows 166 | 167 | def random_sparse_matrix(size, density=0.05): 168 | """Generate a random sparse similarity matrix. 169 | 170 | Values are bounded by [0, 1]. Diagonal is all ones. The final density is 171 | approximately 2*`density`. 172 | 173 | Parameters 174 | ---------- 175 | size : int 176 | Shape of the matrix (`size` x `size`) 177 | 178 | density : float, optional, default=0.05 179 | The matrix' density will be approximately 2 * `density` 180 | 181 | Returns 182 | ------- 183 | S : csr_matrix 184 | Random matrix 185 | """ 186 | S = sparse.rand(size, size, density, 'csr') 187 | S += S.T 188 | S /= S.max() 189 | S -= sparse.diags(S.diagonal(), 0) 190 | S += sparse.diags(np.ones(size), 0) 191 | return S 192 | 193 | def save_csr_matrix(file, matrix): 194 | np.savez(file, data=matrix.data, indices=matrix.indices, 195 | indptr=matrix.indptr, shape=matrix.shape) 196 | return file 197 | 198 | def load_csr_matrix(file): 199 | container = np.load(file) 200 | return sparse.csr_matrix((container['data'], container['indices'], 201 | container['indptr']), shape=container['shape']) 202 | 203 | class FreeMemLinux(object): # pragma: no cover 204 | """Non-cross platform way to get free memory on Linux. 205 | 206 | Original code by Oz123, 207 | http://stackoverflow.com/questions/17718449/determine-free-ram-in-python 208 | """ 209 | 210 | def __init__(self, unit='kB'): 211 | 212 | with open('/proc/meminfo', 'r') as mem: 213 | lines = mem.readlines() 214 | 215 | self._tot = int(lines[0].split()[1]) 216 | self._free = int(lines[1].split()[1]) 217 | self._buff = int(lines[2].split()[1]) 218 | self._cached = int(lines[3].split()[1]) 219 | self._shared = int(lines[20].split()[1]) 220 | self._swapt = int(lines[14].split()[1]) 221 | self._swapf = int(lines[15].split()[1]) 222 | self._swapu = self._swapt - self._swapf 223 | 224 | self.unit = unit 225 | self._convert = self._factor() 226 | 227 | def _factor(self): 228 | """determine the conversion factor""" 229 | if self.unit == 'kB': 230 | return 1 231 | if self.unit == 'k': 232 | return 1024.0 233 | if self.unit == 'MB': 234 | return 1/1024.0 235 | if self.unit == 'GB': 236 | return 1/1024.0/1024.0 237 | if self.unit == '%': 238 | return 1.0/self._tot * 100 239 | else: 240 | raise Exception("Unit not understood") 241 | 242 | @property 243 | def total(self): 244 | return self._convert * self._tot 245 | 246 | @property 247 | def used(self): 248 | return self._convert * (self._tot - self._free) 249 | 250 | @property 251 | def used_real(self): 252 | """memory used which is not cache or buffers""" 253 | return self._convert * (self._tot - self._free - self._buff - self._cached) 254 | 255 | @property 256 | def shared(self): 257 | return self._convert * (self._tot - self._free) 258 | 259 | @property 260 | def buffers(self): 261 | return self._convert * (self._buff) 262 | 263 | @property 264 | def cached(self): 265 | return self._convert * self._cached 266 | 267 | @property 268 | def user_free(self): 269 | """This is the free memory available for the user""" 270 | return self._convert * (self._free + self._buff + self._cached) 271 | 272 | @property 273 | def swap(self): 274 | return self._convert * self._swapt 275 | 276 | @property 277 | def swap_free(self): 278 | return self._convert * self._swapf 279 | 280 | @property 281 | def swap_used(self): 282 | return self._convert * self._swapu 283 | 284 | if __name__ == '__main__': 285 | fml = FreeMemLinux(unit='MB') 286 | fml2 = FreeMemLinux(unit='%') 287 | print("Used memory: {:.1f}M ({:.1f}%).".format(fml.used_real, fml2.used_real)) 288 | print("Free memory: {:.1f}M ({:.1f}%).".format(fml.user_free, fml2.user_free)) 289 | -------------------------------------------------------------------------------- /hub_toolbox/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file is part of the HUB TOOLBOX available at 4 | https://github.com/OFAI/hub-toolbox-python3/ 5 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 6 | 7 | (c) 2018, Roman Feldbauer 8 | Austrian Research Institute for Artificial Intelligence (OFAI) 9 | Contact: 10 | """ 11 | from multiprocessing import Value 12 | 13 | __all__ = ['SynchronizedCounter'] 14 | 15 | class SynchronizedCounter(object): 16 | """ A multiprocessing-safe counter for progress information. """ 17 | def __init__(self, init:int=-1): 18 | self.val = Value('i', init) 19 | 20 | def increment_and_get_value(self, n=1) -> int: 21 | """ Obtain a lock before incrementing, since += isn't atomic. """ 22 | with self.val.get_lock(): 23 | self.val.value += n 24 | return self.val.value 25 | 26 | @property 27 | def value(self) -> int: 28 | return self.val.value 29 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | build: 2 | image: latest 3 | 4 | python: 5 | version: 3.6 -------------------------------------------------------------------------------- /readthedocs_requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OFAI/hub-toolbox-python3/b76fa405dc6ffc80484a9bfed7e68fa828b7dc8e/readthedocs_requirements.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | pandas 4 | scikit-learn 5 | joblib 6 | coveralls 7 | falconn 8 | nmslib 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2011-2018, Dominik Schnitzer and Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | 13 | 14 | Installation: 15 | ------------- 16 | In the console (terminal application) change to the folder containing this file. 17 | 18 | To build the package hub_toolbox: 19 | python3 setup.py build 20 | 21 | To install the package (with administrator rights): 22 | sudo python3 setup.py install 23 | 24 | To test the installation: 25 | sudo python3 setup.py test 26 | 27 | If this succeeds with an 'OK' message, you are ready to go. 28 | Otherwise you may consider filing a bug report on github. 29 | (Some skipped tests are perfectly fine, though.) 30 | """ 31 | import re, os, sys 32 | REQ_MAJOR = 3 33 | REQ_MINOR = 6 34 | if sys.version_info < (REQ_MAJOR, REQ_MINOR): 35 | sys.stdout.write( 36 | (f"The HUB TOOLBOX requires Python {REQ_MAJOR}.{REQ_MINOR} or higher." 37 | f"\nPlease try to run as python3 setup.py or update your Python " 38 | f"environment.\n Consider using Anaconda for easy package handling.")) 39 | sys.exit(1) 40 | 41 | try: 42 | import numpy, scipy, sklearn # @UnusedImport 43 | except ImportError: 44 | sys.stdout.write("The HUB TOOLBOX requires numpy, scipy and scikit-learn. " 45 | "Please make sure these packages are available locally. " 46 | "Consider using Anaconda for easy package handling.\n") 47 | try: 48 | import pandas, joblib # @UnusedImport 49 | except ImportError: 50 | sys.stdout.write("Some modules of the HUB TOOLBOX require pandas and joblib. " 51 | "Please make sure these packages are available locally. " 52 | "Consider using Anaconda for easy package handling.\n") 53 | try: 54 | import nmslib, falconn # @UnusedImport 55 | except ImportError: 56 | sys.stdout.write("The 'approximate' module uses 'nmslib' and 'falconn' " 57 | "libraries for approximate nearest neighbor search. " 58 | "Please make sure these packages are available locally. " 59 | "Consider using Anaconda for easy package handling.\n") 60 | setup_options = {} 61 | 62 | try: 63 | from setuptools import setup 64 | setup_options['test_suite'] = 'tests' 65 | except ImportError: 66 | from distutils.core import setup 67 | import warnings 68 | warnings.warn("setuptools not found, resorting to distutils. " 69 | "Unit tests won't be discovered automatically.") 70 | 71 | # Parsing current version number 72 | # Adapted from the Lasagne project at 73 | # https://github.com/Lasagne/Lasagne/blob/master/setup.py 74 | here = os.path.abspath(os.path.dirname(__file__)) 75 | try: 76 | # obtain version string from __init__.py 77 | # Read ASCII file with builtin open() so __version__ is str in Python 2 and 3 78 | with open(os.path.join(here, 'hub_toolbox', '__init__.py'), 'r') as f: 79 | init_py = f.read() 80 | version = re.search("__version__ = '(.*)'", init_py).groups()[0] 81 | except Exception: 82 | version = '' 83 | 84 | setup( 85 | name = "hub_toolbox", 86 | version = version, 87 | author = "Roman Feldbauer", 88 | author_email = "roman.feldbauer@ofai.at", 89 | maintainer = "Roman Feldbauer", 90 | maintainer_email = "roman.feldbauer@ofai.at", 91 | description = "Hubness reduction and analysis tools", 92 | license = "GNU GPLv3", 93 | keywords = ["machine learning", "data science"], 94 | url = "https://github.com/OFAI/hub-toolbox-python3", 95 | packages=['hub_toolbox', 'tests'], 96 | package_data={'hub_toolbox': ['example_datasets/*']}, 97 | classifiers=[ 98 | "Development Status :: 4 - Beta", 99 | "Environment :: Console", 100 | "Intended Audience :: Science/Research", 101 | "License :: OSI Approved :: GNU General Public License v3 " 102 | "or later (GPLv3+)", 103 | "Programming Language :: Python :: 3", 104 | "Programming Language :: Python :: 3.6", 105 | "Programming Language :: Python :: 3.7", 106 | "Topic :: Scientific/Engineering" 107 | ], 108 | **setup_options 109 | ) 110 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is part of the HUB TOOLBOX available at 6 | https://github.com/OFAI/hub-toolbox-python3/ 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 8 | 9 | (c) 2016-2018, Roman Feldbauer 10 | Austrian Research Institute for Artificial Intelligence (OFAI) 11 | Contact: 12 | 13 | --- 14 | 15 | unittest module 16 | """ -------------------------------------------------------------------------------- /tests/approximate_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from sklearn.datasets import make_classification 15 | from sklearn.model_selection import train_test_split 16 | from hub_toolbox import approximate 17 | from sklearn.metrics.classification import accuracy_score 18 | 19 | 20 | class ApproximateHRTest(unittest.TestCase): 21 | 22 | def setUp(self): 23 | n_samples = 500 24 | n_informative = 256 25 | n_features = n_informative 26 | test_size = int(n_samples * .2) 27 | X, y = make_classification( 28 | n_samples=n_samples, 29 | n_features=n_features, 30 | n_informative=n_informative, 31 | n_redundant=0, n_repeated=0, n_classes=2, 32 | n_clusters_per_class=10, random_state=2847356) 33 | X_train, X_test, y_train, y_test = train_test_split( 34 | X.astype(np.float32), y.astype(np.int32), test_size=test_size) 35 | self.X_train = X_train 36 | self.X_test = X_test 37 | self.y_train = y_train 38 | self.y_test = y_test 39 | 40 | self.hr_algorithms = ['LS', 'NICDM', 'MP', 'MPG', 'DSL', None, 'NoNe'] 41 | self.n_neighbors = 5 42 | self.n_samples = 100 43 | self.sampling_algorithms = ['random', 'kmeans++', 'LSH', 'HNSW', 44 | None, 'nOnE'] 45 | self.metrics = ['sqeuclidean', 'cosine'] 46 | self.n_jobs = [-1, 1] 47 | self.verbose = 0 48 | self.accu_time = 0. 49 | 50 | def tearDown(self): 51 | print(f'Accumulated time: {self.accu_time} seconds.') 52 | 53 | def _approximate_hr(self, hr_algorithm, sampling_algorithm, 54 | metric, n_jobs): 55 | hr = approximate.SuQHR(hr_algorithm=hr_algorithm, 56 | n_neighbors=self.n_neighbors, 57 | n_samples=self.n_samples, 58 | metric=metric, 59 | sampling_algorithm=sampling_algorithm, 60 | random_state=123, 61 | n_jobs=n_jobs, 62 | verbose=self.verbose) 63 | hr.fit(self.X_train, self.y_train) 64 | y_pred = hr.predict(self.X_test) 65 | acc = accuracy_score(y_pred, self.y_test) 66 | print(f'SuQHR ({hr_algorithm}, {sampling_algorithm}, {metric}) ' 67 | f'{self.n_neighbors}-NN accuracy: {acc:.2f}') 68 | total_time = hr.time_fit_ + hr.time_transform_ + hr.time_predict_ 69 | self.accu_time += total_time.total.values 70 | 71 | def test_approximate_hubness_reduction(self): 72 | for hr_algorithm in self.hr_algorithms: 73 | for sampling_algorithm in self.sampling_algorithms: 74 | for metric in self.metrics: 75 | for n_jobs in self.n_jobs: 76 | self._approximate_hr(hr_algorithm, 77 | sampling_algorithm, 78 | metric, 79 | n_jobs) 80 | 81 | def test_surrogate_class(self): 82 | hr = approximate.ApproximateHubnessReduction() 83 | return self.assertIn(hr.hr_algorithm, self.hr_algorithms) 84 | 85 | 86 | if __name__ == "__main__": 87 | # import sys;sys.argv = ['', 'Test.testName'] 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /tests/centering_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from sklearn.preprocessing import StandardScaler 15 | from hub_toolbox.centering import centering, weighted_centering, \ 16 | localized_centering, dis_sim_global, dis_sim_local 17 | from hub_toolbox.io import load_dexter 18 | from hub_toolbox.hubness import hubness 19 | from hub_toolbox.knn_classification import score 20 | 21 | class TestCentering(unittest.TestCase): 22 | 23 | def setUp(self): 24 | self.distance, self.target, self.vectors = load_dexter() 25 | 26 | def test_centering_equal_to_sklearn_centering(self): 27 | vectors_cent = centering(self.vectors, 'vector') 28 | scaler = StandardScaler(with_mean=True, with_std=False) 29 | vectors_sklearn_cent = scaler.fit_transform(self.vectors) 30 | return np.testing.assert_array_almost_equal( 31 | vectors_cent, vectors_sklearn_cent, decimal=7) 32 | 33 | def test_weighted_centering_with_gamma_zero_equal_centering(self): 34 | vectors_wcent = weighted_centering(self.vectors, 'cosine', gamma=0.) 35 | vectors_cent = centering(self.vectors, 'vector') 36 | return np.testing.assert_array_almost_equal( 37 | vectors_cent, vectors_wcent, decimal=7) 38 | 39 | def test_weighted_centering_with_gamma_notzero_changes_result(self): 40 | gamma = np.random.rand(1) 41 | vectors_wcent = weighted_centering(self.vectors, 'cosine', gamma) 42 | vectors_cent = centering(self.vectors, 'vector') 43 | return self.assertNotEqual((vectors_cent - vectors_wcent).sum(), 0) 44 | 45 | def test_localized_centering(self): 46 | """Test whether hubness and k-NN accuracy improve for dexter""" 47 | h_orig = hubness(self.distance)[0] 48 | acc_orig = score(self.distance, self.target)[0][0, 0] 49 | sim_lcent = localized_centering(self.vectors, kappa=20, gamma=1.) 50 | h_lcent = hubness(sim_lcent, metric='similarity')[0] 51 | acc_lcent = score(sim_lcent, self.target, metric='similarity')[0][0, 0] 52 | result = (h_orig / h_lcent > 1.5) & (acc_lcent - acc_orig > 0.03) 53 | return self.assertTrue(result) 54 | 55 | def test_localized_centering_parallel(self): 56 | lcent_seq = localized_centering( 57 | self.vectors, kappa=20, gamma=1., n_jobs=4) 58 | lcent_par = localized_centering( 59 | self.vectors, kappa=20, gamma=1., n_jobs=1) 60 | return np.testing.assert_array_almost_equal(lcent_par, lcent_seq, 14) 61 | 62 | def test_dis_sim_global(self): 63 | """Test whether hubness and k-NN accuracy improve for dexter""" 64 | h_orig = hubness(self.distance)[0] 65 | acc_orig = score(self.distance, self.target)[0][0, 0] 66 | dist_dsg = dis_sim_global(self.vectors) 67 | h_dsg = hubness(dist_dsg)[0] 68 | acc_dsg = score(dist_dsg, self.target)[0][0, 0] 69 | result = (h_orig / h_dsg > 2) & (acc_dsg - acc_orig > 0.07) 70 | return self.assertTrue(result) 71 | 72 | def test_dis_sim_local(self): 73 | """Test whether hubness and k-NN accuracy improve for dexter""" 74 | #self.vectors = np.tile(self.vectors, 1) 75 | h_orig = hubness(self.distance)[0] 76 | acc_orig = score(self.distance, self.target)[0][0, 0] 77 | dist_dsl = dis_sim_local(self.vectors, k=50) 78 | h_dsl = hubness(dist_dsl)[0] 79 | acc_dsl = score(dist_dsl, self.target)[0][0, 0] 80 | result = (h_orig / h_dsl > 10) & (acc_dsl - acc_orig > 0.03) 81 | return self.assertTrue(result) 82 | 83 | def test_dis_sim_local_parallel(self): 84 | dsl_seq = dis_sim_local(self.vectors, k=50, n_jobs=1) 85 | dsl_par = dis_sim_local(self.vectors, k=50, n_jobs=4) 86 | return np.testing.assert_array_almost_equal(dsl_seq, dsl_par, 14) 87 | 88 | def test_dis_sim_local_split_parallel_(self): 89 | X = self.vectors[:150, :] 90 | Y = self.vectors[150:, :] 91 | dsl_seq = dis_sim_local(X, Y, n_jobs=1) 92 | dsl_par = dis_sim_local(X, Y, n_jobs=4) 93 | return np.testing.assert_array_almost_equal(dsl_seq, dsl_par, 14) 94 | 95 | if __name__ == "__main__": 96 | unittest.main() 97 | -------------------------------------------------------------------------------- /tests/distances_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from scipy.spatial.distance import pdist, cdist, squareform 15 | from sklearn.neighbors import KNeighborsClassifier 16 | from sklearn.metrics import accuracy_score 17 | from hub_toolbox.distances import (cosine_distance, euclidean_distance, 18 | mp_dissim) 19 | from hub_toolbox.io import load_dexter 20 | from hub_toolbox.hubness import hubness 21 | 22 | class TestDistances(unittest.TestCase): 23 | 24 | def setUp(self): 25 | np.random.seed(626) 26 | self.vectors = 99. * (np.random.rand(400, 200) - 0.5) 27 | 28 | def tearDown(self): 29 | del self.vectors 30 | 31 | def test_cosine_dist_equal_to_scipy_pdist_cos(self): 32 | cos_dist = cosine_distance(self.vectors) 33 | cos_dist_scipy = squareform(pdist(self.vectors, 'cosine')) 34 | return np.testing.assert_array_almost_equal( 35 | cos_dist, cos_dist_scipy, decimal=7) 36 | 37 | def test_euclidean_dist_equal_to_scipy_cdist_eucl(self): 38 | eucl_dist = euclidean_distance(self.vectors) 39 | eucl_dist_cdist = cdist(self.vectors, self.vectors, 'euclidean') 40 | return np.testing.assert_array_almost_equal( 41 | eucl_dist, eucl_dist_cdist, decimal=7) 42 | 43 | class TestMpDisSim(unittest.TestCase): 44 | 45 | def setUp(self): 46 | _, y, X = load_dexter() 47 | r = np.random.permutation(y.size) 48 | self.X = X[r, :] 49 | self.y = y[r] 50 | split = int(len(y)/10*9) 51 | train_ind = slice(0, split) 52 | test_ind = slice(split, len(y)) 53 | self.X_train = self.X[train_ind] 54 | self.X_test = self.X[test_ind] 55 | self.y_train = self.y[train_ind] 56 | self.y_test = self.y[test_ind] 57 | 58 | def test_mp_dissim(self): 59 | ''' Test that mp_dissim improves kNN-accuracy for dexter. ''' 60 | D_part = cdist(self.X_test, self.X_train, 'euclidean') 61 | knn = KNeighborsClassifier( 62 | n_neighbors=5, metric='precomputed', n_jobs=4) 63 | knn.fit(self.X_train, self.y_train) 64 | y_pred = knn.predict(D_part) 65 | acc_eucl = accuracy_score(self.y_test, y_pred) 66 | h_eucl = hubness(D_part, k=5, metric='distance', n_jobs=4)[0] 67 | D_part_mp = mp_dissim( 68 | X=self.X_test, Y=self.X_train, p=0, n_bins=10, bin_size='r', verbose=1, n_jobs=-1) 69 | y_pred_mp = knn.predict(D_part_mp) 70 | acc_mp = accuracy_score(self.y_test, y_pred_mp) 71 | h_mp = hubness(D_part_mp, k=5, metric='distance', n_jobs=4)[0] 72 | #======================================================================= 73 | # print("Hub:", h_eucl, h_mp) 74 | # print("Acc:", acc_eucl, acc_mp) 75 | # D_mp = mp_dissim(self.X, p=2, n_bins=10, bin_size='r', n_jobs=-1, verbose=1) 76 | #======================================================================= 77 | self.assertLess(h_mp, h_eucl) 78 | self.assertGreater(acc_mp, acc_eucl) 79 | 80 | if __name__ == "__main__": 81 | unittest.main() 82 | -------------------------------------------------------------------------------- /tests/goodmankruskal_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from scipy.spatial.distance import squareform, pdist 15 | from scipy.sparse.csr import csr_matrix 16 | from hub_toolbox.goodman_kruskal import goodman_kruskal_index,\ 17 | _naive_goodman_kruskal, sparse_goodman_kruskal_index 18 | from hub_toolbox.io import random_sparse_matrix 19 | from hub_toolbox.shared_neighbors import shared_nearest_neighbors 20 | 21 | class TestGoodmanKruskal(unittest.TestCase): 22 | 23 | def setUp(self): 24 | n = 50 25 | m = 5 26 | c = 3 27 | np.random.seed(823475) 28 | data = np.random.rand(n, m) 29 | self.distance = squareform(pdist(data, 'euclidean')) 30 | self.similarity = 1. - self.distance / self.distance.max() 31 | self.labels = np.random.randint(0, c, n) 32 | 33 | def tearDown(self): 34 | del self.distance, self.similarity, self.labels 35 | 36 | def test_naive_goodmankruskal_algorithm(self): 37 | """Using a small clustering with correct value calc by hand""" 38 | distance = np.array( 39 | squareform([0.7, 1.55, 0.5, 1.7, 0.9, 0.85, 1.2, 1.5, 0.6, 1.4])) 40 | label = np.array([0, 0, 1, 2, 1]) 41 | CORRECT_RESULT = 0.75 42 | result = _naive_goodman_kruskal(distance, label, 'distance') 43 | return self.assertEqual(result, CORRECT_RESULT) 44 | 45 | def test_efficient_goodmankruskal_equal_to_naive_goodmankruskal(self): 46 | """Test whether goodman_kruskal_index yields correct result""" 47 | gamma_efficient = goodman_kruskal_index(self.distance, self.labels) 48 | gamma_naive = _naive_goodman_kruskal(self.distance, self.labels) 49 | return self.assertEqual(gamma_efficient, gamma_naive) 50 | 51 | def test_goodmankruskal_distance_based_equal_to_similarity_based(self): 52 | """Test whether results are correct using similarities""" 53 | gamma_dist = goodman_kruskal_index(self.distance, self.labels, 'distance') 54 | gamma_sim = goodman_kruskal_index(self.similarity, self.labels, 'similarity') 55 | return self.assertEqual(gamma_dist, gamma_sim) 56 | 57 | def test_goodmankruskal_close_to_zero_for_random_data(self): 58 | gamma_dist = goodman_kruskal_index(self.distance, self.labels) 59 | return self.assertAlmostEqual(gamma_dist, 0., places=1) 60 | 61 | def test_sparse_goodmankruskal_equal_to_dense_goodmankruskal(self): 62 | similarity = random_sparse_matrix(size=1000) 63 | labels = np.random.randint(0, 5, 1000) 64 | gamma_sparse = sparse_goodman_kruskal_index(similarity, labels, verbose=2) 65 | gamma_dense = goodman_kruskal_index(similarity.toarray(), labels, 'similarity') 66 | return self.assertEqual(gamma_dense, gamma_sparse) 67 | 68 | def test_correct_handling_equal_distances_goodmankruskal(self): 69 | """SharedNN matrices contain lots of equal distances""" 70 | dist_snn = shared_nearest_neighbors(self.distance) 71 | gamma_efficient = goodman_kruskal_index(dist_snn, self.labels) 72 | gamma_naive = _naive_goodman_kruskal(dist_snn, self.labels) 73 | return self.assertEqual(gamma_efficient, gamma_naive) 74 | 75 | def test_correct_handling_equal_similarities_sparse_gk(self): 76 | sim_snn = 1. - shared_nearest_neighbors(self.distance) 77 | gamma_sparse = sparse_goodman_kruskal_index(csr_matrix(sim_snn), self.labels) 78 | gamma_efficient = goodman_kruskal_index(sim_snn, self.labels, 'similarity') 79 | return self.assertEqual(gamma_efficient, gamma_sparse) 80 | 81 | if __name__ == "__main__": 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /tests/hubness_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from scipy.spatial.distance import squareform 15 | from sklearn.datasets.samples_generator import make_classification 16 | from sklearn.model_selection import train_test_split 17 | from hub_toolbox.approximate import ApproximateHubnessReduction,\ 18 | VALID_HR, VALID_SAMPLE 19 | from hub_toolbox.distances import euclidean_distance 20 | from hub_toolbox.hubness import hubness, Hubness, hubness_from_vectors 21 | from hub_toolbox.io import random_sparse_matrix 22 | 23 | class TestHubness(unittest.TestCase): 24 | """Test hubness calculations""" 25 | 26 | def setUp(self): 27 | """Hubness truth: S_k=5, skewness calculated with bias""" 28 | self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9]) 29 | self.hubness_truth = -0.2561204163 30 | 31 | def tearDown(self): 32 | del self.dist 33 | 34 | def test_hubness(self): 35 | """Test hubness against ground truth calc on spreadsheet""" 36 | Sk5, _, _ = hubness(self.dist, k=2, verbose=1) 37 | return self.assertAlmostEqual(Sk5, self.hubness_truth, places=10) 38 | 39 | def test_hubness_return_values_are_self_consistent(self): 40 | """Test that the three returned values fit together""" 41 | np.random.seed(626) 42 | points = 200 43 | dim = 500 44 | vector = 99. * (np.random.rand(points, dim) - 0.5) 45 | dist = euclidean_distance(vector) 46 | k = 10 47 | Sk10, Dk10, Nk10 = hubness(dist, k=k) 48 | # Dk is just checked for correct shape 49 | correct_dim_Dk10 = Dk10.shape == (points, k) 50 | # Count k-occurence (different method than in module) 51 | Dk10 = Dk10.ravel() 52 | Nk10_true = np.zeros(points, dtype=int) 53 | for i in range(points): 54 | Nk10_true[i] = (Dk10 == i).sum() 55 | correct_Nk10 = np.all(Nk10 == Nk10_true) 56 | # Calculate skewness (different method than in module) 57 | x0 = Nk10 - Nk10.mean() 58 | s2 = (x0**2).mean() 59 | m3 = (x0**3).mean() 60 | s = m3 / (s2**1.5) 61 | Sk10_true = s 62 | correct_Sk10 = Sk10 == Sk10_true 63 | return self.assertTrue(correct_dim_Dk10 64 | and correct_Nk10 65 | and correct_Sk10) 66 | 67 | def test_parallel_hubness_equal_serial_hubness_distance_based(self): 68 | S_k_p, D_k_p, N_k_p = hubness( 69 | self.dist, k=5, metric='distance', verbose=True, n_jobs=-1) 70 | S_k_s, D_k_s, N_k_s = hubness( 71 | self.dist, k=5, metric='distance', verbose=False, n_jobs=1) 72 | np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7) 73 | np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7) 74 | np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7) 75 | 76 | def test_parallel_hubness_equal_serial_hubness_similarity_based(self): 77 | similarity = random_sparse_matrix(size=1000) 78 | S_k_p, D_k_p, N_k_p = hubness( 79 | similarity, k=5, metric='similarity', verbose=False, n_jobs=-1) 80 | S_k_s, D_k_s, N_k_s = hubness( 81 | similarity, k=5, metric='similarity', verbose=False, n_jobs=1) 82 | np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7) 83 | np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7) 84 | np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7) 85 | 86 | class TestHubnessClass(unittest.TestCase): 87 | """Test hubness calculations""" 88 | 89 | def setUp(self): 90 | """Hubness truth: S_k=5, skewness calculated with bias""" 91 | np.random.seed(123) 92 | self.X = np.random.rand(100, 50) 93 | self.D = euclidean_distance(self.X) 94 | self.verbose = 1 95 | 96 | def tearDown(self): 97 | del self.X 98 | 99 | def test_hubness_against_distance(self): 100 | """Test hubness class against distance-based methods.""" 101 | Sk_dist, Dk_dist, Nk_dist = hubness(self.D, k=10) 102 | hub = Hubness(k=10, 103 | return_k_neighbors=True, 104 | return_k_occurrence=True, 105 | verbose=self.verbose) 106 | hub.fit_transform(self.X) 107 | Sk_class = hub.k_skewness_ 108 | Dk_class = hub.k_neighbors_ 109 | Nk_class = hub.k_occurrence_ 110 | np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10) 111 | np.testing.assert_array_equal(Dk_class, Dk_dist) 112 | np.testing.assert_array_equal(Nk_class, Nk_dist) 113 | hub = Hubness(k=10, 114 | return_k_neighbors=True, 115 | return_k_occurrence=True, 116 | metric='precomputed', 117 | verbose=self.verbose) 118 | hub.fit_transform(self.D, has_self_distances=True) 119 | Sk_class = hub.k_skewness_ 120 | Dk_class = hub.k_neighbors_ 121 | Nk_class = hub.k_occurrence_ 122 | np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10) 123 | np.testing.assert_array_equal(Dk_class, Dk_dist) 124 | np.testing.assert_array_equal(Nk_class, Nk_dist) 125 | 126 | def test_hubness_against_vectors(self): 127 | """ Test hubness class against vector-based method. """ 128 | Sk_vect, Dk_vect, Nk_vect = hubness_from_vectors(self.X, k=10) 129 | hub = Hubness(k=10, 130 | return_k_neighbors=True, 131 | return_k_occurrence=True, 132 | verbose=self.verbose) 133 | hub.fit_transform(self.X) 134 | Sk_class = hub.k_skewness_ 135 | Dk_class = hub.k_neighbors_ 136 | Nk_class = hub.k_occurrence_ 137 | np.testing.assert_almost_equal(Sk_class, Sk_vect, decimal=10) 138 | np.testing.assert_array_equal(Dk_class, Dk_vect) 139 | np.testing.assert_array_equal(Nk_class, Nk_vect) 140 | np.testing.assert_array_less( 141 | hub.k_skewness_truncnorm_, hub.k_skewness_) 142 | 143 | def test_hubness_multiprocessing(self): 144 | """ Test multiprocessing capabilities of Hubness. """ 145 | hub = Hubness(k=10, 146 | return_k_neighbors=True, 147 | return_k_occurrence=True, 148 | n_jobs=1, 149 | verbose=self.verbose) 150 | hub.fit_transform(self.X) 151 | Sk_vect = hub.k_skewness_ 152 | Dk_vect = hub.k_neighbors_ 153 | Nk_vect = hub.k_occurrence_ 154 | hub = Hubness(k=10, 155 | return_k_neighbors=True, 156 | return_k_occurrence=True, 157 | n_jobs=-1, 158 | verbose=self.verbose) 159 | hub.fit_transform(self.X) 160 | Sk_mp = hub.k_skewness_ 161 | Dk_mp = hub.k_neighbors_ 162 | Nk_mp = hub.k_occurrence_ 163 | np.testing.assert_almost_equal(Sk_mp, Sk_vect, decimal=10) 164 | np.testing.assert_array_equal(Dk_mp, Dk_vect) 165 | np.testing.assert_array_equal(Nk_mp, Nk_vect) 166 | np.testing.assert_array_less( 167 | hub.k_skewness_truncnorm_, hub.k_skewness_) 168 | 169 | def test_hubness_independent_on_data_set_size(self): 170 | thousands = 3 171 | n_objects = thousands * 1_000 172 | X = np.random.rand(n_objects, 128) 173 | N_SAMPLES = np.arange(1, thousands + 1) * 1_000 174 | Sk_trunc = np.empty(N_SAMPLES.size) 175 | for i, n_samples in enumerate(N_SAMPLES): 176 | ind = np.random.permutation(n_objects)[:n_samples] 177 | X_sample = X[ind, :] 178 | hub = Hubness() 179 | hub.fit_transform(X_sample) 180 | Sk_trunc[i] = hub.k_skewness_truncnorm_ 181 | if i > 0: 182 | np.testing.assert_allclose( 183 | Sk_trunc[i], Sk_trunc[i-1], rtol=1e-1, 184 | err_msg=(f'Hubness measure is too dependent on data set ' 185 | f'size with S({N_SAMPLES[i]}) = x ' 186 | f'and S({N_SAMPLES[i-1]}) = y.')) 187 | np.testing.assert_allclose(Sk_trunc[-1], Sk_trunc[0], rtol=1e-1) 188 | 189 | def test_hubness_from_sparse_precomputed_matrix(self): 190 | # Generate high-dimensional data 191 | X, y = make_classification(n_samples=1000, 192 | n_features=100, 193 | n_informative=100, 194 | n_redundant=0, 195 | n_repeated=0, 196 | random_state=123) 197 | X = X.astype(np.float32) 198 | y = y.astype(np.int32) 199 | for hr_algorithm in VALID_HR: #['dsl']:# 200 | for sampling_algorithm in VALID_SAMPLE: #['hnsw', 'lsh']:# 201 | for n_samples in [50, 100]: 202 | print(f'Test {hr_algorithm}, {sampling_algorithm}, ' 203 | f'with {n_samples} samples.') 204 | self.hubness_from_sparse_precomputed_matrix( 205 | X, y, hr_algorithm, sampling_algorithm, n_samples) 206 | 207 | def hubness_from_sparse_precomputed_matrix(self, X, y, hr, 208 | sample, n_samples): 209 | # Make train-test split 210 | X_train, X_test, y_train, _ = train_test_split(X, y) 211 | # Obtain a sparse distance matrix 212 | ahr = ApproximateHubnessReduction( 213 | hr_algorithm=hr, sampling_algorithm=sample, n_samples=n_samples) 214 | ahr.fit(X_train, y_train) 215 | _ = ahr.transform(X_test) 216 | D_test_csr = ahr.sec_dist_sparse_ 217 | # Hubness in sparse matrix 218 | hub = Hubness(k=10, 219 | metric='precomputed', 220 | return_k_neighbors=True, 221 | shuffle_equal=False, 222 | verbose=self.verbose) 223 | hub.fit_transform(D_test_csr) 224 | Sk_trunc_sparse = hub.k_skewness_truncnorm_ 225 | Sk_sparse = hub.k_skewness_ 226 | k_neigh_sparse = hub.k_neighbors_ 227 | # Hubness in dense matrix 228 | try: 229 | D_test_dense = D_test_csr.toarray() 230 | except AttributeError: 231 | return # Without sampling, the distance matrix is not sparse 232 | D_test_dense[D_test_dense == 0] = np.finfo(np.float32).max 233 | hub_dense = Hubness(k=10, 234 | metric='precomputed', 235 | return_k_neighbors=True, 236 | shuffle_equal=False) 237 | hub_dense.fit_transform(D_test_dense) 238 | Sk_trunc_dense = hub_dense.k_skewness_truncnorm_ 239 | Sk_dense = hub_dense.k_skewness_ 240 | k_neigh_dense = hub_dense.k_neighbors_ 241 | if hr in ['MP', 'MPG']: 242 | decimal = 1 243 | else: 244 | decimal = 5 245 | try: 246 | np.testing.assert_array_equal( 247 | k_neigh_dense.ravel(), k_neigh_sparse) 248 | except AssertionError: 249 | s1 = k_neigh_dense.sum() 250 | s2 = k_neigh_sparse.sum() 251 | sm = max(s1, s2) 252 | print(f'k_neighbors not identical, but close: ' 253 | f'{s1}, {s2}, {s1/s2}.') 254 | np.testing.assert_allclose(s2/sm, s1/sm, rtol=1e-2) 255 | np.testing.assert_array_almost_equal( 256 | Sk_sparse, Sk_dense, decimal=decimal) 257 | np.testing.assert_array_almost_equal( 258 | Sk_trunc_sparse, Sk_trunc_dense, decimal=decimal) 259 | 260 | if __name__ == "__main__": 261 | unittest.main() 262 | -------------------------------------------------------------------------------- /tests/hubnessanalysis_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from hub_toolbox import hubness_analysis 15 | from hub_toolbox.distances import euclidean_distance 16 | 17 | class TestHubnessAnalysis(unittest.TestCase): 18 | """Test the hubness_analysis class (check for results, 19 | but not for *correct* results.) 20 | """ 21 | 22 | def setUp(self): 23 | points = 100 24 | dim = 10 25 | self.vector = 99. * (np.random.rand(points, dim) - 0.5) 26 | self.label = np.random.randint(0, 5, points) 27 | self.dist = euclidean_distance(self.vector) 28 | self.SEC_DIST = set(['mp', 'mp_gaussi', 'mp_gammai', 29 | 'ls', 'nicdm', 'snn', 'cent', 'wcent', 'lcent', 30 | 'dsg', 'dsl', 'orig']) 31 | 32 | def tearDown(self): 33 | del self.dist, self.label, self.vector, self.SEC_DIST 34 | 35 | def test_all_sec_dist_are_covered_in_unittests(self): 36 | n_self_sec_dist = len(self.SEC_DIST) 37 | hub_ana_sec_dist = set(hubness_analysis.SEC_DIST.keys()) 38 | n_intersection = len(hub_ana_sec_dist & self.SEC_DIST) 39 | return self.assertEqual(n_self_sec_dist, n_intersection) 40 | 41 | def test_all_sec_dist_have_header(self): 42 | ha_sec_dist = set(hubness_analysis.SEC_DIST.keys()) 43 | header_sec_dist = set(hubness_analysis.HubnessAnalysis()._header.keys()) 44 | n_sec_dist = len(ha_sec_dist) 45 | n_intersection = len(ha_sec_dist & header_sec_dist) 46 | return self.assertEqual(n_sec_dist, n_intersection) 47 | 48 | def test_all_sec_dist_types(self): 49 | got_all_results = True 50 | for dist_type in self.SEC_DIST: 51 | got_all_results &= self._perform(dist_type) 52 | return self.assertTrue(got_all_results) 53 | 54 | def _perform(self, dist_type): 55 | """Test whether the given secondary distance type is supported.""" 56 | ana = hubness_analysis.HubnessAnalysis( 57 | self.dist, self.label, self.vector, 'distance') 58 | ana = ana.analyze_hubness( 59 | experiments=dist_type, print_results=True, verbose=1) 60 | exp = ana.experiments[0] 61 | got_all_results = \ 62 | (exp.secondary_distance is not None and 63 | len(exp.hubness) > 0 and 64 | len(exp.anti_hubs) > 0 and 65 | len(exp.max_hub_k_occurence) > 0 and 66 | len(exp.knn_accuracy) > 0 and 67 | exp.gk_index is not None and 68 | ana.intrinsic_dim is not None) 69 | return got_all_results 70 | 71 | def test_hubness_analysis_only_with_distances(self): 72 | """ Check correct handling when no labels, vectors are given.""" 73 | ana = hubness_analysis.HubnessAnalysis(self.dist) 74 | ana = ana.analyze_hubness("orig") 75 | exp = ana.experiments[0] 76 | got_all_results = \ 77 | (exp.secondary_distance is not None and 78 | len(exp.hubness) > 0 and 79 | len(exp.anti_hubs) > 0 and 80 | len(exp.max_hub_k_occurence) > 0 and 81 | exp.gk_index is not None and 82 | ana.intrinsic_dim is not None) 83 | return got_all_results 84 | 85 | if __name__ == "__main__": 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/intrinsicdim_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from hub_toolbox.io import load_dexter 15 | from hub_toolbox.intrinsic_dimension import intrinsic_dimension 16 | 17 | class TestIntrinsicDim(unittest.TestCase): 18 | 19 | def setUp(self): 20 | self.vector = np.random.rand(50, 2) 21 | 22 | def tearDown(self): 23 | del self.vector 24 | 25 | def test_intrinsic_dim_mle_levina(self): 26 | """Test against value calc. by matlab reference implementation.""" 27 | _, _, vector = load_dexter() 28 | ID_MLE_REF = 74.472 29 | id_mle = intrinsic_dimension(vector, k1=6, k2=12, 30 | estimator='levina', metric='vector', trafo=None) 31 | return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3) 32 | 33 | def test_intrinsic_dim_mle_levina_low_memory(self): 34 | """ Same as above, but invoking the speed-memory trade-off. """ 35 | _, _, vector = load_dexter() 36 | ID_MLE_REF = 74.472 37 | id_mle = intrinsic_dimension(vector, 6, 12, 'levina', 38 | 'vector', None, mem_threshold=0) 39 | return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3) 40 | 41 | def test_incorrect_est_params(self): 42 | """ Test handling of incorrect estimator. """ 43 | with self.assertRaises(ValueError): 44 | intrinsic_dimension(self.vector, 45 | estimator='the_single_truly_best_id_estimator') 46 | 47 | def test_incorrect_k1_params(self): 48 | """ Test handling of incorrect neighborhood parameters.""" 49 | with self.assertRaises(ValueError): 50 | intrinsic_dimension(self.vector, k1=0) 51 | 52 | def test_incorrect_k12_params(self): 53 | """ Test handling of incorrect neighborhood parameters.""" 54 | with self.assertRaises(ValueError): 55 | intrinsic_dimension(self.vector, k1=6, k2=4) 56 | 57 | def test_incorrect_k2_params(self): 58 | """ Test handling of incorrect neighborhood parameters.""" 59 | n = self.vector.shape[0] 60 | with self.assertRaises(ValueError): 61 | intrinsic_dimension(self.vector, k2=n) 62 | 63 | def test_incorrect_trafo_params(self): 64 | """ Test handling of incorrect transformation parameters.""" 65 | with self.assertRaises(ValueError): 66 | intrinsic_dimension(self.vector, trafo=0) 67 | 68 | def test_incorrect_metric_dist(self): 69 | """ Test handling of unsupported metric parameters.""" 70 | with self.assertRaises(NotImplementedError): 71 | intrinsic_dimension(self.vector, metric='distance') 72 | 73 | def test_incorrect_metric_sim(self): 74 | """ Test handling of unsupported metric parameters.""" 75 | with self.assertRaises(NotImplementedError): 76 | intrinsic_dimension(self.vector, metric='similarity') 77 | 78 | def test_incorrect_metric_other(self): 79 | """ Test handling of unsupported metric parameters.""" 80 | with self.assertRaises(ValueError): 81 | intrinsic_dimension(self.vector, metric=None) 82 | 83 | if __name__ == "__main__": 84 | unittest.main() 85 | -------------------------------------------------------------------------------- /tests/io_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import tempfile 14 | import numpy as np 15 | from scipy.sparse.csr import csr_matrix 16 | from hub_toolbox.io import random_sparse_matrix, load_dexter 17 | from hub_toolbox import io 18 | 19 | class TestIO(unittest.TestCase): 20 | 21 | def setUp(self): 22 | np.random.seed(626) 23 | self.matrix_n = 500 24 | self.density = 0.02 25 | self.similarity = random_sparse_matrix( 26 | size=self.matrix_n, density=self.density) 27 | 28 | def tearDown(self): 29 | del self.matrix_n, self.density, self.similarity 30 | 31 | def test_save_and_load_csr_matrix(self): 32 | tmp = tempfile.mkstemp(suffix='.npz')[1] 33 | io_sim = io.load_csr_matrix(io.save_csr_matrix(tmp, self.similarity)) 34 | # If both are identical, the difference must be all-zeros 35 | return self.assertEqual((self.similarity - io_sim).nnz, 0.) 36 | 37 | def test_random_sparse_similarity_matrix_quadratic_form(self): 38 | return self.assertEqual( 39 | self.similarity.shape[0], self.similarity.shape[1]) 40 | 41 | def test_random_sparse_similarity_matrix_correct_size(self): 42 | return self.assertEqual(self.similarity.shape[0], self.matrix_n) 43 | 44 | def test_random_sparse_similarity_matrix_correct_type(self): 45 | return self.assertIsInstance(self.similarity, csr_matrix) 46 | 47 | def test_random_sparse_similarity_matrix_symmetric(self): 48 | non_symmetric_entry = \ 49 | (self.similarity - self.similarity.T != 0.).nnz > 0 50 | return self.assertFalse(non_symmetric_entry) 51 | 52 | def test_random_sparse_similarity_matrix_min_zero(self): 53 | return self.assertGreaterEqual(self.similarity.min(), 0.) 54 | 55 | def test_random_sparse_similarity_matrix_max_one(self): 56 | return self.assertLessEqual(self.similarity.max(), 1.) 57 | 58 | def test_random_sparse_similarity_matrix_self_similarity_one(self): 59 | all_diag_ones = np.all(self.similarity.diagonal() == 1) 60 | return self.assertTrue(all_diag_ones) 61 | 62 | def test_random_sparse_similarity_matrix_density(self): 63 | return self.assertAlmostEqual( 64 | self.similarity.nnz / self.matrix_n**2, self.density*2, places=2) 65 | 66 | def test_load_dexter(self): 67 | """Loading dexter, checking shape of distances, labels, vectors""" 68 | self.dist, self.lab, self.vect = load_dexter() 69 | symm_dist_shape = self.dist.shape[0] == self.dist.shape[1] 70 | corr_dist_shape = self.dist.shape[0] == self.vect.shape[0] 71 | corr_label_shape = self.lab.shape[0] == self.vect.shape[0] 72 | return self.assertTrue( 73 | symm_dist_shape == corr_dist_shape == corr_label_shape) 74 | 75 | def test_check_shape(self): 76 | with self.assertRaises(TypeError): 77 | d = np.empty((2, 3)) 78 | io.check_distance_matrix_shape(d) 79 | 80 | def test_check_dist_vs_classes(self): 81 | with self.assertRaises(TypeError): 82 | D = np.empty((5, 5)) 83 | classes = np.empty(4) 84 | io.check_distance_matrix_shape_fits_labels(D, classes) 85 | 86 | def test_check_dist_vs_vectors(self): 87 | with self.assertRaises(TypeError): 88 | D = np.zeros((5, 5)) 89 | vectors = np.zeros((4, 5)) 90 | io.check_distance_matrix_shape_fits_vectors(D, vectors) 91 | 92 | def test_check_valid_metric(self): 93 | with self.assertRaises(ValueError): 94 | metric = 'dissimilarity' 95 | io.check_valid_metric_parameter(metric) 96 | 97 | if __name__ == "__main__": 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /tests/knn_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from scipy.sparse.csr import csr_matrix 15 | try: # for scikit-learn >= 0.18 16 | from sklearn.model_selection import LeaveOneOut, cross_val_predict 17 | except ImportError: # lower scikit-learn versions 18 | from sklearn.cross_validation import LeaveOneOut, cross_val_predict 19 | from sklearn.neighbors import KNeighborsClassifier 20 | from sklearn.metrics import accuracy_score, f1_score as f1_score_sklearn 21 | from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder 22 | from hub_toolbox.distances import sample_distance 23 | from hub_toolbox.io import load_dexter, random_sparse_matrix 24 | from hub_toolbox.knn_classification import \ 25 | score, predict, f1_score, r_precision, f1_macro, f1_micro, f1_weighted 26 | 27 | 28 | class TestKnnClassification(unittest.TestCase): 29 | 30 | def setUp(self): 31 | self.distance, self.label, self.vector = load_dexter() 32 | self.n = self.distance.shape[0] 33 | 34 | def tearDown(self): 35 | del self.distance, self.label, self.vector 36 | 37 | def test_r_precision_does_not_error(self): 38 | """ Does not test correctness of result! """ 39 | sim = csr_matrix(1 - self.distance) 40 | y = self.label 41 | r = r_precision(sim, y, metric='similarity', return_y_pred=1, 42 | verbose=1, n_jobs=2) 43 | r_precision_weighted = r['weighted'] 44 | r_precision_macro = r['macro'] 45 | y_pred = np.array(r['y_pred']) 46 | acc = (y == y_pred.ravel()).sum() / self.label.size 47 | return self.assertTrue( 48 | r_precision_weighted >= 0. and r_precision_macro >= 0. 49 | and acc > 0.80) 50 | 51 | def test_r_precision(self): 52 | y = [ 0, 1, 1, 0, 1 , 2] 53 | sim = [[1.0, 0.6, 0.0, 0.0, 0.0, 0], # 0 / 1 .. 1 nnz 54 | [0.6, 1.0, 0.0, 0.0, 0.7, 0], # 1 / 2 .. 2 nnz 55 | [0.0, 0.0, 1.0, 0.0, 0.0, 0], # 0 / 2 .. 0 nnz 56 | [0.0, 0.0, 0.0, 1.0, 0.0, 0], # 0 / 1 .. 0 nnz 57 | [0.0, 0.7, 0.0, 0.0, 1.0, 0], # 1 / 2 .. 1 nnz 58 | [0.0, 0.0, 0.0, 0.0, 0.0, 1]] # 0 / 0 .. 1 nnz 59 | sim = csr_matrix(np.array(sim)) 60 | y = np.array(y) 61 | r = r_precision(sim, y, metric='similarity', return_y_pred=2, 62 | verbose=1, n_jobs=2) 63 | rpw = r['weighted'] 64 | rpm = r['macro'] 65 | r_peritem = r['per_item'] 66 | relevant_items = r['relevant_items'] 67 | y_return = r['y_true'] 68 | rppiw = np.average(r_peritem, weights=relevant_items[y_return]) 69 | return self.assertListEqual([rpw, rpm, rppiw], [0.25, 1/6, rpw]) 70 | 71 | def test_knn_sparse_does_not_error(self): 72 | """ Does not test correctness of result! """ 73 | sim = random_sparse_matrix(100, 0.1) 74 | y = np.random.randint(0, 2, 100) 75 | acc, _, _ = score(sim, y, k=[1,5,10], metric='similarity') 76 | return self.assertTrue(np.alltrue(acc >= 0.)) 77 | 78 | def test_knn_sparse_equal_dense(self): 79 | sim_dense = 1 - self.distance 80 | sim_sparse = csr_matrix(sim_dense) 81 | acc_dense, _, _ = score(sim_dense, self.label, metric='similarity') 82 | acc_sparse, _, _ = score(sim_sparse, self.label, metric='similarity') 83 | return self.assertEqual(acc_dense, acc_sparse) 84 | 85 | def test_knn_predict_equal_sklearn_loocv_predict(self): 86 | y = LabelEncoder().fit_transform(self.label) 87 | y_pred = predict(self.distance, y, k=5, 88 | metric='distance', return_cmat=False)[0].ravel() 89 | knn = KNeighborsClassifier( 90 | n_neighbors=5, algorithm='brute', metric='precomputed') 91 | n = self.distance.shape[0] # for LOO-CV 92 | try: # sklearn < 0.18 93 | loo_cv = LeaveOneOut(n) 94 | except TypeError: 95 | loo_cv = LeaveOneOut() 96 | y_pred_sklearn = cross_val_predict( 97 | knn, self.distance, y, cv=loo_cv) 98 | return self.assertTrue(np.alltrue(y_pred == y_pred_sklearn)) 99 | 100 | def test_f1_score(self): 101 | y = LabelBinarizer().fit_transform(self.label).ravel() 102 | y_pred, cmat = predict(self.distance, y, k=5, metric='distance') 103 | y_pred = y_pred.ravel() 104 | knn = KNeighborsClassifier( 105 | n_neighbors=5, algorithm='brute', metric='precomputed') 106 | n = self.distance.shape[0] # for LOO-CV 107 | try: # sklearn < 0.18 108 | loo_cv = LeaveOneOut(n) 109 | except TypeError: 110 | loo_cv = LeaveOneOut() 111 | y_pred_sklearn = cross_val_predict( 112 | knn, self.distance, y, cv=loo_cv) 113 | f1_binary_hub = f1_score(cmat[0, 0, :, :]) 114 | f1_binary_sklearn = f1_score_sklearn( 115 | y, y_pred_sklearn, average='binary') 116 | return self.assertEqual(f1_binary_hub, f1_binary_sklearn) 117 | 118 | def test_f1_micro_macro_weighted(self): 119 | y = np.random.randint(0, 5, self.label.size).reshape(-1, 1) 120 | y = OneHotEncoder().fit_transform(y).toarray() 121 | y_pred, cmat = predict(self.distance, y, k=5, metric='distance') 122 | y_pred = y_pred[0] 123 | knn = KNeighborsClassifier( 124 | n_neighbors=5, algorithm='brute', metric='precomputed') 125 | n = self.distance.shape[0] # for LOO-CV 126 | try: # sklearn < 0.18 127 | loo_cv = LeaveOneOut(n) 128 | except TypeError: 129 | loo_cv = LeaveOneOut() 130 | y_pred_sklearn = cross_val_predict( 131 | knn, self.distance, y, cv=loo_cv) 132 | f1_hub = [f1_macro(cmat[0]), f1_micro(cmat[0]), f1_weighted(cmat[0])] 133 | f1_sklearn = [f1_score_sklearn(y, y_pred_sklearn, average='macro'), 134 | f1_score_sklearn(y, y_pred_sklearn, average='micro'), 135 | f1_score_sklearn(y, y_pred_sklearn, average='weighted')] 136 | return self.assertListEqual(f1_hub, f1_sklearn) 137 | 138 | def test_knn_score_matches_correct_prediction_fraction(self): 139 | k = np.array([1, 5, 20]) 140 | acc, correct, _ = score(self.distance, self.label, k=k) 141 | acc_match = np.zeros_like(k, dtype=bool) 142 | for i, _ in enumerate(k): 143 | cur_acc = acc[i] 144 | cur_correct = correct[i] 145 | acc_match[i] = np.allclose(cur_acc, cur_correct.sum() / self.n) 146 | return self.assertTrue(np.all(acc_match)) 147 | 148 | def test_knn_score_matches_confusion_matrix(self): 149 | k = np.array([1, 5, 20]) 150 | acc, _, cmat = score(self.distance, self.label, k=k) 151 | acc_match = np.zeros_like(k, dtype=bool) 152 | for i, _ in enumerate(k): 153 | cur_acc = acc[i] 154 | cur_cmat = cmat[i] 155 | TP = cur_cmat[0, 0] 156 | FN = cur_cmat[0, 1] 157 | FP = cur_cmat[1, 0] 158 | TN = cur_cmat[1, 1] 159 | acc_from_cmat = (TP + TN) / (TP + FN + FP + TN) 160 | acc_match[i] = np.allclose(cur_acc, acc_from_cmat) 161 | return self.assertTrue(np.all(acc_match)) 162 | 163 | def test_knn_score_equal_sklearn_loocv_score(self): 164 | acc, correct, cmat = \ 165 | score(self.distance, self.label, k=5, metric='distance') 166 | # scoring only one k value, so take just the first elements: 167 | acc = acc[0, 0] 168 | correct = correct[0] 169 | cmat = cmat[0] 170 | knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', 171 | metric='precomputed') 172 | n = self.distance.shape[0] # for LOO-CV 173 | try: # sklearn < 0.18 174 | loo_cv = LeaveOneOut(n) 175 | except TypeError: 176 | loo_cv = LeaveOneOut() 177 | predicted_sklearn = cross_val_predict( 178 | knclassifier, self.distance, self.label, cv=loo_cv) 179 | acc_sklearn = accuracy_score(self.label, predicted_sklearn) 180 | if not np.allclose(acc, acc_sklearn): 181 | return self.assertAlmostEqual(acc, acc_sklearn, places=7) 182 | else: 183 | correct_sklearn = predicted_sklearn == self.label 184 | equal_prediction = np.all(correct == correct_sklearn) 185 | msg = """Accuracies of hub toolbox k-NN and sklearn-kNN are almost 186 | equal, but the predictions per data point are not.""" 187 | return self.assertTrue(equal_prediction, msg) 188 | 189 | def test_sample_knn(self): 190 | """ Make sure that sample-kNN works correctly. """ 191 | # TODO create a stricter test 192 | X = np.array([[1., 2.], 193 | [2., 2.], 194 | [2., 3.], 195 | [3., .5], 196 | [4., 1.5]]) 197 | y = np.array([0, 1, 0, 1, 1]) 198 | s = 2 199 | rnd = 1234 200 | D, sample_idx = sample_distance(X, y, s, random_state=rnd) 201 | expected_sample_idx = np.array([4, 2]) 202 | expected_acc = 0.4 203 | if not np.setdiff1d(sample_idx, expected_sample_idx).size == \ 204 | np.setdiff1d(expected_sample_idx, sample_idx).size == 0: 205 | return self.fail("Test implementation broken: wrong sample.") 206 | acc, _, _ = score(D=D, target=y, k=2, metric='distance', 207 | sample_idx=sample_idx) 208 | return self.assertEqual(expected_acc, acc[0, 0]) 209 | 210 | 211 | if __name__ == "__main__": 212 | unittest.main() 213 | -------------------------------------------------------------------------------- /tests/localscaling_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from scipy.spatial.distance import squareform 15 | from hub_toolbox.distances import euclidean_distance 16 | from hub_toolbox.local_scaling import local_scaling, nicdm 17 | from hub_toolbox.hubness import hubness 18 | from hub_toolbox.knn_classification import score 19 | 20 | class TestLocalScaling(unittest.TestCase): 21 | """Unit tests for the LocalScaling class""" 22 | 23 | def setUpMod(self, mode='rnd'): 24 | np.random.seed(626) 25 | if mode == 'rnd': 26 | points = 200 # 200 27 | dim = 500 # 500 28 | self.vector = 99. * (np.random.rand(points, dim) - 0.5) 29 | self.label = np.random.randint(0, 5, points) 30 | self.dist = euclidean_distance(self.vector) 31 | elif mode == 'toy': 32 | # LS/NICDM ground truth calculated in spreadsheet for toy example 33 | self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9]) 34 | self.ls_dist_truth = squareform( 35 | [0.486582881, 0.1535182751, 0.9816843611, 0.7364028619, 36 | 0.6321205588, 0.6471339185, 0.9342714714, 0.9844961464, 37 | 0.8646647168, 0.8150186001]) 38 | self.nicdm_dist_truth = squareform( 39 | [0.310029690448236, 0.173311865721368, 0.769089007390428, 40 | 0.438448192970227, 0.402740381783397, 0.37233361467179, 41 | 0.594335892341949, 0.832563272714335, 0.569560910033398, 42 | 0.473903322836619]) 43 | self.vector = None 44 | self.label = None 45 | 46 | def tearDown(self): 47 | del self.dist, self.label, self.vector 48 | 49 | def test_local_scaling(self): 50 | self.setUpMod('toy') 51 | dist_calc = local_scaling(self.dist, k=2) 52 | return np.testing.assert_array_almost_equal( 53 | dist_calc, self.ls_dist_truth, decimal=7) 54 | 55 | def test_ls_basic_requirements(self): 56 | """Test that matrix is symmetric, diag==0, and in range [0, 1]""" 57 | self.setUpMod('rnd') 58 | ls_dist = local_scaling(self.dist) 59 | symmetric = np.all(ls_dist == ls_dist.T) 60 | diag_zero = np.all(ls_dist.diagonal() == 0.) 61 | correct_range = ls_dist.min() >= 0. and ls_dist.max() <= 1. 62 | return self.assertTrue(symmetric and diag_zero and correct_range) 63 | 64 | def test_ls_dist_equals_sim(self): 65 | """Test for equal RANKS using dist. vs. sim. (LS_dist != 1-LS_sim). 66 | Using hubness and k-NN accuracy as proxy.""" 67 | self.setUpMod('rnd') 68 | ls_dist = local_scaling(self.dist, metric='distance') 69 | ls_sim = local_scaling(1 - self.dist, metric='similarity') 70 | h_dist, _, _ = hubness(ls_dist, metric='distance') 71 | h_sim, _, _ = hubness(ls_sim, metric='similarity') 72 | acc_dist, _, _ = score(ls_dist, self.label, metric='distance') 73 | acc_sim, _, _ = score(ls_sim, self.label, metric='similarity') 74 | dist_sim_equal_in_hubness_knn = np.allclose(h_dist, h_sim) and \ 75 | np.allclose(acc_dist, acc_sim) 76 | return self.assertTrue(dist_sim_equal_in_hubness_knn) 77 | 78 | def test_ls_parallel_equals_sequential(self): 79 | self.setUpMod('rnd') 80 | ls_dist_par = local_scaling(self.dist, n_jobs=4) 81 | ls_dist_seq = local_scaling(self.dist, n_jobs=1) 82 | return np.testing.assert_array_equal(ls_dist_seq, ls_dist_par) 83 | 84 | def test_nicdm(self): 85 | self.setUpMod('toy') 86 | dist_calc = nicdm(self.dist, k=2) 87 | return np.testing.assert_array_almost_equal( 88 | dist_calc, self.nicdm_dist_truth, decimal=7) 89 | 90 | def test_nicdm_basic_requirements(self): 91 | """Test that matrix is symmetric, diag==0, and in range [0, inf)""" 92 | self.setUpMod('rnd') 93 | nicdm_dist = nicdm(self.dist) 94 | symmetric = np.all(nicdm_dist == nicdm_dist.T) 95 | diag_zero = np.all(nicdm_dist.diagonal() == 0.) 96 | correct_range = nicdm_dist.min() >= 0. 97 | return self.assertTrue(symmetric and diag_zero and correct_range) 98 | 99 | def test_nicdm_similarity_based(self): 100 | """There is no similarity-based NICDM""" 101 | self.setUpMod('toy') 102 | return self.assertRaises(NotImplementedError) 103 | 104 | def test_nicdm_parallel_equals_sequential(self): 105 | self.setUpMod('rnd') 106 | ls_dist_par = nicdm(self.dist, n_jobs=4) 107 | ls_dist_seq = nicdm(self.dist, n_jobs=1) 108 | return np.testing.assert_array_equal(ls_dist_seq, ls_dist_par) 109 | 110 | if __name__ == "__main__": 111 | unittest.main() 112 | -------------------------------------------------------------------------------- /tests/logging_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | from hub_toolbox.htlogging import Logging, ConsoleLogging, FileLogging 14 | 15 | class TestLogging(unittest.TestCase): 16 | """Minimally test Logging (should switch to std module logging anyway)""" 17 | 18 | def test_unable_to_instantiate_abstract_class_logging(self): 19 | with self.assertRaises(TypeError): 20 | Logging() 21 | 22 | def test_console_logging_has_all_methods(self): 23 | log = ConsoleLogging() 24 | has_all_attributes = hasattr(log, 'warning') and \ 25 | hasattr(log, 'warning') and hasattr(log, 'error') 26 | return self.assertTrue(has_all_attributes) 27 | 28 | def test_file_logging_has_all_methods(self): 29 | log = FileLogging() 30 | has_all_attributes = hasattr(log, 'warning') and \ 31 | hasattr(log, 'warning') and hasattr(log, 'error') 32 | return self.assertTrue(has_all_attributes) 33 | 34 | def test_message(self): 35 | log = ConsoleLogging() 36 | log.message("Message") 37 | return self 38 | 39 | def test_warning(self): 40 | log = ConsoleLogging() 41 | log.warning("Warning") 42 | return self 43 | 44 | def test_error(self): 45 | log = ConsoleLogging() 46 | log.error("Error") 47 | return self 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /tests/mutualproximity_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from hub_toolbox.distances import euclidean_distance 15 | from hub_toolbox.global_scaling import mutual_proximity_empiric,\ 16 | mutual_proximity_gaussi, mutual_proximity_gammai 17 | from scipy.sparse.csr import csr_matrix 18 | from scipy.spatial.distance import squareform 19 | 20 | class TestMutualProximity(unittest.TestCase): 21 | """Unit tests for the MutualProximity class (serial computing)""" 22 | 23 | def setUpMod(self, mode='rnd'): 24 | np.random.seed(626) 25 | if mode == 'rnd': 26 | points = 50 27 | dim = 500 28 | self.vector = 99. * (np.random.rand(points, dim) - 0.5) 29 | self.label = np.random.randint(0, 5, points) 30 | self.dist = euclidean_distance(self.vector) 31 | # scale to [0, 1), avoiding 1: otherwise sparseMP != denseMP (by design) 32 | self.dist /= (self.dist.max() + 1e-12) 33 | elif mode == 'toy': 34 | # MP empiric ground truth calculated by hand for this toy example 35 | self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9]) 36 | 37 | # MP with div/(n-0) 38 | self.mp_dist_truth = squareform([.6, .4, 1., .8, .6, 39 | .8, 1., 1., .8, 1.]) 40 | """ 41 | # MP with div/(n-1) 42 | self.mp_dist_truth = squareform([.5, .25, 1., .75, .5, 43 | .75, 1., 1., .75, 1.]) 44 | 45 | # MP with div/(n-2) 46 | self.mp_dist_truth = squareform([1/3, 0., 1., 2/3, 1/3, 47 | 2/3, 1., 1., 2/3, 1.]) 48 | """ 49 | self.vector = None 50 | self.label = None 51 | 52 | def tearDown(self): 53 | del self.dist, self.label, self.vector 54 | 55 | def test_mp_empiric_sample(self): 56 | """Test MP Emp Sample equals MP Emp when sample == population""" 57 | self.setUpMod('toy') 58 | mp_dist = mutual_proximity_empiric(self.dist, 'distance') 59 | y = np.array([0, 1, 2, 3, 4]) 60 | mp_sample_dist = mutual_proximity_empiric(D=self.dist, 61 | sample_ind=y, 62 | metric='distance') 63 | return np.testing.assert_array_almost_equal( 64 | mp_dist, mp_sample_dist, decimal=7) 65 | 66 | """ 67 | def test_mp_gaussi_sample(self): 68 | """'''Test MP Gaussi Sample.'''""" 69 | self.setUpMod('toy') 70 | mp_dist = mutual_proximity_gaussi(self.dist) 71 | y = np.array([0, 1, 2, 3, 4]) 72 | mp_sample_dist = mutual_proximity_gaussi(self.dist[:, y], idx=y) 73 | mp_sample_equal_pop = np.alltrue(mp_dist == mp_sample_dist) 74 | #======================================================================= 75 | # print(self.dist) 76 | # print(mp_dist) 77 | # print(mp_sample_dist) 78 | #======================================================================= 79 | print("SampleMP-Gaussi with all pts equals MP-Gaussi:", mp_sample_equal_pop) 80 | y2 = np.array([1, 2, 4]) 81 | mp_sample_dist2 = mutual_proximity_gaussi(self.dist[:, y2], idx=y2) 82 | print(self.dist[:, y2]) 83 | print(mp_dist[:, y2]) 84 | print(mp_sample_dist2) 85 | print(mp_sample_dist) 86 | #return self.assertTrue(mp_sample_equal_pop) 87 | return self.fail() 88 | """ 89 | 90 | def test_mp_empiric(self): 91 | """Test MP Empiric for toy example (ground truth calc by hand)""" 92 | self.setUpMod('toy') 93 | mp_dist_calc = mutual_proximity_empiric(self.dist, 'distance', verbose=1) 94 | return np.testing.assert_array_almost_equal( 95 | mp_dist_calc, self.mp_dist_truth, decimal=7) 96 | 97 | def test_mp_empiric_all_zero_self_distances(self): 98 | self.setUpMod('rnd') 99 | mp_dist_calc = mutual_proximity_empiric(self.dist) 100 | mp_self_distances_all_zero = np.all(mp_dist_calc.diagonal() == 0.) 101 | return self.assertTrue(mp_self_distances_all_zero) 102 | 103 | def test_mp_empiric_symmetric(self): 104 | self.setUpMod('rnd') 105 | mp_dist = mutual_proximity_empiric(self.dist) 106 | return np.testing.assert_array_almost_equal( 107 | mp_dist, mp_dist.T, decimal=14) 108 | 109 | def test_mp_empiric_dist_equal_sim(self): 110 | self.setUpMod('rnd') 111 | sim = 1. - self.dist 112 | mp_dist = mutual_proximity_empiric(self.dist, 'distance') 113 | mp_sim = mutual_proximity_empiric(sim, 'similarity') 114 | return np.testing.assert_array_almost_equal( 115 | mp_dist, 1. - mp_sim, decimal=7) 116 | 117 | def test_mp_empiric_sparse_equal_dense(self): 118 | self.setUpMod('rnd') 119 | sim_dense = 1. - self.dist 120 | sim_sparse = csr_matrix(sim_dense) 121 | mp_dense = mutual_proximity_empiric(sim_dense, 'similarity') 122 | mp_sparse = mutual_proximity_empiric( 123 | sim_sparse, 'similarity', verbose=1, n_jobs=4) 124 | return np.testing.assert_array_almost_equal( 125 | mp_dense, mp_sparse.toarray(), decimal=7) 126 | 127 | def test_mp_gaussi(self): 128 | """Test MP GaussI for toy example (ground truth calc by 'hand')""" 129 | self.setUpMod('toy') 130 | mp_gaussi = mutual_proximity_gaussi(self.dist, verbose=1) 131 | # Calculated with formula (3) in JMLR paper, aided by LibreOffice Calc 132 | mp_gaussi_hand = np.array( 133 | [[0.155334048, 0.3466121867, 0.2534339319, 0.971773078, 0.575452874], 134 | [0.3466121867, 0.0267023937, 0.4637020361, 0.6708772779, 0.9702788336], 135 | [0.2534339319, 0.4637020361, 0.1354428205, 0.9899969991, 0.7660250185], 136 | [0.971773078, 0.6708772779, 0.9899969991, 1.90126724466388e-05, 0.975462801], 137 | [0.575452874, 0.9702788336, 0.7660250185, 0.975462801, 0.0003114667]]) 138 | # Gaussians can go below distance 0; self dist anyway defined as 0. 139 | np.fill_diagonal(mp_gaussi_hand, 0.) 140 | return np.testing.assert_array_almost_equal( 141 | mp_gaussi, mp_gaussi_hand, decimal=7) 142 | 143 | def test_mp_gaussi_all_zero_self_distances(self): 144 | self.setUpMod('rnd') 145 | mp_dist = mutual_proximity_gaussi(self.dist) 146 | mp_self_dist_all_zero = np.all(mp_dist.diagonal() == 0.) 147 | return self.assertTrue(mp_self_dist_all_zero) 148 | 149 | def test_mp_gaussi_symmetric(self): 150 | self.setUpMod('rnd') 151 | mp_dist = mutual_proximity_gaussi(self.dist) 152 | return np.testing.assert_array_almost_equal( 153 | mp_dist, mp_dist.T, decimal=7) 154 | 155 | def test_mp_gaussi_dist_equal_sim(self): 156 | self.setUpMod('rnd') 157 | sim = 1. - self.dist 158 | mp_dist = mutual_proximity_gaussi(self.dist, 'distance') 159 | mp_sim = mutual_proximity_gaussi(sim, 'similarity') 160 | return np.testing.assert_array_almost_equal( 161 | mp_dist, 1. - mp_sim, decimal=7) 162 | 163 | def test_mp_gaussi_sparse_equal_dense(self): 164 | self.setUpMod('rnd') 165 | sim_dense = 1. - self.dist 166 | sim_sparse = csr_matrix(sim_dense) 167 | mp_dense = mutual_proximity_gaussi(sim_dense, 'similarity') 168 | mp_sparse = mutual_proximity_gaussi(sim_sparse, 'similarity') 169 | return np.testing.assert_array_almost_equal( 170 | mp_dense, mp_sparse.toarray(), decimal=7) 171 | 172 | def test_mp_gammai(self): 173 | """Test MP GammaI for toy example (ground truth calc by 'hand')""" 174 | self.setUpMod('toy') 175 | mp_gammai = mutual_proximity_gammai(self.dist, verbose=1) 176 | # Calculated with formula (3) in JMLR paper, aided by LibreOffice Calc 177 | mp_gammai_hand = np.array( 178 | [[0., 0.4334769987, 0.230927083, 0.9558409888, 0.6744697939], 179 | [0.4334769987, 0., 0.5761291218, 0.7088478962, 0.9585297208], 180 | [0.230927083, 0.5761291218, 0., 0.9817785746, 0.8286910286], 181 | [0.9558409888, 0.7088478962, 0.9817785746, 0., 0.9646050169], 182 | [0.6744697939, 0.9585297208, 0.8286910286, 0.9646050169, 0.]]) 183 | return np.testing.assert_array_almost_equal( 184 | mp_gammai, mp_gammai_hand, decimal=7) 185 | 186 | def test_mp_gammai_all_zero_self_distances(self): 187 | self.setUpMod('rnd') 188 | mp_dist = mutual_proximity_gammai(self.dist) 189 | mp_self_dist_all_zero = np.all(mp_dist.diagonal() == 0.) 190 | return self.assertTrue(mp_self_dist_all_zero) 191 | 192 | def test_mp_gammai_symmetric(self): 193 | self.setUpMod('rnd') 194 | mp_dist = mutual_proximity_gammai(self.dist) 195 | return np.testing.assert_array_almost_equal( 196 | mp_dist, mp_dist.T, decimal=7) 197 | 198 | def test_mp_gammai_dist_equal_sim(self): 199 | self.setUpMod('rnd') 200 | #===================================================================== 201 | # sim = 1. - self.dist 202 | # mp_dist = mutual_proximity_gammai(self.dist, 'distance') 203 | # mp_sim = mutual_proximity_gammai(sim, 'similarity') 204 | # dist_allclose_one_minus_sim = np.allclose(mp_dist, 1. - mp_sim) 205 | #===================================================================== 206 | msg = "MP GammaI similarity differs from GammaI distance. "\ 207 | + "Whether the currently implemented similarity function makes "\ 208 | + "any sense, is yet to be investigated." 209 | return self.skipTest(msg) 210 | #return self.assertTrue(dist_allclose_one_minus_sim) 211 | 212 | def test_mp_gammai_sparse_equal_dense(self): 213 | self.setUpMod('rnd') 214 | sim_dense = 1. - self.dist 215 | sim_sparse = csr_matrix(sim_dense) 216 | mp_dense = mutual_proximity_gammai(sim_dense, 'similarity') 217 | mp_sparse = mutual_proximity_gammai(sim_sparse, 'similarity') 218 | dense_allclose_sparse = np.allclose(mp_dense, mp_sparse.toarray()) 219 | return self.assertTrue(dense_allclose_sparse) 220 | 221 | if __name__ == "__main__": 222 | unittest.main() 223 | -------------------------------------------------------------------------------- /tests/sharednn_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This file is part of the HUB TOOLBOX available at 5 | https://github.com/OFAI/hub-toolbox-python3/ 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3. 7 | 8 | (c) 2016-2018, Roman Feldbauer 9 | Austrian Research Institute for Artificial Intelligence (OFAI) 10 | Contact: 11 | """ 12 | import unittest 13 | import numpy as np 14 | from scipy.spatial.distance import squareform 15 | from hub_toolbox.distances import euclidean_distance 16 | from hub_toolbox.shared_neighbors import \ 17 | shared_nearest_neighbors, snn_sample, simhubIN 18 | 19 | class TestSharedNN(unittest.TestCase): 20 | 21 | def setUpMod(self, mode='rnd'): 22 | np.random.seed(626) 23 | if mode == 'rnd': 24 | points = 200 25 | dim = 500 26 | self.vector = 99. * (np.random.rand(points, dim) - 0.5) 27 | self.label = np.random.randint(0, 5, points) 28 | self.dist = euclidean_distance(self.vector) 29 | #self.dist /= (self.dist.max() + 1e-12) 30 | elif mode == 'toy': 31 | # SNN (k=2) ground truth calculated by hand for this toy example 32 | self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9]) 33 | self.snn_dist_truth = squareform([.5, .5, .5, .5, .5, 34 | .5, 0., 0., .5, .5]) 35 | self.vector = None 36 | self.label = None 37 | 38 | def test_snn_matrix_basic_requirements(self): 39 | """Test that matrix is symmetric, diag==0, and in range [0, 1]""" 40 | self.setUpMod('rnd') 41 | snn_dist = shared_nearest_neighbors(self.dist) 42 | np.testing.assert_equal(snn_dist.diagonal(), 0.) # self dist 43 | np.testing.assert_array_less(snn_dist, 1+1e-14) # max==1 44 | np.testing.assert_array_less(-snn_dist, 0+1e-14) # min==0 45 | np.testing.assert_array_equal(snn_dist, snn_dist.T) # symmetry 46 | return 47 | 48 | def test_snn(self): 49 | """Test correctness of SNN in toy example (hand-calculated)""" 50 | self.setUpMod('toy') 51 | snn_dist = shared_nearest_neighbors(self.dist, k=2) 52 | return np.testing.assert_array_equal(self.snn_dist_truth, snn_dist) 53 | 54 | def test_snn_dist_equals_sim(self): 55 | """Test that SNN results are equivalent using distances or simil.""" 56 | self.setUpMod('rnd') 57 | snn_dist = shared_nearest_neighbors(self.dist, metric='distance') 58 | snn_sim = shared_nearest_neighbors(1. - self.dist, metric='similarity') 59 | return np.testing.assert_array_almost_equal(snn_sim, 1.-snn_dist, 12) 60 | 61 | def test_snn_parallel(self): 62 | self.setUpMod('rnd') 63 | snn_seq = shared_nearest_neighbors(self.dist, n_jobs=1) 64 | snn_par = shared_nearest_neighbors(self.dist, n_jobs=4) 65 | return np.testing.assert_array_almost_equal(snn_seq, snn_par, 14) 66 | 67 | def test_snn_sample_parallel(self): 68 | self.setUpMod('rnd') 69 | train_ind = np.arange(self.label.size//2) 70 | test_ind = np.arange(self.label.size//2, self.label.size) 71 | D_sample = self.dist[:, train_ind] 72 | snn_seq = snn_sample( 73 | D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=1) 74 | snn_par = snn_sample( 75 | D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=4) 76 | return np.testing.assert_array_almost_equal(snn_seq, snn_par, 14) 77 | 78 | def test_simhubIN(self): 79 | return self.skipTest("simhubIn requires test for correctness!") 80 | 81 | def test_simhubIN_parallel(self): 82 | self.setUpMod('rnd') 83 | train_ind = np.arange(self.label.size//2) 84 | test_ind = np.arange(self.label.size//2, self.label.size) 85 | D_sample = self.dist[:, train_ind] 86 | shi_seq = simhubIN( 87 | D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=1) 88 | shi_par = simhubIN( 89 | D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=4) 90 | return np.testing.assert_array_almost_equal(shi_seq, shi_par, 14) 91 | 92 | if __name__ == "__main__": 93 | unittest.main() 94 | --------------------------------------------------------------------------------