├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── source
    │   ├── hub_toolbox.rst
    │   └── modules.rst
    └── user
    │   ├── installation.rst
    │   ├── matlab_vs_python.rst
    │   └── tutorial.rst
├── hub_toolbox
    ├── __init__.py
    ├── approximate.py
    ├── centering.py
    ├── distances.py
    ├── example_datasets
    │   ├── ABOUT
    │   ├── dexter_train.data
    │   └── dexter_train.labels
    ├── global_scaling.py
    ├── goodman_kruskal.py
    ├── htlogging.py
    ├── hubness.py
    ├── hubness_analysis.py
    ├── intrinsic_dimension.py
    ├── io.py
    ├── knn_classification.py
    ├── local_scaling.py
    ├── shared_neighbors.py
    └── utils.py
├── readthedocs.yml
├── readthedocs_requirements.txt
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── approximate_test.py
    ├── centering_test.py
    ├── distances_test.py
    ├── goodmankruskal_test.py
    ├── hubness_test.py
    ├── hubnessanalysis_test.py
    ├── intrinsicdim_test.py
    ├── io_test.py
    ├── knn_test.py
    ├── localscaling_test.py
    ├── logging_test.py
    ├── mutualproximity_test.py
    └── sharednn_test.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit = 
 3 | 	tests/*
 4 | 	setup.py
 5 | branch = True
 6 | parallel = True
 7 | concurrency = multiprocessing
 8 | 
 9 | [report]
10 | exclude_lines =
11 |     pragma: no cover
12 |     def __repr__
13 |     raise AssertionError
14 |     raise NotImplementedError
15 |     if __name__ == .__main__.:


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | build/
3 | dist/
4 | hub_toolbox.egg-info/
5 | 
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: true
 3 | dist: xenial
 4 | python:
 5 | - '3.6'
 6 | - '3.7'
 7 | install:
 8 | - sudo apt-get update
 9 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
10 | - bash miniconda.sh -b -p $HOME/miniconda;
11 | - export PATH="$HOME/miniconda/bin:$PATH"
12 | - hash -r
13 | - conda config --set always_yes yes --set changeps1 no
14 | - conda update -q conda
15 | - conda info -a
16 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy pandas scikit-learn
17 |   coverage
18 | - source activate test-environment
19 | - pip install pybind11
20 | - pip install coveralls joblib falconn nmslib
21 | - python setup.py build
22 | - python setup.py install
23 | script:
24 | - python setup.py test
25 | - coverage run setup.py test
26 | - coverage combine # required for multiprocessing
27 | after_success:
28 | - coveralls
29 | cache:
30 | - apt
31 | - directories:
32 |   - "$HOME/.cache/pip"
33 | deploy:
34 |   - provider: releases
35 |     api_key:
36 |       secure: "kyQ/EbH2A4fedw49y2hATgSnoonY7r1zf/jiFRaXy+ZqcnOlileDAzJihd7RHeeRN9wL2Gw1L+U3s62AwjddiGZRz9Qv87Ub/bvJVU+aNB0uHlxSRjw3Q8zhtO0hDEyp4wWQnhPqNhGEJCfrRVuAYG956XdgdpfdL6ZdSPWaHt+nj7yfDEwZ/5iiU8UpjgxZzAgO3k7EIvW188dl75SgL9xf5eYxTXjf2NNNbvpvUXvgrpAMUkTCKix5EHMJcnoKDqlNqnURBQI+f/TpqBoO3g+F+KfF/wLIwmiqJIKALhTsTfyHb+Auv+evJ/kxtWe+GSkeF9+SBT5RdCZx3uh6U9RVm/soy12nf88f344HgS/xnj5WLfqPcG53gdwHdoKbA41OzCNGJ66mTtQfNtVFnsYfphU2fZ7yTq3JHxRwknAWDeKWh9cZixf6U8Y9Pi4vpkDNyl56sHDSlroZltRSz37M3grQOJ3kKdPfB1XOTH6nhN2yiuv1047mSj0WVGDsIwFGECc/iUDvUtdY6cliAiC0rRZX1A/axLQKW8LD3GpBSgXmPS1hJy+l8iPiiHjJvwrldz5t0OMeHfvW2ln7jMqN/yirQiHqJJN7oWKYM3qrTCK0pEJg6KS+eje4GOfMSPl2+/RkJF8ViZPfCpE37HmjsZYAHdHKD8dX89C9Y5I="
37 |     #file: ''
38 |     on:
39 |       repo: OFAI/hub-toolbox-python3
40 |       tags: true
41 |   - provider: pypi
42 |     user: "feldbauer"
43 |     password:
44 |       secure: "fSerf/lsApqvZjbNYmOSAuG33+TyW7aKsM/aS2pItzr9u3GQSSQls0Lo+yCduV3/12joBKh4G9k+SXqZnxmdMFPJ/L2RT9ZCx28HFrD+mGakoFX2nVqVhxnqw2bfSg4Wndw1fyNcimYQGNhHHp1WECTFjZInV162719cwID6fLaVzn1AHM8LcR1WPoO5RPZJ/0KhAuxbpkMsoMp5EDJtAxDgn7QGnyTZfwo8jV4ZlUGVTiKYbiPBvLPZ3eTp7b88x4X846X2QzdBHfQ6Qr6nzA6IOJAzkZ+NNpEhDQlQRX44ty0JR0jd5Bz3IypFodZVtDguz29L5oCcuYxJGaul1ANpoqfPZ4vR6b9FkWf3CQW1BNXd5SLVbscf9l4yorDUX4KeagvPJ2z65Y/IaTIoMZjgeZX0/Pm0rcuRFkn/6KobK+lG1IaLMs6F7H7LM+TJn5v9tUYNDbPthPbr7kGmm0E5OtwX8+QZD9h9ufAPgEnsvJkLurus5HbxUiSyARE1SwayBKatJAOY3AyjC3t3tjDSWY+FVTSPbvEbMIg3BMQy0NP0oRzNJLBJ5ZbFO1bcGpMEiqbYim9ZgYonagXsmhfWzRaWughkHZABZZMyFW4uhmyvDV3SiMZpM3wE7DiWm/Oq2PhkAvLJkW085qbjiw6wFxqjgSJ3amCeH3/ZdAY="
45 |     on:
46 |       tags: true  
47 | # safelist
48 | branches:
49 |   only:
50 |     - master
51 |     - develop
52 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include README.rst
3 | recursive-include hub_toolbox *
4 | recursive-include tests *.py


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://badge.fury.io/py/hub-toolbox.svg
  2 |     :target: https://badge.fury.io/py/hub-toolbox
  3 | 
  4 | .. image:: https://readthedocs.org/projects/hub-toolbox-python3/badge/?version=latest
  5 | 	:target: http://hub-toolbox-python3.readthedocs.io/en/latest/?badge=latest
  6 | 	:alt: Documentation Status
  7 | 
  8 | .. image:: https://travis-ci.org/OFAI/hub-toolbox-python3.svg?branch=master
  9 |     :target: https://travis-ci.org/OFAI/hub-toolbox-python3
 10 | 
 11 | .. image:: https://coveralls.io/repos/github/OFAI/hub-toolbox-python3/badge.svg?branch=master
 12 | 	:target: https://coveralls.io/github/OFAI/hub-toolbox-python3?branch=master 
 13 | 
 14 | .. image:: https://img.shields.io/aur/license/yaourt.svg?maxAge=2592000   
 15 | 	:target: https://github.com/OFAI/hub-toolbox-python3/blob/master/LICENSE.txt
 16 | 
 17 | 
 18 | HUB-TOOLBOX
 19 | ===========
 20 | 
 21 | #-----------------------------------------------------------------------------------
 22 | 
 23 | Checkout our new project `scikit-hubness <https://github.com/VarIr/scikit-hubness>`_
 24 | which provides the functionality of the Hub-Toolbox while integrating nicely into
 25 | `scikit-learn` workflows.
 26 | 
 27 | Use `skhubness.neighbors` as a drop-in replacement for `sklearn.neighbors`.
 28 | It offers the same functionality and adds transparent support for hubness reduction,
 29 | approximate nearest neighbor search (HNSW, LSH), and approximate hubness reduction.
 30 | 
 31 | We strive to improve usability of hubness reduction with the development of
 32 | `scikit-hubness`, and we are very interested in
 33 | `user feedback <https://github.com/VarIr/scikit-hubness/issues>`_!
 34 | 
 35 | #-----------------------------------------------------------------------------------
 36 | 
 37 | The Hub Toolbox is a software suite for hubness analysis and
 38 | hubness reduction in high-dimensional data.
 39 | 
 40 | It allows to
 41 | 
 42 | - analyze, whether your datasets show hubness
 43 | - reduce hubness via a variety of different techniques 
 44 |   (including scaling and centering approaches)
 45 |   and obtain secondary distances for downstream analysis inside or 
 46 |   outside the Hub Toolbox
 47 | - perform evaluation tasks with both internal and external measures
 48 |   (e.g. Goodman-Kruskal index and k-NN classification)
 49 | - NEW IN 2.5:
 50 |   The ``approximate`` module provides approximate hubness reduction methods
 51 |   with linear complexity which allow to analyze large datasets.
 52 | - NEW IN 2.5:
 53 |   Measure hubness with the recently proposed Robin-Hood index
 54 |   for fast and reliable hubness estimation.
 55 | 	
 56 | Installation
 57 | ------------
 58 | 
 59 | Make sure you have a working Python3 environment (at least 3.6) with
 60 | numpy, scipy and scikit-learn packages. Use pip3 to install the latest 
 61 | stable version:
 62 | 
 63 | .. code-block:: bash
 64 | 
 65 |   pip3 install hub-toolbox
 66 | 
 67 | For more details and alternatives, please see the `Installation instructions
 68 | <http://hub-toolbox-python3.readthedocs.io/en/latest/user/installation.html>`_.
 69 | 
 70 | Documentation
 71 | -------------
 72 | 
 73 | Documentation is available online: 
 74 | http://hub-toolbox-python3.readthedocs.io/en/latest/index.html
 75 | 
 76 | Example
 77 | -------
 78 | 
 79 | To run a full hubness analysis on the example dataset (DEXTER) 
 80 | using some of the provided hubness reduction methods, 
 81 | simply run the following in a Python shell:
 82 | 
 83 | .. code-block:: python
 84 | 
 85 | 	>>> from hub_toolbox.HubnessAnalysis import HubnessAnalysis
 86 | 	>>> ana = HubnessAnalysis()
 87 | 	>>> ana.analyze_hubness()
 88 | 	
 89 | See how you can conduct the individual analysis steps:
 90 | 
 91 | .. code-block:: python
 92 | 
 93 | 	import hub_toolbox
 94 | 	
 95 | 	# load the DEXTER example dataset
 96 | 	D, labels, vectors = hub_toolbox.io.load_dexter()
 97 | 
 98 | 	# calculate intrinsic dimension estimate
 99 | 	d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vector)
100 | 	
101 | 	# calculate hubness (here, skewness of 5-occurence)
102 | 	S_k, _, _ = hub_toolbox.hubness.hubness(D=D, k=5, metric='distance')
103 | 
104 | 	# perform k-NN classification LOO-CV for two different values of k
105 | 	acc, _, _ = hub_toolbox.knn_classification.score(
106 |                 D=D, target=labels, k=[1,5], metric='distance')
107 | 
108 | 	# calculate Goodman-Kruskal index
109 | 	gamma = hub_toolbox.goodman_kruskal.goodman_kruskal_index(
110 | 	    D=D, classes=labels, metric='distance')
111 | 	 	
112 | 	# Reduce hubness with Mutual Proximity (Empiric distance distribution)
113 | 	D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric(
114 | 	    D=D, metric='distance')
115 | 		
116 | 	# Reduce hubness with Local Scaling variant NICDM
117 | 	D_nicdm = hub_toolbox.local_scaling.nicdm(D=D, k=10, metric='distance')
118 | 	
119 | 	# Check whether indices improve after hubness reduction
120 | 	S_k_mp, _, _ = hub_toolbox.hubness.hubness(D=D_mp, k=5, metric='distance')
121 | 	acc_mp, _, _ = hub_toolbox.knn_classification.score(
122 | 		D=D_mp, target=labels, k=[1,5], metric='distance')
123 | 	gamma_mp = hub_toolbox.goodman_kruskal.goodman_kruskal_index(
124 | 		D=D_mp, classes=labels, metric='distance')
125 | 		
126 | 	# Repeat the last steps for all secondary distances you calculated
127 | 	...
128 | 
129 | Check the `Tutorial
130 | <http://hub-toolbox-python3.readthedocs.io/en/latest/user/tutorial.html>`_ 
131 | for in-depth explanations of the same. 
132 | 
133 | 
134 | Development
135 | -----------
136 | 
137 | Development of the Hub Toolbox has finished. Check out its successor
138 | `scikit-hubness <https://github.com/VarIr/scikit-hubness>`_ for fully
139 | scikit-learn compatible hubness analysis and approximate neighbor search.
140 | 
141 | .. code-block:: text
142 | 
143 | 	(c) 2011-2018, Dominik Schnitzer and Roman Feldbauer
144 | 	Austrian Research Institute for Artificial Intelligence (OFAI)
145 | 	Contact: <roman.feldbauer@ofai.at>
146 | 
147 | Citation
148 | --------
149 | 
150 | If you use the Hub Toolbox in your scientific publication, please cite:
151 | 
152 | .. code-block:: text
153 | 
154 | 	@InProceedings{Feldbauer2018b,
155 |                    author        = {Roman Feldbauer and Maximilian Leodolter and Claudia Plant and Arthur Flexer},
156 |                    title         = {Fast Approximate Hubness Reduction for Large High-Dimensional Data},
157 |                    booktitle     = {2018 {IEEE} International Conference on Big Knowledge, {ICBK} 2018, Singapore, November 17-18, 2018},
158 |                    year          = {2018},
159 |                    editor        = {Xindong Wu and Yew{-}Soon Ong and Charu C. Aggarwal and Huanhuan Chen},
160 |                    pages         = {358--367},
161 |                    publisher     = {{IEEE} Computer Society},
162 |                    bibsource     = {dblp computer science bibliography, https://dblp.org},
163 |                    biburl        = {https://dblp.org/rec/conf/icbk/FeldbauerLPF18.bib},
164 |                    doi           = {10.1109/ICBK.2018.00055},
165 |                  }
166 | 
167 | Relevant literature:
168 | 
169 | 2018: ``Fast approximate hubness reduction for large high-dimensional data``, available as
170 | technical report at `<http://www.ofai.at/cgi-bin/tr-online?number+2018-02>`_.
171 | 
172 | 2018: ``A comprehensive empirical comparison of hubness reduction in high-dimensional spaces``,
173 | full paper available at https://doi.org/10.1007/s10115-018-1205-y
174 | 
175 | 2016: ``Centering Versus Scaling for Hubness Reduction``, available as technical report
176 | at `<http://www.ofai.at/cgi-bin/tr-online?number+2016-05>`_ .
177 | 
178 | 2012: ``Local and Global Scaling Reduce Hubs in Space``, full paper available at
179 | `<http://www.jmlr.org/papers/v13/schnitzer12a.html>`_ .
180 | 
181 | License
182 | -------
183 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
184 | 
185 | Acknowledgements
186 | ----------------
187 | PyVmMonitor is being used to support the development of this free open source 
188 | software package. For more information go to http://www.pyvmmonitor.com
189 | 
190 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 49 | 
 50 | .PHONY: clean
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | .PHONY: html
 55 | html:
 56 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 57 | 	@echo
 58 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 59 | 
 60 | .PHONY: dirhtml
 61 | dirhtml:
 62 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 65 | 
 66 | .PHONY: singlehtml
 67 | singlehtml:
 68 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 71 | 
 72 | .PHONY: pickle
 73 | pickle:
 74 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 75 | 	@echo
 76 | 	@echo "Build finished; now you can process the pickle files."
 77 | 
 78 | .PHONY: json
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | .PHONY: htmlhelp
 85 | htmlhelp:
 86 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 89 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 90 | 
 91 | .PHONY: qthelp
 92 | qthelp:
 93 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 94 | 	@echo
 95 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 96 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 97 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/hub-toolbox.qhcp"
 98 | 	@echo "To view the help file:"
 99 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/hub-toolbox.qhc"
100 | 
101 | .PHONY: applehelp
102 | applehelp:
103 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | 	@echo
105 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | 	@echo "N.B. You won't be able to view it unless you put it in" \
107 | 	      "~/Library/Documentation/Help or install it in your application" \
108 | 	      "bundle."
109 | 
110 | .PHONY: devhelp
111 | devhelp:
112 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | 	@echo
114 | 	@echo "Build finished."
115 | 	@echo "To view the help file:"
116 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/hub-toolbox"
117 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/hub-toolbox"
118 | 	@echo "# devhelp"
119 | 
120 | .PHONY: epub
121 | epub:
122 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | 	@echo
124 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 | 
126 | .PHONY: latex
127 | latex:
128 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | 	@echo
130 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | 	      "(use \`make latexpdf' here to do that automatically)."
133 | 
134 | .PHONY: latexpdf
135 | latexpdf:
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through pdflatex..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
145 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 | 
148 | .PHONY: text
149 | text:
150 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | 	@echo
152 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
153 | 
154 | .PHONY: man
155 | man:
156 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | 	@echo
158 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 | 
160 | .PHONY: texinfo
161 | texinfo:
162 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | 	@echo
164 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
166 | 	      "(use \`make info' here to do that automatically)."
167 | 
168 | .PHONY: info
169 | info:
170 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | 	@echo "Running Texinfo files through makeinfo..."
172 | 	make -C $(BUILDDIR)/texinfo info
173 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 | 
175 | .PHONY: gettext
176 | gettext:
177 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | 	@echo
179 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 | 
181 | .PHONY: changes
182 | changes:
183 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | 	@echo
185 | 	@echo "The overview file is in $(BUILDDIR)/changes."
186 | 
187 | .PHONY: linkcheck
188 | linkcheck:
189 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | 	@echo
191 | 	@echo "Link check complete; look for any errors in the above output " \
192 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
193 | 
194 | .PHONY: doctest
195 | doctest:
196 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | 	@echo "Testing of doctests in the sources finished, look at the " \
198 | 	      "results in $(BUILDDIR)/doctest/output.txt."
199 | 
200 | .PHONY: coverage
201 | coverage:
202 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | 	@echo "Testing of coverage in the sources finished, look at the " \
204 | 	      "results in $(BUILDDIR)/coverage/python.txt."
205 | 
206 | .PHONY: xml
207 | xml:
208 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | 	@echo
210 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 | 
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | 	@echo
216 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # hub-toolbox documentation build configuration file, created by
  5 | # sphinx-quickstart on Wed Aug 24 14:36:53 2016.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | sys.path.insert(0, os.path.abspath('../'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     'sphinx.ext.autodoc',
 34 |     'sphinx.ext.doctest',
 35 |     'sphinx.ext.coverage',
 36 |     'sphinx.ext.napoleon',
 37 |     'sphinx.ext.mathjax',
 38 |     'sphinx.ext.viewcode',
 39 | ]
 40 | 
 41 | # Add any paths that contain templates here, relative to this directory.
 42 | templates_path = ['_templates']
 43 | 
 44 | # The suffix(es) of source filenames.
 45 | # You can specify multiple suffix as a list of string:
 46 | # source_suffix = ['.rst', '.md']
 47 | source_suffix = '.rst'
 48 | 
 49 | # The encoding of source files.
 50 | #source_encoding = 'utf-8-sig'
 51 | 
 52 | # The master toctree document.
 53 | master_doc = 'index'
 54 | 
 55 | # General information about the project.
 56 | project = 'hub-toolbox'
 57 | copyright = '2016, Roman Feldbauer'
 58 | author = 'Roman Feldbauer'
 59 | 
 60 | # The version info for the project you're documenting, acts as replacement for
 61 | # |version| and |release|, also used in various other places throughout the
 62 | # built documents.
 63 | #
 64 | # The short X.Y version.
 65 | version = '2.3'
 66 | # The full version, including alpha/beta/rc tags.
 67 | release = '2.3'
 68 | 
 69 | # The language for content autogenerated by Sphinx. Refer to documentation
 70 | # for a list of supported languages.
 71 | #
 72 | # This is also used if you do content translation via gettext catalogs.
 73 | # Usually you set "language" from the command line for these cases.
 74 | language = None
 75 | 
 76 | # There are two options for replacing |today|: either, you set today to some
 77 | # non-false value, then it is used:
 78 | #today = ''
 79 | # Else, today_fmt is used as the format for a strftime call.
 80 | #today_fmt = '%B %d, %Y'
 81 | 
 82 | # List of patterns, relative to source directory, that match files and
 83 | # directories to ignore when looking for source files.
 84 | exclude_patterns = ['_build']
 85 | 
 86 | # The reST default role (used for this markup: `text`) to use for all
 87 | # documents.
 88 | #default_role = None
 89 | 
 90 | # If true, '()' will be appended to :func: etc. cross-reference text.
 91 | #add_function_parentheses = True
 92 | 
 93 | # If true, the current module name will be prepended to all description
 94 | # unit titles (such as .. function::).
 95 | #add_module_names = True
 96 | 
 97 | # If true, sectionauthor and moduleauthor directives will be shown in the
 98 | # output. They are ignored by default.
 99 | #show_authors = False
100 | 
101 | # The name of the Pygments (syntax highlighting) style to use.
102 | pygments_style = 'sphinx'
103 | 
104 | # A list of ignored prefixes for module index sorting.
105 | #modindex_common_prefix = []
106 | 
107 | # If true, keep warnings as "system message" paragraphs in the built documents.
108 | #keep_warnings = False
109 | 
110 | # If true, `todo` and `todoList` produce output, else they produce nothing.
111 | todo_include_todos = False
112 | 
113 | 
114 | # -- Options for HTML output ----------------------------------------------
115 | 
116 | # The theme to use for HTML and HTML Help pages.  See the documentation for
117 | # a list of builtin themes.
118 | #html_theme = 'alabaster'
119 | html_theme = 'sphinx_rtd_theme'
120 | 
121 | # Theme options are theme-specific and customize the look and feel of a theme
122 | # further.  For a list of options available for each theme, see the
123 | # documentation.
124 | #html_theme_options = {}
125 | 
126 | # Add any paths that contain custom themes here, relative to this directory.
127 | #html_theme_path = []
128 | 
129 | # The name for this set of Sphinx documents.  If None, it defaults to
130 | # "<project> v<release> documentation".
131 | #html_title = None
132 | 
133 | # A shorter title for the navigation bar.  Default is the same as html_title.
134 | #html_short_title = None
135 | 
136 | # The name of an image file (relative to this directory) to place at the top
137 | # of the sidebar.
138 | #html_logo = None
139 | 
140 | # The name of an image file (within the static path) to use as favicon of the
141 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
142 | # pixels large.
143 | #html_favicon = None
144 | 
145 | # Add any paths that contain custom static files (such as style sheets) here,
146 | # relative to this directory. They are copied after the builtin static files,
147 | # so a file named "default.css" will overwrite the builtin "default.css".
148 | html_static_path = ['_static']
149 | 
150 | # Add any extra paths that contain custom files (such as robots.txt or
151 | # .htaccess) here, relative to this directory. These files are copied
152 | # directly to the root of the documentation.
153 | #html_extra_path = []
154 | 
155 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
156 | # using the given strftime format.
157 | #html_last_updated_fmt = '%b %d, %Y'
158 | 
159 | # If true, SmartyPants will be used to convert quotes and dashes to
160 | # typographically correct entities.
161 | #html_use_smartypants = True
162 | 
163 | # Custom sidebar templates, maps document names to template names.
164 | #html_sidebars = {}
165 | 
166 | # Additional templates that should be rendered to pages, maps page names to
167 | # template names.
168 | #html_additional_pages = {}
169 | 
170 | # If false, no module index is generated.
171 | #html_domain_indices = True
172 | 
173 | # If false, no index is generated.
174 | #html_use_index = True
175 | 
176 | # If true, the index is split into individual pages for each letter.
177 | #html_split_index = False
178 | 
179 | # If true, links to the reST sources are added to the pages.
180 | #html_show_sourcelink = True
181 | 
182 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
183 | #html_show_sphinx = True
184 | 
185 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
186 | #html_show_copyright = True
187 | 
188 | # If true, an OpenSearch description file will be output, and all pages will
189 | # contain a <link> tag referring to it.  The value of this option must be the
190 | # base URL from which the finished HTML is served.
191 | #html_use_opensearch = ''
192 | 
193 | # This is the file name suffix for HTML files (e.g. ".xhtml").
194 | #html_file_suffix = None
195 | 
196 | # Language to be used for generating the HTML full-text search index.
197 | # Sphinx supports the following languages:
198 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
199 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
200 | #html_search_language = 'en'
201 | 
202 | # A dictionary with options for the search language support, empty by default.
203 | # Now only 'ja' uses this config value
204 | #html_search_options = {'type': 'default'}
205 | 
206 | # The name of a javascript file (relative to the configuration directory) that
207 | # implements a search results scorer. If empty, the default will be used.
208 | #html_search_scorer = 'scorer.js'
209 | 
210 | # Output file base name for HTML help builder.
211 | htmlhelp_basename = 'hub-toolboxdoc'
212 | 
213 | # -- Options for LaTeX output ---------------------------------------------
214 | 
215 | latex_elements = {
216 | # The paper size ('letterpaper' or 'a4paper').
217 | #'papersize': 'letterpaper',
218 | 
219 | # The font size ('10pt', '11pt' or '12pt').
220 | #'pointsize': '10pt',
221 | 
222 | # Additional stuff for the LaTeX preamble.
223 | #'preamble': '',
224 | 
225 | # Latex figure (float) alignment
226 | #'figure_align': 'htbp',
227 | }
228 | 
229 | # Grouping the document tree into LaTeX files. List of tuples
230 | # (source start file, target name, title,
231 | #  author, documentclass [howto, manual, or own class]).
232 | latex_documents = [
233 |     (master_doc, 'hub-toolbox.tex', 'hub-toolbox Documentation',
234 |      'Roman Feldbauer', 'manual'),
235 | ]
236 | 
237 | # The name of an image file (relative to this directory) to place at the top of
238 | # the title page.
239 | #latex_logo = None
240 | 
241 | # For "manual" documents, if this is true, then toplevel headings are parts,
242 | # not chapters.
243 | #latex_use_parts = False
244 | 
245 | # If true, show page references after internal links.
246 | #latex_show_pagerefs = False
247 | 
248 | # If true, show URL addresses after external links.
249 | #latex_show_urls = False
250 | 
251 | # Documents to append as an appendix to all manuals.
252 | #latex_appendices = []
253 | 
254 | # If false, no module index is generated.
255 | #latex_domain_indices = True
256 | 
257 | 
258 | # -- Options for manual page output ---------------------------------------
259 | 
260 | # One entry per manual page. List of tuples
261 | # (source start file, name, description, authors, manual section).
262 | man_pages = [
263 |     (master_doc, 'hub-toolbox', 'hub-toolbox Documentation',
264 |      [author], 1)
265 | ]
266 | 
267 | # If true, show URL addresses after external links.
268 | #man_show_urls = False
269 | 
270 | 
271 | # -- Options for Texinfo output -------------------------------------------
272 | 
273 | # Grouping the document tree into Texinfo files. List of tuples
274 | # (source start file, target name, title, author,
275 | #  dir menu entry, description, category)
276 | texinfo_documents = [
277 |     (master_doc, 'hub-toolbox', 'hub-toolbox Documentation',
278 |      author, 'hub-toolbox', 'One line description of project.',
279 |      'Miscellaneous'),
280 | ]
281 | 
282 | # Documents to append as an appendix to all manuals.
283 | #texinfo_appendices = []
284 | 
285 | # If false, no module index is generated.
286 | #texinfo_domain_indices = True
287 | 
288 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
289 | #texinfo_show_urls = 'footnote'
290 | 
291 | # If true, do not generate a @detailmenu in the "Top" node's menu.
292 | #texinfo_no_detailmenu = False
293 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to the HUB TOOLBOX!
 2 | ===========================
 3 | 
 4 | The Hub Toolbox is a software suite for hubness analysis and hubness reduction
 5 | in high-dimensional data.
 6 | 
 7 | User Guide
 8 | ------------
 9 | 
10 | The user guide explains how to install the Hub Toolbox, how to analyze your 
11 | data sets for hubness, and how to use the Hub Toolbox to lift this 
12 | *curse of dimensionality*.
13 | 
14 | .. toctree::
15 |   :maxdepth: 2
16 | 
17 |   user/matlab_vs_python
18 |   user/installation
19 |   user/tutorial
20 |   
21 | 
22 | API Reference
23 | -------------
24 | 
25 | Find all the information about specific modules and functions of the Hub 
26 | Toolbox in this section.
27 | 
28 | * :ref:`genindex`
29 | * :ref:`modindex`
30 | 


--------------------------------------------------------------------------------
/docs/source/hub_toolbox.rst:
--------------------------------------------------------------------------------
  1 | hub_toolbox package
  2 | ===================
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | hub_toolbox.Centering module
  8 | ----------------------------
  9 | 
 10 | .. automodule:: hub_toolbox.Centering
 11 |     :members:
 12 |     :undoc-members:
 13 |     :show-inheritance:
 14 | 
 15 | hub_toolbox.Distances module
 16 | ----------------------------
 17 | 
 18 | .. automodule:: hub_toolbox.Distances
 19 |     :members:
 20 |     :undoc-members:
 21 |     :show-inheritance:
 22 | 
 23 | hub_toolbox.GoodmanKruskal module
 24 | ---------------------------------
 25 | 
 26 | .. automodule:: hub_toolbox.GoodmanKruskal
 27 |     :members:
 28 |     :undoc-members:
 29 |     :show-inheritance:
 30 | 
 31 | hub_toolbox.Hubness module
 32 | --------------------------
 33 | 
 34 | .. automodule:: hub_toolbox.Hubness
 35 |     :members:
 36 |     :undoc-members:
 37 |     :show-inheritance:
 38 | 
 39 | hub_toolbox.HubnessAnalysis module
 40 | ----------------------------------
 41 | 
 42 | .. automodule:: hub_toolbox.HubnessAnalysis
 43 |     :members:
 44 |     :undoc-members:
 45 |     :show-inheritance:
 46 | 
 47 | hub_toolbox.Hubness_parallel module
 48 | -----------------------------------
 49 | 
 50 | .. automodule:: hub_toolbox.Hubness_parallel
 51 |     :members:
 52 |     :undoc-members:
 53 |     :show-inheritance:
 54 | 
 55 | hub_toolbox.IO module
 56 | ---------------------
 57 | 
 58 | .. automodule:: hub_toolbox.IO
 59 |     :members:
 60 |     :undoc-members:
 61 |     :show-inheritance:
 62 | 
 63 | hub_toolbox.IntrinsicDim module
 64 | -------------------------------
 65 | 
 66 | .. automodule:: hub_toolbox.IntrinsicDim
 67 |     :members:
 68 |     :undoc-members:
 69 |     :show-inheritance:
 70 | 
 71 | hub_toolbox.KnnClassification module
 72 | ------------------------------------
 73 | 
 74 | .. automodule:: hub_toolbox.KnnClassification
 75 |     :members:
 76 |     :undoc-members:
 77 |     :show-inheritance:
 78 | 
 79 | hub_toolbox.LocalScaling module
 80 | -------------------------------
 81 | 
 82 | .. automodule:: hub_toolbox.LocalScaling
 83 |     :members:
 84 |     :undoc-members:
 85 |     :show-inheritance:
 86 | 
 87 | hub_toolbox.Logging module
 88 | --------------------------
 89 | 
 90 | .. automodule:: hub_toolbox.Logging
 91 |     :members:
 92 |     :undoc-members:
 93 |     :show-inheritance:
 94 | 
 95 | hub_toolbox.MutualProximity module
 96 | ----------------------------------
 97 | 
 98 | .. automodule:: hub_toolbox.MutualProximity
 99 |     :members:
100 |     :undoc-members:
101 |     :show-inheritance:
102 | 
103 | hub_toolbox.MutualProximity_parallel module
104 | -------------------------------------------
105 | 
106 | .. automodule:: hub_toolbox.MutualProximity_parallel
107 |     :members:
108 |     :undoc-members:
109 |     :show-inheritance:
110 | 
111 | hub_toolbox.SharedNN module
112 | ---------------------------
113 | 
114 | .. automodule:: hub_toolbox.SharedNN
115 |     :members:
116 |     :undoc-members:
117 |     :show-inheritance:
118 | 
119 | 
120 | Module contents
121 | ---------------
122 | 
123 | .. automodule:: hub_toolbox
124 |     :members:
125 |     :undoc-members:
126 |     :show-inheritance:
127 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | hub_toolbox
2 | ===========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    hub_toolbox
8 | 


--------------------------------------------------------------------------------
/docs/user/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | ============
 4 | Installation
 5 | ============
 6 | 
 7 | 
 8 | Most of the instructions below assume you are running a Linux system. 
 9 | It might be possible to install the Hub Toolbox on Mac or Windows systems.
10 | We cannot, however, give any guidance for these cases at this point.
11 | 
12 | 
13 | Prerequisites
14 | =============
15 | 
16 | Python
17 | ------
18 | 
19 | The Hub Toolbox currently requires Python 3.6 or higher. You can check this
20 | on your system with:
21 | 
22 | .. code-block:: bash
23 | 
24 | 	python3 --version
25 | 
26 | If Python3 is missing, or its version is lower than 3.6, please install it
27 | via the package manager of your operating system (e.g. ``apt`` in 
28 | Debian/Ubuntu or ``dnf`` in Fedora).
29 | 
30 | You might also consider using the `Anaconda environment 
31 | <https://www.continuum.io/downloads#linux>`_ for easy Python environment 
32 | and package handling.
33 | 
34 | numpy/scipy/scikit-learn
35 | ------------------------
36 | 
37 | The Hub Toolbox heavily relies on numpy and requires scipy and scikit-learn
38 | for some functions.
39 | Please install these packages via your operating system's package manager
40 | (e.g. ``sudo apt install python3-numpy python3-scipy python3-sklearn``) or
41 | use Anaconda: ``conda install numpy scipy scikit-learn``.
42 | We do not recommend installation via ``pip`` since this may lead to suboptimal
43 | performance unless configured properly.
44 | 
45 | 
46 | Stable Hub Toolbox release
47 | ==========================
48 | 
49 | Stable releases of the Hub Toolbox are added to 
50 | `PyPI <https://pypi.python.org/pypi/hub-toolbox>`_ .
51 | To install the latest stable release, simply use `pip` 
52 | (you may need to install it first via your operating system's package manager,
53 | e.g. ``sudo apt install python3-pip``).
54 | 
55 | .. code-block:: bash
56 | 
57 | 	pip3 install hub-toolbox
58 | 
59 | Alternatively, you may download the `latest release from GitHub 
60 | <https://github.com/OFAI/hub-toolbox-python3/releases/latest>`_ and follow
61 | the instructions of a development installation (from source) below,
62 | omitting the ``git clone`` step.
63 | 
64 | 
65 | .. _hubtoolbox-development-install:
66 | 
67 | Installation from source
68 | ========================
69 | 
70 | For a bleeding edge version of the Hub Toolbox, you can install it from
71 | the latest sources:  
72 | On the console, change to the directory, under which the Hub Toolbox should
73 | be installed. Then obtain a copy of the latest sources from GitHub:
74 | 
75 | .. code-block:: bash
76 | 
77 |   git clone https://github.com/OFAI/hub-toolbox-python3.git
78 | 
79 | They will be cloned to a subdirectory called ``hub-toolbox-python3``. 
80 | The Hub Toolbox must then be built and installed with
81 | 
82 | .. code-block:: bash
83 | 
84 | 	cd hub-toolbox-python3
85 | 	python3 setup.py build
86 | 	sudo python3 setup.py install
87 | 	
88 | The Hub Toolbox is now available system wide. Optionally, you can now run
89 | a test suite by
90 | 
91 | .. code-block:: bash
92 | 
93 | 	sudo python3 setup.py test
94 | 	
95 | If this prints an ``OK`` message, you are ready to go. Note, that some 
96 | skipped tests are fine.
97 | 


--------------------------------------------------------------------------------
/docs/user/matlab_vs_python.rst:
--------------------------------------------------------------------------------
 1 | Which Hub Toolbox to choose
 2 | ===========================
 3 | 
 4 | The Hub Toolbox is available as Python and Matlab scripts. 
 5 | If in doubt, use the Hub Toolbox for Python. See below
 6 | for a more detailed description.
 7 | 
 8 | hub-toolbox-matlab
 9 | --------------------
10 | 
11 | The Hub Toolbox was originally developed for Matlab/Octave. 
12 | We still provide these scripts, however, development is limited to bugfixing.
13 | No new functionality will be added.
14 | The `Hub Toolbox for Matlab <https://github.com/OFAI/hub-toolbox-matlab>`_ 
15 | supports:
16 | 
17 | - hubness analysis
18 | 
19 | - hubness reduction
20 | 
21 |   - Mutual Proximity
22 |   - Local Scaling
23 |   - Shared Nearest Neighbors
24 | - evaluation
25 | 
26 |   - k-NN classification
27 |   - Goodman-Kruskal index
28 | 
29 | for distance matrices.
30 | 
31 | hub-toolbox-python3
32 | -------------------
33 | 
34 | The `Hub Toolbox for Python3 <https://github.com/OFAI/hub-toolbox-python3>`_ 
35 | was initially ported from the Matlab code. 
36 | Development now focuses on these scripts. It is thus continuously being extended 
37 | for new functionality and is tested and documented thoroughly. 
38 | The Hub Toolbox for Python3 offers all the functionality the Matlab 
39 | scripts offer, plus:
40 | 
41 | - additional hubness reduction methods
42 | 
43 |   - centering
44 |   - DisSim
45 | - using similarity matrices instead of distance matrices
46 | - support for sparse matrices (some modules)
47 | - support for parallel processing (some modules)
48 | - performance improvements (some modules)
49 | - unit tests
50 | - this documentation
51 |  
52 | We recommend using hub-toolbox-python3 for all users. This documentation will 
53 | assume you are using these scripts.
54 | 


--------------------------------------------------------------------------------
/docs/user/tutorial.rst:
--------------------------------------------------------------------------------
  1 | .. _tutorial:
  2 | 
  3 | ========
  4 | Tutorial
  5 | ========
  6 | 
  7 | In this tutorial you will analyze the dexter dataset for hubness, reduce 
  8 | hubness, and observe how this improves internal and external evaluation
  9 | measures.
 10 | 
 11 | From there on you will be able to apply the techniques offered by the 
 12 | Hub Toolbox to the dataset of your choice.
 13 | 
 14 | 
 15 | Prerequisites
 16 | =============
 17 | 
 18 | For this tutorial, you will require a working installation of the Hub 
 19 | Toolbox. If you don't have one yet, please follow the instructions in 
 20 | :ref:`installation`.
 21 | 
 22 | 
 23 | Analyze the dexter dataset
 24 | ==========================
 25 | 
 26 | The Hub Toolbox ships with DEXTER as an example dataset. DEXTER is a text 
 27 | classification problem in a bag-of-word representation. This is a 
 28 | binary classification problem with sparse continuous input variables. 
 29 | This dataset was one of five datasets of the NIPS 2003 feature selection 
 30 | challenge. For more info, see: http://archive.ics.uci.edu/ml/datasets/Dexter
 31 | 
 32 | On the terminal, start a Python shell:
 33 | 
 34 | .. code-block:: bash
 35 | 
 36 | 	python3
 37 | 
 38 | Consider using an `IPython/jupyter notebook <http://jupyter.org/>`_ as a
 39 | more flexible and powerful alternative.
 40 | 
 41 | The :class:`HubnessAnalysis <hub_toolbox.HubnessAnalysis>` class automatically
 42 | analyzes the DEXTER example dataset, if invoked without further parameters:
 43 | 
 44 | .. code-block:: python
 45 | 
 46 | 	>>> from hub_toolbox.HubnessAnalysis import HubnessAnalysis
 47 | 	>>> ana = HubnessAnalysis()
 48 | 	>>> ana.analyze_hubness()
 49 | 	
 50 | This will print a rather lengthy result log:
 51 | 
 52 | .. code-block:: text
 53 | 
 54 | 	NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set.
 55 | 	DEXTER is a text classification problem in a bag-of-word 
 56 | 	representation. This is a two-class classification problem
 57 | 	with sparse continuous input variables. 
 58 | 	This dataset is one of five datasets of the NIPS 2003
 59 | 	feature selection challenge.
 60 | 	http://archive.ics.uci.edu/ml/datasets/Dexter
 61 | 	
 62 | 	
 63 | 	================
 64 | 	Hubness Analysis
 65 | 	================
 66 | 	
 67 | 	ORIGINAL DATA:
 68 | 	data set hubness (S^k= 5)                : 4.22
 69 | 	% of anti-hubs at k= 5                   : 26.67%
 70 | 	% of k= 5-NN lists the largest hub occurs: 23.67%
 71 | 	data set hubness (S^k=10)                : 3.98
 72 | 	% of anti-hubs at k=10                   : 17.67%
 73 | 	% of k=10-NN lists the largest hub occurs: 50.0%
 74 | 	k= 1-NN classification accuracy          : 80.33%
 75 | 	k= 5-NN classification accuracy          : 80.33%
 76 | 	k=20-NN classification accuracy          : 84.33%
 77 | 	Goodman-Kruskal index (higher=better)    : 0.104
 78 | 	embedding dimensionality                 : 20000
 79 | 	intrinsic dimensionality estimate        : 161
 80 | 	
 81 | 	MUTUAL PROXIMITY (Empiric):
 82 | 	data set hubness (S^k= 5)                : 0.712
 83 | 	% of anti-hubs at k= 5                   : 3.0%
 84 | 	% of k= 5-NN lists the largest hub occurs: 6.0%
 85 | 	data set hubness (S^k=10)                : 0.71
 86 | 	% of anti-hubs at k=10                   : 0.0%
 87 | 	% of k=10-NN lists the largest hub occurs: 10.67%
 88 | 	k= 1-NN classification accuracy          : 82.67%
 89 | 	k= 5-NN classification accuracy          : 89.67%
 90 | 	k=20-NN classification accuracy          : 88.67%
 91 | 	Goodman-Kruskal index (higher=better)    : 0.132
 92 | 	embedding dimensionality                 : 20000
 93 | 	intrinsic dimensionality estimate        : 161
 94 | 	
 95 | 	MUTUAL PROXIMITY (Independent Gaussians):
 96 | 	data set hubness (S^k= 5)                : 0.805
 97 | 	% of anti-hubs at k= 5                   : 4.667%
 98 | 	% of k= 5-NN lists the largest hub occurs: 5.667%
 99 | 	data set hubness (S^k=10)                : 1.21
100 | 	% of anti-hubs at k=10                   : 0.0%
101 | 	% of k=10-NN lists the largest hub occurs: 12.67%
102 | 	k= 1-NN classification accuracy          : 83.67%
103 | 	k= 5-NN classification accuracy          : 89.0%
104 | 	k=20-NN classification accuracy          : 90.0%
105 | 	Goodman-Kruskal index (higher=better)    : 0.135
106 | 	embedding dimensionality                 : 20000
107 | 	intrinsic dimensionality estimate        : 161
108 | 	
109 | 	LOCAL SCALING (NICDM):
110 | 	parameter k = 7 (for optimization use the individual modules of the HUB-TOOLBOX)
111 | 	data set hubness (S^k= 5)                : 2.1
112 | 	% of anti-hubs at k= 5                   : 0.6667%
113 | 	% of k= 5-NN lists the largest hub occurs: 8.667%
114 | 	data set hubness (S^k=10)                : 1.74
115 | 	% of anti-hubs at k=10                   : 0.0%
116 | 	% of k=10-NN lists the largest hub occurs: 16.0%
117 | 	k= 1-NN classification accuracy          : 84.67%
118 | 	k= 5-NN classification accuracy          : 85.0%
119 | 	k=20-NN classification accuracy          : 85.0%
120 | 	Goodman-Kruskal index (higher=better)    : 0.118
121 | 	embedding dimensionality                 : 20000
122 | 	intrinsic dimensionality estimate        : 161
123 | 	
124 | 	CENTERING:
125 | 	data set hubness (S^k= 5)                : 1.62
126 | 	% of anti-hubs at k= 5                   : 6.667%
127 | 	% of k= 5-NN lists the largest hub occurs: 8.333%
128 | 	data set hubness (S^k=10)                : 1.38
129 | 	% of anti-hubs at k=10                   : 1.333%
130 | 	% of k=10-NN lists the largest hub occurs: 13.0%
131 | 	k= 1-NN classification accuracy          : 85.0%
132 | 	k= 5-NN classification accuracy          : 87.67%
133 | 	k=20-NN classification accuracy          : 89.33%
134 | 	Goodman-Kruskal index (higher=better)    : 0.19
135 | 	embedding dimensionality                 : 20000
136 | 	intrinsic dimensionality estimate        : 161
137 | 	
138 | 	DISSIM GLOBAL:
139 | 	data set hubness (S^k= 5)                : 1.87
140 | 	% of anti-hubs at k= 5                   : 6.333%
141 | 	% of k= 5-NN lists the largest hub occurs: 8.667%
142 | 	data set hubness (S^k=10)                : 1.62
143 | 	% of anti-hubs at k=10                   : 1.667%
144 | 	% of k=10-NN lists the largest hub occurs: 14.67%
145 | 	k= 1-NN classification accuracy          : 84.0%
146 | 	k= 5-NN classification accuracy          : 88.67%
147 | 	k=20-NN classification accuracy          : 88.67%
148 | 	Goodman-Kruskal index (higher=better)    : 0.189
149 | 	embedding dimensionality                 : 20000
150 | 	intrinsic dimensionality estimate        : 161
151 | 
152 | 
153 | Interpreting the results
154 | ========================
155 | 
156 | Let us dissect these results: The first block appears, because we did not
157 | provide any parameters, when instantiating 
158 | :class:`HubnessAnalysis <hub_toolbox.HubnessAnalysis>`.  It thus goes 
159 | into example mode and tells you a little bit about the dataset being used.
160 | 
161 | The actual results of the analysis are grouped into blocks by experiments.
162 | Here, an experiment comprises the following: 
163 | 
164 | #. a hubness reduction method is applied to the dataset's distance matrix
165 |    to obtain a matrix of secondary distances (except for centering, which
166 |    changes vector data)
167 | #. hubness and additional measures of hubs and anti-hubs are calculated
168 |    (in this case twice, for two different neighborhood sizes)
169 | #. k-nearest neighbor classification leave-one-out cross-validation is
170 |    performed (in this case three times, for three different values of `k`)
171 | #. the Goodman-Kruskal index is calculated for the secondary distance matrix
172 | 
173 | Additionally, the intrinsic dimension is estimated once for the dataset 
174 | for all experiments. 
175 | 
176 | The second block (under the `Hubness Analysis` headline) is the experiment
177 | using primary distances. For text-based datasets like DEXTER cosine distances
178 | are used frequently. We observe considerable hubness of ``S^(k=5) = 4.22``. 
179 | (As a rule of thumb, consider values above ``1.2`` as 'high hubness'). 
180 | Knowing that hubness is a phenomenon of intrinsically high dimensional data, 
181 | it is not surprising that the intrinsic dimension estimate of ``161`` is also 
182 | considerably high (although much lower than the embedding dimension 
183 | of ``20000``). We also observe a lot of anti-hubs (i.e. points that are
184 | not among the k-nearest neighbors of any other point; or in other words:
185 | their ``k-occurence=0``), while the largest hub is among the k-nearest 
186 | neighbors of very many points. We find k-NN classification accuracy of
187 | roughly ``80%``.
188 | 
189 | The third block contains the results of an Mutual Proximity experiment, 
190 | using the empirical distance distribution to rescale these distances.
191 | We observe tremendously reduced hubness, hardly any anti-hubs, and reduced
192 | k-occurence of the largest hub. Also, internal evaluation with the 
193 | Goodman-Kruskal index improves compared to using the primary distances.
194 | Mutual Proximity is thus able to reduce hubness, but we don't know yet, 
195 | whether these secondary distances still reflect the semantics of the dataset. 
196 | Looking at the k-NN classification, it seems like these were actually 
197 | improved, because accuracy increased to nearly ``90%``.
198 | Note that embedding and intrinsic dimension do not change, because they are
199 | computed on the original dataset.
200 | 
201 | The following blocks represent other hubness reduction methods, some 
202 | performing as well as Mutual Proximity, some performing worse. However,
203 | all of them improve internal as well as external evaluation measures.
204 | 
205 | 
206 | Analyzing other datasets
207 | ========================
208 | 
209 | :class:`HubnessAnalysis <hub_toolbox.HubnessAnalysis>` can also be used to
210 | investigate other datasets. You will require at least a numpy array of your
211 | feature vectors (called `vectors`), or a distance matrix ``D`` (where 
212 | ``D[i, j]`` is the distance between your ``i-th`` and ``j-th`` feature vector).
213 | If you want to perform classification, you also need to provide a vector 
214 | with integer labels for each data point (``target`` or 'ground-truth'). 
215 | If you don't have a distance matrix yet, you can use the methods from
216 | :class:`Distances <hub_toolbox.Distances>` to create one based on euclidean
217 | or cosine distances. For other types of distances, you can also use
218 | `scipy.spatial.distance.pdist <http://docs.scipy.org/doc/scipy/reference/
219 | generated/scipy.spatial.distance.pdist.html#scipy.spatial.distance.pdist>`_.
220 | 
221 | Now simply call
222 | 
223 | .. code-block:: python
224 | 	
225 | 	>>> from hub_toolbox.HubnessAnalysis import HubnessAnalysis
226 | 	>>> ana = HubnessAnalysis(D, vectors, target)
227 | 	>>> ana.analyze_hubness(experiments="orig,mp,nicdm,dsg",
228 |                             hubness_k=(5, 10), knn_k=(10, 20))
229 | 
230 | Note, how we provided parameters to ``analyze_hubness``: The Hub Toolbox 
231 | will now perform four experiments (original data, Mutual Proximity (Empiric), 
232 | Local Scaling (NICDM), and DisSim Global). The neighborhood size is the same
233 | as in the last example, but we changed the classification to 10-NN and 20-NN
234 | (instead of 1-NN, 5-NN, and 20-NN).
235 | 
236 | Looking at your output, you may notice a line that was not discussed before:
237 | `NICDM` has a parameter `k` that can be tuned. Other methods do so as well.
238 | The convenience class :class:`HubnessAnalysis <hub_toolbox.HubnessAnalysis>`
239 | does not allow to change the default values for the methods' parameters.
240 | To do so, you can use the individual methods of the Hub Toolbox directly,
241 | which will be covered in the next section.
242 | 
243 | 
244 | Using individual methods
245 | ========================
246 | 
247 | In this section we will revisit the analysis we performed previously 
248 | on the DEXTER dataset. This time, instead of using the convenience class
249 | :class:`HubnessAnalysis <hub_toolbox.HubnessAnalysis>`, we will employ
250 | the individual modules of the Hub Toolbox in order to see, how to use
251 | it in a more flexible way.
252 | 
253 | Loading the example dataset
254 | ---------------------------
255 | 
256 | .. code-block:: python
257 | 
258 | 	>>> from hub_toolbox.IO import load_dexter
259 | 	>>> D, labels, vectors = load_dexter()
260 | 	>>> vectors.shape
261 | 	(300, 20000)
262 | 	
263 | We see that DEXTER comprises ``300`` points in an embedding 
264 | dimension of ``20000``. The `IntrinsicDim` module can provide some insight,
265 | how well this reflects the 'true' dimensionality of the dataset, by
266 | 
267 | Calculating an intrinsic dimension estimate
268 | -------------------------------------------
269 | 
270 | .. code-block:: python
271 | 
272 | 	>>> from hub_toolbox.IntrinsicDim import intrinsic_dimension
273 | 	>>> intrinsic_dimension(vectors, k1=6, k2=12, estimator='levina', trafo=None)
274 | 	74
275 | 
276 | The MLE by Levina and Bickel with neighborhood ``[6, 12]`` tells us
277 | that the intrinsic dimension is much lower than the embedding dimension,
278 | but is still considerably high. We can assume, that this dataset is prone
279 | to
280 | 
281 | Hubness
282 | -------
283 | 
284 | .. code-block:: python
285 | 
286 | 	>>> from hub_toolbox.Hubness import hubness
287 | 	>>> S_k, D_k, N_k = hubness(D=D, k=5, metric='distance')
288 | 	>>> print("Hubness:", S_k)
289 | 	Hubness: 4.222131665788378
290 | 
291 | Besides the hubness in ``S_k``, you also get the objects ``D_k`` 
292 | and ``N_k``, which contain the ``k`` nearest neighbors of all elements 
293 | and the n-occurence, respectively. From them you can extract more 
294 | detailed information about hubs and anti-hubs.
295 | 
296 | External and internal evaluation can be performed with the following 
297 | methods:
298 | 
299 | k-NN classification
300 | -------------------
301 | 
302 | .. code-block:: python
303 | 
304 | 	>>> from hub_toolbox.KnnClassification import score
305 | 	>>> acc, corr, cmat = score(D=D, target=labels, k=[1,5], metric='distance')
306 | 	>>> print("k=5-NN accuracy:", acc[1, 0])
307 | 	k=5-NN accuracy: 0.803333333333
308 | 
309 | Also in this case, you obtain three objects: ``acc`` contains the 
310 | accuracy values, 
311 | ``corr`` contains information about each point, whether it was classified
312 | correctly or not, and ``cmat`` contains the corresponding confusion
313 | matrices. All three objects contain their information of each 
314 | k-NN experiment defined with parameter ``k=[1,5]``.
315 | 
316 | Goodman-Kruskal index
317 | ---------------------
318 | 
319 | .. code-block:: python
320 | 
321 | 	>>> from hub_toolbox.GoodmanKruskal import goodman_kruskal_index
322 | 	>>> gamma = goodman_kruskal_index(D=D, classes=labels, metric='distance')
323 | 	>>> print("Goodman-Kruskal index:", gamma)
324 | 	Goodman-Kruskal index: 0.103701886155
325 | 
326 | Calculating the :meth:`Goodman-Kruskal index <hub_toolbox.GoodmanKruskal.goodman_kruskal_index>`
327 | is straight forward. 
328 | 
329 | Hubness reduction
330 | -----------------
331 | 
332 | .. code-block:: python
333 | 
334 | 	>>> from hub_toolbox.MutualProximity import mutual_proximity_empiric
335 | 	>>> D_mp = mutual_proximity_empiric(D=D, metric='distance')
336 | 	
337 | .. code-block:: python
338 | 
339 | 	>>> from hub_toolbox.LocalScaling import nicdm
340 | 	>>> D_nicdm = nicdm(D=D, k=10, metric='distance')
341 | 
342 | You now have two objects ``D_mp`` and ``D_nicdm`` which contain  
343 | secondary distances of the DEXTER dataset, rescaled with Mutual 
344 | Proximity (Empiric) and Local Scaling (NICDM), respectively.
345 | They can now be used just as illustrated above for k-NN classification, 
346 | hubness calculation etc.
347 | 
348 | The Hub Toolbox provides more methods for hubness reduction than these 
349 | two, and additional ones will be integrated as they are developed by
350 | the hubness community. To see, which methods are currently included, try
351 | 
352 | .. code-block:: python
353 | 
354 | 	>>> from hub_toolbox.HubnessAnalysis import SEC_DIST
355 | 	>>> for k, v in SEC_DIST.items():
356 | 	...   print(k)
357 | 	... 
358 | 	dsl
359 | 	snn
360 | 	wcent
361 | 	lcent
362 | 	mp_gaussi
363 | 	mp
364 | 	orig
365 | 	mp_gauss
366 | 	nicdm
367 | 	dsg
368 | 	cent
369 | 	ls
370 | 	mp_gammai
371 | 
372 | The values ``v`` in this dictionary are actually the hubness reduction 
373 | functions, so you may invoke them for example like this:
374 | 
375 | .. code-block:: python
376 | 
377 | 	>>> D_snn = SEC_DIST['snn'](D)
378 | 	
379 | to obtain shared nearest neighbor distances.
380 | 
381 | Approximate hubness reduction
382 | -----------------------------
383 | TODO
384 | 
385 | For now, please consider the docstrings. If in doubt, please don't hesitate to
386 | contact the author.
387 | 


--------------------------------------------------------------------------------
/hub_toolbox/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This file is part of the HUB TOOLBOX available at
 6 | https://github.com/OFAI/hub-toolbox-python3/
 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 8 | 
 9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer
10 | Austrian Research Institute for Artificial Intelligence (OFAI)
11 | Contact: <roman.feldbauer@ofai.at>
12 | """
13 | 
14 | __version__ = '2.5.2'
15 | 
16 | try:
17 |     import numpy
18 |     import scipy
19 |     import sklearn
20 |     del numpy
21 |     del scipy
22 |     del sklearn
23 | except ImportError:  # pragma: no cover
24 |     raise ImportError("Could not import numpy and/or scipy.\n"
25 |                       "Please make sure you install the following Python3 "
26 |                       "packages: numpy, scipy and scikit-learn.\n"
27 |                       "See the installation docs for more details:"
28 |                       "http://hub-toolbox-python3.readthedocs.io/en/latest/"
29 |                       "user/installation.html#numpy-scipy-scikit-learn")
30 | 
31 | from hub_toolbox import centering
32 | from hub_toolbox import distances
33 | from hub_toolbox import goodman_kruskal
34 | from hub_toolbox import hubness
35 | from hub_toolbox import hubness_analysis
36 | from hub_toolbox import intrinsic_dimension
37 | from hub_toolbox import io
38 | from hub_toolbox import knn_classification
39 | from hub_toolbox import local_scaling
40 | from hub_toolbox import htlogging
41 | from hub_toolbox import global_scaling
42 | from hub_toolbox import shared_neighbors
43 | 


--------------------------------------------------------------------------------
/hub_toolbox/centering.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This file is part of the HUB TOOLBOX available at
  6 | https://github.com/OFAI/hub-toolbox-python3/
  7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  8 | 
  9 | (c) 2015-2018, Roman Feldbauer
 10 | Austrian Research Institute for Artificial Intelligence (OFAI)
 11 | Contact: <roman.feldbauer@ofai.at>
 12 | """
 13 | import ctypes
 14 | from multiprocessing import cpu_count, Pool, RawArray
 15 | import numpy as np
 16 | from sklearn.metrics.pairwise import euclidean_distances
 17 | from hub_toolbox.distances import cosine_distance as cos
 18 | from hub_toolbox import io
 19 | from functools import partial
 20 | 
 21 | __all__ = ['centering', 'weighted_centering', 'localized_centering', 
 22 |            'dis_sim_global', 'dis_sim_local']
 23 | 
 24 | def centering(X:np.ndarray, metric:str='vector', test_set_mask:np.ndarray=None):
 25 |     """
 26 |     Perform  centering, i.e. shift the origin to the data centroid.
 27 | 
 28 |     Centering of vector data `X` with ``n`` objects in an ``m``-dimensional
 29 |     feature space.
 30 |     The mean of each feature is calculated and subtracted from each
 31 |     point [1]_. In distance based mode, it must be checked upstream, that
 32 |     the distance matrix is a gram matrix as described below!
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     X : ndarray
 37 |         - An ``(n x m)`` vector data matrix with ``n`` objects in an
 38 |           ``m``-dimensional feature space
 39 |         - An ``(n x n)`` distance matrix
 40 |           of form ``K = X(X.T)``, if `X` is an ``(n x m)`` matrix;
 41 |           and of form ``K = (X.T)X``, if `X` is an ``(m x n)`` matrix,
 42 |           where ``X.T`` denotes the transpose of `X`.
 43 | 
 44 |         NOTE: The type must be defined via parameter 'metric'!
 45 | 
 46 |     metric : {'vector', 'inner'}, optional (Default: 'vector')
 47 |         Define, whether `X` is vector data or a Gram matrix of inner product
 48 |         similarities as described above.
 49 | 
 50 |     test_set_mask : ndarray, optional (default: None)
 51 |         Hold back data as a test set and perform centering on the remaining 
 52 |         data (training set).
 53 | 
 54 |     Returns
 55 |     ------- 
 56 |     X_cent : ndarray
 57 | 
 58 |         Centered vectors with shape (n, m), if given vector data.
 59 | 
 60 |     K_cent : ndarray
 61 | 
 62 |         Centered inner product similarities with shape (n, n), if given Gram matrix.
 63 |         
 64 |     References
 65 |     ----------
 66 |     .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 
 67 |            Centering similarity measures to reduce hubs. In Proceedings of the 
 68 |            2013 Conference on Empirical Methods in Natural Language Processing 
 69 |            (pp 613–623). 
 70 |            Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf
 71 |     """
 72 |     # Kernel based centering requires inner product similarities, NOT distances.
 73 |     # Since the parameter was previously erroneously called 'distance',
 74 |     # this is kept for compatibility reasons.
 75 |     if metric in ('similarity', 'distance', 'inner', 'inner_product'):
 76 |         if test_set_mask is not None:
 77 |             raise NotImplementedError("Kernel based centering does not "
 78 |                                       "support train/test splits so far.")
 79 |         io.check_distance_matrix_shape(X)
 80 |         n = X.shape[0]
 81 |         H = np.identity(n) - np.ones((n, n)) / n
 82 |         # K = X.T.X must be provided upstream
 83 |         return H.dot(X).dot(H)
 84 |     elif metric == 'vector':
 85 |         n = X.shape[0]
 86 |         if test_set_mask is None:
 87 |             # center among all data
 88 |             return X - np.mean(X, axis=0)
 89 |         else: 
 90 |             # center among training data
 91 |             train_ind = np.setdiff1d(np.arange(n), test_set_mask)
 92 |             return X - np.mean(X[train_ind], axis=0)
 93 |     else:
 94 |         raise ValueError("Parameter 'metric' must be 'inner' or 'vector'.")
 95 | 
 96 | def weighted_centering(X:np.ndarray, metric:str='cosine', gamma:float=1., 
 97 |                        test_set_mask:np.ndarray=None):
 98 |     """
 99 |     Perform  weighted centering: shift origin to the weighted data mean
100 |     
101 |     Move the origin more actively towards hub objects in the dataset, 
102 |     rather than towards the data centroid [1]_.
103 |     
104 |     Parameters
105 |     ----------
106 |     X : ndarray
107 |         An ``m x n`` vector data matrix with ``n`` objects in an 
108 |         ``m`` dimensional feature space 
109 |     
110 |     metric : {'cosine', 'euclidean'}, optional (default: 'cosine')
111 |         Distance measure used to place more weight on objects that are more 
112 |         likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does 
113 |         not make much sense and might be removed in the future).
114 |     
115 |     gamma : float, optional (default: 1.0)
116 |         Controls how much we emphasize the weighting effect
117 |         
118 |         - ``gamma=0`` : equivalent to normal centering
119 |         - ``gamma>0`` : move origin closer to objects with larger similarity 
120 |           to other objects
121 |     
122 |     test_set_mask : ndarray, optional (default: None)
123 |         Hold back data as a test set and perform centering on the remaining 
124 |         data (training set).
125 |     
126 |     Returns
127 |     ------- 
128 |     X_wcent : ndarray
129 |         Weighted centered vectors.
130 |         
131 |     References
132 |     ----------
133 |     .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 
134 |            Centering similarity measures to reduce hubs. In Proceedings of the 
135 |            2013 Conference on Empirical Methods in Natural Language Processing 
136 |            (pp 613–623). 
137 |            Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf
138 |     """
139 |     n = X.shape[0]
140 |                    
141 |     # Indices of training examples
142 |     if test_set_mask is not None:
143 |         train_set_mask = np.setdiff1d(np.arange(n), test_set_mask)
144 |     else:
145 |         train_set_mask = slice(0, n)
146 |     
147 |     n_train = X[train_set_mask].shape[0]
148 |     d = np.zeros(n)
149 |     
150 |     if metric == 'cosine':
151 |         vectors_sum = X[train_set_mask].sum(axis=0)
152 |         for i in np.arange(n):
153 |             d[i] = n_train * cos(np.array([X[i], vectors_sum/n_train]))[0, 1]
154 |     # Using euclidean distances does not really make sense
155 |     elif metric == 'euclidean':
156 |         for i in range(n):
157 |             displ_v = X[train_set_mask] - d[i]
158 |             d[i] = np.sum(np.sqrt(displ_v * displ_v))
159 |     else:
160 |         raise ValueError("Weighted centering only supports cosine distances.")
161 |     d_sum = np.sum(d ** gamma)
162 |     w = (d ** gamma) / d_sum
163 |     vectors_mean_weighted = np.sum(w.reshape(n, 1) * X, axis=0)
164 |     X_wcent = X - vectors_mean_weighted
165 |     return X_wcent
166 | 
167 | #===============================================================================
168 | # #=============================================================================
169 | # #                         LOCALIZED CENTERING
170 | # #=============================================================================
171 | #===============================================================================
172 | 
173 | def _lcent_load_shared_data(w_, sim_train_, local_affinity_):
174 |     global w, sim_train, local_affinity
175 |     w = w_
176 |     sim_train = sim_train_
177 |     local_affinity = local_affinity_
178 |     return
179 | 
180 | def _lcent_calculate_loc_af(i, kappa):
181 |     # Get the kappa nearest neighbors (highest similarity)
182 |     nn = np.argpartition(sim_train[i, :], kth=-kappa)[-1:-kappa-1:-1]
183 |     # Local centroid
184 |     c_kappa_x = w[nn, :].mean(axis=0)
185 |     local_affinity[i] = np.inner(w[i, :], c_kappa_x)
186 |     return
187 | 
188 | def localized_centering(X:np.ndarray, Y:np.ndarray=None,
189 |                         kappa:int=40, gamma:float=1., n_jobs:int=1):
190 |     """
191 |     Perform localized centering.
192 |     
193 |     Reduce hubness in datasets according to the method proposed in [2]_.
194 |     
195 |     Parameters
196 |     ----------
197 |     X : ndarray
198 |         An ``n x m`` vector data matrix with ``n`` objects in an 
199 |         ``m`` dimensional feature space
200 | 
201 |     Y : ndarray, optional
202 |         If Y is provided, calculate similarities between all test data in `X`
203 |         versus all training data in `Y`.
204 |     
205 |     kappa : int, optional (default: 40)
206 |         Local segment size, determines the size of the local neighborhood for 
207 |         calculating the local affinity. When ``kappa=n`` localized centering 
208 |         reduces to standard centering.
209 |         "select κ depending on the dataset, so that the correlation between
210 |         Nk(x) and the local affinity <x, cκ(x)> is maximized" [2]_
211 |         
212 |     gamma : float, optional (default: 1.0)
213 |         Control the degree of penalty, so that used the similarity score 
214 |         is smaller depending on how likely a point is to become a hub.
215 |         "Parameter γ can be tuned so as to maximally reduce the skewness 
216 |         of the Nk distribution" [2]_.
217 | 
218 |     n_jobs : int, optional
219 |         Parallel execution
220 | 
221 |     Returns
222 |     ------- 
223 |     S_lcent : ndarray
224 |         Secondary similarity (localized centering) matrix.
225 |         
226 |     References
227 |     ----------
228 |     .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 
229 |            Centering similarity measures to reduce hubs. In Proceedings of the 
230 |            2013 Conference on Empirical Methods in Natural Language Processing 
231 |            (pp 613–623). 
232 |            Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf
233 |     
234 |     .. [2] Hara, K., Suzuki, I., Shimbo, M., Kobayashi, K., Fukumizu, K., & 
235 |            Radovanović, M. (2015). Localized centering: Reducing hubness in 
236 |            large-sample data hubness in high-dimensional data. In AAAI ’15: 
237 |            Proceedings of the 29th AAAI Conference on Artificial Intelligence 
238 |            (pp. 2645–2651).
239 |     """
240 |     if n_jobs == -1:
241 |         n_jobs = cpu_count()
242 |     # Rescale vectors to unit length
243 |     div_ = np.sqrt((X ** 2).sum(axis=-1))[..., np.newaxis]
244 |     div_[div_ == 0] = 1e-7
245 |     v = X / div_
246 |     if Y is None: # calc all-against-all in X
247 |         w = v
248 |         n, _ = X.shape
249 |         sim = v.dot(w.T)
250 |         sim_train = sim
251 |     else: # calc sim from test data in X against train data in Y
252 |         div_ = np.sqrt((Y ** 2).sum(axis=-1))[..., np.newaxis]
253 |         div_[div_ == 0] = 1e-7
254 |         w = Y / div_
255 |         n, _ = Y.shape
256 |         sim = v.dot(w.T)
257 |         sim_train = w.dot(w.T)   
258 | 
259 |     if n_jobs > 1:
260 |         local_affinity_ctype = RawArray(ctypes.c_double, n)
261 |         local_affinity = np.frombuffer(local_affinity_ctype, dtype=np.float64)
262 |         with Pool(processes=n_jobs,
263 |                   initializer=_lcent_load_shared_data,
264 |                   initargs=(w, sim_train, local_affinity)) as pool:
265 |             for _ in pool.imap(
266 |                 func=partial(_lcent_calculate_loc_af, kappa=kappa), 
267 |                 iterable=range(n)):
268 |                 pass # local_affinity is handled within func
269 |     else:
270 |         local_affinity = np.zeros(n)
271 |         for i in range(n):
272 |             # Get the kappa nearest neighbors (highest similarity)
273 |             nn = np.argpartition(sim_train[i, :], kth=-kappa)[-1:-kappa-1:-1]
274 |             # Local centroid
275 |             c_kappa_x = w[nn, :].mean(axis=0)
276 |             local_affinity[i] = np.inner(w[i, :], c_kappa_x)
277 |     # Only change penalty, if all values are positive 
278 |     if gamma != 1 and (local_affinity < 0).sum() == 0:
279 |         local_affinity **= gamma
280 |     sim -= local_affinity
281 |     return sim
282 | 
283 | 
284 | def dis_sim_global(X:np.ndarray, Y:np.ndarray=None):
285 |     """
286 |     Calculate dissimilarity based on global 'sample-wise centrality' [1]_.
287 |     
288 |     Parameters
289 |     ----------
290 |     X : ndarray
291 |         An ``n x m`` vector data matrix with ``n`` objects in an 
292 |         ``m`` dimensional feature space
293 | 
294 |     Y : ndarray, optional
295 |         If Y is provided, calculate dissimilarities between all test data
296 |         in `X` and all training data in `Y`.
297 | 
298 |     Returns
299 |     -------
300 |     D_dsg : ndarray
301 |         Secondary dissimilarity (DisSimGlobal) matrix.
302 |         
303 |     References
304 |     ----------
305 |     .. [1] Hara, K., Suzuki, I., Kobayashi, K., Fukumizu, K., & 
306 |            Radovanović, M. (2016). Flattening the density gradient for 
307 |            eliminating spatial centrality to reduce hubness. Proceedings of 
308 |            the Thirtieth AAAI Conference on Artificial Intelligence (AAAI ’16), 
309 |            1659–1665. Retrieved from http://www.aaai.org/ocs/index.php/AAAI/
310 |            AAAI16/paper/download/12055/11787
311 |     """
312 |     if Y is None:
313 |         Y = X
314 |     if X.shape[1] != Y.shape[1]:
315 |         raise ValueError("X and Y must have same number of features.")
316 |     c = Y.mean(0)
317 |     x_c = euclidean_distances(Y, c[np.newaxis, :], squared=True)
318 |     if id(X) == id(Y): # i.e. no Y was provided
319 |         q_c = x_c
320 |     else: # avoid duplicate calculations
321 |         q_c = euclidean_distances(X, c[np.newaxis, :], squared=True)
322 |     D_xq = euclidean_distances(X, Y, squared=True)
323 |     D_xq -= x_c.T
324 |     D_xq -= q_c
325 |     return D_xq
326 | 
327 | #===============================================================================
328 | # #=============================================================================
329 | # #                         DisSim LOCAL
330 | # #=============================================================================
331 | #===============================================================================
332 | 
333 | def _dsl_init(c_k_X_or_Y_, D_test_or_train_, Y_):
334 |     global c_k_X_or_Y, D_test_or_train, Y
335 |     c_k_X_or_Y = c_k_X_or_Y_
336 |     D_test_or_train = D_test_or_train_
337 |     Y = Y_
338 |     return
339 | 
340 | def _dsl_local_centroids(i, k):
341 |     knn_idx = np.argpartition(D_test_or_train[i, :], kth=k)[:k]
342 |     c_k_X_or_Y[i] = Y[knn_idx].mean(axis=0)
343 |     return
344 | 
345 | def dis_sim_local(X:np.ndarray, Y:np.ndarray=None, k:int=10, n_jobs:int=1):
346 |     """Calculate dissimilarity based on local 'sample-wise centrality' [1]_.
347 |     
348 |     Parameters
349 |     ----------
350 |     X : ndarray
351 |         An ``n x m`` vector data matrix with ``n`` objects in an 
352 |         ``m`` dimensional feature space.
353 | 
354 |     Y : ndarray, optional
355 |         If Y is provided, calculate dissimilarities between all test data
356 |         in `X` and all training data in `Y`.
357 | 
358 |     k : int, optional (default: 10)
359 |         Neighborhood size used for determining the local centroids.
360 |         Can be optimized as to maximally reduce hubness [1]_.
361 | 
362 |     n_jobs : int, optional
363 |         Parallel execution
364 | 
365 |     Returns
366 |     -------
367 |     D_dsl : ndarray
368 |         Secondary dissimiliarity (DisSimLocal) matrix.
369 |         
370 |     References
371 |     ----------
372 |     .. [1] Hara, K., Suzuki, I., Kobayashi, K., Fukumizu, K., & 
373 |            Radovanović, M. (2016). Flattening the density gradient for 
374 |            eliminating spatial centrality to reduce hubness. Proceedings of 
375 |            the Thirtieth AAAI Conference on Artificial Intelligence (AAAI ’16), 
376 |            1659–1665. Retrieved from http://www.aaai.org/ocs/index.php/AAAI/
377 |            AAAI16/paper/download/12055/11787
378 |     """
379 |     X = X.copy()
380 |     # all-against-all dissimilarities?
381 |     if Y is None:
382 |         Y = X
383 |     else:
384 |         Y = Y.copy()
385 | 
386 |     # dataset size and dimensionality
387 |     n_test, m_test = X.shape
388 |     n_train, m_train = Y.shape
389 |     if m_test != m_train:
390 |         raise ValueError("X and Y must have same number of features.")
391 | 
392 |     # Calc euclidean distances to find nearest neighbors among training data
393 |     D_train = euclidean_distances(Y, squared=True)
394 |     if id(Y) == id(X):
395 |         # Exclude self distances from kNN lists:
396 |         np.fill_diagonal(D_train, np.inf)
397 |         D_test = D_train
398 |     else:
399 |         # ... and between test and training data
400 |         D_test = euclidean_distances(X, Y, squared=True)
401 | 
402 |     # Local centroid for each point among its k-nearest training neighbors
403 |     if n_jobs > 1:
404 |         c_k_X_ctype = RawArray(ctypes.c_double, X.size)
405 |         c_k_X = np.frombuffer(c_k_X_ctype, dtype=np.float64).reshape(X.shape)
406 |         with Pool(processes=n_jobs,
407 |                   initializer=_dsl_init,
408 |                   initargs=(c_k_X, D_test, Y)) as pool:
409 |             for _ in pool.imap(
410 |                 func=partial(_dsl_local_centroids, k=k), 
411 |                 iterable=range(n_test)):
412 |                 pass # handling inside function
413 |     else:
414 |         c_k_X = np.zeros_like(X)
415 |         for i in range(n_test):
416 |             knn_idx = np.argpartition(D_test[i, :], kth=k)[:k]
417 |             c_k_X[i] = Y[knn_idx].mean(axis=0)
418 |     X -= c_k_X
419 |     X **= 2
420 |     x_c_k = X.sum(axis=1)
421 |     if id(Y) == id(X):
422 |         c_k_Y = c_k_X
423 |         y_c_k = x_c_k
424 |     else:
425 |         if n_jobs > 1:
426 |             c_k_Y_ctype = RawArray(ctypes.c_double, Y.size)
427 |             c_k_Y = np.frombuffer(c_k_Y_ctype, dtype=np.float64).reshape(Y.shape)
428 |             with Pool(processes=n_jobs,
429 |                       initializer=_dsl_init,
430 |                       initargs=(c_k_Y, D_train, Y)) as pool:
431 |                 for _ in pool.imap(
432 |                     func=partial(_dsl_local_centroids, k=k),
433 |                     iterable=range(n_train)):
434 |                     pass # handling within function
435 |         else:
436 |             c_k_Y = np.zeros_like(Y)
437 |             for i in range(n_train):
438 |                 knn_idx = np.argpartition(D_train[i, :], kth=k)[:k]
439 |                 c_k_Y[i] = Y[knn_idx].mean(axis=0)
440 |         Y -= c_k_Y
441 |         Y **= 2
442 |         y_c_k = Y.sum(axis=1)
443 |     # DisSimLocal
444 |     x_y = D_test
445 |     x_y -= x_c_k[:, np.newaxis]
446 |     x_y -= y_c_k
447 |     if id(Y) == id(X):
448 |         np.fill_diagonal(x_y, -np.inf)
449 |     return x_y
450 | 
451 | if __name__ == '__main__':
452 |     #vectors = np.arange(12).reshape(3,4)
453 |     np.random.seed(47)
454 |     VECT_DATA = np.random.rand(3, 4)
455 |     print("Vectors: ............... \n{}".
456 |           format(VECT_DATA))
457 |     print("Centering: ............. \n{}".
458 |           format(centering(VECT_DATA, 'vector')))
459 |     print("Weighted centering: .... \n{}".
460 |           format(weighted_centering(VECT_DATA, 'cosine', 0.4)))
461 |     print("Localized centering: ... \n{}".
462 |           format(localized_centering(VECT_DATA, kappa=2, gamma=1)))
463 |     print("DisSim (global): ....... \n{}".
464 |           format(dis_sim_global(VECT_DATA)))
465 |     print("DisSim (local): ........ \n{}".
466 |           format(dis_sim_local(VECT_DATA, k=2)))
467 | 


--------------------------------------------------------------------------------
/hub_toolbox/distances.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This file is part of the HUB TOOLBOX available at
  6 | https://github.com/OFAI/hub-toolbox-python3/
  7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  8 | 
  9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer
 10 | Austrian Research Institute for Artificial Intelligence (OFAI)
 11 | Contact: <roman.feldbauer@ofai.at>
 12 | """
 13 | import ctypes
 14 | from multiprocessing import Pool, cpu_count, RawArray
 15 | import numpy as np
 16 | from scipy.spatial.distance import cdist, pdist, squareform
 17 | try: # for scikit-learn >= 0.18
 18 |     from sklearn.model_selection import StratifiedShuffleSplit
 19 | except ImportError: # lower scikit-learn versions
 20 |     from sklearn.cross_validation import StratifiedShuffleSplit
 21 | from sklearn.metrics.pairwise import pairwise_distances
 22 | from hub_toolbox.io import check_vector_matrix_shape_fits_labels
 23 | from hub_toolbox.htlogging import ConsoleLogging
 24 | 
 25 | __all__ = ['cosine_distance', 'euclidean_distance', 
 26 |            'lp_norm', 'sample_distance']
 27 | 
 28 | def cosine_distance(X):
 29 |     """Calculate the cosine distance between all pairs of vectors in `X`."""
 30 |     xn = np.sqrt(np.sum(X**2, 1))
 31 |     Y = X / xn[:, np.newaxis]
 32 |     del xn
 33 |     D = 1. - Y.dot(Y.T)
 34 |     del Y
 35 |     D[D < 0] = 0
 36 |     D = np.triu(D, 1) + np.triu(D, 1).T
 37 |     return D
 38 | 
 39 | def euclidean_distance(X):
 40 |     """Calculate the euclidean distances between all pairs of vectors in `X`.
 41 |     
 42 |     Consider using sklearn.metric.pairwise.euclidean_distances for faster,
 43 |     but less accurate distances (not necessarily symmetric, too)."""
 44 |     return squareform(pdist(X, 'euclidean'))
 45 | 
 46 | def lp_norm(X:np.ndarray, Y:np.ndarray=None, p:float=None, n_jobs:int=1):
 47 |     """Calculate Minkowski distances with L^p norm.
 48 |     
 49 |     Calculate distances between all pairs of vectors within `X`, if `Y` is None.
 50 |     Otherwise calculate distances distances between all vectors in `X` against
 51 |     all vectors in `Y`. For example, this is useful if only distances from
 52 |     test data to training data are required.
 53 | 
 54 |     Parameters
 55 |     ----------
 56 |     X : ndarray
 57 |         Vector data (e.g. test set)
 58 | 
 59 |     Y : ndarray, optional, default: None
 60 |         Vector data (e.g. training set)
 61 | 
 62 |     p : float, default: None
 63 |         Minkowski norm
 64 | 
 65 |     n_jobs : int, default: 1
 66 |         Parallel computation with multiple processes. See the scikit-learn
 67 |         docs for for more details.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     D : ndarray
 72 |         Distance matrix based on Lp-norm
 73 | 
 74 |     See also
 75 |     --------
 76 |     http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
 77 |     """
 78 |     if p is None:
 79 |         raise ValueError("Please define the `p` parameter for lp_norm().")
 80 |     elif p == 1.: # Use efficient version for cityblock distances
 81 |         return pairwise_distances(X=X, Y=Y, metric='l1',
 82 |                                   n_jobs=n_jobs)
 83 |     elif p == 2.: # Use efficient version for Euclidean distances
 84 |         return pairwise_distances(X=X, Y=Y, metric='l2',
 85 |                                   n_jobs=n_jobs)
 86 |     else: # Use general, less efficient version for general Minkowski distances
 87 |         return pairwise_distances(X=X, Y=Y, metric='minkowski',
 88 |                                   n_jobs=n_jobs, **{'p' : p})
 89 | 
 90 | #===============================================================================
 91 | # #=============================================================================
 92 | # # 
 93 | # #                        m_p dissimilarity
 94 | # # 
 95 | # #=============================================================================
 96 | #===============================================================================
 97 | def _mp_load_shared_Y(Y_, n_bins_):
 98 |     global Y, n_bins
 99 |     Y = Y_
100 |     n_bins = n_bins_
101 | 
102 | def _mp_load_shared_data(X_, Y_, p_, n_bins_, R_bins_, R_bins_np_,
103 |                          X_bins_, X_bins_np_, Y_bins_, Y_bins_np_, mp_, mp_np_):
104 |     global X, Y, n_bins, n_x, n_y, d, p
105 |     global X_bins, X_bins_np, Y_bins, Y_bins_np, R_bins, R_bins_np, mp, mp_np
106 |     X = X_
107 |     Y = Y_
108 |     n_bins = n_bins_
109 |     n_x, d = X.shape
110 |     n_y = Y.shape[0]
111 |     p = p_
112 |     R_bins = R_bins_
113 |     R_bins_np = R_bins_np_
114 |     X_bins = X_bins_
115 |     X_bins_np = X_bins_np_
116 |     Y_bins = Y_bins_
117 |     Y_bins_np = Y_bins_np_
118 |     mp = mp_
119 |     mp_np = mp_np_
120 | 
121 | def _mp_find_bin_edges(i):
122 |     return np.partition(Y[:, i], kth=kth)[kth]
123 | 
124 | def _mp_calc_histograms(i):
125 |     bins = _mp_find_bin_edges(i)
126 |     return np.histogram(Y[:, i], bins=bins)
127 | 
128 | def _mp_calc_histograms_n_bins(i):
129 |     return np.histogram(Y[:, i], bins=n_bins)
130 | 
131 | def _mp_create_r_bins(i):
132 |     hist, _ = histograms[i]
133 |     for b in range(n_bins):
134 |         R_bins_np[i, b, b:] = np.cumsum(hist[b:])
135 |     R_bins_np[i] += np.triu(R_bins_np[i], k=1).T
136 |     return
137 | 
138 | def _mp_estimate_r(i):
139 |     # Binning. Values outside the range are binned into the first/last bin
140 |     _, bin_edges = histograms[i]
141 |     bin_x = np.digitize(X[:, i], bins=bin_edges)
142 |     bin_x -= 1
143 |     np.clip(bin_x, 0, n_bins-1, out=bin_x)
144 |     bin_y = np.digitize(Y[:, i], bins=bin_edges)
145 |     bin_y -= 1
146 |     np.clip(bin_y, 0, n_bins-1, out=bin_y)
147 |     X_bins_np[i, :] = bin_x
148 |     Y_bins_np[i, :] = bin_y
149 |     return
150 | 
151 | def _mp_calc_mp_dissim(x):
152 |     mp_xy = np.zeros(n_y, dtype=float)
153 |     for i in range(d):
154 |         tmp = R_bins_np[i, X_bins_np[i, x], Y_bins_np[i, :]] / (n_x + n_y)
155 |         tmp **= p
156 |         mp_xy += tmp
157 |     mp_xy /= d
158 |     mp_xy **= (1. / p)
159 |     mp_np[x, :] = mp_xy
160 |     return
161 | 
162 | def mp_dissim(X:np.ndarray, Y:np.ndarray=None, p:float=2,
163 |               n_bins:int=0, bin_size:str='range', n_jobs:int=1, verbose:int=0):
164 |     """ Calculate m_p dissimilarity.
165 | 
166 |     The data-dependent m_p dissimilarity measure considers the relative
167 |     positions of objects x and y with respect to the rest of the data
168 |     distribution in each dimension [1]_.
169 | 
170 |     Parameters
171 |     ----------
172 |     X : ndarray
173 |         Vector data (e.g. test set), shape (n_x, d)
174 | 
175 |     Y : ndarray, optional, default: None
176 |         Vector data (e.g. training set), shape (n_y, d).
177 |         Number of features ``d`` must be equal in `X` and `Y`.
178 | 
179 |     p : float, optional, default: 2
180 |         Parameter, similar to `p` in Minkowski norm
181 | 
182 |     n_bins : int, optional, default: 0
183 |         Number of bins for probability mass estimation
184 | 
185 |     bin_size : str, optional, default: 'range'
186 |         Strategy for binning. May be one of:
187 |             'range' ... create bins with uniform range length
188 |             'mass'  ... create bins with approx. uniform mass
189 | 
190 |     n_jobs : int, optional, default: 1
191 |         Parallel computation with multiple processes.
192 | 
193 |     verbose : int, optional, default: 0
194 |         Increasing level of output
195 | 
196 |     Returns
197 |     -------
198 |     D : ndarray, shape (X.shape[0], Y.shape[0])
199 |         m_p dissimilarity matrix
200 | 
201 |     References
202 |     ----------
203 |     .. [1] Aryal et al. (2017). Data-dependent dissimilarity measure: an
204 |            effective alternative to geometric distance measures.
205 |            Knowledge and Information Systems, Springer-Verlag London.
206 |     """
207 |     # Some preparation
208 |     n_x, d = X.shape
209 |     # All-against-all in X, or X against Y?
210 |     if Y is None:
211 |         Y = X
212 |     n_y, d_y = Y.shape
213 |     # X and Y must have same dimensionality
214 |     assert d == d_y
215 |     if n_jobs == -1:
216 |         n_jobs = cpu_count()
217 |     n_bins = int(n_bins)
218 |     if p == 0:
219 |         log = ConsoleLogging()
220 |         log.warning('Got mpDisSim parameter p=0. Changed to default '
221 |                     'value p=2 instead, in order to avoid zero division.')
222 |         p = 2.
223 | 
224 |     # RawArrays have no locks. Must take EXTREME CARE!!
225 |     R_bins = RawArray(ctypes.c_int32, d * n_bins * n_bins)
226 |     R_bins_np = np.frombuffer(R_bins, dtype=np.int32).reshape((d, n_bins, n_bins))
227 |     X_bins = RawArray(ctypes.c_int32, d * n_x)
228 |     X_bins_np = np.frombuffer(X_bins, dtype=np.int32).reshape((d, n_x))
229 |     Y_bins = RawArray(ctypes.c_int32, d * n_y)
230 |     Y_bins_np = np.frombuffer(Y_bins, dtype=np.int32).reshape((d, n_y))
231 |     mp = RawArray(ctypes.c_double, n_x * n_y)
232 |     mp_np = np.frombuffer(mp).reshape((n_x, n_y))
233 | 
234 |     global histograms, kth
235 |     kth = np.arange(0, n_y)[0:n_y:int(n_y/n_bins)]
236 |     if kth[-1] != n_y - 1:
237 |         kth = np.append(kth, n_y-1)
238 |     if verbose:
239 |         print("Creating bins for estimating probability data mass.")
240 |     with Pool(processes=n_jobs,
241 |               initializer=_mp_load_shared_Y,
242 |               initargs=(Y, n_bins)) as pool:
243 |         if 'mass'.startswith(bin_size):
244 |             histograms = pool.map(func=_mp_calc_histograms,
245 |                                   iterable=range(d))
246 |         elif 'range'.startswith(bin_size):
247 |             histograms = pool.map(func=_mp_calc_histograms_n_bins,
248 |                                   iterable=range(d))
249 |         else:
250 |             raise ValueError("{}' is not a valid value for `bin_size`. "
251 |                              "Please use 'range' or 'mass'.".format(bin_size))
252 |     # The second pool needs `histograms`
253 |     with Pool(processes=n_jobs,
254 |               initializer=_mp_load_shared_data,
255 |               initargs=(X, Y, p, n_bins, R_bins, R_bins_np, X_bins, X_bins_np,
256 |                         Y_bins, Y_bins_np, mp, mp_np)) as pool:
257 |         pool.map(func=_mp_create_r_bins, iterable=range(d))
258 |         if verbose:
259 |             print("Estimating probability data mass in all regions R_i(x,y).")
260 |         pool.map(func=_mp_estimate_r, iterable=range(d))
261 |         if verbose:
262 |             print("Calculating m_p dissimilarity for all pairs x, y.")
263 |         pool.map(func=_mp_calc_mp_dissim, iterable=range(n_x))
264 |     if verbose:
265 |         print("Done.")
266 |     return mp_np
267 | 
268 | 
269 | def sample_distance(X, y, sample_size, metric='euclidean', strategy='a',
270 |                     random_state=None):
271 |     """Calculate incomplete distance matrix.
272 |     
273 |     Parameters
274 |     ----------
275 |     X : ndarray
276 |         Input vector data.
277 | 
278 |     y : ndarray
279 |         Input labels (used for stratified sampling).
280 | 
281 |     sample_size : int or float
282 |         If float, must be between 0.0 and 1.0 and represent the proportion of
283 |         the dataset for which distances should be calculated to.
284 |         If int, represents the absolute number of sample distances.
285 |         NOTE: See also the notes to the return value `y_sample`!
286 | 
287 |     metric : any scipy.spatial.distance.cdist metric (default: 'euclidean')
288 |         Metric used to calculate distances.
289 | 
290 |     strategy : 'a', 'b' (default: 'a')
291 |         
292 |         - 'a': Stratified sampling, for all points the distances to the
293 |                 same points are chosen.
294 |         - 'b': Stratified sampling, for each point it is chosen independently,
295 |                 to which other points distances are calculated.
296 |                 NOTE: currently not implemented.
297 | 
298 |     random_state : int or RandomState
299 |         Pseudo-random number generator state used for random sampling.
300 | 
301 |     Returns
302 |     -------
303 |     D : ndarray
304 |         The ``n x s`` distance matrix, where ``n`` is the dataset size and
305 |         ``s`` is the sample size.
306 | 
307 |     y_sample : ndarray
308 |         The index array that determines, which column in `D` corresponds
309 |         to which data point.
310 |         
311 |         NOTE: The size of `y_sample` may be slightly higher than defined by
312 |         `sample_size` in order to meet stratification requirements!
313 |         Thus, please always check the size in the downstream workflow.
314 | 
315 |     Notes
316 |     -----
317 |     Only calculate distances to a fixed number/fraction of all ``n`` points.
318 |     These ``s`` points are sampled according to the chosen strategy (see above).
319 |     In other words, calculate the distance from all points to each point
320 |     in the sample to obtain a ``n x s`` distance matrix.
321 |     
322 |     """
323 |     check_vector_matrix_shape_fits_labels(X, y)
324 |     n = X.shape[0]
325 |     if not isinstance(sample_size, int):
326 |         sample_size = int(sample_size * n)
327 |     if strategy == 'a':
328 |         try: # scikit-learn == 0.18
329 |             sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size,
330 |                                          random_state=random_state)
331 |             _, y_sample = sss.split(X=X, y=y)
332 |         except ValueError: # scikit-learn >= 0.18.1
333 |             _, y_sample = next(sss.split(X=X, y=y))
334 |         except TypeError: # scikit-learn < 0.18
335 |             sss = StratifiedShuffleSplit(y=y, n_iter=1, test_size=sample_size,
336 |                                          random_state=random_state)
337 |             _, y_sample = next(iter(sss))
338 |     elif strategy == 'b':
339 |         raise NotImplementedError("Strategy 'b' is not yet implemented.")
340 |         #=======================================================================
341 |         # y_sample = np.zeros((n, sample_size))
342 |         # try: # scikit-learn >= 0.18
343 |         #     for i in range(n):
344 |         #         sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size)
345 |         #         _, y_sample[i, :] = sss.split(X=y, y=y)
346 |         # except TypeError: # scikit-learn < 0.18
347 |         #     for i in range(n):
348 |         #         sss = StratifiedShuffleSplit(y=y, n_iter=1, test_size=sample_size)
349 |         #         _, y_sample[i, :] = next(iter(sss))
350 |         # # TODO will need to adapt cdist call below...
351 |         #=======================================================================
352 |     else:
353 |         raise NotImplementedError("Strategy", strategy, "unknown.")
354 |     
355 |     D = cdist(X, X[y_sample, :], metric=metric)
356 |     return D, y_sample
357 | 


--------------------------------------------------------------------------------
/hub_toolbox/example_datasets/ABOUT:
--------------------------------------------------------------------------------
1 | DEXTER is a text classification problem in a bag-of-word representation. This
2 | is a two-class classification problem with sparse continuous input variables.
3 | This dataset is one of five datasets of the NIPS 2003 feature selection
4 | challenge. 
5 | 
6 | http://archive.ics.uci.edu/ml/datasets/Dexter
7 | 


--------------------------------------------------------------------------------
/hub_toolbox/example_datasets/dexter_train.labels:
--------------------------------------------------------------------------------
  1 | 1
  2 | -1
  3 | 1
  4 | -1
  5 | 1
  6 | -1
  7 | 1
  8 | -1
  9 | 1
 10 | 1
 11 | 1
 12 | 1
 13 | -1
 14 | 1
 15 | 1
 16 | 1
 17 | -1
 18 | 1
 19 | -1
 20 | -1
 21 | 1
 22 | -1
 23 | 1
 24 | 1
 25 | 1
 26 | 1
 27 | 1
 28 | -1
 29 | -1
 30 | -1
 31 | 1
 32 | -1
 33 | -1
 34 | 1
 35 | 1
 36 | 1
 37 | 1
 38 | -1
 39 | 1
 40 | -1
 41 | -1
 42 | -1
 43 | -1
 44 | 1
 45 | -1
 46 | -1
 47 | -1
 48 | -1
 49 | -1
 50 | 1
 51 | -1
 52 | -1
 53 | 1
 54 | -1
 55 | -1
 56 | -1
 57 | 1
 58 | 1
 59 | 1
 60 | 1
 61 | 1
 62 | -1
 63 | -1
 64 | -1
 65 | -1
 66 | -1
 67 | 1
 68 | -1
 69 | 1
 70 | -1
 71 | 1
 72 | -1
 73 | -1
 74 | -1
 75 | 1
 76 | 1
 77 | 1
 78 | 1
 79 | 1
 80 | -1
 81 | -1
 82 | -1
 83 | -1
 84 | -1
 85 | 1
 86 | 1
 87 | 1
 88 | 1
 89 | -1
 90 | -1
 91 | -1
 92 | -1
 93 | 1
 94 | -1
 95 | 1
 96 | -1
 97 | -1
 98 | 1
 99 | 1
100 | -1
101 | 1
102 | 1
103 | -1
104 | -1
105 | 1
106 | 1
107 | 1
108 | 1
109 | -1
110 | -1
111 | -1
112 | 1
113 | 1
114 | -1
115 | 1
116 | 1
117 | -1
118 | -1
119 | 1
120 | 1
121 | -1
122 | 1
123 | -1
124 | -1
125 | 1
126 | 1
127 | 1
128 | -1
129 | -1
130 | 1
131 | 1
132 | 1
133 | -1
134 | -1
135 | 1
136 | 1
137 | -1
138 | -1
139 | 1
140 | -1
141 | 1
142 | 1
143 | 1
144 | -1
145 | -1
146 | -1
147 | 1
148 | 1
149 | -1
150 | -1
151 | 1
152 | -1
153 | 1
154 | -1
155 | 1
156 | -1
157 | -1
158 | 1
159 | 1
160 | -1
161 | 1
162 | -1
163 | 1
164 | -1
165 | -1
166 | 1
167 | -1
168 | 1
169 | 1
170 | -1
171 | 1
172 | -1
173 | 1
174 | -1
175 | -1
176 | -1
177 | 1
178 | -1
179 | 1
180 | 1
181 | 1
182 | 1
183 | -1
184 | -1
185 | 1
186 | -1
187 | 1
188 | 1
189 | 1
190 | -1
191 | -1
192 | 1
193 | -1
194 | -1
195 | 1
196 | -1
197 | -1
198 | -1
199 | 1
200 | -1
201 | -1
202 | 1
203 | 1
204 | -1
205 | 1
206 | -1
207 | 1
208 | 1
209 | -1
210 | 1
211 | 1
212 | -1
213 | -1
214 | -1
215 | 1
216 | -1
217 | -1
218 | 1
219 | 1
220 | -1
221 | 1
222 | -1
223 | -1
224 | -1
225 | -1
226 | 1
227 | 1
228 | 1
229 | 1
230 | 1
231 | 1
232 | 1
233 | -1
234 | -1
235 | 1
236 | -1
237 | -1
238 | 1
239 | 1
240 | -1
241 | 1
242 | 1
243 | -1
244 | -1
245 | -1
246 | 1
247 | 1
248 | 1
249 | -1
250 | 1
251 | 1
252 | -1
253 | 1
254 | -1
255 | -1
256 | -1
257 | -1
258 | 1
259 | -1
260 | 1
261 | 1
262 | -1
263 | -1
264 | 1
265 | 1
266 | -1
267 | -1
268 | 1
269 | 1
270 | 1
271 | -1
272 | -1
273 | -1
274 | -1
275 | 1
276 | 1
277 | 1
278 | 1
279 | 1
280 | -1
281 | -1
282 | 1
283 | 1
284 | -1
285 | -1
286 | 1
287 | 1
288 | -1
289 | 1
290 | -1
291 | -1
292 | 1
293 | 1
294 | 1
295 | -1
296 | -1
297 | -1
298 | -1
299 | 1
300 | -1
301 | 


--------------------------------------------------------------------------------
/hub_toolbox/goodman_kruskal.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | This file is part of the HUB TOOLBOX available at
  5 | https://github.com/OFAI/hub-toolbox-python3/
  6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  7 | 
  8 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer
  9 | Austrian Research Institute for Artificial Intelligence (OFAI)
 10 | Contact: <roman.feldbauer@ofai.at>
 11 | """
 12 | import sys
 13 | import numpy as np
 14 | from scipy.sparse import csr_matrix, lil_matrix
 15 | from hub_toolbox import io
 16 | 
 17 | __all__ = ['goodman_kruskal_index', 'sparse_goodman_kruskal_index']
 18 | 
 19 | def goodman_kruskal_index(D:np.ndarray, classes:np.ndarray,
 20 |                           metric:str='distance') -> float:
 21 |     """Calculate the Goodman-Kruskal clustering index.
 22 |         
 23 |     Parameters
 24 |     ----------
 25 |     D : ndarray
 26 |         The ``n x n`` symmetric distance (similarity) matrix.
 27 |     
 28 |     classes : ndarray
 29 |         The ``1 x n`` vector of class labels for each point.
 30 |     
 31 |     metric : {'distance', 'similarity'}, optional (default: 'distance')
 32 |         Define, whether the matrix `D` is a distance or similarity matrix
 33 |     
 34 |     Returns
 35 |     -------
 36 |     gamma : float
 37 |         Goodman-Kruskal index in ``[-1, 1]`` (higher=better)
 38 |     
 39 |     Notes
 40 |     -----
 41 |     This clustering quality measure relates the number of concordant (`Q_c`) 
 42 |     and discordant (`Q_d`) quadruples (`d_ij`, `d_kl`) of a distance matrix.
 43 |     We only consider tuples, so that `i`, `j` are from the same class 
 44 |     and `k`, `l` are from different classes. Then a quadruple is...
 45 |     concordant, if 
 46 |     
 47 |     .. math:: 
 48 |         d_{i,j} < d_{k,l}
 49 |     
 50 |     discordant, if 
 51 |     
 52 |     .. math:: 
 53 |         d_{i,j} > d_{k,l}
 54 |     
 55 |     and not counted, otherwise.
 56 |     
 57 |     The Goodman-Kruskal index gamma is then defined as: 
 58 |     
 59 |     .. math:: 
 60 |         gamma = \\frac{Q_c - Q_d}{Q_c + Q_d}
 61 |         
 62 |     `gamma` is bounded to ``[-1, 1]``, where larger values indicate better 
 63 |     clustering.
 64 |     """
 65 |     
 66 |     # Checking input
 67 |     io.check_distance_matrix_shape(D)
 68 |     io.check_distance_matrix_shape_fits_labels(D, classes)
 69 |     io.check_valid_metric_parameter(metric)
 70 |     
 71 |     # Calculations
 72 |     Q_c = 0.0
 73 |     Q_d = 0.0
 74 |     cls = np.unique(classes)
 75 |     
 76 |     # D_kl pairs in different classes
 77 |     other = classes[:, np.newaxis] != classes[np.newaxis, :]
 78 |     D_other = D[np.triu(other, 1)]
 79 |     
 80 |     for c in cls:
 81 |         sel = classes == c
 82 |         if np.sum(sel) > 1:
 83 |             sel = sel[:, np.newaxis].astype(bool)
 84 |             selD = np.logical_and(sel, sel.T)
 85 |             # D_ij pairs within same class
 86 |             D_self = D[np.triu(selD, 1).astype(bool).T].T
 87 |         else:
 88 |             # skip if there is only one item per class
 89 |             continue
 90 |         # D_kl pairs in different classes (D_other) are computed once for all c
 91 |         D_full = np.append(D_self, D_other)
 92 | 
 93 |         self_size = np.max(np.shape(D_self))
 94 |         other_size = np.max(np.shape(D_other))
 95 |         # Sort algorithm must be stable!
 96 |         full_idx = np.argsort(D_full, kind='mergesort')[::-1]
 97 |         
 98 |         # Calc number of quadruples with equal distance
 99 |         n_equidistant = 0
100 |         sdf = np.sort(D_full, axis=None)
101 |         equi_mask = np.zeros(sdf.size, dtype=bool)
102 |         # Positions with repeated values
103 |         equi_mask[1:] = sdf[1:] == sdf[:-1]
104 |         equi_dist = sdf[equi_mask]
105 |         # How often does each value occur in self/other:
106 |         for dist in np.unique(equi_dist):
107 |             equi_arg = np.where(D_full == dist)[0]
108 |             self_equi = (equi_arg < self_size).sum()
109 |             other_equi = len(equi_arg) - self_equi
110 |             # Number of dc that are actually equal
111 |             n_equidistant += self_equi * other_equi
112 | 
113 |         # Calc number of concordant quadruples
114 |         cc = 0
115 |         ccsize = other_size
116 |         for idx in full_idx:
117 |             if idx < self_size:
118 |                 cc += ccsize
119 |             else:
120 |                 ccsize -= 1
121 | 
122 |         # Calc number of discordant quadruples
123 |         dc = self_size * other_size - cc - n_equidistant
124 | 
125 |         Q_c += cc
126 |         Q_d += dc
127 |     
128 |     # Calc Goodman-Kruskal's gamma
129 |     if Q_c + Q_d == 0:
130 |         gamma = 0.0
131 |     else:
132 |         if metric == 'similarity':
133 |             gamma = (Q_c - Q_d) / (Q_c + Q_d)
134 |         else:
135 |             gamma = (Q_d - Q_c) / (Q_c + Q_d)
136 | 
137 |     return gamma 
138 | 
139 | def sparse_goodman_kruskal_index(S:csr_matrix, classes:np.ndarray, 
140 |                                  metric='similarity', zero_mv:bool=False, 
141 |                                  heuristic:str=None, verbose:int=0) -> float:
142 |     """Calculate the Goodman-Kruskal clustering index.
143 |     
144 |     Parameters
145 |     ----------
146 |     S : csr_matrix
147 |         The ``n x n`` symmetric similarity matrix.
148 |     
149 |     classes : ndarray
150 |         The ``1 x n`` vector of class labels for each point.
151 |     
152 |     metric : {'similarity', 'distance'}, optional (default: 'similarity')
153 |         Define, whether the matrix `D` is a distance or similarity matrix.
154 |         
155 |         NOTE: 'distance' is used for debugging purposes only. Use standard
156 |         goodman_kruskal_index function for distance matrices.
157 |               
158 |     zero_mv : boolean, optional (default: False)
159 |         Treat zeros as missing values, i.e. tuples with any zero
160 |         similarities are not counted.
161 |         
162 |     heuristic : {None, 'equal_sim'}, optional (default: None)
163 |         * None - Exact GK
164 |         * 'equal_sim' - omit expensive search for equal similarities
165 |           Useful, when no/few equal similarites are expected.
166 |           Do NOT use in case of SharedNN matrices!
167 |                         
168 |         NOTE: Equal zero similarities are still considered
169 |         when using the heuristic.
170 |     
171 |     verbose : int, optional (default: 0)
172 |         Increasing level of output (progress report).
173 | 
174 |     Returns
175 |     -------
176 |     gamma : float
177 |         Goodman-Kruskal index in ``[-1, 1]`` (higher=better)
178 |     
179 |     Notes
180 |     -----
181 |     This clustering quality measure relates the number of concordant (`Q_c`) 
182 |     and discordant (`Q_d`) quadruples (`d_ij`, `d_kl`) of a distance matrix.
183 |     We only consider tuples, so that `i`, `j` are from the same class 
184 |     and `k`, `l` are from different classes. Then a quadruple is...
185 |     concordant, if 
186 |     
187 |     .. math:: 
188 |         d_{i,j} < d_{k,l}
189 |     
190 |     discordant, if 
191 |     
192 |     .. math:: 
193 |         d_{i,j} > d_{k,l}
194 |     
195 |     and not counted, otherwise.
196 |     
197 |     The Goodman-Kruskal index gamma is then defined as: 
198 |     
199 |     .. math:: 
200 |         gamma = \\frac{Q_c - Q_d}{Q_c + Q_d}
201 |         
202 |     `gamma` is bounded to ``[-1, 1]``, where larger values indicate better 
203 |     clustering.
204 |     """
205 |     
206 |     # Checking input
207 |     io.check_distance_matrix_shape(S)
208 |     io.check_distance_matrix_shape_fits_labels(S, classes)
209 |     io.check_valid_metric_parameter(metric)
210 |     
211 |     if verbose:
212 |         print("Sparse Goodman-Kruskal")
213 |         sys.stdout.write("----------------------")
214 |         print(flush=True)
215 |     # Calculations
216 |     Qc = 0.0
217 |     Qd = 0.0
218 |     n = classes.size
219 |     
220 |     # S_kl pairs in different classes
221 |     S_other_list = lil_matrix((n, n))
222 |     other_nnz = 0
223 |     # building the complete mask at once would result in dense N x N matrix
224 |     if verbose >= 2:
225 |         print("Finding S_kl pairs with different class labels...", 
226 |               end=' ', flush=True)
227 |     for i, c in enumerate(classes):
228 |         cur_other = csr_matrix((c != classes)[i+1:])
229 |         other_nnz += cur_other.nnz
230 |         S_other_list[i, :cur_other.shape[1]] = \
231 |             S[i, i+1:].multiply(cur_other)
232 |     n_other_zeros = other_nnz - S_other_list.nnz
233 |     # The following might be achieved faster w/o csr intermediate
234 |     S_other = S_other_list.tocsr().data
235 |     del S_other_list, cur_other
236 |     if verbose >= 2:
237 |         print("done.", flush=True)
238 |     
239 |     cls = np.unique(classes)
240 |     for c in cls:
241 |         if verbose == 1:# and c % 10 == 0:
242 |             # end='\r' does not work with jupyter notebook
243 |             print("Class: {}/{}".format(c, len(cls)), end='')
244 |         sel = classes == c
245 |         if np.sum(sel) > 1:
246 |             if verbose >= 2:
247 |                 print("Finding S_ij pairs for class {}..."
248 |                       .format(c), end=' ')
249 |             n = sel.size
250 |             # intra-class distances
251 |             S_self_list = lil_matrix((n, n))
252 |             self_nnz = 0
253 |             
254 |             # Only visit points of self class
255 |             sel_arg = np.where(sel > 0)[0]
256 |             for i in sel_arg:
257 |                 cur_self = csr_matrix(sel[i+1:])
258 |                 self_nnz += cur_self.nnz
259 |                 S_self_list[i, :cur_self.shape[1]] = \
260 |                     S[i, i+1:].multiply(cur_self)
261 |             
262 |             n_self_zeros = self_nnz - S_self_list.nnz
263 |             # Same as with S_other
264 |             S_self = S_self_list.tocsr().data
265 |             del S_self_list, cur_self
266 |             if verbose >= 2:
267 |                 print("done.")
268 |         else:
269 |             # skip if there is only one item per class
270 |             if verbose == 1: # and c % 10 == 0:
271 |                 sys.stdout.write('\r')
272 |             continue
273 |         
274 |         # S_kl pairs in different classes are computed once for all c
275 |         if verbose >= 2:
276 |             print("Sorting data...", end=' ')
277 |         S_full_data = np.append(S_self, S_other)
278 |         
279 |         self_data_size = S_self.size
280 |         self_size = S_self.size + n_self_zeros
281 |         other_data_size = S_other.size
282 |         other_size = S_other.size + n_other_zeros
283 |         full_data_idx = np.argsort(S_full_data, kind='mergesort')[::-1]
284 |         del S_self
285 |         if verbose >= 2:
286 |             print("done.", flush=True)
287 |         
288 |         # Calc number of quadruples with equal distance
289 |         if verbose >= 2:
290 |             print("Calculating number of quadruples with equal distance...", 
291 |                   end=' ')
292 |         n_equidistant = 0
293 |         # Number of equal zero similarities
294 |         if zero_mv:
295 |             n_zero = 0
296 |         else:
297 |             n_zero = n_self_zeros * n_other_zeros
298 |         if heuristic == 'equal_sim':
299 |             if verbose >= 2:
300 |                 print("OMITTED (heuristic).")
301 |             else:
302 |                 pass
303 |         else:
304 |             sdf = np.sort(S_full_data, axis=None)
305 |             equi_mask = np.zeros(sdf.size, dtype=bool)
306 |             # Positions with repeated values
307 |             equi_mask[1:] = sdf[1:] == sdf[:-1]
308 |             equi_dist = sdf[equi_mask]
309 |             equi_arg = 0
310 |             # How often does each value occur in self/other:
311 |             for dist in np.unique(equi_dist):
312 |                 equi_arg = np.where(S_full_data == dist)[0]
313 |                 self_equi = (equi_arg < self_data_size).sum()
314 |                 other_equi = len(equi_arg) - self_equi
315 |                 # Number of dc that are actually equal
316 |                 n_equidistant += self_equi * other_equi
317 |             del S_full_data, equi_mask, equi_dist, equi_arg
318 |             if verbose >= 2:
319 |                 print("done.", flush=True)
320 |         
321 |         # Calc number of concordant quadruples
322 |         if verbose >= 2:
323 |             print("Calculating number of concordant quadruples...", end=' ')
324 |         cc = 0
325 |         if zero_mv:
326 |             ccsize = other_data_size
327 |         else:
328 |             ccsize = other_size
329 |         for idx in full_data_idx:
330 |             if idx < self_data_size:
331 |                 cc += ccsize
332 |             else:
333 |                 ccsize -= 1
334 |         if verbose >= 2:
335 |             print("done.", flush=True)
336 |         
337 |         # Calc number of discordant quadruples
338 |         if verbose >= 2:
339 |             print("Calculating number of discordant quadruples...", end=' ')
340 |         if zero_mv:
341 |             dc = self_data_size * other_data_size - cc - n_equidistant
342 |         else:
343 |             dc = self_size * other_size - cc - n_equidistant - n_zero
344 |         Qc += cc
345 |         Qd += dc
346 |         if verbose >= 2:
347 |             print("done.", flush=True)
348 |         if verbose == 1: # and c % 10 == 0:
349 |             sys.stdout.write('\r')
350 |     
351 |     # Calc Goodman-Kruskal's gamma
352 |     if verbose >= 2:
353 |         print("Calculating Goodman-Kruskal gamma...", end=' ')
354 |     if Qc + Qd == 0:
355 |         gamma = 0.0
356 |     else:
357 |         if metric == 'similarity':
358 |             gamma = (Qc - Qd) / (Qc + Qd)
359 |         elif metric == 'distance':
360 |             gamma = (Qd - Qc) / (Qc + Qd)
361 |         else:
362 |             print("WARNING: Unknown metric type {}. Assuming 'similarity' "
363 |                   "instead. Sign of result might be reversed, if this is "
364 |                   "wrong!".format(metric.__str__[0:32]), file=sys.stderr)
365 |             gamma = (Qc - Qd) / (Qc + Qd)
366 |     if verbose >= 2:
367 |         print("done.", flush=True)
368 |     return gamma
369 | 
370 | def _naive_goodman_kruskal(D:np.ndarray, labels:np.ndarray, metric='distance'):
371 |     """Calculate Goodman-Kruskal's gamma (slow naive implementation)
372 |     
373 |     Please use one of the others methods for calculating the GK index. This
374 |     function is intended for testing purposes only.
375 |     """
376 |     
377 |     # Checking input
378 |     io.check_distance_matrix_shape(D)
379 |     io.check_distance_matrix_shape_fits_labels(D, labels)
380 |     io.check_valid_metric_parameter(metric)
381 |     n = D.shape[0]
382 |     Q_c = 0
383 |     Q_d = 0
384 |     
385 |     # loop through all quadruples...
386 |     for i in range(n):
387 |         # ...but ignore self distances and only count undirected edges
388 |         for j in range(i + 1, n):
389 |             if labels[i] == labels[j]:
390 |                 for k in range(n):
391 |                     for l in range(k + 1, n):
392 |                         if labels[l] != labels[k]: # or l == i or l == j:
393 |                             if D[i, j] < D[k, l]:
394 |                                 Q_c += 1
395 |                             elif D[i, j] > D[k, l]:
396 |                                 Q_d += 1
397 |                             else: # don't count equal distances
398 |                                 pass
399 |     if Q_c + Q_d == 0:
400 |         return 0
401 |     if metric == 'similarity':
402 |         return (Q_d - Q_c) / (Q_c + Q_d)
403 |     else: # metric == 'distance':
404 |         return (Q_c - Q_d) / (Q_c + Q_d)
405 | 


--------------------------------------------------------------------------------
/hub_toolbox/htlogging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This file is part of the HUB TOOLBOX available at
 6 | https://github.com/OFAI/hub-toolbox-python3/
 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 8 | 
 9 | (c) 2015-2018, Roman Feldbauer
10 | Austrian Research Institute for Artificial Intelligence (OFAI)
11 | Contact: <roman.feldbauer@ofai.at>
12 | """
13 | import sys, time
14 | from abc import ABCMeta, abstractmethod
15 | 
16 | __all__ = ['ConsoleLogging']
17 | 
18 | class Logging(metaclass=ABCMeta): # pragma: no cover
19 |     """Base class for time-stamped logging.
20 |     
21 |     Do not instantiate this class, but ConsoleLogging or FileLogging!
22 |     """
23 |     @property
24 |     def _current_time(self):
25 |         """Formatted time stamp"""
26 |         return time.strftime('%Y-%m-%d %H:%M:%S')
27 |     
28 |     @abstractmethod
29 |     def message(self):
30 |         ...
31 |     @abstractmethod
32 |     def warning(self):
33 |         ...
34 |     @abstractmethod
35 |     def error(self):
36 |         ...
37 | 
38 | class ConsoleLogging(Logging):
39 |     """Convenience functions for time-stamped logging to the console"""
40 | 
41 |     def message(self, *objs, flush=True):
42 |         """Log normal program function"""
43 |         print(self._current_time, 'INFO:', *objs)
44 |         if flush:
45 |             sys.stdout.flush()
46 | 
47 |     def warning(self, *objs, flush=True):
48 |         """Log warning (program can still continue)"""
49 |         print(self._current_time, 'WARNING:', *objs, file=sys.stderr)
50 |         if flush:
51 |             sys.stderr.flush()
52 | 
53 |     def error(self, *objs, flush=True):
54 |         """Log error (program fails)"""
55 |         print(self._current_time, 'ERROR:', *objs, file=sys.stderr)
56 |         if flush:
57 |             sys.stderr.flush()
58 | 
59 | class FileLogging(ConsoleLogging):
60 |     """Convenience functions for time-stamped logging to a file"""
61 | 
62 |     def __init__(self):
63 |         """Not implemented"""
64 |         self.warning("FileLogging not yet implemented, will print to "
65 |                      "console anyway.")
66 | 
67 | if __name__ == '__main__':
68 |     """Simple test of this module"""
69 |     log = ConsoleLogging()
70 |     log.message('This module supplies functions for printing and logging.')
71 |     log.message('Examples:')
72 |     sys.stdout.flush()
73 |     time.sleep(0.01)
74 |     log.warning('This is a warning.')
75 |     log.error('This is an error!')
76 |     sys.stderr.flush()
77 |     time.sleep(0.01)
78 |     log.message('You should have got three messages on stdout and '
79 |                 'two on stderr.')
80 |     log = FileLogging()
81 |     log.message('Still written to console, until implemented.')
82 |     try:
83 |         log = Logging()
84 |     except TypeError as e:
85 |         log.warning('Must not instantiate Logging(), got exception:\n', e)
86 | 


--------------------------------------------------------------------------------
/hub_toolbox/hubness_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This file is part of the HUB TOOLBOX available at
  6 | https://github.com/OFAI/hub-toolbox-python3/
  7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  8 | 
  9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer
 10 | Austrian Research Institute for Artificial Intelligence (OFAI)
 11 | Contact: <roman.feldbauer@ofai.at>
 12 | """
 13 | from inspect import signature
 14 | import numpy as np
 15 | from hub_toolbox import io
 16 | from hub_toolbox.centering import centering, weighted_centering, \
 17 |     localized_centering, dis_sim_global, dis_sim_local
 18 | from hub_toolbox.distances import cosine_distance
 19 | from hub_toolbox.global_scaling import mutual_proximity_empiric, \
 20 |     mutual_proximity_gammai, mutual_proximity_gaussi
 21 | from hub_toolbox.goodman_kruskal import goodman_kruskal_index
 22 | from hub_toolbox.hubness import hubness
 23 | from hub_toolbox.intrinsic_dimension import intrinsic_dimension
 24 | from hub_toolbox.knn_classification import score
 25 | from hub_toolbox.local_scaling import nicdm, local_scaling
 26 | from hub_toolbox.shared_neighbors import shared_nearest_neighbors
 27 | 
 28 | __all__ = ['HubnessAnalysis']
 29 | 
 30 | CITATION = \
 31 | """
 32 | R. Feldbauer, M. Leodolter, C. Plant and A. Flexer,
 33 | "Fast Approximate Hubness Reduction for Large High-Dimensional Data",
 34 | 2018 IEEE International Conference on Big Knowledge (ICBK), Singapore, 2018,
 35 | pp. 358-367. doi: 10.1109/ICBK.2018.00055
 36 | (tech report available at http://www.ofai.at/cgi-bin/tr-online?number+2018-02)
 37 | 
 38 | or
 39 | 
 40 | R. Feldbauer, A. Flexer, "A comprehensive empirical comparison of hubness reduction in high-dimensional spaces"
 41 | Knowledge and Information Systems, 2018, https://doi.org/10.1007/s10115-018-1205-y
 42 | """
 43 | 
 44 | 
 45 | def _primary_distance(D: np.ndarray, metric):
 46 |     """Return `D`, identical. (Dummy function.)"""
 47 |     return D
 48 | 
 49 | 
 50 | # New types of hubness reduction methods must be added here
 51 | SEC_DIST = {'mp': mutual_proximity_empiric,
 52 |             'mp_gaussi': mutual_proximity_gaussi,
 53 |             'mp_gammai': mutual_proximity_gammai,
 54 |             'ls': local_scaling,
 55 |             'nicdm': nicdm,
 56 |             'snn': shared_nearest_neighbors,
 57 |             'cent': centering,
 58 |             'wcent': weighted_centering,
 59 |             'lcent': localized_centering,
 60 |             'dsg': dis_sim_global,
 61 |             'dsl': dis_sim_local,
 62 |             'orig': _primary_distance  # a dummy function
 63 |             }
 64 | 
 65 | 
 66 | class HubnessAnalysis:
 67 |     """The main hubness analysis class.
 68 | 
 69 |     For more detailed analyses (optimizing parameters, using similarity data,
 70 |     etc.) please use the individual modules.
 71 | 
 72 |     Examples
 73 |     --------
 74 |     >>> from hub_toolbox.hubness_analysis import HubnessAnalysis
 75 |     >>> hub = HubnessAnalysis()
 76 |     >>> hub.analyze_hubness()
 77 | 
 78 |     >>> from hub_toolbox.io import load_dexter
 79 |     >>> D, y, X = load_dexter()
 80 |     >>> hub = HubnessAnalysis(D, classes=y, vectors=X)
 81 |     >>> hub.analyze_hubness()
 82 | 
 83 |     Notes
 84 |     -----
 85 |     The first example loads the example data set and performs a quick 
 86 |     hubness analysis with some of the functions provided in this toolbox.
 87 | 
 88 |     For the second example you must provide a distance matrix `D` (NxN)
 89 |     together with an optional class labels vector (`classes`) and the
 90 |     original (optional) data vectors (`vectors`) to perform a full hubness
 91 |     analysis.
 92 | 
 93 |     See also
 94 |     --------
 95 |     analyse_hubness : additional parameters (e.g. k-occurrence, k-NN)
 96 |     """
 97 | 
 98 |     def __init__(self, D: np.ndarray = None, classes: np.ndarray = None,
 99 |                  vectors: np.ndarray = None, metric: str = 'distance'):
100 |         """Initialize a quick hubness analysis.
101 | 
102 |         Parameters
103 |         ----------
104 |         D : ndarray, optional (default: None)
105 |             The n x n symmetric distance (similarity) matrix.
106 |             Default: load example dataset (dexter).
107 | 
108 |         classes : ndarray, optional (default: None)
109 |             The 1 x n class labels. Required for k-NN, GK.
110 | 
111 |         vectors : ndarray, optional (default: None)
112 |             The m x n vector data. Required for IntrDim estimation.
113 | 
114 |         metric : {'distance', 'similarity'}
115 |             Define whether `D` is a distance or similarity matrix.
116 |         """
117 | 
118 |         self.has_class_data, self.has_vector_data = False, False
119 |         if D is None:
120 |             print('\n'
121 |                   'NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set.'
122 |                   '\n'
123 |                   'DEXTER is a text classification problem in a bag-of-word \n'
124 |                   'representation. This is a two-class classification problem\n'
125 |                   'with sparse continuous input variables. \n'
126 |                   'This dataset is one of five datasets of the NIPS 2003\n'
127 |                   'feature selection challenge.\n'
128 |                   'http://archive.ics.uci.edu/ml/datasets/Dexter\n')
129 |             self.D, self.classes, self.vectors = io.load_dexter()
130 |             self.has_class_data, self.has_vector_data = True, True
131 |             self.metric = 'distance'
132 |         else:
133 |             # copy data and ensure correct type (not int16 etc.)
134 |             self.D = np.copy(D).astype(np.float64)
135 |             if classes is None:
136 |                 self.classes = None
137 |             else:
138 |                 self.classes = np.copy(classes).astype(np.float64)
139 |                 self.has_class_data = True
140 |             if vectors is None:
141 |                 self.vectors = None
142 |             else:
143 |                 self.vectors = np.copy(vectors).astype(np.float64)
144 |                 self.has_vector_data = True
145 |             self.metric = metric
146 |         self.n = len(self.D)
147 |         self.experiments = []
148 | 
149 |     @property
150 |     def _header(self):
151 |         return {'mp': "MUTUAL PROXIMITY (Empiric)",
152 |                 'mp_gaussi': "MUTUAL PROXIMITY (Independent Gaussians)",
153 |                 'mp_gammai': "MUTUAL PROXIMITY (Independent Gamma)",
154 |                 'ls': "LOCAL SCALING (original)",
155 |                 'nicdm': "LOCAL SCALING (NICDM)",
156 |                 'snn': "SHARED NEAREST NEIGHBORS",
157 |                 'cent': "CENTERING",
158 |                 'wcent': "WEIGHTED CENTERING",
159 |                 'lcent': "LOCALIZED CENTERING",
160 |                 'dsg': "DISSIM GLOBAL",
161 |                 'dsl': "DISSIM LOCAL",
162 |                 'orig': "ORIGINAL DATA"}
163 | 
164 |     def _calc_intrinsic_dim(self):
165 |         """Calculate intrinsic dimension estimate."""
166 |         self.intrinsic_dim = intrinsic_dimension(X=self.vectors)
167 |         return self
168 | 
169 |     def analyze_hubness(self, experiments="orig,mp,mp_gaussi,nicdm,cent,dsg",
170 |                         hubness_k=(5, 10), knn_k=(1, 5, 20),
171 |                         print_results=True, verbose: int = 0):
172 |         """Analyse hubness in original data and rescaled distances.
173 | 
174 |         Parameters
175 |         ----------
176 |         experiments : str, optional
177 |             Define which experiments to perform. Please provide a string of
178 |             comma separated values chosen from the following options:
179 | 
180 |             - "orig" : Original, primary distances
181 |             - "mp" : Mutual Proximity (empiric)
182 |             - "mp_gaussi" : Mutual Proximity (independent Gaussians)
183 |             - "mp_gammai" ... Mutual Proximity (independent Gamma)
184 |             - "ls" : Local Scaling (using k-th neighbor)
185 |             - "nicdm" : Local Scaling variant NICDM (average of k neighbors)
186 |             - "snn" : Shared Nearest Neighbors
187 |             - "cent" : Centering
188 |             - "wcent" : Weighted Centering
189 |             - "lcent" : Localized Centering
190 |             - "dsg" : DisSim Global
191 |             - "dsl" : DisSim Local
192 | 
193 |         hubness_k : tuple, optional (default: (5, 10))
194 |             Hubness parameter (skewness of `k`-occurence)
195 | 
196 |         knn_k : tuple, optional (default: (1, 5, 20))
197 |             `k`-NN classification parameter
198 | 
199 |         print_results : bool, optional (default: True)
200 |             Define whether to print hubness analysis report to stdout
201 | 
202 |         verbose : int, optional (default: 0)
203 |             Increasing output verbosity
204 | 
205 |         Returns
206 |         -------
207 |         self : optionally prints results to stdout
208 |         """
209 |         experiments = experiments.split(',')
210 |         if self.vectors is None:
211 |             self.intrinsic_dim = None
212 |         else:
213 |             self._calc_intrinsic_dim()
214 |         for i, exp_type in enumerate(experiments):
215 |             if verbose:
216 |                 print("Experiment {}/{} ({})".
217 |                       format(i+1, len(experiments), exp_type), end="\r")
218 |             experiment = HubnessExperiment(D=self.D,
219 |                                            secondary_distance_type=exp_type,
220 |                                            metric=self.metric,
221 |                                            classes=self.classes,
222 |                                            vectors=self.vectors)
223 |             if self.D is not None:
224 |                 experiment._calc_secondary_distance()
225 |                 for k in hubness_k:
226 |                     experiment._calc_hubness(k=k)
227 |             if self.classes is not None:
228 |                 for k in knn_k:
229 |                     experiment._calc_knn_accuracy(k=k)
230 |                 experiment._calc_gk_index()
231 |             self.experiments.append(experiment)
232 |             if print_results:
233 |                 self.print_analysis_report(experiment, report_nr=i)
234 |         if print_results:
235 |             print("------------------------------------------------------------")
236 |             print("Thanks for using the HUB-TOOLBOX!")
237 |             print("If you use this software in a research project, please cite:")
238 |             print("\n", CITATION)
239 |             print("Please also consider citing the references to the \n"
240 |                   "individual modules/hubness functions that you use.")
241 |         return self
242 | 
243 |     def print_analysis_report(self, experiment=None, report_nr:int=0):
244 |         """Print a report of the performed hubness analysis.
245 | 
246 |         Parameters
247 |         ----------
248 |         experiment : HubnessExperiment, optional (default: None)
249 |             If given, report only this `experiment`. Otherwise, report all
250 |             experiments of this analysis.
251 | 
252 |         report_nr : int, optional (default: 0)
253 |             Method only prints headline for first report
254 | 
255 |         Returns
256 |         -------
257 |         None : Output is printed to stdout
258 |         """
259 |         if experiment is not None:
260 |             experiments = [experiment]
261 |         else:
262 |             experiments = self.experiments
263 |         if report_nr == 0:
264 |             print("\n"
265 |                   "================\n"
266 |                   "Hubness Analysis\n"
267 |                   "================\n")
268 |         for experiment in experiments:
269 |             print(self._header[experiment.secondary_distance_type] + ':')
270 |             # Print used parameters (which are the default parameters)
271 |             sig = signature(SEC_DIST[experiment.secondary_distance_type])
272 |             for p in ['k', 'kappa', 'gamma']:
273 |                 try:
274 |                     print("parameter {} = {} (for optimization use the "
275 |                           "individual modules of the HUB-TOOLBOX)".
276 |                           format(p, sig.parameters[p].default))
277 |                 except KeyError:
278 |                     pass # function does not use this parameter
279 |             try: # to print hubness results, if available
280 |                 for k in sorted(experiment.hubness.keys()):
281 |                     print('data set hubness (S^k={:2})                : {:.3}'.
282 |                           format(k, experiment.hubness[k]))
283 |                     print('% of anti-hubs at k={:2}                   : {:.4}%'.
284 |                           format(k, experiment.anti_hubs[k]))
285 |                     print('% of k={:2}-NN lists the largest hub occurs: {:.4}%'.
286 |                           format(k, experiment.max_hub_k_occurence[k]))
287 |             except KeyError:
288 |                 print('data set hubness (S^k={:2})                : '
289 |                       'No k given')
290 |             try: # to print k-NN results, if available
291 |                 for k in sorted(experiment.knn_accuracy.keys()):
292 |                     print('k={:2}-NN classification accuracy          : {:.4}%'.
293 |                           format(k, 100.*float(experiment.knn_accuracy[k])))
294 |             except KeyError:
295 |                 print('k=5-NN classification accuracy           : '
296 |                       'No classes given')
297 |             # print Goodman-Kruskal result, if available
298 |             if experiment.gk_index is None:
299 |                 print('Goodman-Kruskal index (higher=better)    : '
300 |                       'No classes given/Not calculated')
301 |             else:
302 |                 print('Goodman-Kruskal index (higher=better)    : {:.3}'.
303 |                       format(experiment.gk_index))
304 |             # Embedding dimension
305 |             if self.vectors is None:
306 |                 print('embedding dimensionality                 : '
307 |                       'No vectors given')
308 |             else:
309 |                 print('embedding dimensionality                 : {}'.
310 |                       format(experiment.embedding_dim))
311 |             # Intrinsic dimension estimate, if available
312 |             if self.intrinsic_dim is None:
313 |                 print('intrinsic dimensionality estimate        : '
314 |                       'No vectors given')
315 |             else:
316 |                 print('intrinsic dimensionality estimate        : {}'.
317 |                       format(round(self.intrinsic_dim)))
318 |             print()
319 |         return
320 | 
321 | 
322 | class HubnessExperiment:
323 |     """Perform a single hubness experiment"""
324 | 
325 |     def __init__(self, D: np.ndarray, secondary_distance_type: str,
326 |                  metric: str = 'distance', classes: np.ndarray = None,
327 |                  vectors: np.ndarray = None):
328 |         """Initialize a hubness experiment"""
329 | 
330 |         io.check_distance_matrix_shape(D)
331 |         io.check_valid_metric_parameter(metric)
332 |         if secondary_distance_type not in SEC_DIST.keys():
333 |             raise ValueError("Requested secondary distance type unknown.")
334 |         if classes is not None:
335 |             io.check_distance_matrix_shape_fits_labels(D, classes)
336 |         if vectors is None:
337 |             self.embedding_dim = None
338 |         else: # got vectors
339 |             io.check_distance_matrix_shape_fits_vectors(D, vectors)
340 |             self.embedding_dim = vectors.shape[1]
341 |         self.original_distance = D
342 |         self.secondary_distance_type = secondary_distance_type
343 |         self.classes = classes
344 |         self.vectors = vectors
345 |         self.metric = metric
346 |         self.n = D.shape[0]
347 |         # Obtained later through functions:
348 |         self.secondary_distance = None
349 |         self.hubness = dict()
350 |         self.anti_hubs = dict()
351 |         self.max_hub_k_occurence = dict()
352 |         self.knn_accuracy = dict()
353 |         self.gk_index = None
354 | 
355 |     def _calc_secondary_distance(self):
356 |         """Calculate secondary distances (e.g. Mutual Proximity)"""
357 |         sec_dist_fun = SEC_DIST[self.secondary_distance_type]
358 |         try:
359 |             self.secondary_distance = sec_dist_fun(
360 |                 D=self.original_distance, metric=self.metric)
361 |         except TypeError: # centering has no keyword 'D='
362 |             if self.secondary_distance_type in ['cent', 'wcent']:
363 |                 self.secondary_distance = \
364 |                     cosine_distance(sec_dist_fun(X=self.vectors))
365 |             elif self.secondary_distance_type in ['lcent']:
366 |                 self.secondary_distance = 1. - sec_dist_fun(X=self.vectors)
367 |             elif self.secondary_distance_type in ['dsg', 'dsl']:
368 |                 self.secondary_distance = sec_dist_fun(X=self.vectors)
369 |             else:
370 |                 raise ValueError("Erroneous secondary distance type: {}".
371 |                                  format(self.secondary_distance_type))
372 |         return self
373 | 
374 |     def _calc_hubness(self, k: int = 5):
375 |         """Calculate hubness (skewness of `k`-occurence).
376 | 
377 |         Also calculate percentage of anti hubs (`k`-occurence == 0) and
378 |         percentage of k-NN lists the largest hub occurs in.
379 |         """
380 |         S_k, _, N_k = hubness(D=self.secondary_distance,
381 |                               metric=self.metric, k=k)
382 |         self.hubness[k] = S_k
383 |         self.anti_hubs[k] = 100 * (N_k == 0).sum() / self.n
384 |         self.max_hub_k_occurence[k] = 100 * N_k.max() / self.n
385 |         return self
386 | 
387 |     def _calc_knn_accuracy(self, k: int = 5):
388 |         """Calculate `k`-NN accuracy."""
389 |         acc, _, _ = score(D=self.secondary_distance, target=self.classes,
390 |                           k=k, metric=self.metric)
391 |         self.knn_accuracy[k] = acc
392 |         return self
393 | 
394 |     def _calc_gk_index(self):
395 |         """Calculate Goodman-Kruskal's gamma."""
396 |         self.gk_index = goodman_kruskal_index(D=self.secondary_distance,
397 |                                               classes=self.classes,
398 |                                               metric=self.metric)
399 |         return self
400 | 
401 | 
402 | if __name__ == "__main__":
403 |     hub = HubnessAnalysis()
404 |     hub.analyze_hubness()
405 | 


--------------------------------------------------------------------------------
/hub_toolbox/intrinsic_dimension.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This file is part of the HUB TOOLBOX available at
  6 | https://github.com/OFAI/hub-toolbox-python3/
  7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  8 | 
  9 | (c) 2011-2018, Dominik Schnitzer, Roman Feldbauer
 10 | Austrian Research Institute for Artificial Intelligence (OFAI)
 11 | Contact: <roman.feldbauer@ofai.at>
 12 | 
 13 | This file is based on a Matlab script by Elizaveta Levina, University of
 14 | Michigan, available at http://dept.stat.lsa.umich.edu/~elevina/mledim.m
 15 | 
 16 | Reference:  E. Levina and P.J. Bickel (2005).
 17 |  "Maximum Likelihood Estimation  of Intrinsic Dimension."
 18 |  In Advances in NIPS 17, Eds. L. K. Saul, Y. Weiss, L. Bottou.
 19 | """
 20 | import numpy as np
 21 | 
 22 | __all__ = ['intrinsic_dimension']
 23 | 
 24 | def intrinsic_dimension(X:np.ndarray, k1:int=6, k2:int=12,
 25 |                         estimator:str='mackay', metric:str='vector',
 26 |                         trafo:str=None, mem_threshold:int=5000):
 27 |     """Calculate intrinsic dimension based on the MLE by Levina and Bickel [1]_.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     X : ndarray
 32 |         - An ``m x n`` vector data matrix with ``n`` objects in an
 33 |           ``m`` dimensional feature space
 34 |         - An ``n x n`` distance matrix.
 35 | 
 36 |         NOTE: The type must be defined via parameter `metric`!
 37 | 
 38 |     k1 : int, optional (default: 6)
 39 |         Start of neighborhood range to search in.
 40 | 
 41 |     k2 : int, optional (default: 12)
 42 |         End of neighborhood range to search in.
 43 | 
 44 |     estimator : {'levina', 'mackay'}, optional (default: 'mackay')
 45 |         Determine the summation strategy: see [2]_.
 46 | 
 47 |     metric : {'vector', 'distance'}, optional (default: 'vector')
 48 |         Determine data type of `X`.
 49 | 
 50 |         NOTE: the MLE was derived for euclidean distances. Using
 51 |         other dissimilarity measures may lead to undefined results.
 52 | 
 53 |     trafo : {None, 'std', 'var'}, optional (default: None)
 54 |         Transform vector data. 
 55 | 
 56 |         - None: no transformation
 57 |         - 'std': standardization
 58 |         - 'var': subtract mean, divide by variance (default behavior of
 59 |           Laurens van der Maaten's DR toolbox; most likely for other
 60 |           ID/DR techniques).
 61 | 
 62 |     mem_treshold : int, optional, default: 5000
 63 |         Controls speed-memory usage trade-off: If number of points is higher
 64 |         than the given value, don't calculate complete distance matrix at
 65 |         once (fast, high memory), but per row (slower, less memory).
 66 | 
 67 |     Returns
 68 |     -------
 69 |     d_mle : int
 70 |         Intrinsic dimension estimate (rounded to next integer)
 71 | 
 72 |     References
 73 |     ----------
 74 |     .. [1] Levina, E., & Bickel, P. (2004). Maximum likelihood estimation of
 75 |            intrinsic dimension. Advances in Neural Information …, 17, 777–784.
 76 |            http://doi.org/10.2307/2335172
 77 |     .. [2] http://www.inference.phy.cam.ac.uk/mackay/dimension/
 78 |     """
 79 |     n = X.shape[0]
 80 |     if estimator not in ['levina', 'mackay']:
 81 |         raise ValueError("Parameter 'estimator' must be 'levina' or 'mackay'.")
 82 |     if k1 < 1 or k2 < k1 or k2 >= n:
 83 |         raise ValueError("Invalid neighborhood: Please make sure that "
 84 |                          "0 < k1 <= k2 < n. (Got k1={} and k2={}).".
 85 |                          format(k1, k2))
 86 |     X = X.copy().astype(float)
 87 | 
 88 |     if metric == 'vector':
 89 |         # New array with unique rows
 90 |         X = X[np.lexsort(np.fliplr(X).T)]
 91 |         
 92 |         if trafo is None:
 93 |             pass
 94 |         elif trafo == 'var':
 95 |             X -= X.mean(axis=0) # broadcast
 96 |             X /= X.var(axis=0) + 1e-7 # broadcast
 97 |         elif trafo == 'std':
 98 |             # Standardization
 99 |             X -= X.mean(axis=0) # broadcast
100 |             X /= X.std(axis=0) + 1e-7 # broadcast
101 |         else:
102 |             raise ValueError("Transformation must be None, 'std', or 'var'.")
103 | 
104 |         # Compute matrix of log nearest neighbor distances
105 |         X2 = (X**2).sum(1)
106 | 
107 |         if n <= mem_threshold: # speed-memory trade-off
108 |             distance = X2.reshape(-1, 1) + X2 - 2*np.dot(X, X.T) #2x br.cast
109 |             distance.sort(1)
110 |             # Replace invalid values with a small number
111 |             distance[distance<=0] = 1e-7
112 |             knnmatrix = .5 * np.log(distance[:, 1:k2+1])
113 |         else:
114 |             knnmatrix = np.zeros((n, k2))
115 |             for i in range(n):
116 |                 distance = np.sort(X2[i] + X2 - 2 * np.dot(X, X[i, :]))
117 |                 # Replace invalid values with a small number
118 |                 distance[distance <= 0] = 1e-7
119 |                 knnmatrix[i, :] = .5 * np.log(distance[1:k2+1])
120 |     elif metric == 'distance':
121 |         raise NotImplementedError("ID currently only supports vector data.")
122 |         # XXX perhaps map to sufficiently high dim with MDS, then calc ID??
123 |         #=======================================================================
124 |         # # TODO calculation WRONG
125 |         # X.sort(1)
126 |         # X[X < 0] = 1e-7
127 |         # knnmatrix = np.log(X[:, 1:k2+1])
128 |         #=======================================================================
129 |     elif metric == 'similarity':
130 |         raise NotImplementedError("ID currently only supports vector data.")
131 |         #=======================================================================
132 |         # # TODO calculation WRONG
133 |         # print("WARNING: using similarity data may return "
134 |         #       "undefined results.", file=sys.stderr)
135 |         # X[X < 0] = 0
136 |         # distance = 1 - (X / X.max())
137 |         # knnmatrix = np.log(distance[:, 1:k2+1])
138 |         #=======================================================================
139 |     else:
140 |         raise ValueError("Parameter `metric` must be 'vector'.")
141 | 
142 |     # Compute the ML estimate
143 |     S = np.cumsum(knnmatrix, 1)
144 |     indexk = np.arange(k1, k2+1) # broadcasted afterwards
145 |     dhat = -(indexk - 2) / (S[:, k1-1:k2] - knnmatrix[:, k1-1:k2] * indexk)
146 |     if estimator == 'levina':
147 |         # Average over estimates and over values of k
148 |         no_dims = dhat.mean()
149 |     if estimator == 'mackay':
150 |         # Average over inverses
151 |         dhat **= -1
152 |         dhat_k = dhat.mean(0)
153 |         no_dims = (dhat_k ** -1).mean()
154 |     return no_dims
155 | 
156 | if __name__ == '__main__':
157 |     m_dim = 100
158 |     n_dim = 2000
159 |     VECT_DATA = np.random.rand(n_dim, m_dim)
160 |     id_ = intrinsic_dimension(VECT_DATA)
161 |     print("Random {}x{} matrix: ID_MLE = {}".format(n_dim, m_dim, id_))
162 | 


--------------------------------------------------------------------------------
/hub_toolbox/io.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This file is part of the HUB TOOLBOX available at
  6 | https://github.com/OFAI/hub-toolbox-python3/
  7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  8 | 
  9 | (c) 2015-2018, Roman Feldbauer
 10 | Austrian Research Institute for Artificial Intelligence (OFAI)
 11 | Contact: <roman.feldbauer@ofai.at>
 12 | """
 13 | 
 14 | import os
 15 | import numpy as np
 16 | from scipy import sparse
 17 | from scipy.sparse.base import issparse
 18 | 
 19 | __all__ = ['load_dexter', 'random_sparse_matrix', 
 20 |            'load_csr_matrix', 'save_csr_matrix']
 21 | 
 22 | def load_dexter():
 23 |     """Load the example data set (dexter).
 24 | 
 25 |     Returns
 26 |     -------
 27 |     D : ndarray
 28 |         Distance matrix
 29 | 
 30 |     classes : ndarray
 31 |         Class label vector
 32 | 
 33 |     vectors : ndarray
 34 |         Vector data matrix
 35 |     """
 36 |     from hub_toolbox.distances import cosine_distance
 37 |         
 38 |     n = 300
 39 |     dim = 20000
 40 | 
 41 |     # Read class labels
 42 |     classes_file = os.path.dirname(os.path.realpath(__file__)) +\
 43 |         '/example_datasets/dexter_train.labels'
 44 |     classes = np.loadtxt(classes_file)  
 45 | 
 46 |     # Read data
 47 |     vectors = np.zeros((n, dim))
 48 |     data_file = os.path.dirname(os.path.realpath(__file__)) + \
 49 |         '/example_datasets/dexter_train.data'
 50 |     with open(data_file, mode='r') as fid:
 51 |         data = fid.readlines()
 52 |     row = 0
 53 |     for line in data:
 54 |         line = line.strip().split() # line now contains pairs of dim:val
 55 |         for word in line:
 56 |             col, val = word.split(':')
 57 |             vectors[row][int(col)-1] = int(val)
 58 |         row += 1
 59 | 
 60 |     # Calc distance
 61 |     D = cosine_distance(vectors)
 62 |     return D, classes, vectors
 63 | 
 64 | def check_is_nD_array(arr:np.ndarray, n:int, arr_type=''):
 65 |     """ Check that array is exactly n dimensional. """
 66 |     try:
 67 |         if arr.ndim != n:
 68 |             raise TypeError(arr_type + " array must be a " + str(n) +
 69 |                             "D array, but was found to be a " +
 70 |                             str(arr.ndim) + "D array with shape: " +
 71 |                             str(arr.shape))
 72 |     except AttributeError:
 73 |         raise TypeError("Object 'arr' does not seem to be an array.")
 74 | 
 75 | def check_distance_matrix_shape(D:np.ndarray):
 76 |     """ Check that matrix is quadratic. """
 77 |     check_is_nD_array(D, n=2, arr_type="Distance/similarity")
 78 |     if D.shape[0] != D.shape[1]:
 79 |         raise TypeError("Distance/similarity matrix is not quadratic. "
 80 |                         "Shape: {}".format(D.shape))
 81 | 
 82 | def check_distance_matrix_shape_fits_vectors(D:np.ndarray, vectors:np.ndarray):
 83 |     """ Check number of points in distance matrix equal number of vectors. """
 84 |     check_is_nD_array(D, 2, "Distance/similarity")
 85 |     check_is_nD_array(vectors, 2, "Data vectors")
 86 |     if D.shape[0] != vectors.shape[0]:
 87 |         raise TypeError("Number of points in `vectors` does not match "
 88 |                         "number of points in `D`. Shape of `vectors`: {}, "
 89 |                         "shape of `D`: {}".format(vectors.shape[0], D.shape[0]))
 90 | 
 91 | def check_distance_matrix_shape_fits_labels(D:np.ndarray, classes:np.ndarray):
 92 |     """ Check the number of points in distance matrix equal number of labels."""
 93 |     check_is_nD_array(D, 2, "Distance/similarity")
 94 |     check_is_nD_array(classes, 1, "Class label")
 95 |     if classes.size != D.shape[0]:
 96 |         raise TypeError("Number of class labels does not match number of "
 97 |                         "points. Labels: {}, points: {}."
 98 |                         .format(classes.size, D.shape[0]))
 99 | 
100 | def check_vector_matrix_shape_fits_labels(X:np.ndarray, classes:np.ndarray):
101 |     """ Check the number of points in vector matrix equal number of labels."""
102 |     check_is_nD_array(X, 2, "Data vectors")
103 |     check_is_nD_array(classes, 1, "Class label")
104 |     if classes.size != X.shape[0]:
105 |         raise TypeError("Number of class labels does not match number of "
106 |                         "points. Labels: {}, points: {}."
107 |                         .format(classes.size, X.shape[0]))
108 | 
109 | def check_sample_shape_fits(D:np.ndarray, idx:np.ndarray):
110 |     """ Check that number of columns in ``D`` equals the size of ``idx``. """
111 |     if issparse(D) or issparse(idx):
112 |         raise TypeError("Sparse matrices are not supported for SampleMP.")
113 |     check_is_nD_array(D, 2, "Distance/similarity")
114 |     check_is_nD_array(idx, 1, "Index")
115 |     if D.shape[1] > D.shape[0]:
116 |         raise ValueError("Number of samples is higher than number of points. "
117 |                          "Must be less than or equal. In the latter case, "
118 |                          "consider not using samples at all for efficiency. "
119 |                          "Shape of `D`: {}.".format(D.shape))
120 |     if D.shape[1] != idx.size:
121 |         raise TypeError("Number of samples in index array does not match "
122 |                         "the number of samples in the data matrix. "
123 |                         "Size of `idx`: {}, Columns in `D`: {}."
124 |                         .format(idx.size, D.shape[1]))
125 | 
126 | def check_valid_metric_parameter(metric:str):
127 |     """ Check parameter is either 'distance' or 'similarity'. """
128 |     if metric != 'distance' and metric != 'similarity':
129 |         raise ValueError("Parameter 'metric' must be "
130 |                          "'distance' or 'similarity'."
131 |                          "Got: " + metric.__str__())
132 | 
133 | def matrix_split(rows, cols, elem_size=8, nr_matrices=4): # pragma: no cover
134 |     """Determine how to split a matrix that does not fit into memory.
135 | 
136 |     Parameters
137 |     ----------
138 |     rows, cols : int 
139 |         Shape of matrix that should be split.
140 | 
141 |     elem_size : int
142 |         memory requirement per matrix element in bytes. E.g. 8 bytes for float64
143 | 
144 |     nr_matrices : int
145 |         How many times must the split matrix fit into memory?
146 |         This depends on the subsequent operations.
147 | 
148 |     Returns
149 |     -------
150 |     nr_batches : int
151 |         number of submatrices
152 | 
153 |     nr_rows : int
154 |         number of rows per submatrix.
155 | 
156 |     Notes
157 |     -----
158 |         - Submatrices always contain all columns per row.
159 |         - The last batch will usually have less rows than `nr_rows`
160 |     """
161 |     free_mem = FreeMemLinux(unit='k').user_free
162 |     max_rows = int(free_mem / cols / elem_size)
163 |     nr_rows = int(max_rows / nr_matrices)
164 |     nr_batches = int(np.ceil(rows / nr_rows))
165 |     return nr_batches, nr_rows
166 | 
167 | def random_sparse_matrix(size, density=0.05):
168 |     """Generate a random sparse similarity matrix.
169 | 
170 |     Values are bounded by [0, 1]. Diagonal is all ones. The final density is
171 |     approximately 2*`density`.
172 | 
173 |     Parameters
174 |     ----------
175 |     size : int
176 |         Shape of the matrix (`size` x `size`)
177 | 
178 |     density : float, optional, default=0.05
179 |         The matrix' density will be approximately 2 * `density`
180 | 
181 |     Returns
182 |     -------
183 |     S : csr_matrix
184 |         Random matrix
185 |     """
186 |     S = sparse.rand(size, size, density, 'csr')
187 |     S += S.T
188 |     S /= S.max()
189 |     S -= sparse.diags(S.diagonal(), 0)
190 |     S += sparse.diags(np.ones(size), 0)
191 |     return S
192 | 
193 | def save_csr_matrix(file, matrix):
194 |     np.savez(file, data=matrix.data, indices=matrix.indices,
195 |              indptr=matrix.indptr, shape=matrix.shape)
196 |     return file
197 | 
198 | def load_csr_matrix(file):
199 |     container = np.load(file)
200 |     return sparse.csr_matrix((container['data'], container['indices'], 
201 |                               container['indptr']), shape=container['shape'])
202 | 
203 | class FreeMemLinux(object): # pragma: no cover
204 |     """Non-cross platform way to get free memory on Linux.
205 | 
206 |     Original code by Oz123,
207 |     http://stackoverflow.com/questions/17718449/determine-free-ram-in-python
208 |     """
209 | 
210 |     def __init__(self, unit='kB'):
211 | 
212 |         with open('/proc/meminfo', 'r') as mem:
213 |             lines = mem.readlines()
214 | 
215 |         self._tot = int(lines[0].split()[1])
216 |         self._free = int(lines[1].split()[1])
217 |         self._buff = int(lines[2].split()[1])
218 |         self._cached = int(lines[3].split()[1])
219 |         self._shared = int(lines[20].split()[1])
220 |         self._swapt = int(lines[14].split()[1])
221 |         self._swapf = int(lines[15].split()[1])
222 |         self._swapu = self._swapt - self._swapf
223 | 
224 |         self.unit = unit
225 |         self._convert = self._factor()
226 | 
227 |     def _factor(self):
228 |         """determine the conversion factor"""
229 |         if self.unit == 'kB':
230 |             return 1
231 |         if self.unit == 'k':
232 |             return 1024.0
233 |         if self.unit == 'MB':
234 |             return 1/1024.0
235 |         if self.unit == 'GB':
236 |             return 1/1024.0/1024.0
237 |         if self.unit == '%':
238 |             return 1.0/self._tot * 100
239 |         else:
240 |             raise Exception("Unit not understood")
241 | 
242 |     @property
243 |     def total(self):
244 |         return self._convert * self._tot
245 | 
246 |     @property
247 |     def used(self):
248 |         return self._convert * (self._tot - self._free)
249 | 
250 |     @property
251 |     def used_real(self):
252 |         """memory used which is not cache or buffers"""
253 |         return self._convert * (self._tot - self._free - self._buff - self._cached)
254 | 
255 |     @property
256 |     def shared(self):
257 |         return self._convert * (self._tot - self._free)
258 | 
259 |     @property
260 |     def buffers(self):
261 |         return self._convert * (self._buff)
262 | 
263 |     @property
264 |     def cached(self):
265 |         return self._convert * self._cached
266 | 
267 |     @property
268 |     def user_free(self):
269 |         """This is the free memory available for the user"""
270 |         return self._convert * (self._free + self._buff + self._cached)
271 | 
272 |     @property
273 |     def swap(self):
274 |         return self._convert * self._swapt
275 | 
276 |     @property
277 |     def swap_free(self):
278 |         return self._convert * self._swapf
279 | 
280 |     @property
281 |     def swap_used(self):
282 |         return self._convert * self._swapu
283 |     
284 | if __name__ == '__main__':
285 |     fml = FreeMemLinux(unit='MB')
286 |     fml2 = FreeMemLinux(unit='%')
287 |     print("Used memory: {:.1f}M ({:.1f}%).".format(fml.used_real, fml2.used_real))
288 |     print("Free memory: {:.1f}M ({:.1f}%).".format(fml.user_free, fml2.user_free))
289 | 


--------------------------------------------------------------------------------
/hub_toolbox/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This file is part of the HUB TOOLBOX available at
 4 | https://github.com/OFAI/hub-toolbox-python3/
 5 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 6 | 
 7 | (c) 2018, Roman Feldbauer
 8 | Austrian Research Institute for Artificial Intelligence (OFAI)
 9 | Contact: <roman.feldbauer@ofai.at>
10 | """
11 | from multiprocessing import Value
12 | 
13 | __all__ = ['SynchronizedCounter']
14 | 
15 | class SynchronizedCounter(object):
16 |     """ A multiprocessing-safe counter for progress information. """
17 |     def __init__(self, init:int=-1):
18 |         self.val = Value('i', init)
19 | 
20 |     def increment_and_get_value(self, n=1) -> int:
21 |         """ Obtain a lock before incrementing, since += isn't atomic. """
22 |         with self.val.get_lock():
23 |             self.val.value += n
24 |             return self.val.value
25 | 
26 |     @property
27 |     def value(self) -> int:
28 |         return self.val.value
29 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | build:
2 |     image: latest
3 | 
4 | python:
5 |     version: 3.6


--------------------------------------------------------------------------------
/readthedocs_requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OFAI/hub-toolbox-python3/b76fa405dc6ffc80484a9bfed7e68fa828b7dc8e/readthedocs_requirements.txt


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | pandas
4 | scikit-learn
5 | joblib
6 | coveralls
7 | falconn
8 | nmslib
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This file is part of the HUB TOOLBOX available at
  6 | https://github.com/OFAI/hub-toolbox-python3/
  7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  8 | 
  9 | (c) 2011-2018, Dominik Schnitzer and Roman Feldbauer
 10 | Austrian Research Institute for Artificial Intelligence (OFAI)
 11 | Contact: <roman.feldbauer@ofai.at>
 12 | 
 13 | 
 14 | Installation:
 15 | -------------
 16 | In the console (terminal application) change to the folder containing this file.
 17 | 
 18 | To build the package hub_toolbox:
 19 | python3 setup.py build
 20 | 
 21 | To install the package (with administrator rights):
 22 | sudo python3 setup.py install
 23 | 
 24 | To test the installation:
 25 | sudo python3 setup.py test
 26 | 
 27 | If this succeeds with an 'OK' message, you are ready to go.
 28 | Otherwise you may consider filing a bug report on github.
 29 | (Some skipped tests are perfectly fine, though.)
 30 | """
 31 | import re, os, sys
 32 | REQ_MAJOR = 3
 33 | REQ_MINOR = 6
 34 | if sys.version_info < (REQ_MAJOR, REQ_MINOR):
 35 |     sys.stdout.write(
 36 |         (f"The HUB TOOLBOX requires Python {REQ_MAJOR}.{REQ_MINOR} or higher."
 37 |          f"\nPlease try to run as python3 setup.py or update your Python "
 38 |          f"environment.\n Consider using Anaconda for easy package handling."))
 39 |     sys.exit(1)
 40 | 
 41 | try:
 42 |     import numpy, scipy, sklearn  # @UnusedImport
 43 | except ImportError:
 44 |     sys.stdout.write("The HUB TOOLBOX requires numpy, scipy and scikit-learn. "
 45 |                      "Please make sure these packages are available locally. "
 46 |                      "Consider using Anaconda for easy package handling.\n")
 47 | try:
 48 |     import pandas, joblib  # @UnusedImport
 49 | except ImportError:
 50 |     sys.stdout.write("Some modules of the HUB TOOLBOX require pandas and joblib. "
 51 |                      "Please make sure these packages are available locally. "
 52 |                      "Consider using Anaconda for easy package handling.\n")
 53 | try:
 54 |     import nmslib, falconn  # @UnusedImport
 55 | except ImportError:
 56 |     sys.stdout.write("The 'approximate' module uses 'nmslib' and 'falconn' "
 57 |                      "libraries for approximate nearest neighbor search. "
 58 |                      "Please make sure these packages are available locally. "
 59 |                      "Consider using Anaconda for easy package handling.\n")
 60 | setup_options = {}
 61 | 
 62 | try:
 63 |     from setuptools import setup
 64 |     setup_options['test_suite'] = 'tests'
 65 | except ImportError:
 66 |     from distutils.core import setup
 67 |     import warnings
 68 |     warnings.warn("setuptools not found, resorting to distutils. "
 69 |                   "Unit tests won't be discovered automatically.")
 70 | 
 71 | # Parsing current version number
 72 | # Adapted from the Lasagne project at
 73 | # https://github.com/Lasagne/Lasagne/blob/master/setup.py
 74 | here = os.path.abspath(os.path.dirname(__file__))
 75 | try:
 76 |     # obtain version string from __init__.py
 77 |     # Read ASCII file with builtin open() so __version__ is str in Python 2 and 3
 78 |     with open(os.path.join(here, 'hub_toolbox', '__init__.py'), 'r') as f:
 79 |         init_py = f.read()
 80 |     version = re.search("__version__ = '(.*)'", init_py).groups()[0]
 81 | except Exception:
 82 |     version = ''
 83 | 
 84 | setup(
 85 |     name = "hub_toolbox",
 86 |     version = version,
 87 |     author = "Roman Feldbauer",
 88 |     author_email = "roman.feldbauer@ofai.at",
 89 |     maintainer = "Roman Feldbauer",
 90 |     maintainer_email = "roman.feldbauer@ofai.at",
 91 |     description = "Hubness reduction and analysis tools",
 92 |     license = "GNU GPLv3",
 93 |     keywords = ["machine learning", "data science"],
 94 |     url = "https://github.com/OFAI/hub-toolbox-python3",
 95 |     packages=['hub_toolbox', 'tests'],
 96 |     package_data={'hub_toolbox': ['example_datasets/*']},
 97 |     classifiers=[
 98 |         "Development Status :: 4 - Beta",
 99 |         "Environment :: Console",
100 |         "Intended Audience :: Science/Research",
101 |         "License :: OSI Approved :: GNU General Public License v3 "
102 |         "or later (GPLv3+)",
103 |         "Programming Language :: Python :: 3",
104 |         "Programming Language :: Python :: 3.6",
105 |         "Programming Language :: Python :: 3.7",
106 |         "Topic :: Scientific/Engineering"
107 |     ],
108 |     **setup_options
109 | )
110 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This file is part of the HUB TOOLBOX available at
 6 | https://github.com/OFAI/hub-toolbox-python3/
 7 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 8 | 
 9 | (c) 2016-2018, Roman Feldbauer
10 | Austrian Research Institute for Artificial Intelligence (OFAI)
11 | Contact: <roman.feldbauer@ofai.at>
12 | 
13 | ---
14 | 
15 | unittest module
16 | """


--------------------------------------------------------------------------------
/tests/approximate_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import numpy as np
14 | from sklearn.datasets import make_classification
15 | from sklearn.model_selection import train_test_split
16 | from hub_toolbox import approximate
17 | from sklearn.metrics.classification import accuracy_score
18 | 
19 | 
20 | class ApproximateHRTest(unittest.TestCase):
21 | 
22 |     def setUp(self):
23 |         n_samples = 500
24 |         n_informative = 256
25 |         n_features = n_informative
26 |         test_size = int(n_samples * .2)
27 |         X, y = make_classification(
28 |             n_samples=n_samples,
29 |             n_features=n_features,
30 |             n_informative=n_informative,
31 |             n_redundant=0, n_repeated=0, n_classes=2,
32 |             n_clusters_per_class=10, random_state=2847356)
33 |         X_train, X_test, y_train, y_test = train_test_split(
34 |             X.astype(np.float32), y.astype(np.int32), test_size=test_size)
35 |         self.X_train = X_train
36 |         self.X_test = X_test
37 |         self.y_train = y_train
38 |         self.y_test = y_test
39 | 
40 |         self.hr_algorithms = ['LS', 'NICDM', 'MP', 'MPG', 'DSL', None, 'NoNe']
41 |         self.n_neighbors = 5
42 |         self.n_samples = 100
43 |         self.sampling_algorithms = ['random', 'kmeans++', 'LSH', 'HNSW',
44 |                                     None, 'nOnE']
45 |         self.metrics = ['sqeuclidean', 'cosine']
46 |         self.n_jobs = [-1, 1]
47 |         self.verbose = 0
48 |         self.accu_time = 0.
49 | 
50 |     def tearDown(self):
51 |         print(f'Accumulated time: {self.accu_time} seconds.')
52 | 
53 |     def _approximate_hr(self, hr_algorithm, sampling_algorithm,
54 |                         metric, n_jobs):
55 |         hr = approximate.SuQHR(hr_algorithm=hr_algorithm,
56 |                                n_neighbors=self.n_neighbors,
57 |                                n_samples=self.n_samples,
58 |                                metric=metric,
59 |                                sampling_algorithm=sampling_algorithm,
60 |                                random_state=123,
61 |                                n_jobs=n_jobs,
62 |                                verbose=self.verbose)
63 |         hr.fit(self.X_train, self.y_train)
64 |         y_pred = hr.predict(self.X_test)
65 |         acc = accuracy_score(y_pred, self.y_test)
66 |         print(f'SuQHR ({hr_algorithm}, {sampling_algorithm}, {metric}) '
67 |               f'{self.n_neighbors}-NN accuracy: {acc:.2f}')
68 |         total_time = hr.time_fit_ + hr.time_transform_ + hr.time_predict_
69 |         self.accu_time += total_time.total.values
70 | 
71 |     def test_approximate_hubness_reduction(self):
72 |         for hr_algorithm in self.hr_algorithms:
73 |             for sampling_algorithm in self.sampling_algorithms:
74 |                 for metric in self.metrics:
75 |                     for n_jobs in self.n_jobs:
76 |                         self._approximate_hr(hr_algorithm,
77 |                                              sampling_algorithm,
78 |                                              metric,
79 |                                              n_jobs)
80 | 
81 |     def test_surrogate_class(self):
82 |         hr = approximate.ApproximateHubnessReduction()
83 |         return self.assertIn(hr.hr_algorithm, self.hr_algorithms)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     # import sys;sys.argv = ['', 'Test.testName']
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/tests/centering_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import numpy as np
14 | from sklearn.preprocessing import StandardScaler
15 | from hub_toolbox.centering import centering, weighted_centering, \
16 |     localized_centering, dis_sim_global, dis_sim_local
17 | from hub_toolbox.io import load_dexter
18 | from hub_toolbox.hubness import hubness
19 | from hub_toolbox.knn_classification import score
20 | 
21 | class TestCentering(unittest.TestCase):
22 |     
23 |     def setUp(self):
24 |         self.distance, self.target, self.vectors = load_dexter()
25 | 
26 |     def test_centering_equal_to_sklearn_centering(self):
27 |         vectors_cent = centering(self.vectors, 'vector')
28 |         scaler = StandardScaler(with_mean=True, with_std=False)
29 |         vectors_sklearn_cent = scaler.fit_transform(self.vectors)
30 |         return np.testing.assert_array_almost_equal(
31 |             vectors_cent, vectors_sklearn_cent, decimal=7)
32 | 
33 |     def test_weighted_centering_with_gamma_zero_equal_centering(self):
34 |         vectors_wcent = weighted_centering(self.vectors, 'cosine', gamma=0.)
35 |         vectors_cent = centering(self.vectors, 'vector')
36 |         return np.testing.assert_array_almost_equal(
37 |             vectors_cent, vectors_wcent, decimal=7)
38 | 
39 |     def test_weighted_centering_with_gamma_notzero_changes_result(self):
40 |         gamma = np.random.rand(1)
41 |         vectors_wcent = weighted_centering(self.vectors, 'cosine', gamma)
42 |         vectors_cent = centering(self.vectors, 'vector')
43 |         return self.assertNotEqual((vectors_cent - vectors_wcent).sum(), 0)
44 | 
45 |     def test_localized_centering(self):
46 |         """Test whether hubness and k-NN accuracy improve for dexter"""
47 |         h_orig = hubness(self.distance)[0]
48 |         acc_orig = score(self.distance, self.target)[0][0, 0]
49 |         sim_lcent = localized_centering(self.vectors, kappa=20, gamma=1.)
50 |         h_lcent = hubness(sim_lcent, metric='similarity')[0]
51 |         acc_lcent = score(sim_lcent, self.target, metric='similarity')[0][0, 0]
52 |         result = (h_orig / h_lcent > 1.5) & (acc_lcent - acc_orig > 0.03)
53 |         return self.assertTrue(result)
54 | 
55 |     def test_localized_centering_parallel(self):
56 |         lcent_seq = localized_centering(
57 |             self.vectors, kappa=20, gamma=1., n_jobs=4)
58 |         lcent_par = localized_centering(
59 |             self.vectors, kappa=20, gamma=1., n_jobs=1)
60 |         return np.testing.assert_array_almost_equal(lcent_par, lcent_seq, 14)
61 | 
62 |     def test_dis_sim_global(self):
63 |         """Test whether hubness and k-NN accuracy improve for dexter"""
64 |         h_orig = hubness(self.distance)[0]
65 |         acc_orig = score(self.distance, self.target)[0][0, 0]
66 |         dist_dsg = dis_sim_global(self.vectors)
67 |         h_dsg = hubness(dist_dsg)[0]
68 |         acc_dsg = score(dist_dsg, self.target)[0][0, 0]
69 |         result = (h_orig / h_dsg > 2) & (acc_dsg - acc_orig > 0.07)
70 |         return self.assertTrue(result)
71 | 
72 |     def test_dis_sim_local(self):
73 |         """Test whether hubness and k-NN accuracy improve for dexter"""
74 |         #self.vectors = np.tile(self.vectors, 1)
75 |         h_orig = hubness(self.distance)[0]
76 |         acc_orig = score(self.distance, self.target)[0][0, 0]
77 |         dist_dsl = dis_sim_local(self.vectors, k=50)
78 |         h_dsl = hubness(dist_dsl)[0]
79 |         acc_dsl = score(dist_dsl, self.target)[0][0, 0]
80 |         result = (h_orig / h_dsl > 10) & (acc_dsl - acc_orig > 0.03)
81 |         return self.assertTrue(result)
82 | 
83 |     def test_dis_sim_local_parallel(self):
84 |         dsl_seq = dis_sim_local(self.vectors, k=50, n_jobs=1)
85 |         dsl_par = dis_sim_local(self.vectors, k=50, n_jobs=4)
86 |         return np.testing.assert_array_almost_equal(dsl_seq, dsl_par, 14)
87 | 
88 |     def test_dis_sim_local_split_parallel_(self):
89 |         X = self.vectors[:150, :]
90 |         Y = self.vectors[150:, :]
91 |         dsl_seq = dis_sim_local(X, Y, n_jobs=1)
92 |         dsl_par = dis_sim_local(X, Y, n_jobs=4)
93 |         return np.testing.assert_array_almost_equal(dsl_seq, dsl_par, 14)
94 | 
95 | if __name__ == "__main__":
96 |     unittest.main()
97 | 


--------------------------------------------------------------------------------
/tests/distances_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import numpy as np
14 | from scipy.spatial.distance import pdist, cdist, squareform
15 | from sklearn.neighbors import KNeighborsClassifier
16 | from sklearn.metrics import accuracy_score
17 | from hub_toolbox.distances import (cosine_distance, euclidean_distance,
18 |                                    mp_dissim)
19 | from hub_toolbox.io import load_dexter
20 | from hub_toolbox.hubness import hubness
21 | 
22 | class TestDistances(unittest.TestCase):
23 | 
24 |     def setUp(self):
25 |         np.random.seed(626)
26 |         self.vectors = 99. * (np.random.rand(400, 200) - 0.5)
27 | 
28 |     def tearDown(self):
29 |         del self.vectors
30 | 
31 |     def test_cosine_dist_equal_to_scipy_pdist_cos(self):
32 |         cos_dist = cosine_distance(self.vectors)
33 |         cos_dist_scipy = squareform(pdist(self.vectors, 'cosine'))
34 |         return np.testing.assert_array_almost_equal(
35 |             cos_dist, cos_dist_scipy, decimal=7)
36 |     
37 |     def test_euclidean_dist_equal_to_scipy_cdist_eucl(self):
38 |         eucl_dist = euclidean_distance(self.vectors)
39 |         eucl_dist_cdist = cdist(self.vectors, self.vectors, 'euclidean')
40 |         return np.testing.assert_array_almost_equal(
41 |             eucl_dist, eucl_dist_cdist, decimal=7)
42 | 
43 | class TestMpDisSim(unittest.TestCase):
44 |     
45 |     def setUp(self):
46 |         _, y, X = load_dexter()
47 |         r = np.random.permutation(y.size)
48 |         self.X = X[r, :]
49 |         self.y = y[r]
50 |         split = int(len(y)/10*9)
51 |         train_ind = slice(0, split)
52 |         test_ind = slice(split, len(y))
53 |         self.X_train = self.X[train_ind]
54 |         self.X_test = self.X[test_ind]
55 |         self.y_train = self.y[train_ind]
56 |         self.y_test = self.y[test_ind]
57 | 
58 |     def test_mp_dissim(self):
59 |         ''' Test that mp_dissim improves kNN-accuracy for dexter. '''
60 |         D_part = cdist(self.X_test, self.X_train, 'euclidean')
61 |         knn = KNeighborsClassifier(
62 |             n_neighbors=5, metric='precomputed', n_jobs=4)
63 |         knn.fit(self.X_train, self.y_train)
64 |         y_pred = knn.predict(D_part)
65 |         acc_eucl = accuracy_score(self.y_test, y_pred)
66 |         h_eucl = hubness(D_part, k=5, metric='distance', n_jobs=4)[0]
67 |         D_part_mp = mp_dissim(
68 |             X=self.X_test, Y=self.X_train, p=0, n_bins=10, bin_size='r', verbose=1, n_jobs=-1)
69 |         y_pred_mp = knn.predict(D_part_mp)
70 |         acc_mp = accuracy_score(self.y_test, y_pred_mp)
71 |         h_mp = hubness(D_part_mp, k=5, metric='distance', n_jobs=4)[0]
72 |         #=======================================================================
73 |         # print("Hub:", h_eucl, h_mp)
74 |         # print("Acc:", acc_eucl, acc_mp)
75 |         # D_mp = mp_dissim(self.X, p=2, n_bins=10, bin_size='r', n_jobs=-1, verbose=1)
76 |         #=======================================================================
77 |         self.assertLess(h_mp, h_eucl)
78 |         self.assertGreater(acc_mp, acc_eucl)
79 | 
80 | if __name__ == "__main__":
81 |     unittest.main()
82 | 


--------------------------------------------------------------------------------
/tests/goodmankruskal_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import numpy as np
14 | from scipy.spatial.distance import squareform, pdist
15 | from scipy.sparse.csr import csr_matrix
16 | from hub_toolbox.goodman_kruskal import goodman_kruskal_index,\
17 |     _naive_goodman_kruskal, sparse_goodman_kruskal_index
18 | from hub_toolbox.io import random_sparse_matrix
19 | from hub_toolbox.shared_neighbors import shared_nearest_neighbors
20 | 
21 | class TestGoodmanKruskal(unittest.TestCase):
22 | 
23 |     def setUp(self):
24 |         n = 50
25 |         m = 5
26 |         c = 3
27 |         np.random.seed(823475)
28 |         data = np.random.rand(n, m)
29 |         self.distance = squareform(pdist(data, 'euclidean'))
30 |         self.similarity = 1. - self.distance / self.distance.max()
31 |         self.labels = np.random.randint(0, c, n)
32 | 
33 |     def tearDown(self):
34 |         del self.distance, self.similarity, self.labels
35 |         
36 |     def test_naive_goodmankruskal_algorithm(self):
37 |         """Using a small clustering with correct value calc by hand"""
38 |         distance = np.array(
39 |             squareform([0.7, 1.55, 0.5, 1.7, 0.9, 0.85, 1.2, 1.5, 0.6, 1.4]))
40 |         label = np.array([0, 0, 1, 2, 1])
41 |         CORRECT_RESULT = 0.75
42 |         result = _naive_goodman_kruskal(distance, label, 'distance')
43 |         return self.assertEqual(result, CORRECT_RESULT)
44 | 
45 |     def test_efficient_goodmankruskal_equal_to_naive_goodmankruskal(self):
46 |         """Test whether goodman_kruskal_index yields correct result"""
47 |         gamma_efficient = goodman_kruskal_index(self.distance, self.labels)
48 |         gamma_naive = _naive_goodman_kruskal(self.distance, self.labels)
49 |         return self.assertEqual(gamma_efficient, gamma_naive)
50 |     
51 |     def test_goodmankruskal_distance_based_equal_to_similarity_based(self):
52 |         """Test whether results are correct using similarities"""
53 |         gamma_dist = goodman_kruskal_index(self.distance, self.labels, 'distance')
54 |         gamma_sim = goodman_kruskal_index(self.similarity, self.labels, 'similarity')
55 |         return self.assertEqual(gamma_dist, gamma_sim)
56 |     
57 |     def test_goodmankruskal_close_to_zero_for_random_data(self):
58 |         gamma_dist = goodman_kruskal_index(self.distance, self.labels)
59 |         return self.assertAlmostEqual(gamma_dist, 0., places=1)
60 |     
61 |     def test_sparse_goodmankruskal_equal_to_dense_goodmankruskal(self):
62 |         similarity = random_sparse_matrix(size=1000)
63 |         labels = np.random.randint(0, 5, 1000)
64 |         gamma_sparse = sparse_goodman_kruskal_index(similarity, labels, verbose=2)
65 |         gamma_dense = goodman_kruskal_index(similarity.toarray(), labels, 'similarity')
66 |         return self.assertEqual(gamma_dense, gamma_sparse)
67 |     
68 |     def test_correct_handling_equal_distances_goodmankruskal(self):
69 |         """SharedNN matrices contain lots of equal distances"""
70 |         dist_snn = shared_nearest_neighbors(self.distance)
71 |         gamma_efficient = goodman_kruskal_index(dist_snn, self.labels)
72 |         gamma_naive = _naive_goodman_kruskal(dist_snn, self.labels)
73 |         return self.assertEqual(gamma_efficient, gamma_naive)
74 |     
75 |     def test_correct_handling_equal_similarities_sparse_gk(self):
76 |         sim_snn = 1. - shared_nearest_neighbors(self.distance)
77 |         gamma_sparse = sparse_goodman_kruskal_index(csr_matrix(sim_snn), self.labels)
78 |         gamma_efficient = goodman_kruskal_index(sim_snn, self.labels, 'similarity')
79 |         return self.assertEqual(gamma_efficient, gamma_sparse)
80 | 
81 | if __name__ == "__main__":
82 |     unittest.main()
83 | 


--------------------------------------------------------------------------------
/tests/hubness_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This file is part of the HUB TOOLBOX available at
  5 | https://github.com/OFAI/hub-toolbox-python3/
  6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  7 | 
  8 | (c) 2016-2018, Roman Feldbauer
  9 | Austrian Research Institute for Artificial Intelligence (OFAI)
 10 | Contact: <roman.feldbauer@ofai.at>
 11 | """
 12 | import unittest
 13 | import numpy as np
 14 | from scipy.spatial.distance import squareform
 15 | from sklearn.datasets.samples_generator import make_classification
 16 | from sklearn.model_selection import train_test_split
 17 | from hub_toolbox.approximate import ApproximateHubnessReduction,\
 18 |                                     VALID_HR, VALID_SAMPLE
 19 | from hub_toolbox.distances import euclidean_distance
 20 | from hub_toolbox.hubness import hubness, Hubness, hubness_from_vectors
 21 | from hub_toolbox.io import random_sparse_matrix
 22 | 
 23 | class TestHubness(unittest.TestCase):
 24 |     """Test hubness calculations"""
 25 | 
 26 |     def setUp(self):
 27 |         """Hubness truth: S_k=5, skewness calculated with bias"""
 28 |         self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9])
 29 |         self.hubness_truth = -0.2561204163
 30 | 
 31 |     def tearDown(self):
 32 |         del self.dist
 33 | 
 34 |     def test_hubness(self):
 35 |         """Test hubness against ground truth calc on spreadsheet"""
 36 |         Sk5, _, _ = hubness(self.dist, k=2, verbose=1)
 37 |         return self.assertAlmostEqual(Sk5, self.hubness_truth, places=10)
 38 | 
 39 |     def test_hubness_return_values_are_self_consistent(self):
 40 |         """Test that the three returned values fit together"""
 41 |         np.random.seed(626)
 42 |         points = 200
 43 |         dim = 500
 44 |         vector = 99. * (np.random.rand(points, dim) - 0.5)
 45 |         dist = euclidean_distance(vector)
 46 |         k = 10
 47 |         Sk10, Dk10, Nk10 = hubness(dist, k=k)
 48 |         # Dk is just checked for correct shape
 49 |         correct_dim_Dk10 = Dk10.shape == (points, k)
 50 |         # Count k-occurence (different method than in module)
 51 |         Dk10 = Dk10.ravel()
 52 |         Nk10_true = np.zeros(points, dtype=int)
 53 |         for i in range(points):
 54 |             Nk10_true[i] = (Dk10 == i).sum()
 55 |         correct_Nk10 = np.all(Nk10 == Nk10_true)
 56 |         # Calculate skewness (different method than in module)
 57 |         x0 = Nk10 - Nk10.mean()
 58 |         s2 = (x0**2).mean()
 59 |         m3 = (x0**3).mean()
 60 |         s = m3 / (s2**1.5)
 61 |         Sk10_true = s
 62 |         correct_Sk10 = Sk10 == Sk10_true
 63 |         return self.assertTrue(correct_dim_Dk10
 64 |                                and correct_Nk10
 65 |                                and correct_Sk10)
 66 | 
 67 |     def test_parallel_hubness_equal_serial_hubness_distance_based(self):
 68 |         S_k_p, D_k_p, N_k_p = hubness(
 69 |             self.dist, k=5, metric='distance', verbose=True, n_jobs=-1)
 70 |         S_k_s, D_k_s, N_k_s = hubness(
 71 |             self.dist, k=5, metric='distance', verbose=False, n_jobs=1)
 72 |         np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7)
 73 |         np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7)
 74 |         np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7)
 75 | 
 76 |     def test_parallel_hubness_equal_serial_hubness_similarity_based(self):
 77 |         similarity = random_sparse_matrix(size=1000)
 78 |         S_k_p, D_k_p, N_k_p = hubness(
 79 |             similarity, k=5, metric='similarity', verbose=False, n_jobs=-1)
 80 |         S_k_s, D_k_s, N_k_s = hubness(
 81 |             similarity, k=5, metric='similarity', verbose=False, n_jobs=1)
 82 |         np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7)
 83 |         np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7)
 84 |         np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7)
 85 | 
 86 | class TestHubnessClass(unittest.TestCase):
 87 |     """Test hubness calculations"""
 88 | 
 89 |     def setUp(self):
 90 |         """Hubness truth: S_k=5, skewness calculated with bias"""
 91 |         np.random.seed(123)
 92 |         self.X = np.random.rand(100, 50)
 93 |         self.D = euclidean_distance(self.X)
 94 |         self.verbose = 1
 95 | 
 96 |     def tearDown(self):
 97 |         del self.X
 98 | 
 99 |     def test_hubness_against_distance(self):
100 |         """Test hubness class against distance-based methods."""
101 |         Sk_dist, Dk_dist, Nk_dist = hubness(self.D, k=10)
102 |         hub = Hubness(k=10,
103 |                       return_k_neighbors=True,
104 |                       return_k_occurrence=True,
105 |                       verbose=self.verbose)
106 |         hub.fit_transform(self.X)
107 |         Sk_class = hub.k_skewness_
108 |         Dk_class = hub.k_neighbors_
109 |         Nk_class = hub.k_occurrence_
110 |         np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10)
111 |         np.testing.assert_array_equal(Dk_class, Dk_dist)
112 |         np.testing.assert_array_equal(Nk_class, Nk_dist)
113 |         hub = Hubness(k=10,
114 |                       return_k_neighbors=True,
115 |                       return_k_occurrence=True,
116 |                       metric='precomputed',
117 |                       verbose=self.verbose)
118 |         hub.fit_transform(self.D, has_self_distances=True)
119 |         Sk_class = hub.k_skewness_
120 |         Dk_class = hub.k_neighbors_
121 |         Nk_class = hub.k_occurrence_
122 |         np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10)
123 |         np.testing.assert_array_equal(Dk_class, Dk_dist)
124 |         np.testing.assert_array_equal(Nk_class, Nk_dist)
125 | 
126 |     def test_hubness_against_vectors(self):
127 |         """ Test hubness class against vector-based method. """
128 |         Sk_vect, Dk_vect, Nk_vect = hubness_from_vectors(self.X, k=10)
129 |         hub = Hubness(k=10,
130 |                       return_k_neighbors=True,
131 |                       return_k_occurrence=True,
132 |                       verbose=self.verbose)
133 |         hub.fit_transform(self.X)
134 |         Sk_class = hub.k_skewness_
135 |         Dk_class = hub.k_neighbors_
136 |         Nk_class = hub.k_occurrence_
137 |         np.testing.assert_almost_equal(Sk_class, Sk_vect, decimal=10)
138 |         np.testing.assert_array_equal(Dk_class, Dk_vect)
139 |         np.testing.assert_array_equal(Nk_class, Nk_vect)
140 |         np.testing.assert_array_less(
141 |             hub.k_skewness_truncnorm_, hub.k_skewness_)
142 | 
143 |     def test_hubness_multiprocessing(self):
144 |         """ Test multiprocessing capabilities of Hubness. """
145 |         hub = Hubness(k=10,
146 |                       return_k_neighbors=True,
147 |                       return_k_occurrence=True,
148 |                       n_jobs=1,
149 |                       verbose=self.verbose)
150 |         hub.fit_transform(self.X)
151 |         Sk_vect = hub.k_skewness_
152 |         Dk_vect = hub.k_neighbors_
153 |         Nk_vect = hub.k_occurrence_
154 |         hub = Hubness(k=10,
155 |                       return_k_neighbors=True,
156 |                       return_k_occurrence=True,
157 |                       n_jobs=-1,
158 |                       verbose=self.verbose)
159 |         hub.fit_transform(self.X)
160 |         Sk_mp = hub.k_skewness_
161 |         Dk_mp = hub.k_neighbors_
162 |         Nk_mp = hub.k_occurrence_
163 |         np.testing.assert_almost_equal(Sk_mp, Sk_vect, decimal=10)
164 |         np.testing.assert_array_equal(Dk_mp, Dk_vect)
165 |         np.testing.assert_array_equal(Nk_mp, Nk_vect)
166 |         np.testing.assert_array_less(
167 |             hub.k_skewness_truncnorm_, hub.k_skewness_)
168 | 
169 |     def test_hubness_independent_on_data_set_size(self):
170 |         thousands = 3
171 |         n_objects = thousands * 1_000
172 |         X = np.random.rand(n_objects, 128)
173 |         N_SAMPLES = np.arange(1, thousands + 1) * 1_000
174 |         Sk_trunc = np.empty(N_SAMPLES.size)
175 |         for i, n_samples in enumerate(N_SAMPLES):
176 |             ind = np.random.permutation(n_objects)[:n_samples]
177 |             X_sample = X[ind, :]
178 |             hub = Hubness()
179 |             hub.fit_transform(X_sample)
180 |             Sk_trunc[i] = hub.k_skewness_truncnorm_
181 |             if i > 0:
182 |                 np.testing.assert_allclose(
183 |                     Sk_trunc[i], Sk_trunc[i-1], rtol=1e-1, 
184 |                     err_msg=(f'Hubness measure is too dependent on data set '
185 |                              f'size with S({N_SAMPLES[i]}) = x '
186 |                              f'and S({N_SAMPLES[i-1]}) = y.'))
187 |         np.testing.assert_allclose(Sk_trunc[-1], Sk_trunc[0], rtol=1e-1)
188 | 
189 |     def test_hubness_from_sparse_precomputed_matrix(self):
190 |         # Generate high-dimensional data
191 |         X, y = make_classification(n_samples=1000,
192 |                                    n_features=100,
193 |                                    n_informative=100,
194 |                                    n_redundant=0,
195 |                                    n_repeated=0,
196 |                                    random_state=123)
197 |         X = X.astype(np.float32)
198 |         y = y.astype(np.int32)
199 |         for hr_algorithm in VALID_HR: #['dsl']:#
200 |             for sampling_algorithm in VALID_SAMPLE: #['hnsw', 'lsh']:#
201 |                 for n_samples in [50, 100]:
202 |                     print(f'Test {hr_algorithm}, {sampling_algorithm}, '
203 |                           f'with {n_samples} samples.')
204 |                     self.hubness_from_sparse_precomputed_matrix(
205 |                         X, y, hr_algorithm, sampling_algorithm, n_samples)
206 |         
207 |     def hubness_from_sparse_precomputed_matrix(self, X, y, hr,
208 |                                                sample, n_samples):
209 |         # Make train-test split
210 |         X_train, X_test, y_train, _ = train_test_split(X, y)
211 |         # Obtain a sparse distance matrix
212 |         ahr = ApproximateHubnessReduction(
213 |             hr_algorithm=hr, sampling_algorithm=sample, n_samples=n_samples)
214 |         ahr.fit(X_train, y_train)
215 |         _ = ahr.transform(X_test)
216 |         D_test_csr = ahr.sec_dist_sparse_
217 |         # Hubness in sparse matrix
218 |         hub = Hubness(k=10,
219 |                       metric='precomputed',
220 |                       return_k_neighbors=True,
221 |                       shuffle_equal=False,
222 |                       verbose=self.verbose)
223 |         hub.fit_transform(D_test_csr)
224 |         Sk_trunc_sparse = hub.k_skewness_truncnorm_
225 |         Sk_sparse = hub.k_skewness_
226 |         k_neigh_sparse = hub.k_neighbors_
227 |         # Hubness in dense matrix
228 |         try:
229 |             D_test_dense = D_test_csr.toarray()
230 |         except AttributeError:
231 |             return # Without sampling, the distance matrix is not sparse
232 |         D_test_dense[D_test_dense == 0] = np.finfo(np.float32).max
233 |         hub_dense = Hubness(k=10,
234 |                             metric='precomputed',
235 |                             return_k_neighbors=True,
236 |                             shuffle_equal=False)
237 |         hub_dense.fit_transform(D_test_dense)
238 |         Sk_trunc_dense = hub_dense.k_skewness_truncnorm_
239 |         Sk_dense = hub_dense.k_skewness_
240 |         k_neigh_dense = hub_dense.k_neighbors_
241 |         if hr in ['MP', 'MPG']:
242 |             decimal = 1
243 |         else:
244 |             decimal = 5
245 |         try:
246 |             np.testing.assert_array_equal(
247 |                 k_neigh_dense.ravel(), k_neigh_sparse)
248 |         except AssertionError:
249 |             s1 = k_neigh_dense.sum()
250 |             s2 = k_neigh_sparse.sum()
251 |             sm = max(s1, s2)
252 |             print(f'k_neighbors not identical, but close: '
253 |                   f'{s1}, {s2}, {s1/s2}.')
254 |             np.testing.assert_allclose(s2/sm, s1/sm, rtol=1e-2)
255 |         np.testing.assert_array_almost_equal(
256 |             Sk_sparse, Sk_dense, decimal=decimal)
257 |         np.testing.assert_array_almost_equal(
258 |             Sk_trunc_sparse, Sk_trunc_dense, decimal=decimal)
259 | 
260 | if __name__ == "__main__":
261 |     unittest.main()
262 | 


--------------------------------------------------------------------------------
/tests/hubnessanalysis_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import numpy as np
14 | from hub_toolbox import hubness_analysis
15 | from hub_toolbox.distances import euclidean_distance
16 | 
17 | class TestHubnessAnalysis(unittest.TestCase):
18 |     """Test the hubness_analysis class (check for results,
19 |        but not for *correct* results.)
20 |     """
21 | 
22 |     def setUp(self):
23 |         points = 100
24 |         dim = 10
25 |         self.vector = 99. * (np.random.rand(points, dim) - 0.5)
26 |         self.label = np.random.randint(0, 5, points)
27 |         self.dist = euclidean_distance(self.vector)
28 |         self.SEC_DIST = set(['mp', 'mp_gaussi', 'mp_gammai', 
29 |                             'ls', 'nicdm', 'snn', 'cent', 'wcent', 'lcent', 
30 |                             'dsg', 'dsl', 'orig'])
31 | 
32 |     def tearDown(self):
33 |         del self.dist, self.label, self.vector, self.SEC_DIST
34 | 
35 |     def test_all_sec_dist_are_covered_in_unittests(self):
36 |         n_self_sec_dist = len(self.SEC_DIST)
37 |         hub_ana_sec_dist = set(hubness_analysis.SEC_DIST.keys())
38 |         n_intersection = len(hub_ana_sec_dist & self.SEC_DIST)
39 |         return self.assertEqual(n_self_sec_dist, n_intersection)
40 | 
41 |     def test_all_sec_dist_have_header(self):
42 |         ha_sec_dist = set(hubness_analysis.SEC_DIST.keys())
43 |         header_sec_dist = set(hubness_analysis.HubnessAnalysis()._header.keys())
44 |         n_sec_dist = len(ha_sec_dist)
45 |         n_intersection = len(ha_sec_dist & header_sec_dist)
46 |         return self.assertEqual(n_sec_dist, n_intersection)
47 | 
48 |     def test_all_sec_dist_types(self):
49 |         got_all_results = True
50 |         for dist_type in self.SEC_DIST:
51 |             got_all_results &= self._perform(dist_type)
52 |         return self.assertTrue(got_all_results)
53 | 
54 |     def _perform(self, dist_type):
55 |         """Test whether the given secondary distance type is supported."""
56 |         ana = hubness_analysis.HubnessAnalysis(
57 |             self.dist, self.label, self.vector, 'distance')
58 |         ana = ana.analyze_hubness(
59 |             experiments=dist_type, print_results=True, verbose=1)
60 |         exp = ana.experiments[0]
61 |         got_all_results = \
62 |             (exp.secondary_distance is not None and
63 |              len(exp.hubness) > 0 and
64 |              len(exp.anti_hubs) > 0 and
65 |              len(exp.max_hub_k_occurence) > 0 and
66 |              len(exp.knn_accuracy) > 0 and
67 |              exp.gk_index is not None and
68 |              ana.intrinsic_dim is not None)
69 |         return got_all_results
70 | 
71 |     def test_hubness_analysis_only_with_distances(self):
72 |         """ Check correct handling when no labels, vectors are given."""
73 |         ana = hubness_analysis.HubnessAnalysis(self.dist)
74 |         ana = ana.analyze_hubness("orig")
75 |         exp = ana.experiments[0]
76 |         got_all_results = \
77 |             (exp.secondary_distance is not None and
78 |              len(exp.hubness) > 0 and
79 |              len(exp.anti_hubs) > 0 and
80 |              len(exp.max_hub_k_occurence) > 0 and
81 |              exp.gk_index is not None and
82 |              ana.intrinsic_dim is not None)
83 |         return got_all_results
84 | 
85 | if __name__ == "__main__":
86 |     unittest.main()
87 | 


--------------------------------------------------------------------------------
/tests/intrinsicdim_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import numpy as np
14 | from hub_toolbox.io import load_dexter
15 | from hub_toolbox.intrinsic_dimension import intrinsic_dimension
16 | 
17 | class TestIntrinsicDim(unittest.TestCase):
18 | 
19 |     def setUp(self):
20 |         self.vector = np.random.rand(50, 2)
21 | 
22 |     def tearDown(self):
23 |         del self.vector
24 | 
25 |     def test_intrinsic_dim_mle_levina(self):
26 |         """Test against value calc. by matlab reference implementation."""
27 |         _, _, vector = load_dexter()
28 |         ID_MLE_REF = 74.472
29 |         id_mle = intrinsic_dimension(vector, k1=6, k2=12, 
30 |             estimator='levina', metric='vector', trafo=None)
31 |         return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3)
32 | 
33 |     def test_intrinsic_dim_mle_levina_low_memory(self):
34 |         """ Same as above, but invoking the speed-memory trade-off. """
35 |         _, _, vector = load_dexter()
36 |         ID_MLE_REF = 74.472
37 |         id_mle = intrinsic_dimension(vector, 6, 12, 'levina', 
38 |                                      'vector', None, mem_threshold=0)
39 |         return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3)
40 | 
41 |     def test_incorrect_est_params(self):
42 |         """ Test handling of incorrect estimator. """
43 |         with self.assertRaises(ValueError):
44 |             intrinsic_dimension(self.vector, 
45 |                 estimator='the_single_truly_best_id_estimator')
46 | 
47 |     def test_incorrect_k1_params(self):
48 |         """ Test handling of incorrect neighborhood parameters."""
49 |         with self.assertRaises(ValueError):
50 |             intrinsic_dimension(self.vector, k1=0)
51 | 
52 |     def test_incorrect_k12_params(self):
53 |         """ Test handling of incorrect neighborhood parameters."""
54 |         with self.assertRaises(ValueError):
55 |             intrinsic_dimension(self.vector, k1=6, k2=4)
56 | 
57 |     def test_incorrect_k2_params(self):
58 |         """ Test handling of incorrect neighborhood parameters."""
59 |         n = self.vector.shape[0]
60 |         with self.assertRaises(ValueError):
61 |             intrinsic_dimension(self.vector, k2=n)
62 | 
63 |     def test_incorrect_trafo_params(self):
64 |         """ Test handling of incorrect transformation parameters."""
65 |         with self.assertRaises(ValueError):
66 |             intrinsic_dimension(self.vector, trafo=0)
67 | 
68 |     def test_incorrect_metric_dist(self):
69 |         """ Test handling of unsupported metric parameters."""
70 |         with self.assertRaises(NotImplementedError):
71 |             intrinsic_dimension(self.vector, metric='distance')
72 | 
73 |     def test_incorrect_metric_sim(self):
74 |         """ Test handling of unsupported metric parameters."""
75 |         with self.assertRaises(NotImplementedError):
76 |             intrinsic_dimension(self.vector, metric='similarity')
77 | 
78 |     def test_incorrect_metric_other(self):
79 |         """ Test handling of unsupported metric parameters."""
80 |         with self.assertRaises(ValueError):
81 |             intrinsic_dimension(self.vector, metric=None)
82 | 
83 | if __name__ == "__main__":
84 |     unittest.main()
85 | 


--------------------------------------------------------------------------------
/tests/io_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import tempfile
14 | import numpy as np
15 | from scipy.sparse.csr import csr_matrix
16 | from hub_toolbox.io import random_sparse_matrix, load_dexter
17 | from hub_toolbox import io
18 | 
19 | class TestIO(unittest.TestCase):
20 | 
21 |     def setUp(self):
22 |         np.random.seed(626)
23 |         self.matrix_n = 500
24 |         self.density = 0.02
25 |         self.similarity = random_sparse_matrix(
26 |             size=self.matrix_n, density=self.density)
27 | 
28 |     def tearDown(self):
29 |         del self.matrix_n, self.density, self.similarity
30 | 
31 |     def test_save_and_load_csr_matrix(self):
32 |         tmp = tempfile.mkstemp(suffix='.npz')[1]
33 |         io_sim = io.load_csr_matrix(io.save_csr_matrix(tmp, self.similarity))
34 |         # If both are identical, the difference must be all-zeros
35 |         return self.assertEqual((self.similarity - io_sim).nnz, 0.)
36 | 
37 |     def test_random_sparse_similarity_matrix_quadratic_form(self):
38 |         return self.assertEqual(
39 |             self.similarity.shape[0], self.similarity.shape[1])
40 | 
41 |     def test_random_sparse_similarity_matrix_correct_size(self):
42 |         return self.assertEqual(self.similarity.shape[0], self.matrix_n)
43 | 
44 |     def test_random_sparse_similarity_matrix_correct_type(self):
45 |         return self.assertIsInstance(self.similarity, csr_matrix)
46 | 
47 |     def test_random_sparse_similarity_matrix_symmetric(self):
48 |         non_symmetric_entry = \
49 |             (self.similarity - self.similarity.T != 0.).nnz > 0
50 |         return self.assertFalse(non_symmetric_entry)
51 | 
52 |     def test_random_sparse_similarity_matrix_min_zero(self):
53 |         return self.assertGreaterEqual(self.similarity.min(), 0.)
54 | 
55 |     def test_random_sparse_similarity_matrix_max_one(self):
56 |         return self.assertLessEqual(self.similarity.max(), 1.)
57 | 
58 |     def test_random_sparse_similarity_matrix_self_similarity_one(self):
59 |         all_diag_ones = np.all(self.similarity.diagonal() == 1)
60 |         return self.assertTrue(all_diag_ones)
61 | 
62 |     def test_random_sparse_similarity_matrix_density(self):
63 |         return self.assertAlmostEqual(
64 |             self.similarity.nnz / self.matrix_n**2, self.density*2, places=2)
65 | 
66 |     def test_load_dexter(self):
67 |         """Loading dexter, checking shape of distances, labels, vectors"""
68 |         self.dist, self.lab, self.vect = load_dexter()
69 |         symm_dist_shape = self.dist.shape[0] == self.dist.shape[1]
70 |         corr_dist_shape = self.dist.shape[0] == self.vect.shape[0]
71 |         corr_label_shape = self.lab.shape[0] == self.vect.shape[0]
72 |         return self.assertTrue(
73 |             symm_dist_shape == corr_dist_shape == corr_label_shape)
74 | 
75 |     def test_check_shape(self):
76 |         with self.assertRaises(TypeError):
77 |             d = np.empty((2, 3))
78 |             io.check_distance_matrix_shape(d)
79 | 
80 |     def test_check_dist_vs_classes(self):
81 |         with self.assertRaises(TypeError):
82 |             D = np.empty((5, 5))
83 |             classes = np.empty(4)
84 |             io.check_distance_matrix_shape_fits_labels(D, classes)
85 | 
86 |     def test_check_dist_vs_vectors(self):
87 |         with self.assertRaises(TypeError):
88 |             D = np.zeros((5, 5))
89 |             vectors = np.zeros((4, 5))
90 |             io.check_distance_matrix_shape_fits_vectors(D, vectors)
91 | 
92 |     def test_check_valid_metric(self):
93 |         with self.assertRaises(ValueError):
94 |             metric = 'dissimilarity'
95 |             io.check_valid_metric_parameter(metric)
96 | 
97 | if __name__ == "__main__":
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/tests/knn_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This file is part of the HUB TOOLBOX available at
  5 | https://github.com/OFAI/hub-toolbox-python3/
  6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  7 | 
  8 | (c) 2016-2018, Roman Feldbauer
  9 | Austrian Research Institute for Artificial Intelligence (OFAI)
 10 | Contact: <roman.feldbauer@ofai.at>
 11 | """
 12 | import unittest
 13 | import numpy as np
 14 | from scipy.sparse.csr import csr_matrix
 15 | try: # for scikit-learn >= 0.18
 16 |     from sklearn.model_selection import LeaveOneOut, cross_val_predict
 17 | except ImportError: # lower scikit-learn versions
 18 |     from sklearn.cross_validation import LeaveOneOut, cross_val_predict
 19 | from sklearn.neighbors import KNeighborsClassifier
 20 | from sklearn.metrics import accuracy_score, f1_score as f1_score_sklearn
 21 | from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
 22 | from hub_toolbox.distances import sample_distance
 23 | from hub_toolbox.io import load_dexter, random_sparse_matrix
 24 | from hub_toolbox.knn_classification import \
 25 |     score, predict, f1_score, r_precision, f1_macro, f1_micro, f1_weighted
 26 | 
 27 | 
 28 | class TestKnnClassification(unittest.TestCase):
 29 | 
 30 |     def setUp(self):
 31 |         self.distance, self.label, self.vector = load_dexter()
 32 |         self.n = self.distance.shape[0]
 33 | 
 34 |     def tearDown(self):
 35 |         del self.distance, self.label, self.vector
 36 | 
 37 |     def test_r_precision_does_not_error(self):
 38 |         """ Does not test correctness of result! """
 39 |         sim = csr_matrix(1 - self.distance)
 40 |         y = self.label
 41 |         r = r_precision(sim, y, metric='similarity', return_y_pred=1,
 42 |                         verbose=1, n_jobs=2)
 43 |         r_precision_weighted = r['weighted']
 44 |         r_precision_macro = r['macro']
 45 |         y_pred = np.array(r['y_pred'])
 46 |         acc = (y == y_pred.ravel()).sum() / self.label.size        
 47 |         return self.assertTrue(
 48 |             r_precision_weighted >= 0. and r_precision_macro >= 0.
 49 |             and acc > 0.80)
 50 | 
 51 |     def test_r_precision(self):
 52 |         y = [    0,   1,   1,   0,   1 , 2]
 53 |         sim = [[1.0, 0.6, 0.0, 0.0, 0.0, 0],  # 0 / 1 .. 1 nnz
 54 |                [0.6, 1.0, 0.0, 0.0, 0.7, 0],  # 1 / 2 .. 2 nnz
 55 |                [0.0, 0.0, 1.0, 0.0, 0.0, 0],  # 0 / 2 .. 0 nnz
 56 |                [0.0, 0.0, 0.0, 1.0, 0.0, 0],  # 0 / 1 .. 0 nnz
 57 |                [0.0, 0.7, 0.0, 0.0, 1.0, 0],  # 1 / 2 .. 1 nnz
 58 |                [0.0, 0.0, 0.0, 0.0, 0.0, 1]]  # 0 / 0 .. 1 nnz
 59 |         sim = csr_matrix(np.array(sim))
 60 |         y = np.array(y)
 61 |         r = r_precision(sim, y, metric='similarity', return_y_pred=2,
 62 |                         verbose=1, n_jobs=2)
 63 |         rpw = r['weighted']
 64 |         rpm = r['macro']
 65 |         r_peritem = r['per_item']
 66 |         relevant_items = r['relevant_items']
 67 |         y_return = r['y_true']
 68 |         rppiw = np.average(r_peritem, weights=relevant_items[y_return])
 69 |         return self.assertListEqual([rpw, rpm, rppiw], [0.25, 1/6, rpw])
 70 | 
 71 |     def test_knn_sparse_does_not_error(self):
 72 |         """ Does not test correctness of result! """
 73 |         sim = random_sparse_matrix(100, 0.1)
 74 |         y = np.random.randint(0, 2, 100)
 75 |         acc, _, _ = score(sim, y, k=[1,5,10], metric='similarity')
 76 |         return self.assertTrue(np.alltrue(acc >= 0.))
 77 | 
 78 |     def test_knn_sparse_equal_dense(self):
 79 |         sim_dense = 1 - self.distance
 80 |         sim_sparse = csr_matrix(sim_dense)
 81 |         acc_dense, _, _ = score(sim_dense, self.label, metric='similarity')
 82 |         acc_sparse, _, _ = score(sim_sparse, self.label, metric='similarity')
 83 |         return self.assertEqual(acc_dense, acc_sparse)
 84 |         
 85 |     def test_knn_predict_equal_sklearn_loocv_predict(self):
 86 |         y = LabelEncoder().fit_transform(self.label)
 87 |         y_pred = predict(self.distance, y, k=5, 
 88 |                          metric='distance', return_cmat=False)[0].ravel()
 89 |         knn = KNeighborsClassifier(
 90 |             n_neighbors=5, algorithm='brute', metric='precomputed')
 91 |         n = self.distance.shape[0] # for LOO-CV
 92 |         try:  # sklearn < 0.18
 93 |             loo_cv = LeaveOneOut(n)
 94 |         except TypeError:
 95 |             loo_cv = LeaveOneOut()
 96 |         y_pred_sklearn = cross_val_predict(
 97 |             knn, self.distance, y, cv=loo_cv)
 98 |         return self.assertTrue(np.alltrue(y_pred == y_pred_sklearn))
 99 |         
100 |     def test_f1_score(self):
101 |         y = LabelBinarizer().fit_transform(self.label).ravel()
102 |         y_pred, cmat = predict(self.distance, y, k=5, metric='distance')
103 |         y_pred = y_pred.ravel()
104 |         knn = KNeighborsClassifier(
105 |             n_neighbors=5, algorithm='brute', metric='precomputed')
106 |         n = self.distance.shape[0] # for LOO-CV
107 |         try: # sklearn < 0.18
108 |             loo_cv = LeaveOneOut(n)
109 |         except TypeError:
110 |             loo_cv = LeaveOneOut()
111 |         y_pred_sklearn = cross_val_predict(
112 |             knn, self.distance, y, cv=loo_cv)
113 |         f1_binary_hub = f1_score(cmat[0, 0, :, :])
114 |         f1_binary_sklearn = f1_score_sklearn(
115 |             y, y_pred_sklearn, average='binary')
116 |         return self.assertEqual(f1_binary_hub, f1_binary_sklearn)
117 | 
118 |     def test_f1_micro_macro_weighted(self):
119 |         y = np.random.randint(0, 5, self.label.size).reshape(-1, 1)
120 |         y = OneHotEncoder().fit_transform(y).toarray()
121 |         y_pred, cmat = predict(self.distance, y, k=5, metric='distance')
122 |         y_pred = y_pred[0]
123 |         knn = KNeighborsClassifier(
124 |             n_neighbors=5, algorithm='brute', metric='precomputed')
125 |         n = self.distance.shape[0] # for LOO-CV
126 |         try: # sklearn < 0.18
127 |             loo_cv = LeaveOneOut(n)
128 |         except TypeError:
129 |             loo_cv = LeaveOneOut()
130 |         y_pred_sklearn = cross_val_predict(
131 |             knn, self.distance, y, cv=loo_cv)
132 |         f1_hub = [f1_macro(cmat[0]), f1_micro(cmat[0]), f1_weighted(cmat[0])]
133 |         f1_sklearn = [f1_score_sklearn(y, y_pred_sklearn, average='macro'),
134 |                       f1_score_sklearn(y, y_pred_sklearn, average='micro'),
135 |                       f1_score_sklearn(y, y_pred_sklearn, average='weighted')]
136 |         return self.assertListEqual(f1_hub, f1_sklearn)
137 | 
138 |     def test_knn_score_matches_correct_prediction_fraction(self):
139 |         k = np.array([1, 5, 20])
140 |         acc, correct, _ = score(self.distance, self.label, k=k)
141 |         acc_match = np.zeros_like(k, dtype=bool)
142 |         for i, _ in enumerate(k):
143 |             cur_acc = acc[i]
144 |             cur_correct = correct[i]
145 |             acc_match[i] = np.allclose(cur_acc, cur_correct.sum() / self.n)
146 |         return self.assertTrue(np.all(acc_match))
147 | 
148 |     def test_knn_score_matches_confusion_matrix(self):
149 |         k = np.array([1, 5, 20])
150 |         acc, _, cmat = score(self.distance, self.label, k=k)
151 |         acc_match = np.zeros_like(k, dtype=bool)
152 |         for i, _ in enumerate(k):
153 |             cur_acc = acc[i]
154 |             cur_cmat = cmat[i]
155 |             TP = cur_cmat[0, 0]
156 |             FN = cur_cmat[0, 1]
157 |             FP = cur_cmat[1, 0]
158 |             TN = cur_cmat[1, 1]
159 |             acc_from_cmat = (TP + TN) / (TP + FN + FP + TN)
160 |             acc_match[i] = np.allclose(cur_acc, acc_from_cmat)
161 |         return self.assertTrue(np.all(acc_match))
162 | 
163 |     def test_knn_score_equal_sklearn_loocv_score(self):
164 |         acc, correct, cmat = \
165 |             score(self.distance, self.label, k=5, metric='distance')
166 |         # scoring only one k value, so take just the first elements:
167 |         acc = acc[0, 0]
168 |         correct = correct[0]
169 |         cmat = cmat[0]
170 |         knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', 
171 |                                             metric='precomputed')
172 |         n = self.distance.shape[0] # for LOO-CV
173 |         try: # sklearn < 0.18
174 |             loo_cv = LeaveOneOut(n)
175 |         except TypeError:
176 |             loo_cv = LeaveOneOut()
177 |         predicted_sklearn = cross_val_predict(
178 |             knclassifier, self.distance, self.label, cv=loo_cv)
179 |         acc_sklearn = accuracy_score(self.label, predicted_sklearn)
180 |         if not np.allclose(acc, acc_sklearn):
181 |             return self.assertAlmostEqual(acc, acc_sklearn, places=7)
182 |         else:
183 |             correct_sklearn = predicted_sklearn == self.label
184 |             equal_prediction = np.all(correct == correct_sklearn)
185 |             msg = """Accuracies of hub toolbox k-NN and sklearn-kNN are almost 
186 |                      equal, but the predictions per data point are not."""
187 |             return self.assertTrue(equal_prediction, msg)
188 | 
189 |     def test_sample_knn(self):
190 |         """ Make sure that sample-kNN works correctly. """
191 |         # TODO create a stricter test
192 |         X = np.array([[1., 2.],
193 |                       [2., 2.],
194 |                       [2., 3.],
195 |                       [3., .5],
196 |                       [4., 1.5]])
197 |         y = np.array([0, 1, 0, 1, 1])
198 |         s = 2
199 |         rnd = 1234
200 |         D, sample_idx = sample_distance(X, y, s, random_state=rnd)
201 |         expected_sample_idx = np.array([4, 2])
202 |         expected_acc = 0.4
203 |         if not np.setdiff1d(sample_idx, expected_sample_idx).size == \
204 |                np.setdiff1d(expected_sample_idx, sample_idx).size == 0:
205 |             return self.fail("Test implementation broken: wrong sample.")
206 |         acc, _, _ = score(D=D, target=y, k=2, metric='distance', 
207 |                           sample_idx=sample_idx)
208 |         return self.assertEqual(expected_acc, acc[0, 0])
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     unittest.main()
213 | 


--------------------------------------------------------------------------------
/tests/localscaling_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This file is part of the HUB TOOLBOX available at
  5 | https://github.com/OFAI/hub-toolbox-python3/
  6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  7 | 
  8 | (c) 2016-2018, Roman Feldbauer
  9 | Austrian Research Institute for Artificial Intelligence (OFAI)
 10 | Contact: <roman.feldbauer@ofai.at>
 11 | """
 12 | import unittest
 13 | import numpy as np
 14 | from scipy.spatial.distance import squareform
 15 | from hub_toolbox.distances import euclidean_distance
 16 | from hub_toolbox.local_scaling import local_scaling, nicdm
 17 | from hub_toolbox.hubness import hubness
 18 | from hub_toolbox.knn_classification import score
 19 | 
 20 | class TestLocalScaling(unittest.TestCase):
 21 |     """Unit tests for the LocalScaling class"""
 22 | 
 23 |     def setUpMod(self, mode='rnd'):
 24 |         np.random.seed(626)
 25 |         if mode == 'rnd':
 26 |             points = 200 # 200
 27 |             dim = 500 # 500
 28 |             self.vector = 99. * (np.random.rand(points, dim) - 0.5)
 29 |             self.label = np.random.randint(0, 5, points)
 30 |             self.dist = euclidean_distance(self.vector)
 31 |         elif mode == 'toy':
 32 |             # LS/NICDM ground truth calculated in spreadsheet for toy example
 33 |             self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9])
 34 |             self.ls_dist_truth = squareform(
 35 |                 [0.486582881, 0.1535182751, 0.9816843611, 0.7364028619, 
 36 |                  0.6321205588, 0.6471339185, 0.9342714714, 0.9844961464, 
 37 |                  0.8646647168, 0.8150186001])
 38 |             self.nicdm_dist_truth = squareform(
 39 |                 [0.310029690448236, 0.173311865721368, 0.769089007390428,
 40 |                  0.438448192970227, 0.402740381783397, 0.37233361467179,
 41 |                  0.594335892341949, 0.832563272714335, 0.569560910033398,
 42 |                  0.473903322836619])
 43 |             self.vector = None
 44 |             self.label = None
 45 | 
 46 |     def tearDown(self):
 47 |         del self.dist, self.label, self.vector
 48 | 
 49 |     def test_local_scaling(self):
 50 |         self.setUpMod('toy')
 51 |         dist_calc = local_scaling(self.dist, k=2)
 52 |         return np.testing.assert_array_almost_equal(
 53 |             dist_calc, self.ls_dist_truth, decimal=7)
 54 |  
 55 |     def test_ls_basic_requirements(self):
 56 |         """Test that matrix is symmetric, diag==0, and in range [0, 1]"""
 57 |         self.setUpMod('rnd')
 58 |         ls_dist = local_scaling(self.dist)
 59 |         symmetric = np.all(ls_dist == ls_dist.T)
 60 |         diag_zero = np.all(ls_dist.diagonal() == 0.)
 61 |         correct_range = ls_dist.min() >= 0. and ls_dist.max() <= 1.
 62 |         return self.assertTrue(symmetric and diag_zero and correct_range)
 63 |  
 64 |     def test_ls_dist_equals_sim(self):
 65 |         """Test for equal RANKS using dist. vs. sim. (LS_dist != 1-LS_sim).
 66 |            Using hubness and k-NN accuracy as proxy."""
 67 |         self.setUpMod('rnd')
 68 |         ls_dist = local_scaling(self.dist, metric='distance')
 69 |         ls_sim = local_scaling(1 - self.dist, metric='similarity')
 70 |         h_dist, _, _ = hubness(ls_dist, metric='distance')
 71 |         h_sim, _, _ = hubness(ls_sim, metric='similarity')
 72 |         acc_dist, _, _ = score(ls_dist, self.label, metric='distance')
 73 |         acc_sim, _, _ = score(ls_sim, self.label, metric='similarity')
 74 |         dist_sim_equal_in_hubness_knn = np.allclose(h_dist, h_sim) and \
 75 |                                         np.allclose(acc_dist, acc_sim)
 76 |         return self.assertTrue(dist_sim_equal_in_hubness_knn)
 77 | 
 78 |     def test_ls_parallel_equals_sequential(self):
 79 |         self.setUpMod('rnd')
 80 |         ls_dist_par = local_scaling(self.dist, n_jobs=4)
 81 |         ls_dist_seq = local_scaling(self.dist, n_jobs=1)
 82 |         return np.testing.assert_array_equal(ls_dist_seq, ls_dist_par)
 83 | 
 84 |     def test_nicdm(self):
 85 |         self.setUpMod('toy')
 86 |         dist_calc = nicdm(self.dist, k=2)
 87 |         return np.testing.assert_array_almost_equal(
 88 |             dist_calc, self.nicdm_dist_truth, decimal=7)
 89 |  
 90 |     def test_nicdm_basic_requirements(self):
 91 |         """Test that matrix is symmetric, diag==0, and in range [0, inf)"""
 92 |         self.setUpMod('rnd')
 93 |         nicdm_dist = nicdm(self.dist)
 94 |         symmetric = np.all(nicdm_dist == nicdm_dist.T)
 95 |         diag_zero = np.all(nicdm_dist.diagonal() == 0.)
 96 |         correct_range = nicdm_dist.min() >= 0.
 97 |         return self.assertTrue(symmetric and diag_zero and correct_range)
 98 |  
 99 |     def test_nicdm_similarity_based(self):
100 |         """There is no similarity-based NICDM"""
101 |         self.setUpMod('toy')
102 |         return self.assertRaises(NotImplementedError)
103 | 
104 |     def test_nicdm_parallel_equals_sequential(self):
105 |         self.setUpMod('rnd')
106 |         ls_dist_par = nicdm(self.dist, n_jobs=4)
107 |         ls_dist_seq = nicdm(self.dist, n_jobs=1)
108 |         return np.testing.assert_array_equal(ls_dist_seq, ls_dist_par)
109 | 
110 | if __name__ == "__main__":
111 |     unittest.main()
112 | 


--------------------------------------------------------------------------------
/tests/logging_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | from hub_toolbox.htlogging import Logging, ConsoleLogging, FileLogging
14 | 
15 | class TestLogging(unittest.TestCase):
16 |     """Minimally test Logging (should switch to std module logging anyway)"""
17 | 
18 |     def test_unable_to_instantiate_abstract_class_logging(self):
19 |         with self.assertRaises(TypeError):
20 |             Logging()
21 | 
22 |     def test_console_logging_has_all_methods(self):
23 |         log = ConsoleLogging()
24 |         has_all_attributes = hasattr(log, 'warning') and \
25 |             hasattr(log, 'warning') and hasattr(log, 'error')
26 |         return self.assertTrue(has_all_attributes)
27 | 
28 |     def test_file_logging_has_all_methods(self):
29 |         log = FileLogging()
30 |         has_all_attributes = hasattr(log, 'warning') and \
31 |             hasattr(log, 'warning') and hasattr(log, 'error')
32 |         return self.assertTrue(has_all_attributes)
33 | 
34 |     def test_message(self):
35 |         log = ConsoleLogging()
36 |         log.message("Message")
37 |         return self
38 | 
39 |     def test_warning(self):
40 |         log = ConsoleLogging()
41 |         log.warning("Warning")
42 |         return self
43 | 
44 |     def test_error(self):
45 |         log = ConsoleLogging()
46 |         log.error("Error")
47 |         return self
48 | 
49 | if __name__ == "__main__":
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/tests/mutualproximity_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This file is part of the HUB TOOLBOX available at
  5 | https://github.com/OFAI/hub-toolbox-python3/
  6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
  7 | 
  8 | (c) 2016-2018, Roman Feldbauer
  9 | Austrian Research Institute for Artificial Intelligence (OFAI)
 10 | Contact: <roman.feldbauer@ofai.at>
 11 | """
 12 | import unittest
 13 | import numpy as np
 14 | from hub_toolbox.distances import euclidean_distance
 15 | from hub_toolbox.global_scaling import mutual_proximity_empiric,\
 16 |     mutual_proximity_gaussi, mutual_proximity_gammai
 17 | from scipy.sparse.csr import csr_matrix
 18 | from scipy.spatial.distance import squareform
 19 | 
 20 | class TestMutualProximity(unittest.TestCase):
 21 |     """Unit tests for the MutualProximity class (serial computing)"""
 22 | 
 23 |     def setUpMod(self, mode='rnd'):
 24 |         np.random.seed(626)
 25 |         if mode == 'rnd':
 26 |             points = 50
 27 |             dim = 500
 28 |             self.vector = 99. * (np.random.rand(points, dim) - 0.5)
 29 |             self.label = np.random.randint(0, 5, points)
 30 |             self.dist = euclidean_distance(self.vector)
 31 |             # scale to [0, 1), avoiding 1: otherwise sparseMP != denseMP (by design)
 32 |             self.dist /= (self.dist.max() + 1e-12)
 33 |         elif mode == 'toy':
 34 |             # MP empiric ground truth calculated by hand for this toy example
 35 |             self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9])
 36 | 
 37 |             # MP with div/(n-0)
 38 |             self.mp_dist_truth = squareform([.6, .4, 1., .8, .6,
 39 |                                              .8, 1., 1., .8, 1.])
 40 |             """
 41 |             # MP with div/(n-1)
 42 |             self.mp_dist_truth = squareform([.5, .25, 1., .75, .5,
 43 |                                              .75, 1., 1., .75, 1.])
 44 | 
 45 |             # MP with div/(n-2)
 46 |             self.mp_dist_truth = squareform([1/3, 0., 1., 2/3, 1/3,
 47 |                                              2/3, 1., 1., 2/3, 1.])
 48 |             """
 49 |             self.vector = None
 50 |             self.label = None
 51 | 
 52 |     def tearDown(self):
 53 |         del self.dist, self.label, self.vector
 54 | 
 55 |     def test_mp_empiric_sample(self):
 56 |         """Test MP Emp Sample equals MP Emp when sample == population"""
 57 |         self.setUpMod('toy')
 58 |         mp_dist = mutual_proximity_empiric(self.dist, 'distance')
 59 |         y = np.array([0, 1, 2, 3, 4])
 60 |         mp_sample_dist = mutual_proximity_empiric(D=self.dist,
 61 |                                                   sample_ind=y,
 62 |                                                   metric='distance')
 63 |         return np.testing.assert_array_almost_equal(
 64 |             mp_dist, mp_sample_dist, decimal=7)
 65 | 
 66 |     """
 67 |     def test_mp_gaussi_sample(self):
 68 |         """'''Test MP Gaussi Sample.'''"""
 69 |         self.setUpMod('toy')
 70 |         mp_dist = mutual_proximity_gaussi(self.dist)
 71 |         y = np.array([0, 1, 2, 3, 4])
 72 |         mp_sample_dist = mutual_proximity_gaussi(self.dist[:, y], idx=y)
 73 |         mp_sample_equal_pop = np.alltrue(mp_dist == mp_sample_dist)
 74 |         #=======================================================================
 75 |         # print(self.dist)
 76 |         # print(mp_dist)
 77 |         # print(mp_sample_dist)
 78 |         #=======================================================================
 79 |         print("SampleMP-Gaussi with all pts equals MP-Gaussi:", mp_sample_equal_pop)
 80 |         y2 = np.array([1, 2, 4])
 81 |         mp_sample_dist2 = mutual_proximity_gaussi(self.dist[:, y2], idx=y2)
 82 |         print(self.dist[:, y2])
 83 |         print(mp_dist[:, y2])
 84 |         print(mp_sample_dist2)
 85 |         print(mp_sample_dist)
 86 |         #return self.assertTrue(mp_sample_equal_pop)
 87 |         return self.fail()
 88 |     """
 89 | 
 90 |     def test_mp_empiric(self):
 91 |         """Test MP Empiric for toy example (ground truth calc by hand)"""
 92 |         self.setUpMod('toy')
 93 |         mp_dist_calc = mutual_proximity_empiric(self.dist, 'distance', verbose=1)
 94 |         return np.testing.assert_array_almost_equal(
 95 |             mp_dist_calc, self.mp_dist_truth, decimal=7)
 96 | 
 97 |     def test_mp_empiric_all_zero_self_distances(self):
 98 |         self.setUpMod('rnd')
 99 |         mp_dist_calc = mutual_proximity_empiric(self.dist)
100 |         mp_self_distances_all_zero = np.all(mp_dist_calc.diagonal() == 0.)
101 |         return self.assertTrue(mp_self_distances_all_zero)
102 | 
103 |     def test_mp_empiric_symmetric(self):
104 |         self.setUpMod('rnd')
105 |         mp_dist = mutual_proximity_empiric(self.dist)
106 |         return np.testing.assert_array_almost_equal(
107 |             mp_dist, mp_dist.T, decimal=14)
108 | 
109 |     def test_mp_empiric_dist_equal_sim(self):
110 |         self.setUpMod('rnd')
111 |         sim = 1. - self.dist
112 |         mp_dist = mutual_proximity_empiric(self.dist, 'distance')
113 |         mp_sim = mutual_proximity_empiric(sim, 'similarity')
114 |         return np.testing.assert_array_almost_equal(
115 |             mp_dist, 1. - mp_sim, decimal=7)
116 | 
117 |     def test_mp_empiric_sparse_equal_dense(self):
118 |         self.setUpMod('rnd')
119 |         sim_dense = 1. - self.dist
120 |         sim_sparse = csr_matrix(sim_dense)
121 |         mp_dense = mutual_proximity_empiric(sim_dense, 'similarity')
122 |         mp_sparse = mutual_proximity_empiric(
123 |             sim_sparse, 'similarity', verbose=1, n_jobs=4)
124 |         return np.testing.assert_array_almost_equal(
125 |             mp_dense, mp_sparse.toarray(), decimal=7)
126 | 
127 |     def test_mp_gaussi(self):
128 |         """Test MP GaussI for toy example (ground truth calc by 'hand')"""
129 |         self.setUpMod('toy')
130 |         mp_gaussi = mutual_proximity_gaussi(self.dist, verbose=1)
131 |         # Calculated with formula (3) in JMLR paper, aided by LibreOffice Calc
132 |         mp_gaussi_hand = np.array(
133 |             [[0.155334048, 0.3466121867, 0.2534339319, 0.971773078, 0.575452874], 
134 |              [0.3466121867, 0.0267023937, 0.4637020361, 0.6708772779, 0.9702788336], 
135 |              [0.2534339319, 0.4637020361, 0.1354428205, 0.9899969991, 0.7660250185], 
136 |              [0.971773078, 0.6708772779, 0.9899969991, 1.90126724466388e-05, 0.975462801], 
137 |              [0.575452874, 0.9702788336, 0.7660250185, 0.975462801, 0.0003114667]])
138 |         # Gaussians can go below distance 0; self dist anyway defined as 0.
139 |         np.fill_diagonal(mp_gaussi_hand, 0.)
140 |         return np.testing.assert_array_almost_equal(
141 |             mp_gaussi, mp_gaussi_hand, decimal=7)
142 | 
143 |     def test_mp_gaussi_all_zero_self_distances(self):
144 |         self.setUpMod('rnd')
145 |         mp_dist = mutual_proximity_gaussi(self.dist)
146 |         mp_self_dist_all_zero = np.all(mp_dist.diagonal() == 0.)
147 |         return self.assertTrue(mp_self_dist_all_zero)
148 | 
149 |     def test_mp_gaussi_symmetric(self):
150 |         self.setUpMod('rnd')
151 |         mp_dist = mutual_proximity_gaussi(self.dist)
152 |         return np.testing.assert_array_almost_equal(
153 |             mp_dist, mp_dist.T, decimal=7)
154 |     
155 |     def test_mp_gaussi_dist_equal_sim(self):
156 |         self.setUpMod('rnd')
157 |         sim = 1. - self.dist
158 |         mp_dist = mutual_proximity_gaussi(self.dist, 'distance')
159 |         mp_sim = mutual_proximity_gaussi(sim, 'similarity')
160 |         return np.testing.assert_array_almost_equal(
161 |             mp_dist, 1. - mp_sim, decimal=7)
162 | 
163 |     def test_mp_gaussi_sparse_equal_dense(self):
164 |         self.setUpMod('rnd')
165 |         sim_dense = 1. - self.dist
166 |         sim_sparse = csr_matrix(sim_dense)
167 |         mp_dense = mutual_proximity_gaussi(sim_dense, 'similarity')
168 |         mp_sparse = mutual_proximity_gaussi(sim_sparse, 'similarity')
169 |         return np.testing.assert_array_almost_equal(
170 |             mp_dense, mp_sparse.toarray(), decimal=7)
171 | 
172 |     def test_mp_gammai(self):
173 |         """Test MP GammaI for toy example (ground truth calc by 'hand')"""
174 |         self.setUpMod('toy')
175 |         mp_gammai = mutual_proximity_gammai(self.dist, verbose=1)
176 |         # Calculated with formula (3) in JMLR paper, aided by LibreOffice Calc
177 |         mp_gammai_hand = np.array(
178 |             [[0., 0.4334769987, 0.230927083, 0.9558409888, 0.6744697939],
179 |              [0.4334769987, 0., 0.5761291218, 0.7088478962, 0.9585297208],
180 |              [0.230927083, 0.5761291218, 0., 0.9817785746, 0.8286910286],
181 |              [0.9558409888, 0.7088478962, 0.9817785746, 0., 0.9646050169],
182 |              [0.6744697939, 0.9585297208, 0.8286910286, 0.9646050169, 0.]])
183 |         return np.testing.assert_array_almost_equal(
184 |             mp_gammai, mp_gammai_hand, decimal=7)
185 | 
186 |     def test_mp_gammai_all_zero_self_distances(self):
187 |         self.setUpMod('rnd')
188 |         mp_dist = mutual_proximity_gammai(self.dist)
189 |         mp_self_dist_all_zero = np.all(mp_dist.diagonal() == 0.)
190 |         return self.assertTrue(mp_self_dist_all_zero)
191 | 
192 |     def test_mp_gammai_symmetric(self):
193 |         self.setUpMod('rnd')
194 |         mp_dist = mutual_proximity_gammai(self.dist)
195 |         return np.testing.assert_array_almost_equal(
196 |             mp_dist, mp_dist.T, decimal=7)
197 | 
198 |     def test_mp_gammai_dist_equal_sim(self):
199 |         self.setUpMod('rnd')
200 |         #=====================================================================
201 |         # sim = 1. - self.dist
202 |         # mp_dist = mutual_proximity_gammai(self.dist, 'distance')
203 |         # mp_sim = mutual_proximity_gammai(sim, 'similarity')
204 |         # dist_allclose_one_minus_sim = np.allclose(mp_dist, 1. - mp_sim)
205 |         #=====================================================================
206 |         msg = "MP GammaI similarity differs from GammaI distance. "\
207 |             + "Whether the currently implemented similarity function makes "\
208 |             + "any sense, is yet to be investigated."
209 |         return self.skipTest(msg)
210 |         #return self.assertTrue(dist_allclose_one_minus_sim)
211 | 
212 |     def test_mp_gammai_sparse_equal_dense(self):
213 |         self.setUpMod('rnd')
214 |         sim_dense = 1. - self.dist
215 |         sim_sparse = csr_matrix(sim_dense)
216 |         mp_dense = mutual_proximity_gammai(sim_dense, 'similarity')
217 |         mp_sparse = mutual_proximity_gammai(sim_sparse, 'similarity')
218 |         dense_allclose_sparse = np.allclose(mp_dense, mp_sparse.toarray())
219 |         return self.assertTrue(dense_allclose_sparse)
220 | 
221 | if __name__ == "__main__":
222 |     unittest.main()
223 | 


--------------------------------------------------------------------------------
/tests/sharednn_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This file is part of the HUB TOOLBOX available at
 5 | https://github.com/OFAI/hub-toolbox-python3/
 6 | The HUB TOOLBOX is licensed under the terms of the GNU GPLv3.
 7 | 
 8 | (c) 2016-2018, Roman Feldbauer
 9 | Austrian Research Institute for Artificial Intelligence (OFAI)
10 | Contact: <roman.feldbauer@ofai.at>
11 | """
12 | import unittest
13 | import numpy as np
14 | from scipy.spatial.distance import squareform
15 | from hub_toolbox.distances import euclidean_distance
16 | from hub_toolbox.shared_neighbors import \
17 |     shared_nearest_neighbors, snn_sample, simhubIN
18 | 
19 | class TestSharedNN(unittest.TestCase):
20 | 
21 |     def setUpMod(self, mode='rnd'):
22 |         np.random.seed(626)
23 |         if mode == 'rnd':
24 |             points = 200
25 |             dim = 500
26 |             self.vector = 99. * (np.random.rand(points, dim) - 0.5)
27 |             self.label = np.random.randint(0, 5, points)
28 |             self.dist = euclidean_distance(self.vector)
29 |             #self.dist /= (self.dist.max() + 1e-12)
30 |         elif mode == 'toy':
31 |             # SNN (k=2) ground truth calculated by hand for this toy example
32 |             self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9])
33 |             self.snn_dist_truth = squareform([.5, .5, .5, .5, .5, 
34 |                                               .5, 0., 0., .5, .5])
35 |             self.vector = None
36 |             self.label = None
37 | 
38 |     def test_snn_matrix_basic_requirements(self):
39 |         """Test that matrix is symmetric, diag==0, and in range [0, 1]"""
40 |         self.setUpMod('rnd')
41 |         snn_dist = shared_nearest_neighbors(self.dist)
42 |         np.testing.assert_equal(snn_dist.diagonal(), 0.)    # self dist
43 |         np.testing.assert_array_less(snn_dist, 1+1e-14)     # max==1
44 |         np.testing.assert_array_less(-snn_dist, 0+1e-14)    # min==0
45 |         np.testing.assert_array_equal(snn_dist, snn_dist.T) # symmetry
46 |         return
47 | 
48 |     def test_snn(self):
49 |         """Test correctness of SNN in toy example (hand-calculated)"""
50 |         self.setUpMod('toy')
51 |         snn_dist = shared_nearest_neighbors(self.dist, k=2)
52 |         return np.testing.assert_array_equal(self.snn_dist_truth, snn_dist)
53 | 
54 |     def test_snn_dist_equals_sim(self):
55 |         """Test that SNN results are equivalent using distances or simil."""
56 |         self.setUpMod('rnd')
57 |         snn_dist = shared_nearest_neighbors(self.dist, metric='distance')
58 |         snn_sim = shared_nearest_neighbors(1. - self.dist, metric='similarity')
59 |         return np.testing.assert_array_almost_equal(snn_sim, 1.-snn_dist, 12)
60 | 
61 |     def test_snn_parallel(self):
62 |         self.setUpMod('rnd')
63 |         snn_seq = shared_nearest_neighbors(self.dist, n_jobs=1)
64 |         snn_par = shared_nearest_neighbors(self.dist, n_jobs=4)
65 |         return np.testing.assert_array_almost_equal(snn_seq, snn_par, 14)
66 | 
67 |     def test_snn_sample_parallel(self):
68 |         self.setUpMod('rnd')
69 |         train_ind = np.arange(self.label.size//2)
70 |         test_ind = np.arange(self.label.size//2, self.label.size)
71 |         D_sample = self.dist[:, train_ind]
72 |         snn_seq = snn_sample(
73 |             D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=1)
74 |         snn_par = snn_sample(
75 |             D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=4)
76 |         return np.testing.assert_array_almost_equal(snn_seq, snn_par, 14)
77 | 
78 |     def test_simhubIN(self):
79 |         return self.skipTest("simhubIn requires test for correctness!")
80 | 
81 |     def test_simhubIN_parallel(self):
82 |         self.setUpMod('rnd')
83 |         train_ind = np.arange(self.label.size//2)
84 |         test_ind = np.arange(self.label.size//2, self.label.size)
85 |         D_sample = self.dist[:, train_ind]
86 |         shi_seq = simhubIN(
87 |             D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=1)
88 |         shi_par = simhubIN(
89 |             D_sample, train_ind=train_ind, test_ind=test_ind, n_jobs=4)
90 |         return np.testing.assert_array_almost_equal(shi_seq, shi_par, 14)
91 | 
92 | if __name__ == "__main__":
93 |     unittest.main()
94 | 


--------------------------------------------------------------------------------