15 |
16 |
17 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.pep8speaks.yml:
--------------------------------------------------------------------------------
1 | # File : .pep8speaks.yml
2 |
3 | message: # Customize the comment made by the bot
4 | opened: # Messages when a new PR is submitted
5 | header: "Hello @{name}, Thank you for submitting the Pull Request !"
6 | # The keyword {name} is converted into the author's username
7 | footer: ""
8 | # The messages can be written as they would over GitHub
9 | updated: # Messages when new commits are added to the PR
10 | header: "Hello @{name}, Thank you for updating !"
11 | footer: "" # Why to comment the link to the style guide everytime? :)
12 | no_errors: "Cheers ! There are no PEP8 issues in this Pull Request. :beers: "
13 |
14 | scanner:
15 | diff_only: False # If True, errors caused by only the patch are shown
16 |
17 | pycodestyle:
18 | max-line-length: 100 # Default is 79 in PEP8
19 | ignore: # Errors and warnings to ignore
20 | - W391
21 | - E203
22 |
23 | only_mention_files_with_errors: True # If False, a separate status comment for each file is made.
24 | descending_issues_order: False # If True, PEP8 issues in message will be displayed in descending order of line numbers in the file
25 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 |
6 | tools:
7 | python: "3.12"
8 |
9 |
10 | python:
11 | install:
12 | - requirements: docs/docs_requirements.txt
13 | - method: pip
14 | path: .
15 |
16 | sphinx:
17 |
18 | configuration: docs/conf.py
19 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | python:
4 | - "3.6"
5 | - "3.7"
6 | - "3.8"
7 | - "3.9"
8 |
9 | cache:
10 | apt: true
11 | # We use three different cache directory
12 | # to work around a Travis bug with multi-platform cache
13 | directories:
14 | - $HOME/.cache/pip
15 | - $HOME/download
16 | env:
17 | global:
18 | # Directory where tests are run from
19 | - TEST_DIR=/tmp/test_dir/
20 | - MODULE=hdbscan
21 | matrix:
22 | - DISTRIB="conda"
23 |
24 | install:
25 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
26 | - bash miniconda.sh -b -p $HOME/miniconda
27 | - source "$HOME/miniconda/etc/profile.d/conda.sh"
28 | - hash -r
29 | - conda config --set always_yes yes --set changeps1 no
30 | - conda update -q conda
31 | - conda info -a
32 | - conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas networkx scikit-learn pytest pytest-cov codecov coverage cython
33 | - conda activate testenv
34 | - python -c "import numpy; print('numpy %s' % numpy.__version__)"
35 | - python -c "import scipy; print('scipy %s' % scipy.__version__)"
36 | - python setup.py develop
37 |
38 | script:
39 | - conda activate testenv
40 | - pytest --cov=./
41 |
42 | after_success:
43 | - bash <(curl -s https://codecov.io/bash)
44 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | ## Our Standards
8 |
9 | Examples of behavior that contributes to creating a positive environment include:
10 |
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 |
17 | Examples of unacceptable behavior by participants include:
18 |
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 |
25 | ## Our Responsibilities
26 |
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 |
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | ## Scope
32 |
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 |
35 | ## Enforcement
36 |
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at leland.mcinnes@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 |
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 |
41 | ## Attribution
42 |
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 |
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Leland McInnes
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 |
6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 |
8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9 |
10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 |
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst *.txt pyproject.toml LICENSE
2 | recursive-include hdbscan *.py *.pyx *.pxd *.c
3 | recursive-include notebooks *.ipynb *.npy *.svg
4 | recursive-include examples *.py
5 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | # Trigger a build when there is a push to the main branch or a tag starts with release-
2 | trigger:
3 | branches:
4 | include:
5 | - master
6 | tags:
7 | include:
8 | - release-*
9 |
10 | # Trigger a build when there is a pull request to the main branch
11 | # Ignore PRs that are just updating the docs
12 | pr:
13 | branches:
14 | include:
15 | - master
16 | exclude:
17 | - doc/*
18 | - README.rst
19 |
20 | variables:
21 | triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
22 |
23 | stages:
24 | - stage: RunAllTests
25 | displayName: Run test suite
26 | jobs:
27 | - job: run_platform_tests
28 | strategy:
29 | matrix:
30 | mac_py39:
31 | imageName: 'macOS-latest'
32 | python.version: '3.9'
33 | linux_py39:
34 | imageName: 'ubuntu-latest'
35 | python.version: '3.9'
36 | windows_py39:
37 | imageName: 'windows-latest'
38 | python.version: '3.9'
39 | mac_py310:
40 | imageName: 'macOS-latest'
41 | python.version: '3.10'
42 | linux_py310:
43 | imageName: 'ubuntu-latest'
44 | python.version: '3.10'
45 | windows_py310:
46 | imageName: 'windows-latest'
47 | python.version: '3.10'
48 | mac_py311:
49 | imageName: 'macOS-latest'
50 | python.version: '3.11'
51 | linux_py311:
52 | imageName: 'ubuntu-latest'
53 | python.version: '3.11'
54 | windows_py311:
55 | imageName: 'windows-latest'
56 | python.version: '3.11'
57 | mac_py312:
58 | imageName: 'macOS-latest'
59 | python.version: '3.12'
60 | linux_py312:
61 | imageName: 'ubuntu-latest'
62 | python.version: '3.12'
63 | windows_py312:
64 | imageName: 'windows-latest'
65 | python.version: '3.12'
66 | pool:
67 | vmImage: $(imageName)
68 |
69 | steps:
70 | - task: UsePythonVersion@0
71 | inputs:
72 | versionSpec: '$(python.version)'
73 | displayName: 'Use Python $(python.version)'
74 |
75 | - script: |
76 | python -m pip install --upgrade pip
77 | pip install -r requirements.txt
78 | displayName: 'Install dependencies'
79 |
80 | - script: |
81 | pip install -e .
82 | pip install pytest pytest-azurepipelines
83 | pip install pytest-cov
84 | pip install coveralls
85 | displayName: 'Install package'
86 |
87 | - script: |
88 | pytest hdbscan/tests --show-capture=no -v --disable-warnings --junitxml=junit/test-results.xml --cov=hdbscan/ --cov-report=xml --cov-report=html
89 | displayName: 'Run tests'
90 |
91 | - bash: |
92 | coveralls
93 | displayName: 'Publish to coveralls'
94 | condition: and(succeeded(), eq(variables.triggeredByPullRequest, false)) # Don't run this for PRs because they can't access pipeline secrets
95 | env:
96 | COVERALLS_REPO_TOKEN: $(COVERALLS_TOKEN)
97 |
98 | - task: PublishTestResults@2
99 | inputs:
100 | testResultsFiles: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
101 | testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
102 | condition: succeededOrFailed()
103 |
104 | - stage: BuildPublishArtifact
105 | dependsOn: RunAllTests
106 | condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/release-'), eq(variables.triggeredByPullRequest, false))
107 | jobs:
108 | # Need to use manylinux as ubuntu-latest is too new
109 | - job: Manylinux2014Build
110 | pool:
111 | vmImage: 'ubuntu-latest'
112 | container: quay.io/pypa/manylinux2014_x86_64:latest
113 | strategy:
114 | matrix:
115 | linux_py38:
116 | python.version: 'cp38-cp38'
117 | linux_py39:
118 | python.version: 'cp39-cp39'
119 | linux_py310:
120 | python.version: 'cp310-cp310'
121 | linux_py311:
122 | python.version: 'cp311-cp311'
123 | linux_py312:
124 | python.version: 'cp312-cp312'
125 | steps:
126 | - script: |
127 | "${PYBIN}/python" -m pip install --upgrade pip
128 | "${PYBIN}/python" -m pip install wheel
129 | "${PYBIN}/python" -m pip install -r requirements.txt
130 | "${PYBIN}/python" -m pip install cython
131 | displayName: 'Install dependencies and build tools'
132 | env:
133 | PYBIN: /opt/python/$(python.version)/bin
134 | - script: |
135 | "${PYBIN}/python" setup.py sdist bdist_wheel
136 | displayName: 'Build wheels'
137 | env:
138 | PYBIN: /opt/python/$(python.version)/bin
139 | - bash: |
140 | auditwheel repair dist/*linux_x86_64.whl --plat manylinux2014_x86_64 -w wheelhouse-manylinux/
141 | displayName: 'Audit wheels'
142 |
143 | - task: DownloadSecureFile@1
144 | name: PYPIRC_CONFIG
145 | displayName: 'Download pypirc'
146 | inputs:
147 | secureFile: 'pypirc'
148 |
149 | - bash: |
150 | "${PYBIN}/python" -m pip install twine
151 | "${PYBIN}/python" -m twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing --disable-progress-bar wheelhouse-manylinux/*
152 | "${PYBIN}/python" -m twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing --disable-progress-bar dist/*.tar.gz
153 | displayName: 'Publish wheel to PyPi'
154 | env:
155 | PYBIN: /opt/python/$(python.version)/bin
156 |
157 | - job: BuildWindowsAndMacOSArtifacts
158 | displayName: Build source dists and wheels for windows and macOS
159 | strategy:
160 | matrix:
161 | mac_py38:
162 | imageName: 'macOS-latest'
163 | python.version: '3.8'
164 | windows_py38:
165 | imageName: 'windows-latest'
166 | python.version: '3.8'
167 | mac_py39:
168 | imageName: 'macOS-latest'
169 | python.version: '3.9'
170 | windows_py39:
171 | imageName: 'windows-latest'
172 | python.version: '3.9'
173 | mac_py310:
174 | imageName: 'macOS-latest'
175 | python.version: '3.10'
176 | windows_py310:
177 | imageName: 'windows-latest'
178 | python.version: '3.10'
179 | mac_py311:
180 | imageName: 'macOS-latest'
181 | python.version: '3.11'
182 | windows_py311:
183 | imageName: 'windows-latest'
184 | python.version: '3.11'
185 | mac_py312:
186 | imageName: 'macOS-latest'
187 | python.version: '3.12'
188 | windows_py312:
189 | imageName: 'windows-latest'
190 | python.version: '3.12'
191 | pool:
192 | vmImage: $(imageName)
193 |
194 | steps:
195 | - task: UsePythonVersion@0
196 | inputs:
197 | versionSpec: '$(python.version)'
198 | displayName: 'Use Python $(python.version)'
199 |
200 | - script: |
201 | python -m pip install --upgrade pip
202 | pip install wheel
203 | pip install -r requirements.txt
204 | pip install cython
205 | pip install setuptools
206 | displayName: 'Install dependencies'
207 |
208 | - script: |
209 | pip install -e .
210 | displayName: 'Install package locally'
211 |
212 | - bash: |
213 | python setup.py sdist bdist_wheel
214 | displayName: 'Build package'
215 |
216 | - bash: |
217 | export PACKAGE_VERSION="$(python setup.py --version)"
218 | echo "Package Version: ${PACKAGE_VERSION}"
219 | echo "##vso[task.setvariable variable=packageVersionFormatted;]release-${PACKAGE_VERSION}"
220 | displayName: 'Get package version'
221 |
222 | - script: |
223 | echo "Version in git tag $(Build.SourceBranchName) does not match version derived from setup.py $(packageVersionFormatted)"
224 | exit 1
225 | displayName: Raise error if version doesnt match tag
226 | condition: and(succeeded(), ne(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
227 |
228 | - task: DownloadSecureFile@1
229 | name: PYPIRC_CONFIG
230 | displayName: 'Download pypirc'
231 | inputs:
232 | secureFile: 'pypirc'
233 |
234 | - script: |
235 | pip install twine
236 | twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing dist/*
237 | displayName: 'Upload to PyPI'
238 | condition: and(succeeded(), eq(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
239 |
--------------------------------------------------------------------------------
/ci_scripts/push_doc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script is meant to be called in the "deploy" step defined in
3 | # circle.yml. See https://circleci.com/docs/ for more details.
4 | # The behavior of the script is controlled by environment variable defined
5 | # in the circle.yml in the top level folder of the project.
6 |
7 | MSG="Pushing the docs for revision for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1"
8 |
9 | cd $HOME
10 | # Copy the build docs to a temporary folder
11 | rm -rf tmp
12 | mkdir tmp
13 | cp -R $HOME/$DOC_REPO/doc/_build/html/* ./tmp/
14 |
15 | # Clone the docs repo if it isnt already there
16 | if [ ! -d $DOC_REPO ];
17 | then git clone "git@github.com:$USERNAME/"$DOC_REPO".git";
18 | fi
19 |
20 | cd $DOC_REPO
21 | git branch gh-pages
22 | git checkout -f gh-pages
23 | git reset --hard origin/gh-pages
24 | git clean -dfx
25 |
26 | for name in $(ls -A $HOME/$DOC_REPO); do
27 | case $name in
28 | .nojekyll) # So that github does not build this as a Jekyll website.
29 | ;;
30 | circle.yml) # Config so that build gh-pages branch.
31 | ;;
32 | *)
33 | git rm -rf $name
34 | ;;
35 | esac
36 | done
37 |
38 | # Copy the new build docs
39 | mkdir $DOC_URL
40 | cp -R $HOME/tmp/* ./$DOC_URL/
41 |
42 | git config --global user.email $EMAIL
43 | git config --global user.name $USERNAME
44 | git add -f ./$DOC_URL/
45 | git commit -m "$MSG"
46 | git push -f origin gh-pages
47 | if [ $? -ne 0 ]; then
48 | echo "Pushing docs failed"
49 | echo
50 | exit 1
51 | fi
52 |
53 | echo $MSG
54 |
--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
1 | machine:
2 | environment:
3 | # The github organization or username of the repository which hosts the
4 | # project and documentation.
5 | USERNAME: "scikit-learn-contrib"
6 |
7 | # The repository where the documentation will be hosted
8 | DOC_REPO: "hdbscan"
9 |
10 | # The base URL for the Github page where the documentation will be hosted
11 | DOC_URL: ""
12 |
13 | # The email is to be used for commits in the Github Page
14 | EMAIL: "leland.mcinnes+ci@gmail.com"
15 |
16 | dependencies:
17 |
18 | # Various dependencies
19 | pre:
20 | - sudo -E apt-get -yq remove texlive-binaries --purge
21 | - sudo apt-get update
22 | - sudo apt-get install libatlas-dev libatlas3gf-base
23 | - sudo apt-get install build-essential python-dev python-setuptools
24 | # install numpy first as it is a compile time dependency for other packages
25 | - pip install --upgrade numpy
26 | - pip install --upgrade scipy matplotlib setuptools nose coverage sphinx pillow sphinx-gallery sphinx_rtd_theme
27 | # Installing required packages for `make -C doc check command` to work.
28 | - sudo -E apt-get -yq update
29 | - sudo -E apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra
30 | - pip install --upgrade cython numpydoc
31 | - pip install --upgrade scikit-learn
32 |
33 | # The --user is needed to let sphinx see the source and the binaries
34 | # The pipefail is requested to propagate exit code
35 | override:
36 | - python setup.py clean
37 | - python setup.py develop
38 | - set -o pipefail && cd doc && make html 2>&1 | tee ~/log.txt
39 | test:
40 | # Grep error on the documentation
41 | override:
42 | - cat ~/log.txt && if grep -q "Traceback (most recent call last):" ~/log.txt; then false; else true; fi
43 | deployment:
44 | push:
45 | branch: master
46 | commands:
47 | - bash ci_scripts/push_doc.sh
48 | general:
49 | # Open the doc to the API
50 | artifacts:
51 | - "doc/_build/html"
52 | - "~/log.txt"
53 | # Restric the build to the branch master only
54 | branches:
55 | ignore:
56 | - gh-pages
57 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 |
15 | # Internal variables.
16 | PAPEROPT_a4 = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
21 |
22 | .PHONY: help
23 | help:
24 | @echo "Please use \`make ' where is one of"
25 | @echo " html to make standalone HTML files"
26 | @echo " dirhtml to make HTML files named index.html in directories"
27 | @echo " singlehtml to make a single large HTML file"
28 | @echo " pickle to make pickle files"
29 | @echo " json to make JSON files"
30 | @echo " htmlhelp to make HTML files and a HTML help project"
31 | @echo " qthelp to make HTML files and a qthelp project"
32 | @echo " applehelp to make an Apple Help Book"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38 | @echo " text to make text files"
39 | @echo " man to make manual pages"
40 | @echo " texinfo to make Texinfo files"
41 | @echo " info to make Texinfo files and run them through makeinfo"
42 | @echo " gettext to make PO message catalogs"
43 | @echo " changes to make an overview of all changed/added/deprecated items"
44 | @echo " xml to make Docutils-native XML files"
45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46 | @echo " linkcheck to check all external links for integrity"
47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48 | @echo " coverage to run coverage check of the documentation (if enabled)"
49 |
50 | .PHONY: clean
51 | clean:
52 | rm -rf $(BUILDDIR)/*
53 |
54 | .PHONY: html
55 | html:
56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
57 | @echo
58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
59 |
60 | .PHONY: dirhtml
61 | dirhtml:
62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
63 | @echo
64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
65 |
66 | .PHONY: singlehtml
67 | singlehtml:
68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
69 | @echo
70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
71 |
72 | .PHONY: pickle
73 | pickle:
74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
75 | @echo
76 | @echo "Build finished; now you can process the pickle files."
77 |
78 | .PHONY: json
79 | json:
80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
81 | @echo
82 | @echo "Build finished; now you can process the JSON files."
83 |
84 | .PHONY: htmlhelp
85 | htmlhelp:
86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
87 | @echo
88 | @echo "Build finished; now you can run HTML Help Workshop with the" \
89 | ".hhp project file in $(BUILDDIR)/htmlhelp."
90 |
91 | .PHONY: qthelp
92 | qthelp:
93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
94 | @echo
95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/hdbscan.qhcp"
98 | @echo "To view the help file:"
99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/hdbscan.qhc"
100 |
101 | .PHONY: applehelp
102 | applehelp:
103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | @echo
105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | @echo "N.B. You won't be able to view it unless you put it in" \
107 | "~/Library/Documentation/Help or install it in your application" \
108 | "bundle."
109 |
110 | .PHONY: devhelp
111 | devhelp:
112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | @echo
114 | @echo "Build finished."
115 | @echo "To view the help file:"
116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/hdbscan"
117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/hdbscan"
118 | @echo "# devhelp"
119 |
120 | .PHONY: epub
121 | epub:
122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | @echo
124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 |
126 | .PHONY: latex
127 | latex:
128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | @echo
130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | "(use \`make latexpdf' here to do that automatically)."
133 |
134 | .PHONY: latexpdf
135 | latexpdf:
136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | @echo "Running LaTeX files through pdflatex..."
138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 |
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | @echo "Running LaTeX files through platex and dvipdfmx..."
145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 |
148 | .PHONY: text
149 | text:
150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | @echo
152 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
153 |
154 | .PHONY: man
155 | man:
156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | @echo
158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 |
160 | .PHONY: texinfo
161 | texinfo:
162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | @echo
164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | @echo "Run \`make' in that directory to run these through makeinfo" \
166 | "(use \`make info' here to do that automatically)."
167 |
168 | .PHONY: info
169 | info:
170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | @echo "Running Texinfo files through makeinfo..."
172 | make -C $(BUILDDIR)/texinfo info
173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 |
175 | .PHONY: gettext
176 | gettext:
177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | @echo
179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 |
181 | .PHONY: changes
182 | changes:
183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | @echo
185 | @echo "The overview file is in $(BUILDDIR)/changes."
186 |
187 | .PHONY: linkcheck
188 | linkcheck:
189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | @echo
191 | @echo "Link check complete; look for any errors in the above output " \
192 | "or in $(BUILDDIR)/linkcheck/output.txt."
193 |
194 | .PHONY: doctest
195 | doctest:
196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | @echo "Testing of doctests in the sources finished, look at the " \
198 | "results in $(BUILDDIR)/doctest/output.txt."
199 |
200 | .PHONY: coverage
201 | coverage:
202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | @echo "Testing of coverage in the sources finished, look at the " \
204 | "results in $(BUILDDIR)/coverage/python.txt."
205 |
206 | .PHONY: xml
207 | xml:
208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | @echo
210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 |
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | @echo
216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 |
--------------------------------------------------------------------------------
/docs/advanced_hdbscan.rst:
--------------------------------------------------------------------------------
1 |
2 | Getting More Information About a Clustering
3 | ===========================================
4 |
5 | Once you have the basics of clustering sorted you may want to dig a
6 | little deeper than just the cluster labels returned to you. Fortunately, the hdbscan library provides you with the facilities to do this. During
7 | processing HDBSCAN\* builds a hierarchy of potential clusters, from
8 | which it extracts the flat clustering returned. It can be informative to
9 | look at that hierarchy, and potentially make use of the extra
10 | information contained therein.
11 |
12 | Suppose we have a dataset for clustering. It is a binary file in NumPy format and it can be found at https://github.com/lmcinnes/hdbscan/blob/master/notebooks/clusterable_data.npy.
13 |
14 | .. code:: python
15 |
16 | import hdbscan
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | import seaborn as sns
20 | %matplotlib inline
21 |
22 | .. code:: python
23 |
24 | data = np.load('clusterable_data.bin')
25 | #or
26 | data = np.load('clusterable_data.npy')
27 | #depending on the format of the file
28 |
29 | .. code:: python
30 |
31 | data.shape
32 |
33 | .. parsed-literal::
34 |
35 | (2309, 2)
36 |
37 | .. code:: python
38 |
39 | data
40 |
41 | .. parsed-literal::
42 |
43 | array([[-0.12153499, -0.22876337],
44 | [-0.22093687, -0.25251088],
45 | [ 0.1259037 , -0.27314321],
46 | ...,
47 | [ 0.50243143, -0.3002958 ],
48 | [ 0.53822256, 0.19412199],
49 | [-0.08688887, -0.2092721 ]])
50 |
51 |
52 | .. code:: python
53 |
54 | plt.scatter(*data.T, s=50, linewidth=0, c='b', alpha=0.25)
55 |
56 | .. parsed-literal::
57 |
58 |
59 |
60 | .. image:: images/advanced_hdbscan_3_1.png
61 |
62 |
63 | We can cluster the data as normal, and visualize the labels with
64 | different colors (and even the cluster membership strengths as levels of
65 | saturation)
66 |
67 | .. code:: python
68 |
69 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data)
70 | color_palette = sns.color_palette('deep', 8)
71 | cluster_colors = [color_palette[x] if x >= 0
72 | else (0.5, 0.5, 0.5)
73 | for x in clusterer.labels_]
74 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
75 | zip(cluster_colors, clusterer.probabilities_)]
76 | plt.scatter(*data.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
77 |
78 |
79 | .. image:: images/advanced_hdbscan_5_1.png
80 |
81 | Condensed Trees
82 | ---------------
83 |
84 | The question now is what does the cluster hierarchy look like -- which
85 | clusters are near each other, or could perhaps be merged, and which are
86 | far apart. We can access the basic hierarchy via the :py:attr:`~hdbscan.HDBSCAN.condensed_tree_`
87 | attribute of the clusterer object.
88 |
89 | .. code:: python
90 |
91 | clusterer.condensed_tree_
92 |
93 |
94 |
95 |
96 | .. parsed-literal::
97 |
98 |
99 |
100 |
101 |
102 | This merely gives us a :class:`~hdbscan.plots.CondensedTree` object. If we want to visualize the
103 | hierarchy we can call the :py:meth:`~hdbscan.plots.CondensedTree.plot` method:
104 |
105 | .. code:: python
106 |
107 | clusterer.condensed_tree_.plot()
108 |
109 |
110 | .. image:: images/advanced_hdbscan_9_1.png
111 |
112 |
113 | We can now see the hierarchy as a dendrogram, the width (and color) of
114 | each branch representing the number of points in the cluster at that
115 | level. If we wish to know which branches were selected by the HDBSCAN\*
116 | algorithm we can pass ``select_clusters=True``. You can even pass a
117 | selection palette to color the selections according to the cluster
118 | labeling.
119 |
120 | .. code:: python
121 |
122 | clusterer.condensed_tree_.plot(select_clusters=True,
123 | selection_palette=sns.color_palette('deep', 8))
124 |
125 |
126 | .. image:: images/advanced_hdbscan_11_1.png
127 |
128 |
129 | From this, we can see, for example, that the yellow cluster at the
130 | center of the plot forms early (breaking off from the pale blue and
131 | purple clusters) and persists for a long time. By comparison the green
132 | cluster, which also forms early, quickly breaks apart and then
133 | vanishes altogether (shattering into clusters all smaller than the
134 | ``min_cluster_size`` of 15).
135 |
136 | You can also see that the pale blue cluster breaks apart into several
137 | subclusters that in turn persist for quite some time -- so there is some
138 | interesting substructure to the pale blue cluster that is not present,
139 | for example, in the dark blue cluster.
140 |
141 | If this was a simple visual analysis of the condensed tree can tell you
142 | a lot more about the structure of your data. This is not all we can do
143 | with condensed trees, however. For larger and more complex datasets the
144 | tree itself may be very complex, and it may be desirable to run more
145 | interesting analytics over the tree itself. This can be achieved via
146 | several converter methods: :py:meth:`~hdbscan.plots.CondensedTree.to_networkx`, :py:meth:`~hdbscan.plots.CondensedTree.to_pandas`, and
147 | :py:meth:`~hdbscan.plots.CondensedTree.to_numpy`.
148 |
149 | First we'll consider :py:meth:`~hdbscan.plots.CondensedTree.to_networkx`
150 |
151 | .. code:: python
152 |
153 | clusterer.condensed_tree_.to_networkx()
154 |
155 |
156 |
157 |
158 | .. parsed-literal::
159 |
160 |
161 |
162 |
163 |
164 | As you can see we get a NetworkX directed graph, which we can then use
165 | all the regular NetworkX tools and analytics on. The graph is richer
166 | than the visual plot above may lead you to believe, however:
167 |
168 | .. code:: python
169 |
170 | g = clusterer.condensed_tree_.to_networkx()
171 | g.number_of_nodes()
172 |
173 |
174 |
175 |
176 | .. parsed-literal::
177 |
178 | 2338
179 |
180 |
181 |
182 | The graph actually contains nodes for all the points falling out of
183 | clusters as well as the clusters themselves. Each node has an associated
184 | ``size`` attribute and each edge has a ``weight`` of the lambda value
185 | at which that edge forms. This allows for much more interesting
186 | analyses.
187 |
188 | Next, we have the :py:meth:`~hdbscan.plots.CondensedTree.to_pandas` method, which returns a panda DataFrame
189 | where each row corresponds to an edge of the NetworkX graph:
190 |
191 | .. code:: python
192 |
193 | clusterer.condensed_tree_.to_pandas().head()
194 |
195 |
196 |
197 |
198 | .. raw:: html
199 |
200 |
201 |
202 |
203 |
204 |
205 |
parent
206 |
child
207 |
lambda_val
208 |
child_size
209 |
210 |
211 |
212 |
213 |
0
214 |
2309
215 |
2048
216 |
5.016526
217 |
1
218 |
219 |
220 |
1
221 |
2309
222 |
2006
223 |
5.076503
224 |
1
225 |
226 |
227 |
2
228 |
2309
229 |
2024
230 |
5.279133
231 |
1
232 |
233 |
234 |
3
235 |
2309
236 |
2050
237 |
5.347332
238 |
1
239 |
240 |
241 |
4
242 |
2309
243 |
1992
244 |
5.381930
245 |
1
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 | Here the ``parent`` denotes the id of the parent cluster, the ``child``
256 | the id of the child cluster (or, if the child is a single data point
257 | rather than a cluster, the index in the dataset of that point), the
258 | ``lambda_val`` provides the lambda value at which the edge forms, and
259 | the ``child_size`` provides the number of points in the child cluster.
260 | As you can see the start of the DataFrame has singleton points falling
261 | out of the root cluster, with each ``child_size`` equal to 1.
262 |
263 | If you want just the clusters, rather than all the individual points
264 | as well, simply select the rows of the DataFrame with ``child_size``
265 | greater than 1.
266 |
267 | .. code:: python
268 |
269 | tree = clusterer.condensed_tree_.to_pandas()
270 | cluster_tree = tree[tree.child_size > 1]
271 |
272 |
273 |
274 | Finally we have the :py:meth:`~hdbscan.plots.CondensedTree.to_numpy` function, which returns a numpy record
275 | array:
276 |
277 | .. code:: python
278 |
279 | clusterer.condensed_tree_.to_numpy()
280 |
281 |
282 |
283 |
284 | .. parsed-literal::
285 |
286 | array([(2309, 2048, 5.016525967983049, 1),
287 | (2309, 2006, 5.076503128308643, 1),
288 | (2309, 2024, 5.279133057912248, 1), ...,
289 | (2318, 1105, 86.5507370650292, 1), (2318, 965, 86.5507370650292, 1),
290 | (2318, 954, 86.5507370650292, 1)],
291 | dtype=[('parent', '
316 |
317 |
318 |
319 | Again we have an object which we can then query for relevant
320 | information. The most basic approach is the :py:meth:`~hdbscan.plots.SingleLinkageTree.plot` method, just like
321 | the condensed tree.
322 |
323 | .. code:: python
324 |
325 | clusterer.single_linkage_tree_.plot()
326 |
327 |
328 | .. image:: images/advanced_hdbscan_26_1.png
329 |
330 |
331 | As you can see we gain a lot from condensing the tree in terms of better
332 | presenting and summarising the data. There is a lot less to be gained
333 | from visual inspection of a plot like this (and it only gets worse for
334 | larger datasets). The plot function support most of the same
335 | functionality as the dendrogram plotting from
336 | ``scipy.cluster.hierarchy``, so you can view various truncations of the
337 | tree if necessary. In practice, however, you are more likely to be
338 | interested in access the raw data for further analysis. Again we have
339 | :py:meth:`~hdbscan.plots.SingleLinkageTree.to_networkx`, :py:meth:`~hdbscan.plots.SingleLinkageTree.to_pandas` and :py:meth:`~hdbscan.plots.SingleLinkageTree.to_numpy`. This time the
340 | :py:meth:`~hdbscan.plots.SingleLinkageTree.to_networkx` provides a direct NetworkX version of what you see
341 | above. The NumPy and pandas results conform to the single linkage
342 | hierarchy format of ``scipy.cluster.hierarchy``, and can be passed to
343 | routines there if necessary.
344 |
345 | If you wish to know what the clusters are at a given fixed level of the
346 | single linkage tree you can use the :py:meth:`~hdbscan.plots.SingleLinkageTree.get_clusters` method to extract
347 | a vector of cluster labels. The method takes a cut value of the level
348 | at which to cut the tree, and a ``minimum_cluster_size`` to determine
349 | noise points (any cluster smaller than the ``minimum_cluster_size``).
350 |
351 | .. code:: python
352 |
353 | clusterer.single_linkage_tree_.get_clusters(0.023, min_cluster_size=2)
354 |
355 |
356 |
357 | .. parsed-literal::
358 |
359 | array([ 0, -1, 0, ..., -1, -1, 0])
360 |
361 |
362 | In this way, it is possible to extract the DBSCAN clustering that would result
363 | for any given epsilon value, all from one run of hdbscan.
364 |
365 |
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 |
4 | Major classes are :class:`HDBSCAN` and :class:`RobustSingleLinkage`.
5 |
6 | HDBSCAN
7 | -------
8 |
9 | .. autoclass:: hdbscan.hdbscan_.HDBSCAN
10 | :members:
11 |
12 | RobustSingleLinkage
13 | -------------------
14 |
15 | .. autoclass:: hdbscan.robust_single_linkage_.RobustSingleLinkage
16 | :members:
17 |
18 |
19 | Utilities
20 | ---------
21 |
22 | Other useful classes are contained in the plots module, the validity module,
23 | and the prediction module.
24 |
25 | .. autoclass:: hdbscan.plots.CondensedTree
26 | :members:
27 |
28 | .. autoclass:: hdbscan.plots.SingleLinkageTree
29 | :members:
30 |
31 | .. autoclass:: hdbscan.plots.MinimumSpanningTree
32 | :members:
33 |
34 | .. automodule:: hdbscan.validity
35 | :members:
36 |
37 | .. automodule:: hdbscan.prediction
38 | :members:
39 |
40 |
41 | Branch detection
42 | ----------------
43 |
44 | The branches module contains classes for detecting branches within clusters.
45 |
46 | .. automodule:: hdbscan.branches
47 | :members: BranchDetector, detect_branches_in_clusters, approximate_predict_branch
48 |
49 | .. autoclass:: hdbscan.plots.ApproximationGraph
50 | :members:
51 |
--------------------------------------------------------------------------------
/docs/basic_hdbscan.rst:
--------------------------------------------------------------------------------
1 |
2 | Basic Usage of HDBSCAN\* for Clustering
3 | =======================================
4 |
5 | We have some data, and we want to cluster it. How exactly do we do that,
6 | and what do the results look like? If you are very familiar with sklearn
7 | and its API, particularly for clustering, then you can probably skip
8 | this tutorial -- ``hdbscan`` implements exactly this API, so you can use
9 | it just as you would any other sklearn clustering algorithm. If, on the
10 | other hand, you aren't that familiar with sklearn, fear not, and read
11 | on. Let's start with the simplest case first -- we have data in a nice
12 | tidy dataframe format.
13 |
14 | The Simple Case
15 | ---------------
16 |
17 | Let's generate some data with, say 2000 samples, and 10 features. We can
18 | put it in a dataframe for a nice clean table view of it.
19 |
20 | .. code:: python
21 |
22 | from sklearn.datasets import make_blobs
23 | import pandas as pd
24 |
25 | .. code:: python
26 |
27 | blobs, labels = make_blobs(n_samples=2000, n_features=10)
28 |
29 | .. code:: python
30 |
31 | pd.DataFrame(blobs).head()
32 |
33 |
34 | .. raw:: html
35 |
36 |
37 |
38 |
39 |
40 |
41 |
0
42 |
1
43 |
2
44 |
3
45 |
4
46 |
5
47 |
6
48 |
7
49 |
8
50 |
9
51 |
52 |
53 |
54 |
55 |
0
56 |
-3.370804
57 |
8.487688
58 |
4.631243
59 |
-10.181475
60 |
9.146487
61 |
-8.070935
62 |
-1.612017
63 |
-2.418106
64 |
-8.975390
65 |
-1.769952
66 |
67 |
68 |
1
69 |
-4.092931
70 |
8.409841
71 |
3.362516
72 |
-9.748945
73 |
9.556615
74 |
-9.240307
75 |
-2.038291
76 |
-3.129068
77 |
-7.109673
78 |
-0.993827
79 |
80 |
81 |
2
82 |
-4.604753
83 |
9.616391
84 |
4.631508
85 |
-11.166361
86 |
10.888212
87 |
-8.427564
88 |
-3.929517
89 |
-4.563951
90 |
-8.886373
91 |
-1.995063
92 |
93 |
94 |
3
95 |
-6.889866
96 |
-7.801482
97 |
-6.974958
98 |
-8.570025
99 |
5.438101
100 |
-5.097457
101 |
-4.941206
102 |
-5.926394
103 |
-10.145152
104 |
0.219269
105 |
106 |
107 |
4
108 |
5.339728
109 |
2.791309
110 |
0.611464
111 |
-2.929875
112 |
-7.694973
113 |
7.776050
114 |
-1.218101
115 |
0.408141
116 |
-4.563975
117 |
-1.309128
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 | So now we need to import the hdbscan library.
126 |
127 | .. code:: python
128 |
129 | import hdbscan
130 |
131 | Now, to cluster we need to generate a clustering object.
132 |
133 | .. code:: python
134 |
135 | clusterer = hdbscan.HDBSCAN()
136 |
137 | We can then use this clustering object and fit it to the data we have.
138 | This will return the clusterer object back to you -- just in case you
139 | want do some method chaining.
140 |
141 | .. code:: python
142 |
143 | clusterer.fit(blobs)
144 |
145 |
146 | .. parsed-literal::
147 |
148 | HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,
149 | gen_min_span_tree=False, leaf_size=40, memory=Memory(None),
150 | metric='euclidean', min_cluster_size=5, min_samples=None, p=None)
151 |
152 |
153 |
154 | At this point we are actually done! We've done the clustering! But where
155 | are the results? How do I get the clusters? The clusterer object knows,
156 | and stores the result in an attribute ``labels_``.
157 |
158 | .. code:: python
159 |
160 | clusterer.labels_
161 |
162 |
163 | .. parsed-literal::
164 |
165 | array([2, 2, 2, ..., 2, 2, 0])
166 |
167 |
168 |
169 | So it is an array of integers. What are we to make of that? It is an
170 | array with an integer for each data sample. Samples that are in the same
171 | cluster get assigned the same number. The cluster labels start at 0 and count
172 | up. We can thus determine the number of clusters found by finding the largest
173 | cluster label.
174 |
175 | .. code:: python
176 |
177 | clusterer.labels_.max()
178 |
179 |
180 | .. parsed-literal::
181 |
182 | 2
183 |
184 | So we have a total of three clusters, with labels 0, 1, and 2.
185 | Importantly HDBSCAN is noise aware -- it has a notion of data samples
186 | that are not assigned to any cluster. This is handled by assigning these
187 | samples the label -1. But wait, there's more. The ``hdbscan`` library
188 | implements soft clustering, where each data point is assigned a cluster
189 | membership score ranging from 0.0 to 1.0. A score of 0.0 represents a
190 | sample that is not in the cluster at all (all noise points will get this
191 | score) while a score of 1.0 represents a sample that is at the heart of
192 | the cluster (note that this is not the spatial centroid notion of core).
193 | You can access these scores via the ``probabilities_`` attribute.
194 |
195 | .. code:: python
196 |
197 | clusterer.probabilities_
198 |
199 |
200 | .. parsed-literal::
201 |
202 | array([ 0.83890858, 1. , 0.72629904, ..., 0.79456452,
203 | 0.65311137, 0.76382928])
204 |
205 |
206 |
207 | What about different metrics?
208 | -----------------------------
209 |
210 | That is all well and good, but even data that is embedded in a vector
211 | space may not want to consider distances between data points to be pure
212 | Euclidean distance. What can we do in that case? We are still in good
213 | shape, since ``hdbscan`` supports a wide variety of metrics, which you
214 | can set when creating the clusterer object. For example we can do the
215 | following:
216 |
217 | .. code:: python
218 |
219 | clusterer = hdbscan.HDBSCAN(metric='manhattan')
220 | clusterer.fit(blobs)
221 | clusterer.labels_
222 |
223 |
224 |
225 |
226 | .. parsed-literal::
227 |
228 | array([1, 1, 1, ..., 1, 1, 0])
229 |
230 |
231 |
232 | What metrics are supported? Because we simply steal metric computations
233 | from sklearn we get a large number of metrics readily available.
234 |
235 | .. code:: python
236 |
237 | hdbscan.dist_metrics.METRIC_MAPPING
238 |
239 |
240 |
241 |
242 | .. parsed-literal::
243 |
244 | {'braycurtis': hdbscan.dist_metrics.BrayCurtisDistance,
245 | 'canberra': hdbscan.dist_metrics.CanberraDistance,
246 | 'chebyshev': hdbscan.dist_metrics.ChebyshevDistance,
247 | 'cityblock': hdbscan.dist_metrics.ManhattanDistance,
248 | 'dice': hdbscan.dist_metrics.DiceDistance,
249 | 'euclidean': hdbscan.dist_metrics.EuclideanDistance,
250 | 'hamming': hdbscan.dist_metrics.HammingDistance,
251 | 'haversine': hdbscan.dist_metrics.HaversineDistance,
252 | 'infinity': hdbscan.dist_metrics.ChebyshevDistance,
253 | 'jaccard': hdbscan.dist_metrics.JaccardDistance,
254 | 'kulsinski': hdbscan.dist_metrics.KulsinskiDistance,
255 | 'l1': hdbscan.dist_metrics.ManhattanDistance,
256 | 'l2': hdbscan.dist_metrics.EuclideanDistance,
257 | 'mahalanobis': hdbscan.dist_metrics.MahalanobisDistance,
258 | 'manhattan': hdbscan.dist_metrics.ManhattanDistance,
259 | 'matching': hdbscan.dist_metrics.MatchingDistance,
260 | 'minkowski': hdbscan.dist_metrics.MinkowskiDistance,
261 | 'p': hdbscan.dist_metrics.MinkowskiDistance,
262 | 'pyfunc': hdbscan.dist_metrics.PyFuncDistance,
263 | 'rogerstanimoto': hdbscan.dist_metrics.RogersTanimotoDistance,
264 | 'russellrao': hdbscan.dist_metrics.RussellRaoDistance,
265 | 'seuclidean': hdbscan.dist_metrics.SEuclideanDistance,
266 | 'sokalmichener': hdbscan.dist_metrics.SokalMichenerDistance,
267 | 'sokalsneath': hdbscan.dist_metrics.SokalSneathDistance,
268 | 'wminkowski': hdbscan.dist_metrics.WMinkowskiDistance}
269 |
270 |
271 |
272 | Distance matrices
273 | -----------------
274 |
275 | What if you don't have a nice set of points in a vector space, but only
276 | have a pairwise distance matrix providing the distance between each pair
277 | of points? This is a common situation. Perhaps you have a complex custom
278 | distance measure; perhaps you have strings and are using Levenshtein
279 | distance, etc. Again, this is all fine as ``hdbscan`` supports a special
280 | metric called ``precomputed``. If you create the clusterer with the
281 | metric set to ``precomputed`` then the clusterer will assume that,
282 | rather than being handed a vector of points in a vector space, it is
283 | receiving an all-pairs distance matrix. Missing distances can be
284 | indicated by ``numpy.inf``, which leads HDBSCAN to ignore these pairwise
285 | relationships as long as there exists a path between two points that
286 | contains defined distances (i.e. if there are too many distances
287 | missing, the clustering is going to fail).
288 |
289 | NOTE: The input vector _must_ contain numerical data. If you have a
290 | distance matrix for non-numerical vectors, you will need to map your
291 | input vectors to numerical vectors. (e.g use map ['A', 'G', 'C', 'T']->
292 | [ 1, 2, 3, 4] to replace input vector ['A', 'A', 'A', 'C', 'G'] with
293 | [ 1, 1, 1, 3, 2])
294 |
295 | .. code:: python
296 |
297 | from sklearn.metrics.pairwise import pairwise_distances
298 |
299 | .. code:: python
300 |
301 | distance_matrix = pairwise_distances(blobs)
302 | clusterer = hdbscan.HDBSCAN(metric='precomputed')
303 | clusterer.fit(distance_matrix)
304 | clusterer.labels_
305 |
306 |
307 |
308 |
309 | .. parsed-literal::
310 |
311 | array([1, 1, 1, ..., 1, 1, 2])
312 |
313 |
314 |
315 | Note that this result only appears different due to a different
316 | labelling order for the clusters.
317 |
318 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # hdbscan documentation build configuration file, created by
4 | # sphinx-quickstart on Sat May 28 10:34:44 2016.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | import os
17 | import sphinx_rtd_theme
18 |
19 | # If extensions (or modules to document with autodoc) are in another directory,
20 | # add these directories to sys.path here. If the directory is relative to the
21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
22 |
23 | ### We now install the package in a virtualenv to build docs, so this is not needed
24 | # sys.path.insert(0, os.path.abspath('../'))
25 |
26 | # -- General configuration ------------------------------------------------
27 |
28 | # If your documentation needs a minimal Sphinx version, state it here.
29 | #needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 | 'sphinx.ext.autodoc',
36 | 'sphinx.ext.doctest',
37 | 'sphinx.ext.todo',
38 | 'sphinx.ext.coverage',
39 | 'sphinx.ext.imgmath',
40 | 'sphinx.ext.viewcode',
41 | # 'sphinx.ext.napoleon',
42 | # 'numpy_ext.numpydoc'
43 | ]
44 | #napoleon_google_docstring = False
45 | #napoleon_numpy_docstring = True
46 |
47 | # Add any paths that contain templates here, relative to this directory.
48 | templates_path = ['_templates']
49 |
50 | # The suffix(es) of source filenames.
51 | # You can specify multiple suffix as a list of string:
52 | # source_suffix = ['.rst', '.md']
53 | source_suffix = '.rst'
54 |
55 | # The encoding of source files.
56 | #source_encoding = 'utf-8-sig'
57 |
58 | # The master toctree document.
59 | master_doc = 'index'
60 |
61 | # General information about the project.
62 | project = u'hdbscan'
63 | copyright = u'2016, Leland McInnes, John Healy, Steve Astels'
64 | author = u'Leland McInnes, John Healy, Steve Astels'
65 |
66 | # The version info for the project you're documenting, acts as replacement for
67 | # |version| and |release|, also used in various other places throughout the
68 | # built documents.
69 | #
70 | # The short X.Y version.
71 | version = u'0.8.1'
72 | # The full version, including alpha/beta/rc tags.
73 | release = u'0.8.1'
74 |
75 | # The language for content autogenerated by Sphinx. Refer to documentation
76 | # for a list of supported languages.
77 | #
78 | # This is also used if you do content translation via gettext catalogs.
79 | # Usually you set "language" from the command line for these cases.
80 | language = None
81 |
82 | # There are two options for replacing |today|: either, you set today to some
83 | # non-false value, then it is used:
84 | #today = ''
85 | # Else, today_fmt is used as the format for a strftime call.
86 | #today_fmt = '%B %d, %Y'
87 |
88 | # List of patterns, relative to source directory, that match files and
89 | # directories to ignore when looking for source files.
90 | exclude_patterns = ['_build']
91 |
92 | # The reST default role (used for this markup: `text`) to use for all
93 | # documents.
94 | #default_role = None
95 |
96 | # If true, '()' will be appended to :func: etc. cross-reference text.
97 | #add_function_parentheses = True
98 |
99 | # If true, the current module name will be prepended to all description
100 | # unit titles (such as .. function::).
101 | #add_module_names = True
102 |
103 | # If true, sectionauthor and moduleauthor directives will be shown in the
104 | # output. They are ignored by default.
105 | #show_authors = False
106 |
107 | # The name of the Pygments (syntax highlighting) style to use.
108 | pygments_style = 'sphinx'
109 |
110 | # A list of ignored prefixes for module index sorting.
111 | #modindex_common_prefix = []
112 |
113 | # If true, keep warnings as "system message" paragraphs in the built documents.
114 | #keep_warnings = False
115 |
116 | # If true, `todo` and `todoList` produce output, else they produce nothing.
117 | todo_include_todos = True
118 |
119 |
120 | # -- Options for HTML output ----------------------------------------------
121 |
122 | # The theme to use for HTML and HTML Help pages. See the documentation for
123 | # a list of builtin themes.
124 | #html_theme = 'alabaster'
125 | html_theme = 'sphinx_rtd_theme'
126 |
127 | # Theme options are theme-specific and customize the look and feel of a theme
128 | # further. For a list of options available for each theme, see the
129 | # documentation.
130 | #html_theme_options = {}
131 |
132 | # Add any paths that contain custom themes here, relative to this directory.
133 | #html_theme_path = []
134 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
135 |
136 | # The name for this set of Sphinx documents. If None, it defaults to
137 | # " v documentation".
138 | #html_title = None
139 |
140 | # A shorter title for the navigation bar. Default is the same as html_title.
141 | #html_short_title = None
142 |
143 | # The name of an image file (relative to this directory) to place at the top
144 | # of the sidebar.
145 | #html_logo = None
146 |
147 | # The name of an image file (within the static path) to use as favicon of the
148 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
149 | # pixels large.
150 | #html_favicon = None
151 |
152 | # Add any paths that contain custom static files (such as style sheets) here,
153 | # relative to this directory. They are copied after the builtin static files,
154 | # so a file named "default.css" will overwrite the builtin "default.css".
155 | html_static_path = ['_static']
156 |
157 | # Add any extra paths that contain custom files (such as robots.txt or
158 | # .htaccess) here, relative to this directory. These files are copied
159 | # directly to the root of the documentation.
160 | #html_extra_path = []
161 |
162 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
163 | # using the given strftime format.
164 | #html_last_updated_fmt = '%b %d, %Y'
165 |
166 | # If true, SmartyPants will be used to convert quotes and dashes to
167 | # typographically correct entities.
168 | #html_use_smartypants = True
169 |
170 | # Custom sidebar templates, maps document names to template names.
171 | #html_sidebars = {}
172 |
173 | # Additional templates that should be rendered to pages, maps page names to
174 | # template names.
175 | #html_additional_pages = {}
176 |
177 | # If false, no module index is generated.
178 | #html_domain_indices = True
179 |
180 | # If false, no index is generated.
181 | #html_use_index = True
182 |
183 | # If true, the index is split into individual pages for each letter.
184 | #html_split_index = False
185 |
186 | # If true, links to the reST sources are added to the pages.
187 | #html_show_sourcelink = True
188 |
189 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
190 | #html_show_sphinx = True
191 |
192 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
193 | #html_show_copyright = True
194 |
195 | # If true, an OpenSearch description file will be output, and all pages will
196 | # contain a tag referring to it. The value of this option must be the
197 | # base URL from which the finished HTML is served.
198 | #html_use_opensearch = ''
199 |
200 | # This is the file name suffix for HTML files (e.g. ".xhtml").
201 | #html_file_suffix = None
202 |
203 | # Language to be used for generating the HTML full-text search index.
204 | # Sphinx supports the following languages:
205 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
206 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
207 | #html_search_language = 'en'
208 |
209 | # A dictionary with options for the search language support, empty by default.
210 | # Now only 'ja' uses this config value
211 | #html_search_options = {'type': 'default'}
212 |
213 | # The name of a javascript file (relative to the configuration directory) that
214 | # implements a search results scorer. If empty, the default will be used.
215 | #html_search_scorer = 'scorer.js'
216 |
217 | # Output file base name for HTML help builder.
218 | htmlhelp_basename = 'hdbscandoc'
219 |
220 | # -- Options for LaTeX output ---------------------------------------------
221 |
222 | latex_elements = {
223 | # The paper size ('letterpaper' or 'a4paper').
224 | #'papersize': 'letterpaper',
225 |
226 | # The font size ('10pt', '11pt' or '12pt').
227 | #'pointsize': '10pt',
228 |
229 | # Additional stuff for the LaTeX preamble.
230 | #'preamble': '',
231 |
232 | # Latex figure (float) alignment
233 | #'figure_align': 'htbp',
234 | }
235 |
236 | # Grouping the document tree into LaTeX files. List of tuples
237 | # (source start file, target name, title,
238 | # author, documentclass [howto, manual, or own class]).
239 | latex_documents = [
240 | (master_doc, 'hdbscan.tex', u'hdbscan Documentation',
241 | u'Leland McInnes, John Healy, Steve Astels', 'manual'),
242 | ]
243 |
244 | # The name of an image file (relative to this directory) to place at the top of
245 | # the title page.
246 | #latex_logo = None
247 |
248 | # For "manual" documents, if this is true, then toplevel headings are parts,
249 | # not chapters.
250 | #latex_use_parts = False
251 |
252 | # If true, show page references after internal links.
253 | #latex_show_pagerefs = False
254 |
255 | # If true, show URL addresses after external links.
256 | #latex_show_urls = False
257 |
258 | # Documents to append as an appendix to all manuals.
259 | #latex_appendices = []
260 |
261 | # If false, no module index is generated.
262 | #latex_domain_indices = True
263 |
264 |
265 | # -- Options for manual page output ---------------------------------------
266 |
267 | # One entry per manual page. List of tuples
268 | # (source start file, name, description, authors, manual section).
269 | man_pages = [
270 | (master_doc, 'hdbscan', u'hdbscan Documentation',
271 | [author], 1)
272 | ]
273 |
274 | # If true, show URL addresses after external links.
275 | #man_show_urls = False
276 |
277 |
278 | # -- Options for Texinfo output -------------------------------------------
279 |
280 | # Grouping the document tree into Texinfo files. List of tuples
281 | # (source start file, target name, title, author,
282 | # dir menu entry, description, category)
283 | texinfo_documents = [
284 | (master_doc, 'hdbscan', u'hdbscan Documentation',
285 | author, 'hdbscan', 'One line description of project.',
286 | 'Miscellaneous'),
287 | ]
288 |
289 | # Documents to append as an appendix to all manuals.
290 | #texinfo_appendices = []
291 |
292 | # If false, no module index is generated.
293 | #texinfo_domain_indices = True
294 |
295 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
296 | #texinfo_show_urls = 'footnote'
297 |
298 | # If true, do not generate a @detailmenu in the "Top" node's menu.
299 | #texinfo_no_detailmenu = False
300 |
301 |
302 | # -- Options for Epub output ----------------------------------------------
303 |
304 | # Bibliographic Dublin Core info.
305 | epub_title = project
306 | epub_author = author
307 | epub_publisher = author
308 | epub_copyright = copyright
309 |
310 | # The basename for the epub file. It defaults to the project name.
311 | #epub_basename = project
312 |
313 | # The HTML theme for the epub output. Since the default themes are not
314 | # optimized for small screen space, using the same theme for HTML and epub
315 | # output is usually not wise. This defaults to 'epub', a theme designed to save
316 | # visual space.
317 | #epub_theme = 'epub'
318 |
319 | # The language of the text. It defaults to the language option
320 | # or 'en' if the language is not set.
321 | #epub_language = ''
322 |
323 | # The scheme of the identifier. Typical schemes are ISBN or URL.
324 | #epub_scheme = ''
325 |
326 | # The unique identifier of the text. This can be a ISBN number
327 | # or the project homepage.
328 | #epub_identifier = ''
329 |
330 | # A unique identification for the text.
331 | #epub_uid = ''
332 |
333 | # A tuple containing the cover image and cover page html template filenames.
334 | #epub_cover = ()
335 |
336 | # A sequence of (type, uri, title) tuples for the guide element of content.opf.
337 | #epub_guide = ()
338 |
339 | # HTML files that should be inserted before the pages created by sphinx.
340 | # The format is a list of tuples containing the path and title.
341 | #epub_pre_files = []
342 |
343 | # HTML files that should be inserted after the pages created by sphinx.
344 | # The format is a list of tuples containing the path and title.
345 | #epub_post_files = []
346 |
347 | # A list of files that should not be packed into the epub file.
348 | epub_exclude_files = ['search.html']
349 |
350 | # The depth of the table of contents in toc.ncx.
351 | #epub_tocdepth = 3
352 |
353 | # Allow duplicate toc entries.
354 | #epub_tocdup = True
355 |
356 | # Choose between 'default' and 'includehidden'.
357 | #epub_tocscope = 'default'
358 |
359 | # Fix unsupported image types using the Pillow.
360 | #epub_fix_images = False
361 |
362 | # Scale large images.
363 | #epub_max_image_width = 0
364 |
365 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
366 | #epub_show_urls = 'inline'
367 |
368 | # If false, no index is generated.
369 | #epub_use_index = True
370 |
--------------------------------------------------------------------------------
/docs/dbscan_from_hdbscan.rst:
--------------------------------------------------------------------------------
1 |
2 | Extracting DBSCAN* clustering from HDBSCAN*
3 | ===========================================
4 |
5 | There are a number of reasons that one might prefer `DBSCAN `__'s
6 | clustering over that of HDBSCAN*. The biggest difficulty many folks have with
7 | DBSCAN is that the epsilon distance parameter can be hard to determine and often
8 | requires a great deal of trial and error to tune. If your data lived in a more
9 | interpretable space and you had a good notion of distance in that space this problem
10 | is certainly mitigated and a user might want to set a very specific epsilon distance
11 | for their use case. Another viable use case might be that a user is interested in a
12 | constant density clustering.
13 | HDBSCAN* does variable density clustering by default, looking for the clusters that persist
14 | over a wide range of epsilon distance parameters to find a 'natural' clustering. This might
15 | not be the right result for your application. A DBSCAN clustering at a particular
16 | epsilon value might work better for your particular task.
17 |
18 | HDBSCAN returns a very natural clustering of your data which is often very useful in exploring
19 | a new data set. That doesn't necessarily make it the right clustering algorithm or every
20 | task.
21 |
22 | HDBSCAN* can best be thought of as a DBSCAN* implementation which varies across
23 | all epsilon values and extracts the clusters that persist over the widest range
24 | of these parameter choices. It is therefore able to ignore the parameter and
25 | only needs the minimum cluster size as single input parameter.
26 | The 'eom' (Excess of Mass) cluster selection method then returns clusters with the
27 | best stability over epsilon.
28 |
29 | There are a number of alternative ways of extracting a flat clustering from
30 | the HDBSCAN* hierarchical tree. If one is interested in finer resolution
31 | clusters while still maintaining variable density one could set
32 | ``cluster_selection_method='leaf'`` to extract the leaves of the condensed
33 | tree instead of the most persistent clusters. For more details on these
34 | cluster selection methods see :ref:`leaf_clustering_label`.
35 |
36 | If one wasn't interested in the variable density clustering that is the hallmark of
37 | HDBSCAN* it is relatively easy to extract any DBSCAN* clustering from a
38 | single run of HDBSCAN*. This has the advantage of allowing you to perform
39 | a single computationally efficient HDBSCAN* run and then quickly search over
40 | the DBSCAN* parameter space by extracting clustering results from our
41 | pre-constructed tree. This can save significant computational time when
42 | searching across multiple cluster parameter settings on large amounts of data.
43 |
44 | Alternatively, one could make use of the ``cluster_selection_epsilon`` as a
45 | post processing step with any ``cluster_selection_method`` in order to
46 | return a hybrid clustering of DBSCAN* and HDBSCAN*. For more details on
47 | this see :doc:`how_to_use_epsilon`.
48 |
49 | In order to extract a DBSCAN* clustering from an HDBSCAN run we must first train
50 | and HDBSCAN model on our data.
51 |
52 | .. code:: python
53 |
54 | import hdbscan
55 | h_cluster = hdbscan.HDBSCAN(min_samples=5,match_reference_implementation=True).fit(X)
56 |
57 | The ``min_cluster_size`` parameter is unimportant in this case in that it is
58 | only used in the creation of our condensed tree which we won't be using here.
59 | Now we choose a ``cut_distance`` which is just another name for the epsilon
60 | threshold in DBSCAN and will be passed to our
61 | :py:meth:`~hdbscan.hdbscan_.dbscan_clustering` method.
62 |
63 | .. code:: python
64 |
65 | eps = 0.2
66 | labels = h_cluster.dbscan_clustering(cut_distance=eps, min_cluster_size=5)
67 | sns.scatterplot(x=X[:,0], y=X[:,1], hue=labels.astype(str));
68 |
69 | .. image:: images/dbscan_from_hdbscan_clustering.png
70 | :align: center
71 |
72 | It should be noted that a DBSCAN* clustering extracted from our HDBSCAN* tree will
73 | not precisely match the clustering results from sklearn's DBSCAN implementation.
74 | Our clustering results should better match DBSCAN* (which can be thought of as
75 | DBSCAN without the border points). As such when comparing the two results one
76 | should expect them to mostly differ in the points that DBSCAN considers boarder
77 | points. We'll deal with
78 | this by only looking at the comparison of our clustering results based on the points identified
79 | by DBSCAN as core points. We can see below that the differences between these two
80 | clusterings mostly occur in the boundaries of the clusters. This matches our
81 | intuition of stability within the core points.
82 |
83 | .. image:: images/dbscan_from_hdbscan_comparision.png
84 | :align: center
85 |
86 | For a slightly more empirical comparison we we make use of the `adjusted rand score `__
87 | to compare the clustering of the core points between a DBSCAN cluster from sklearn and
88 | a DBSCAN* clustering extracted from our HDBSCAN* object.
89 |
90 | .. image:: images/dbscan_from_hdbscan_percentage_core.png
91 | :align: center
92 |
93 | .. image:: images/dbscan_from_hdbscan_number_of_clusters.png
94 | :align: center
95 |
96 | We see that for very small epsilon values our number of clusters tends to be quite
97 | far apart, largely due to a large number of the points being considered boundary points
98 | instead of core points. As the epsilon value increases, more and more points are
99 | considered core and the number of clusters generated by each algorithm converge.
100 |
101 | Additionally, the adjusted rand score between the core points of both algorithm
102 | stays consistently high (mostly 1.0) for our entire range of epsilon. There may be
103 | be some minor discrepancies between core point results largely due to implementation
104 | details and optimizations with the code base.
105 |
106 | Why might one just extract the DBSCAN* clustering results from a single HDBSCAN* run
107 | instead of making use of sklearns DBSSCAN code? The short answer is efficiency.
108 | If you aren't sure what epsilon parameter to select for DBSCAN then you may have to
109 | run the algorithm many times on your data set. While those runs can be inexpensive for
110 | very small epsilon values they can get quite expensive for large parameter values.
111 |
112 | In this small benchmark case of 50,000 two dimensional data points we have broken even
113 | after having only had to try two epsilon parameters from DBSCAN, or only a single
114 | run with a large parameter selected. This trend is only exacerbated for larger
115 | data sets in higher dimensional spaces. For more detailed scaling experiments see
116 | `Accelearted Hierarchical Density Clustering `__
117 | by McInnes and Healy.
118 |
119 | .. image:: images/dbscan_from_hdbscan_timing.png
120 | :align: center
121 |
122 |
123 |
124 |
125 |
126 |
127 |
--------------------------------------------------------------------------------
/docs/docs_requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 |
--------------------------------------------------------------------------------
/docs/faq.rst:
--------------------------------------------------------------------------------
1 | Frequently Asked Questions
2 | ==========================
3 |
4 | Here we attempt to address some common questions, directing the user to some
5 | helpful answers.
6 |
7 | Q: Most of data is classified as noise; why?
8 | --------------------------------------------
9 |
10 | The amount of data classified as noise is controlled by the ``min_samples``
11 | parameter. By default, if not otherwise set, this value is set to the same
12 | value as ``min_cluster_size``. You can set it independently if you wish by
13 | specifying it separately. The lower the value, the less noise you'll get, but
14 | there are limits, and it is possible that you simply have noisy data. See
15 | :any:`min_samples ` for more details.
16 |
17 | Q: I mostly just get one large cluster; I want smaller clusters.
18 | ----------------------------------------------------------------
19 |
20 | If you are getting a single large cluster and a few small outlying clusters
21 | that means your data is essentially a large glob with some small outlying
22 | clusters -- there may be structure to the glob, but compared to how well
23 | separated those other small clusters are, it doesn't really show up. You may,
24 | however, want to get at that more fine grained structure. You can do that,
25 | and what you are looking for is :any:`leaf clustering `.
26 |
27 | Q: HDBSCAN is failing to separate the clusters I think it should.
28 | -----------------------------------------------------------------
29 |
30 | Density based clustering relies on having enough data to separate dense areas.
31 | In higher dimensional spaces this becomes more difficult, and hence
32 | requires more data. Quite possibly there is not enough data to make your
33 | clusters clearly separable. Consider the following plots:
34 |
35 | .. image:: images/generative_model_scatter.png
36 | .. image:: images/generative_model_kde.png
37 |
38 | Four different generative models, when sampled, produce results that are hard to
39 | easily differentiate. The blue dataset is sampled from a mixture of three
40 | standard Gaussians centered at (-2, 0), (0,0) and (2,0); the green dataset is
41 | sampled from a mixture of two standard Gaussians centered at (-1,0) and (1,0);
42 | the red data is sampled from a multivariate Gaussian with covariance
43 | [2, 0; 0, 1]; the purple data is a single standard Gaussian with uniform
44 | background noise.
45 |
46 | Despite the generate model having clearly different "clusters", without more
47 | data we simply cannot differentiate between these models, and hence no
48 | density based clustering will manage cluster these according to the model.
49 |
50 | Q: I am not getting the claimed performance. Why not?
51 | -----------------------------------------------------
52 |
53 | The most likely explanation is to do with the dimensionality of your input data.
54 | While HDBSCAN can perform well on low to medium dimensional data the performance
55 | tends to decrease significantly as dimension increases. In general HDBSCAN can do
56 | well on up to around 50 or 100 dimensional data, but performance can see
57 | significant decreases beyond that. Of course a lot is also dataset dependent, so
58 | you can still get good performance even on high dimensional data, but it
59 | is no longer guaranteed.
60 |
61 | Q: I want to predict the cluster of a new unseen point. How do I do this?
62 | -------------------------------------------------------------------------
63 |
64 | This is possible via the function :func:`~hdbscan.prediction.approximate_predict`. Note that you
65 | either need to set ``prediction_data=True`` on initialization of your
66 | clusterer object, or run the ``generate_prediction_data`` method after
67 | fitting. With that done you can run :func:`~hdbscan.prediction.approximate_predict` with the model
68 | and any new data points you wish to predict. Note that this differs from
69 | re-running HDBSCAN with the new points added since no new clusters will be
70 | considered -- instead the new points will be labelled according to the
71 | clusters already labelled by the model.
72 |
73 | Q: Haversine metric is not clustering my Lat-Lon data correctly.
74 | ----------------------------------------------------------------
75 |
76 | The Haversine metric as implemented supports coordinates in radians. That
77 | means you'll need to convert your latitude and longitude data into radians
78 | before passing it in to HDBSCAN.
79 |
80 | Q: I want to cite this software in my journal publication. How do I do that?
81 | ----------------------------------------------------------------------------
82 |
83 | If you have used this codebase in a scientific publication and wish to cite it, please use the `Journal of Open Source Software article `_.
84 |
85 | L. McInnes, J. Healy, S. Astels, *hdbscan: Hierarchical density based clustering*
86 | In: Journal of Open Source Software, The Open Journal, volume 2, number 11.
87 | 2017
88 |
89 | BibTeX::
90 |
91 | @article{McInnes2017,
92 | doi = {10.21105/joss.00205},
93 | url = {https://doi.org/10.21105%2Fjoss.00205},
94 | year = {2017},
95 | month = {mar},
96 | publisher = {The Open Journal},
97 | volume = {2},
98 | number = {11},
99 | author = {Leland McInnes and John Healy and Steve Astels},
100 | title = {hdbscan: Hierarchical density based clustering},
101 | journal = {The Journal of Open Source Software}
102 | }
103 |
104 | ::
105 |
--------------------------------------------------------------------------------
/docs/how_to_use_epsilon.rst:
--------------------------------------------------------------------------------
1 |
2 | Combining HDBSCAN\* with DBSCAN
3 | =============================
4 |
5 | While DBSCAN needs a minimum cluster size *and* a distance threshold epsilon as user-defined input parameters,
6 | HDBSCAN\* is basically a DBSCAN implementation for varying epsilon values and therefore only needs the minimum cluster size as single input parameter.
7 | The ``'eom'`` (Excess of Mass) cluster selection method then returns clusters with the best stability over epsilon.
8 |
9 | Unlike DBSCAN, this allows to it find clusters of variable densities without having to choose a suitable distance threshold first.
10 | However, there are cases where we could still benefit from the use of an epsilon threshold.
11 |
12 | For illustration, see this map with GPS locations, representing recorded pick-up and drop-off locations for customers of a ride pooling provider.
13 | The largest (visual) data cluster can be found around the train station. Smaller clusters are placed along the streets, depending on the requested location
14 | in the form of a postal address or point of interest. Since we are considering a door-to-door system where customers are not bound to collective pick-up or
15 | drop-off locations, we are interested in both large clusters and small clusters with a minimum size of 4.
16 |
17 | .. image:: images/epsilon_parameter_dataset.png
18 | :align: center
19 |
20 | Clustering the given data set with `DBSCAN `__ and an epsilon threshold of 5 meters gives us good results,
21 | but neglects clusters with points that are more than 5 meters apart from each other.
22 | However, increasing epsilon would result in cluster chains along the streets, especially when working with a larger data set.
23 |
24 | .. image:: images/epsilon_parameter_dbscan.png
25 | :align: center
26 |
27 | Unfortunately, HDBSCAN\* does not produce any better results in this case: while it discovers the clusters that DBSCAN missed, it also returns a very high number of micro-clusters around the train station,
28 | even though we would prefer one or only few clusters representing this location. We could achieve this by increasing ``min_cluster_size`` or
29 | the smoothing parameter ``min_samples``, but with the trade-off of losing small clusters in less dense areas or merging them into other clusters
30 | separated by a relatively large distance.
31 |
32 | .. image:: images/epsilon_parameter_hdbscan_eom.png
33 | :align: center
34 |
35 | This is where the parameter ``cluster_selection_epsilon`` comes into play. The cluster extraction method using this parameter, as described in detail
36 | by `Malzer and Baum `__, acts like a hybrid between DBSCAN
37 | (or, to be precise, DBSCAN\*, i.e. DBSCAN without the border points) by extracting DBSCAN results for data partitions
38 | affected by the given parameter value, and HDBSCAN\* results for all others.
39 |
40 | In our example, we choose to merge nested clusters below 5 meters (0.005 kilometers) and therefore set the parameter ``cluster_selection_epsilon`` accordingly:
41 |
42 | .. code:: python
43 |
44 | X = np.radians(coordinates) #convert the list of lat/lon coordinates to radians
45 | earth_radius_km = 6371
46 | epsilon = 0.005 / earth_radius_km #calculate 5 meter epsilon threshold
47 |
48 | clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='haversine',
49 | cluster_selection_epsilon=epsilon, cluster_selection_method = 'eom')
50 | clusterer.fit(X)
51 |
52 | And indeed, the result looks like a mix between DBSCAN and HDBSCAN(eom). We no longer lose clusters of variable densities beyond the given epsilon, but at the
53 | same time avoid the abundance of micro-clusters in the original HDBSCAN\* clustering, which was an undesired side-effect of having to choose a low ``min_cluster_size`` value.
54 |
55 | .. image:: images/epsilon_parameter_hdbscan_eps.png
56 | :align: center
57 |
58 | Note that for the given parameter setting, running HDBSCAN\* based on ``cluster_selection_method = 'eom'`` or ``cluster_selection_method = 'leaf'`` does not make
59 | any difference: the ``cluster_selection_epsilon`` threshold neutralizes the effect of HDBSCAN(eom)'s stability calculations.
60 | When using a lower threshold, some minor differences can be noticed. For example, an epsilon value of 3 meters with ``'eom'`` produces the same results as
61 | a the 5 meter value on the given data set, but 3 meters in combination with ``'leaf'`` achieves a slightly different result:
62 |
63 | .. image:: images/epsilon_parameter_hdbscan_e3_leaf.png
64 | :align: center
65 |
66 | A ``cluster_selection_epsilon`` value of 0 (the default value) always returns the original HDBSCAN\* results, either according to ``'eom'`` or ``'leaf'``.
67 |
68 |
--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_11_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_11_1.png
--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_26_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_26_1.png
--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_3_1.png
--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_5_1.png
--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_9_1.png
--------------------------------------------------------------------------------
/docs/images/allow_single_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/allow_single_cluster.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_12_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_12_0.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_15_0.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_18_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_18_0.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_21_0.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_24_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_24_0.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_27_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_27_0.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_31_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_31_0.png
--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_6_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_6_0.png
--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_dataset.png
--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_dbscan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_dbscan.png
--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_hdbscan_e3_leaf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_e3_leaf.png
--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_hdbscan_eom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_eom.png
--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_hdbscan_eps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_eps.png
--------------------------------------------------------------------------------
/docs/images/generative_model_kde.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/generative_model_kde.png
--------------------------------------------------------------------------------
/docs/images/generative_model_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/generative_model_scatter.png
--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_10_1.png
--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_12_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_12_1.png
--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_15_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_15_1.png
--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_18_1.png
--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_20_1.png
--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_3_1.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_13_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_13_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_15_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_17_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_17_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_19_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_19_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_21_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_23_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_23_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_25_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_25_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_3_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_3_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_5_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_7_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_7_0.png
--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_9_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_9_0.png
--------------------------------------------------------------------------------
/docs/images/outlier_detection_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_3_1.png
--------------------------------------------------------------------------------
/docs/images/outlier_detection_7_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_7_1.png
--------------------------------------------------------------------------------
/docs/images/outlier_detection_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_9_1.png
--------------------------------------------------------------------------------
/docs/images/parameter_selection_11_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_11_1.png
--------------------------------------------------------------------------------
/docs/images/parameter_selection_12_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_12_1.png
--------------------------------------------------------------------------------
/docs/images/parameter_selection_15_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_15_1.png
--------------------------------------------------------------------------------
/docs/images/parameter_selection_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_18_1.png
--------------------------------------------------------------------------------
/docs/images/parameter_selection_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_3_1.png
--------------------------------------------------------------------------------
/docs/images/parameter_selection_7_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_7_1.png
--------------------------------------------------------------------------------
/docs/images/parameter_selection_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_9_1.png
--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_14_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_14_1.png
--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_20_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_20_2.png
--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_24_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_24_1.png
--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_9_1.png
--------------------------------------------------------------------------------
/docs/images/prediction_tutorial_3_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_3_0.png
--------------------------------------------------------------------------------
/docs/images/prediction_tutorial_5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_5_1.png
--------------------------------------------------------------------------------
/docs/images/prediction_tutorial_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_9_1.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_10_1.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_13_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_13_1.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_15_0.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_3_1.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_6_1.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_8_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_8_1.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_11_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_11_0.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_15_0.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_26_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_26_0.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_2_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_2_0.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_31_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_31_0.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_36_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_36_0.png
--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_6_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_6_0.png
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. hdbscan documentation master file, created by
2 | sphinx-quickstart on Sat May 28 10:34:44 2016.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | The hdbscan Clustering Library
7 | ==============================
8 |
9 | The hdbscan library is a suite of tools to use unsupervised learning to find clusters, or
10 | dense regions, of a dataset. The primary algorithm is HDBSCAN* as proposed by Campello,
11 | Moulavi, and Sander. The library provides a high performance implementation of this algorithm,
12 | along with tools for analysing the resulting clustering.
13 |
14 |
15 | User Guide / Tutorial
16 | ---------------------
17 |
18 | .. toctree::
19 | :maxdepth: 2
20 |
21 | basic_hdbscan
22 | advanced_hdbscan
23 | parameter_selection
24 | outlier_detection
25 | prediction_tutorial
26 | soft_clustering
27 | how_to_use_epsilon
28 | dbscan_from_hdbscan
29 | how_to_detect_branches
30 | faq
31 |
32 | Background on Clustering with HDBSCAN
33 | -------------------------------------
34 |
35 | .. toctree::
36 | :maxdepth: 2
37 |
38 | how_hdbscan_works
39 | comparing_clustering_algorithms
40 | performance_and_scalability
41 | soft_clustering_explanation
42 |
43 | API Reference
44 | -------------
45 |
46 | .. toctree::
47 |
48 | api
49 |
50 | Indices and tables
51 | ==================
52 |
53 | * :ref:`genindex`
54 | * :ref:`modindex`
55 | * :ref:`search`
56 |
57 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=_build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
11 | if NOT "%PAPER%" == "" (
12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14 | )
15 |
16 | if "%1" == "" goto help
17 |
18 | if "%1" == "help" (
19 | :help
20 | echo.Please use `make ^` where ^ is one of
21 | echo. html to make standalone HTML files
22 | echo. dirhtml to make HTML files named index.html in directories
23 | echo. singlehtml to make a single large HTML file
24 | echo. pickle to make pickle files
25 | echo. json to make JSON files
26 | echo. htmlhelp to make HTML files and a HTML help project
27 | echo. qthelp to make HTML files and a qthelp project
28 | echo. devhelp to make HTML files and a Devhelp project
29 | echo. epub to make an epub
30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
31 | echo. text to make text files
32 | echo. man to make manual pages
33 | echo. texinfo to make Texinfo files
34 | echo. gettext to make PO message catalogs
35 | echo. changes to make an overview over all changed/added/deprecated items
36 | echo. xml to make Docutils-native XML files
37 | echo. pseudoxml to make pseudoxml-XML files for display purposes
38 | echo. linkcheck to check all external links for integrity
39 | echo. doctest to run all doctests embedded in the documentation if enabled
40 | echo. coverage to run coverage check of the documentation if enabled
41 | goto end
42 | )
43 |
44 | if "%1" == "clean" (
45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
46 | del /q /s %BUILDDIR%\*
47 | goto end
48 | )
49 |
50 |
51 | REM Check if sphinx-build is available and fallback to Python version if any
52 | %SPHINXBUILD% 1>NUL 2>NUL
53 | if errorlevel 9009 goto sphinx_python
54 | goto sphinx_ok
55 |
56 | :sphinx_python
57 |
58 | set SPHINXBUILD=python -m sphinx.__init__
59 | %SPHINXBUILD% 2> nul
60 | if errorlevel 9009 (
61 | echo.
62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
63 | echo.installed, then set the SPHINXBUILD environment variable to point
64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
65 | echo.may add the Sphinx directory to PATH.
66 | echo.
67 | echo.If you don't have Sphinx installed, grab it from
68 | echo.http://sphinx-doc.org/
69 | exit /b 1
70 | )
71 |
72 | :sphinx_ok
73 |
74 |
75 | if "%1" == "html" (
76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
77 | if errorlevel 1 exit /b 1
78 | echo.
79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
80 | goto end
81 | )
82 |
83 | if "%1" == "dirhtml" (
84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
85 | if errorlevel 1 exit /b 1
86 | echo.
87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
88 | goto end
89 | )
90 |
91 | if "%1" == "singlehtml" (
92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
93 | if errorlevel 1 exit /b 1
94 | echo.
95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
96 | goto end
97 | )
98 |
99 | if "%1" == "pickle" (
100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | if errorlevel 1 exit /b 1
102 | echo.
103 | echo.Build finished; now you can process the pickle files.
104 | goto end
105 | )
106 |
107 | if "%1" == "json" (
108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | if errorlevel 1 exit /b 1
110 | echo.
111 | echo.Build finished; now you can process the JSON files.
112 | goto end
113 | )
114 |
115 | if "%1" == "htmlhelp" (
116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | if errorlevel 1 exit /b 1
118 | echo.
119 | echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | goto end
122 | )
123 |
124 | if "%1" == "qthelp" (
125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | if errorlevel 1 exit /b 1
127 | echo.
128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\hdbscan.qhcp
131 | echo.To view the help file:
132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\hdbscan.ghc
133 | goto end
134 | )
135 |
136 | if "%1" == "devhelp" (
137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | if errorlevel 1 exit /b 1
139 | echo.
140 | echo.Build finished.
141 | goto end
142 | )
143 |
144 | if "%1" == "epub" (
145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | if errorlevel 1 exit /b 1
147 | echo.
148 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | goto end
150 | )
151 |
152 | if "%1" == "latex" (
153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | if errorlevel 1 exit /b 1
155 | echo.
156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | goto end
158 | )
159 |
160 | if "%1" == "latexpdf" (
161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | cd %BUILDDIR%/latex
163 | make all-pdf
164 | cd %~dp0
165 | echo.
166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | goto end
168 | )
169 |
170 | if "%1" == "latexpdfja" (
171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | cd %BUILDDIR%/latex
173 | make all-pdf-ja
174 | cd %~dp0
175 | echo.
176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | goto end
178 | )
179 |
180 | if "%1" == "text" (
181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | if errorlevel 1 exit /b 1
183 | echo.
184 | echo.Build finished. The text files are in %BUILDDIR%/text.
185 | goto end
186 | )
187 |
188 | if "%1" == "man" (
189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | if errorlevel 1 exit /b 1
191 | echo.
192 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | goto end
194 | )
195 |
196 | if "%1" == "texinfo" (
197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | if errorlevel 1 exit /b 1
199 | echo.
200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | goto end
202 | )
203 |
204 | if "%1" == "gettext" (
205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | if errorlevel 1 exit /b 1
207 | echo.
208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | goto end
210 | )
211 |
212 | if "%1" == "changes" (
213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | if errorlevel 1 exit /b 1
215 | echo.
216 | echo.The overview file is in %BUILDDIR%/changes.
217 | goto end
218 | )
219 |
220 | if "%1" == "linkcheck" (
221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | if errorlevel 1 exit /b 1
223 | echo.
224 | echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | goto end
227 | )
228 |
229 | if "%1" == "doctest" (
230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | if errorlevel 1 exit /b 1
232 | echo.
233 | echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | goto end
236 | )
237 |
238 | if "%1" == "coverage" (
239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | if errorlevel 1 exit /b 1
241 | echo.
242 | echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | goto end
245 | )
246 |
247 | if "%1" == "xml" (
248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | if errorlevel 1 exit /b 1
250 | echo.
251 | echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | goto end
253 | )
254 |
255 | if "%1" == "pseudoxml" (
256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | if errorlevel 1 exit /b 1
258 | echo.
259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | goto end
261 | )
262 |
263 | :end
264 |
--------------------------------------------------------------------------------
/docs/outlier_detection.rst:
--------------------------------------------------------------------------------
1 |
2 | Outlier Detection
3 | =================
4 |
5 | The hdbscan library supports the GLOSH outlier detection algorithm, and
6 | does so within the HDBSCAN clustering class. The GLOSH outlier detection
7 | algorithm is related to older outlier detection methods such as
8 | `LOF `__ and
9 | `LOCI `__.
10 | It is a fast and flexible outlier detection system, and supports a
11 | notion of local outliers. This means that it can detect outliers that
12 | may be noticeably different from points in its local region (for example
13 | points not on a local submanifold) but that are not necessarily outliers
14 | globally. So how do we find outliers? We proceed identically to the
15 | basic use of HDBSCAN\*. We start with some data, and fit it with an
16 | HDBSCAN object.
17 |
18 | .. code:: python
19 |
20 | plt.scatter(*data.T, s=50, linewidth=0, c='b', alpha=0.25)
21 |
22 |
23 | .. image:: images/outlier_detection_3_1.png
24 |
25 |
26 | .. code:: python
27 |
28 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data)
29 |
30 | The ``clusterer`` object now has an attribute (computed when first accessed)
31 | called ``outlier_scores_``. This provides a numpy array with a value for
32 | each sample in the original dataset that was fit with the ``clusterer``. The
33 | higher the score, the more likely the point is to be an outlier. In
34 | practice it is often best to look at the distributions of outlier
35 | scores.
36 |
37 | .. code:: python
38 |
39 | clusterer.outlier_scores_
40 |
41 |
42 |
43 |
44 | .. parsed-literal::
45 |
46 | array([ 0.14791852, 0.14116731, 0.09171929, ..., 0.62050534,
47 | 0.56749298, 0.20681685])
48 |
49 |
50 |
51 | .. code:: python
52 |
53 | sns.distplot(clusterer.outlier_scores_[np.isfinite(clusterer.outlier_scores_)], rug=True)
54 |
55 | .. image:: images/outlier_detection_7_1.png
56 |
57 |
58 | We can pull off upper quantiles to detect outliers, which we can then
59 | plot.
60 |
61 | .. code:: python
62 |
63 | threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
64 | outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
65 | plt.scatter(*data.T, s=50, linewidth=0, c='gray', alpha=0.25)
66 | plt.scatter(*data[outliers].T, s=50, linewidth=0, c='red', alpha=0.5)
67 |
68 | .. image:: images/outlier_detection_9_1.png
69 |
70 |
71 | Note that not only are the outlying border points highlighted as
72 | outliers, but points at the edge of the central ball like cluster, and
73 | just below the vertical band cluster, are also designated as outliers.
74 | This is because those two clusters are extremely dense, and the points
75 | at the edge of this cluster are close enough to the cluster that they
76 | should be part of it, but far enough from the being core parts of the
77 | cluster that they are extremely unlikely and hence anomalous.
78 |
79 |
--------------------------------------------------------------------------------
/docs/parameter_selection.rst:
--------------------------------------------------------------------------------
1 |
2 | Parameter Selection for HDBSCAN\*
3 | =================================
4 |
5 | While the HDBSCAN class has a large number of parameters that can be set
6 | on initialization, in practice there are a very small number of
7 | parameters that have significant practical effect on clustering. We will
8 | consider those major parameters, and consider how one may go about
9 | choosing them effectively.
10 |
11 | .. _min_cluster_size_label:
12 |
13 | Selecting ``min_cluster_size``
14 | ------------------------------
15 |
16 | The primary parameter to effect the resulting clustering is
17 | ``min_cluster_size``. Ideally this is a relatively intuitive parameter
18 | to select -- set it to the smallest size grouping that you wish to
19 | consider a cluster. It can have slightly non-obvious effects however.
20 | Let's consider the digits dataset from sklearn. We can project the data
21 | into two dimensions to visualize it via t-SNE.
22 |
23 | .. code:: python
24 |
25 | digits = datasets.load_digits()
26 | data = digits.data
27 | projection = TSNE().fit_transform(data)
28 | plt.scatter(*projection.T, **plot_kwds)
29 |
30 |
31 | .. image:: images/parameter_selection_3_1.png
32 |
33 |
34 | If we cluster this data in the full 64 dimensional space with HDBSCAN\* we
35 | can see some effects from varying the ``min_cluster_size``.
36 |
37 | We start with a ``min_cluster_size`` of 15.
38 |
39 | .. code:: python
40 |
41 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data)
42 | color_palette = sns.color_palette('Paired', 12)
43 | cluster_colors = [color_palette[x] if x >= 0
44 | else (0.5, 0.5, 0.5)
45 | for x in clusterer.labels_]
46 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
47 | zip(cluster_colors, clusterer.probabilities_)]
48 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
49 |
50 |
51 | .. image:: images/parameter_selection_7_1.png
52 |
53 |
54 | Increasing the ``min_cluster_size`` to 30 reduces the number of
55 | clusters, merging some together. This is a result of HDBSCAN\*
56 | reoptimizing which flat clustering provides greater stability under a
57 | slightly different notion of what constitutes a cluster.
58 |
59 | .. code:: python
60 |
61 | clusterer = hdbscan.HDBSCAN(min_cluster_size=30).fit(data)
62 | color_palette = sns.color_palette('Paired', 12)
63 | cluster_colors = [color_palette[x] if x >= 0
64 | else (0.5, 0.5, 0.5)
65 | for x in clusterer.labels_]
66 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
67 | zip(cluster_colors, clusterer.probabilities_)]
68 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
69 |
70 | .. image:: images/parameter_selection_9_1.png
71 |
72 |
73 | Doubling the ``min_cluster_size`` again to 60 gives us just two clusters
74 | -- the really core clusters. This is somewhat as expected, but surely
75 | some of the other clusters that we had previously had more than 60
76 | members? Why are they being considered noise? The answer is that
77 | HDBSCAN\* has a second parameter ``min_samples``. The implementation
78 | defaults this value (if it is unspecified) to whatever
79 | ``min_cluster_size`` is set to. We can recover some of our original
80 | clusters by explicitly providing ``min_samples`` at the original value
81 | of 15.
82 |
83 | .. code:: python
84 |
85 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60).fit(data)
86 | color_palette = sns.color_palette('Paired', 12)
87 | cluster_colors = [color_palette[x] if x >= 0
88 | else (0.5, 0.5, 0.5)
89 | for x in clusterer.labels_]
90 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
91 | zip(cluster_colors, clusterer.probabilities_)]
92 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
93 |
94 |
95 | .. image:: images/parameter_selection_11_1.png
96 |
97 |
98 | .. code:: python
99 |
100 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=15).fit(data)
101 | color_palette = sns.color_palette('Paired', 12)
102 | cluster_colors = [color_palette[x] if x >= 0
103 | else (0.5, 0.5, 0.5)
104 | for x in clusterer.labels_]
105 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
106 | zip(cluster_colors, clusterer.probabilities_)]
107 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
108 |
109 | .. image:: images/parameter_selection_12_1.png
110 |
111 |
112 | As you can see this results in us recovering something much closer to
113 | our original clustering, only now with some of the smaller clusters
114 | pruned out. Thus ``min_cluster_size`` does behave more closely to our
115 | intuitions, but only if we fix ``min_samples``.
116 |
117 | If you wish to explore different ``min_cluster_size`` settings with
118 | a fixed ``min_samples`` value, especially for larger dataset sizes,
119 | you can cache the hard computation, and recompute only the relatively
120 | cheap flat cluster extraction using the ``memory`` parameter, which
121 | makes use of `joblib `_
122 |
123 | .. _min_samples_label:
124 |
125 | Selecting ``min_samples``
126 | -----------------------
127 |
128 | Since we have seen that ``min_samples`` clearly has a dramatic effect on
129 | clustering, the question becomes: how do we select this parameter? The
130 | simplest intuition for what ``min_samples`` does is provide a measure of
131 | how conservative you want your clustering to be. The larger the value of
132 | ``min_samples`` you provide, the more conservative the clustering --
133 | more points will be declared as noise, and clusters will be restricted
134 | to progressively more dense areas. We can see this in practice by
135 | leaving the ``min_cluster_size`` at 60, but reducing ``min_samples`` to
136 | 1.
137 |
138 | Note: adjusting ``min_samples`` will result in recomputing the **hard
139 | comptuation** of the single linkage tree.
140 |
141 | .. code:: python
142 |
143 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=1).fit(data)
144 | color_palette = sns.color_palette('Paired', 12)
145 | cluster_colors = [color_palette[x] if x >= 0
146 | else (0.5, 0.5, 0.5)
147 | for x in clusterer.labels_]
148 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
149 | zip(cluster_colors, clusterer.probabilities_)]
150 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
151 |
152 |
153 |
154 |
155 | .. parsed-literal::
156 |
157 |
158 |
159 |
160 |
161 |
162 | .. image:: images/parameter_selection_15_1.png
163 |
164 |
165 | Now most points are clustered, and there are much fewer noise points.
166 | Steadily increasing ``min_samples`` will, as we saw in the examples
167 | above, make the clustering progressively more conservative, culminating
168 | in the example above where ``min_samples`` was set to 60 and we had only
169 | two clusters with most points declared as noise.
170 |
171 | .. _epsilon_label:
172 |
173 | Selecting ``cluster_selection_epsilon``
174 | ---------------------------------------
175 |
176 | In some cases, we want to choose a small ``min_cluster_size`` because even groups of few points might be of interest to us.
177 | However, if our data set also contains partitions with high concentrations of objects, this parameter setting can result in
178 | a large number of micro-clusters. Selecting a value for ``cluster_selection_epsilon`` helps us to merge clusters in these regions.
179 | Or in other words, it ensures that clusters below the given threshold are not split up any further.
180 |
181 | The choice of ``cluster_selection_epsilon`` depends on the given distances between your data points. For example, set the value to 0.5 if you don't want to
182 | separate clusters that are less than 0.5 units apart. This will basically extract DBSCAN* clusters for epsilon = 0.5 from the condensed cluster tree, but leave
183 | HDBSCAN* clusters that emerged at distances greater than 0.5 untouched. See :doc:`how_to_use_epsilon` for a more detailed demonstration of the effect this parameter
184 | has on the resulting clustering.
185 |
186 | .. _alpha_label:
187 |
188 | Selecting ``alpha``
189 | -----------------
190 |
191 | A further parameter that effects the resulting clustering is ``alpha``.
192 | In practice it is best not to mess with this parameter -- ultimately it
193 | is part of the ``RobustSingleLinkage`` code, but flows naturally into
194 | HDBSCAN\*. If, for some reason, ``min_samples`` or ``cluster_selection_epsilon`` is not providing you
195 | what you need, stop, rethink things, and try again with ``min_samples`` or ``cluster_selection_epsilon``.
196 | If you still need to play with another parameter (and you shouldn't),
197 | then you can try setting ``alpha``. The ``alpha`` parameter provides a
198 | slightly different approach to determining how conservative the
199 | clustering is. By default ``alpha`` is set to 1.0. Increasing ``alpha``
200 | will make the clustering more conservative, but on a much tighter scale,
201 | as we can see by setting ``alpha`` to 1.3.
202 |
203 | Note: adjusting ``alpha`` will result in recomputing the **hard
204 | comptuation** of the single linkage tree.
205 |
206 | .. code:: python
207 |
208 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=15, alpha=1.3).fit(data)
209 | color_palette = sns.color_palette('Paired', 12)
210 | cluster_colors = [color_palette[x] if x >= 0
211 | else (0.5, 0.5, 0.5)
212 | for x in clusterer.labels_]
213 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
214 | zip(cluster_colors, clusterer.probabilities_)]
215 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
216 |
217 | .. image:: images/parameter_selection_18_1.png
218 |
219 |
220 | .. _leaf_clustering_label:
221 |
222 | Leaf clustering
223 | ---------------
224 |
225 | HDBSCAN supports an extra parameter ``cluster_selection_method`` to determine
226 | how it selects flat clusters from the cluster tree hierarchy. The default
227 | method is ``'eom'`` for Excess of Mass, the algorithm described in
228 | :doc:`how_hdbscan_works`. This is not always the most desireable approach to
229 | cluster selection. If you are more interested in having small homogeneous
230 | clusters then you may find Excess of Mass has a tendency to pick one or two
231 | large clusters and then a number of small extra clusters. In this situation
232 | you may be tempted to recluster just the data in the single large cluster.
233 | Instead, a better option is to select ``'leaf'`` as a cluster selection
234 | method. This will select leaf nodes from the tree, producing many small
235 | homogeneous clusters. Note that you can still get variable density clusters
236 | via this method, and it is also still possible to get large clusters, but
237 | there will be a tendency to produce a more fine grained clustering than
238 | Excess of Mass can provide.
239 |
240 | .. _single_cluster_label:
241 |
242 | Allowing a single cluster
243 | -------------------------
244 |
245 | In contrast, if you are getting lots of small clusters, but believe there
246 | should be some larger scale structure (or the possibility of no structure),
247 | consider the ``allow_single_cluster`` option. By default HDBSCAN\* does not
248 | allow a single cluster to be returned -- this is due to how the Excess of
249 | Mass algorithm works, and a bias towards the root cluster that may occur. You
250 | can override this behaviour and see what clustering would look like if you
251 | allow a single cluster to be returned. This can alleviate issue caused by
252 | there only being a single large cluster, or by data that is essentially just
253 | noise. For example, the image below shows the effects of setting
254 | ``allow_single_cluster=True`` in the bottom row, compared to the top row
255 | which used default settings.
256 |
257 | .. image:: images/allow_single_cluster.png
258 |
--------------------------------------------------------------------------------
/docs/prediction_tutorial.rst:
--------------------------------------------------------------------------------
1 |
2 | Predicting clusters for new points
3 | ==================================
4 |
5 | Often it is useful to train a model once on a large amount of data, and
6 | then query the model repeatedly with small amounts of new data. This is
7 | hard for HDBSCAN\* as it is a transductive method -- new data points
8 | can (and should!) be able to alter the underlying clustering. That is,
9 | given new information it might make sense to create a new cluster, split
10 | an existing cluster, or merge two previously separate clusters. If the
11 | actual clusters (and hence their labels) change with each new data point
12 | it becomes impossible to compare the cluster assignments between such
13 | queries.
14 |
15 | We can accommodate this by effectively holding a clustering fixed (after
16 | a potentially expensive training run) and then asking: *if we do not
17 | change the existing clusters* which cluster would HDBSCAN\* assign a new
18 | data point to. In practice this amounts to determining where in the
19 | condensed tree the new data point would fall (see
20 | :any:`how_hdbscan_works`) assuming we do not change the condensed
21 | tree. This allows for a very inexpensive operation to compute a
22 | predicted cluster for the new data point.
23 |
24 | This has been implemented in ``hdbscan`` as the
25 | :py:func:`~hdbscan.predict.approximate_predict` function. We'll look
26 | at how this works below.
27 |
28 | As usual we begin with our test synthetic data set, and cluster it with
29 | HDBSCAN. The primary point to note here, however, is the use of the
30 | ``prediction_data=True`` keyword argument. This ensures that HDBSCAN
31 | does a little extra computation when fitting the model that can
32 | dramatically speed up the prediction queries later.
33 |
34 | You can also get an HDBSCAN object to create this data after the fact
35 | via the :py:meth:`~hdbscan.HDBSCAN.generate_prediction_data` method.
36 |
37 | .. code:: python
38 |
39 | data = np.load('clusterable_data.npy')
40 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15, prediction_data=True).fit(data)
41 | pal = sns.color_palette('deep', 8)
42 | colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_,
43 | clusterer.probabilities_)]
44 | plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);
45 |
46 |
47 |
48 | .. image:: images/prediction_tutorial_3_0.png
49 |
50 |
51 | Now to make things a little more interesting let's generate 50 new data
52 | points scattered across the data. We can plot them in black to see where
53 | they happen to fall.
54 |
55 | .. code:: python
56 |
57 | test_points = np.random.random(size=(50, 2)) - 0.5
58 |
59 | colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_,
60 | clusterer.probabilities_)]
61 | plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);
62 | plt.scatter(*test_points.T, c='k', s=50)
63 |
64 |
65 |
66 | .. image:: images/prediction_tutorial_5_1.png
67 |
68 |
69 | We can use the predict API on this data, calling
70 | :py:func:`~hdbscan.predict.approximate_predict` with the HDBSCAN object,
71 | and the numpy array of new points. Note that
72 | :py:func:`~hdbscan.predict.approximate_predict` takes an *array* of new
73 | points. If you have a single point be sure to wrap it in a list.
74 |
75 | .. code:: python
76 |
77 | test_labels, strengths = hdbscan.approximate_predict(clusterer, test_points)
78 | test_labels
79 |
80 |
81 |
82 |
83 | .. parsed-literal::
84 |
85 | array([ 2, -1, -1, -1, -1, -1, 1, 5, -1, -1, 5, -1, -1, -1, -1, 4, -1,
86 | -1, -1, -1, -1, 4, -1, -1, -1, -1, 2, -1, -1, 1, -1, -1, -1, 0,
87 | -1, 2, -1, -1, 3, -1, -1, 1, -1, -1, -1, -1, -1, 5, 3, 2])
88 |
89 |
90 |
91 | The result is a set of labels as you can see. Many of the points as
92 | classified as noise, but several are also assigned to clusters. This is
93 | a very fast operation, even with large datasets, as long the HDBSCAN
94 | object has the prediction data generated beforehand.
95 |
96 | We can also visualize how this worked, coloring the new data points by
97 | the cluster to which they were assigned. I have added black border
98 | around the points so they don't get lost inside the clusters they fall
99 | into.
100 |
101 | .. code:: python
102 |
103 | colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_,
104 | clusterer.probabilities_)]
105 | test_colors = [pal[col] if col >= 0 else (0.1, 0.1, 0.1) for col in test_labels]
106 | plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);
107 | plt.scatter(*test_points.T, c=test_colors, s=80, linewidths=1, edgecolors='k')
108 |
109 |
110 |
111 | .. image:: images/prediction_tutorial_9_1.png
112 |
113 |
114 | It is as simple as that. So now you can get started using HDBSCAN as a
115 | streaming clustering service -- just be sure to cache your data and
116 | retrain your model periodically to avoid drift!
117 |
118 |
--------------------------------------------------------------------------------
/docs/soft_clustering.rst:
--------------------------------------------------------------------------------
1 |
2 | Soft Clustering for HDBSCAN\*
3 | =============================
4 |
5 | Soft clustering is a new (and still somewhat experimental) feature of
6 | the hdbscan library. It takes advantage of the fact that the condensed
7 | tree is a kind of smoothed density function over data points, and the
8 | notion of exemplars for clusters. If you want to better understand how
9 | soft clustering works please refer to :any:`soft_clustering_explanation`.
10 |
11 | Let's consider the digits dataset from sklearn. We can project the data
12 | into two dimensions to visualize it via t-SNE.
13 |
14 | .. code:: python
15 |
16 | from sklearn import datasets
17 | from sklearn.manifold import TSNE
18 | import matplotlib.pyplot as plt
19 | import seaborn as sns
20 | import numpy as np
21 |
22 | .. code:: python
23 |
24 | digits = datasets.load_digits()
25 | data = digits.data
26 | projection = TSNE().fit_transform(data)
27 | plt.scatter(*projection.T, **plot_kwds)
28 |
29 |
30 | .. image:: images/soft_clustering_3_1.png
31 |
32 |
33 | Now we import hdbscan and then cluster in the full 64 dimensional space.
34 | It is important to note that, if we wish to use the soft clustering we
35 | should use the ``prediction_data=True`` option for HDBSCAN. This will
36 | ensure we generate the extra data required that will allow soft
37 | clustering to work.
38 |
39 | .. code:: python
40 |
41 | import hdbscan
42 |
43 | .. code:: python
44 |
45 | clusterer = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True).fit(data)
46 | color_palette = sns.color_palette('Paired', 12)
47 | cluster_colors = [color_palette[x] if x >= 0
48 | else (0.5, 0.5, 0.5)
49 | for x in clusterer.labels_]
50 | cluster_member_colors = [sns.desaturate(x, p) for x, p in
51 | zip(cluster_colors, clusterer.probabilities_)]
52 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
53 |
54 |
55 |
56 | .. image:: images/soft_clustering_6_1.png
57 |
58 |
59 | Certainly a number of clusters were found, but the data is fairly noisy
60 | in 64 dimensions, so there are a number of points that have been
61 | classified as noise. We can generate a soft clustering to get more
62 | information about some of these noise points.
63 |
64 | To generate a soft clustering for all the points in the original dataset
65 | we use the
66 | :py:func:`~hdbscan.prediction.all_points_membership_vectors` function
67 | which takes a clusterer object. If we wanted to get soft cluster
68 | membership values for a set of new unseen points we could use
69 | :py:func:`~hdbscan.prediction.membership_vector` instead.
70 |
71 | The return value is a two-dimensional numpy array. Each point of the
72 | input data is assigned a vector of probabilities of being in a cluster.
73 | For a first pass we can visualize the data looking at what the *most
74 | likely* cluster was, by coloring according to the ``argmax`` of the
75 | probability vector (i.e. the cluster for which a given point has the
76 | highest probability of being in).
77 |
78 | .. code:: python
79 |
80 | soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
81 | color_palette = sns.color_palette('Paired', 12)
82 | cluster_colors = [color_palette[np.argmax(x)]
83 | for x in soft_clusters]
84 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25)
85 |
86 |
87 |
88 | .. image:: images/soft_clustering_8_1.png
89 |
90 |
91 | This fills out the clusters nicely -- we see that there were many noise
92 | points that are most likely to belong to the clusters we would expect;
93 | we can also see where things have gotten confused in the middle, and
94 | there is a mix of cluster assignments.
95 |
96 | We are still only using part of the information however; we can
97 | desaturate according to the actual probability value for the most likely
98 | cluster.
99 |
100 | .. code:: python
101 |
102 | color_palette = sns.color_palette('Paired', 12)
103 | cluster_colors = [sns.desaturate(color_palette[np.argmax(x)], np.max(x))
104 | for x in soft_clusters]
105 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25)
106 |
107 |
108 |
109 | .. image:: images/soft_clustering_10_1.png
110 |
111 |
112 | We see that many points actually have a low probability of being in the
113 | cluster -- indeed the soft clustering applies *within* a cluster, so
114 | only the very cores of each cluster have high probabilities. In practice
115 | desaturating is a fairly string treatment; visually a lot will look
116 | gray. We could apply a function and put a lower limit on the
117 | desaturation that meets better with human visual perception, but that is
118 | left as an exercise for the reader.
119 |
120 | Instead we'll explore what else we can learn about the data from these
121 | cluster membership probabilities. An interesting question is which
122 | points have high likelihoods for *two* clusters (and low likelihoods for
123 | the other clusters).
124 |
125 | .. code:: python
126 |
127 | def top_two_probs_diff(probs):
128 | sorted_probs = np.sort(probs)
129 | return sorted_probs[-1] - sorted_probs[-2]
130 |
131 | # Compute the differences between the top two probabilities
132 | diffs = np.array([top_two_probs_diff(x) for x in soft_clusters])
133 | # Select out the indices that have a small difference, and a larger total probability
134 | mixed_points = np.where((diffs < 0.001) & (np.sum(soft_clusters, axis=1) > 0.5))[0]
135 |
136 | .. code:: python
137 |
138 | colors = [(0.75, 0.1, 0.1) if x in mixed_points
139 | else (0.5, 0.5, 0.5) for x in range(data.shape[0])]
140 | plt.scatter(*projection.T, s=50, linewidth=0, c=colors, alpha=0.5)
141 |
142 |
143 |
144 |
145 | .. image:: images/soft_clustering_13_1.png
146 |
147 |
148 | We can look at a few of these and see that many are, indeed, hard to
149 | classify (even for humans). It also seems that 8 was not assigned a
150 | cluster and is seen as a mixture of other clusters.
151 |
152 | .. code:: python
153 |
154 | fig = plt.figure()
155 | for i, image in enumerate(digits.images[mixed_points][:16]):
156 | ax = fig.add_subplot(4,4,i+1)
157 | ax.imshow(image)
158 | plt.tight_layout()
159 |
160 |
161 |
162 | .. image:: images/soft_clustering_15_0.png
163 |
164 |
165 | There is, of course, a lot more analysis that can be done from here, but
166 | hopefully this provides sufficient introduction to what can be achieved
167 | with soft clustering.
168 |
169 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: hdbscan
2 | dependencies:
3 | - python>=3.5
4 | - scikit-learn>=0.19
5 | - matplotlib>=2.0
6 | - seaborn>=0.8
7 | - hdbscan>=0.8.11
8 |
--------------------------------------------------------------------------------
/examples/plot_cluster_comparison.py:
--------------------------------------------------------------------------------
1 | """
2 | =========================================================
3 | Comparing different clustering algorithms on toy datasets
4 | =========================================================
5 |
6 | This example aims at showing characteristics of different
7 | clustering algorithms on datasets that are "interesting"
8 | but still in 2D. The last dataset is an example of a 'null'
9 | situation for clustering: the data is homogeneous, and
10 | there is no good clustering.
11 |
12 | While these examples give some intuition about the algorithms,
13 | this intuition might not apply to very high dimensional data.
14 |
15 | The results could be improved by tweaking the parameters for
16 | each clustering strategy, for instance setting the number of
17 | clusters for the methods that needs this parameter
18 | specified. Note that affinity propagation has a tendency to
19 | create many clusters. Thus in this example its two parameters
20 | (damping and per-point preference) were set to to mitigate this
21 | behavior.
22 | """
23 | print(__doc__)
24 |
25 | import time
26 |
27 | import numpy as np
28 | import matplotlib.pyplot as plt
29 |
30 | from sklearn import cluster, datasets
31 | from sklearn.neighbors import kneighbors_graph
32 | from sklearn.preprocessing import StandardScaler
33 |
34 | import hdbscan
35 |
36 | np.random.seed(0)
37 | plt.style.use('fivethirtyeight')
38 |
39 | def make_var_density_blobs(n_samples=750, centers=[[0,0]], cluster_std=[0.5], random_state=0):
40 | samples_per_blob = n_samples // len(centers)
41 | blobs = [datasets.make_blobs(n_samples=samples_per_blob, centers=[c], cluster_std=cluster_std[i])[0]
42 | for i, c in enumerate(centers)]
43 | labels = [i * np.ones(samples_per_blob) for i in range(len(centers))]
44 | return np.vstack(blobs), np.hstack(labels)
45 |
46 | # Generate datasets. We choose the size big enough to see the scalability
47 | # of the algorithms, but not too big to avoid too long running times
48 | n_samples = 1500
49 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
50 | noise=.08)
51 | noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.10)
52 | blobs = datasets.make_blobs(n_samples=n_samples-200, random_state=8)
53 | noisy_blobs = np.vstack((blobs[0], 25.0*np.random.rand(200, 2)-[10.0,10.0])), np.hstack((blobs[1], -1*np.ones(200)))
54 | varying_blobs = make_var_density_blobs(n_samples,
55 | centers=[[1, 1],
56 | [-1, -1],
57 | [1, -1]],
58 | cluster_std=[0.2, 0.35, 0.5])
59 | no_structure = np.random.rand(n_samples, 2), None
60 |
61 | colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
62 | colors = np.hstack([colors] * 20)
63 |
64 | clustering_names = [
65 | 'MiniBatchKMeans', 'AffinityPropagation',
66 | 'SpectralClustering', 'AgglomerativeClustering',
67 | 'DBSCAN', 'HDBSCAN']
68 |
69 | plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
70 | plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
71 | hspace=.01)
72 |
73 | plot_num = 1
74 |
75 | datasets = [noisy_circles, noisy_moons, noisy_blobs, varying_blobs, no_structure]
76 | for i_dataset, dataset in enumerate(datasets):
77 | X, y = dataset
78 | # normalize dataset for easier parameter selection
79 | X = StandardScaler().fit_transform(X)
80 |
81 | # estimate bandwidth for mean shift
82 | bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
83 |
84 | # connectivity matrix for structured Ward
85 | connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
86 | # make connectivity symmetric
87 | connectivity = 0.5 * (connectivity + connectivity.T)
88 |
89 | # create clustering estimators
90 | two_means = cluster.MiniBatchKMeans(n_clusters=2)
91 | spectral = cluster.SpectralClustering(n_clusters=2,
92 | eigen_solver='arpack',
93 | affinity="nearest_neighbors")
94 | dbscan = cluster.DBSCAN(eps=.2)
95 | affinity_propagation = cluster.AffinityPropagation(damping=.9,
96 | preference=-200)
97 |
98 | average_linkage = cluster.AgglomerativeClustering(
99 | linkage="average", affinity="cityblock", n_clusters=2,
100 | connectivity=connectivity)
101 |
102 | hdbscanner = hdbscan.HDBSCAN()
103 | clustering_algorithms = [
104 | two_means, affinity_propagation, spectral, average_linkage,
105 | dbscan, hdbscanner]
106 |
107 | for name, algorithm in zip(clustering_names, clustering_algorithms):
108 | # predict cluster memberships
109 | t0 = time.time()
110 | algorithm.fit(X)
111 | t1 = time.time()
112 | if hasattr(algorithm, 'labels_'):
113 | y_pred = algorithm.labels_.astype(np.int)
114 | else:
115 | y_pred = algorithm.predict(X)
116 |
117 | # plot
118 | plt.subplot(5, len(clustering_algorithms), plot_num)
119 | if i_dataset == 0:
120 | plt.title(name, size=18)
121 | plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
122 |
123 | if hasattr(algorithm, 'cluster_centers_'):
124 | centers = algorithm.cluster_centers_
125 | center_colors = colors[:len(centers)]
126 | plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
127 | plt.xlim(-2, 2)
128 | plt.ylim(-2, 2)
129 | plt.xticks(())
130 | plt.yticks(())
131 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
132 | transform=plt.gca().transAxes, size=15,
133 | horizontalalignment='right')
134 | plot_num += 1
135 |
136 | plt.show()
137 |
--------------------------------------------------------------------------------
/examples/plot_hdbscan.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | ===================================
4 | Demo of HDBSCAN clustering algorithm
5 | ===================================
6 |
7 | Finds a clustering that has the greatest stability over a range
8 | of epsilon values for standard DBSCAN. This allows clusterings
9 | of different densities unlike DBSCAN.
10 |
11 | """
12 | print(__doc__)
13 |
14 | import numpy as np
15 |
16 | from hdbscan import HDBSCAN
17 | from sklearn.cluster import DBSCAN
18 | from sklearn import metrics
19 | from sklearn.datasets.samples_generator import make_blobs
20 | from sklearn.preprocessing import StandardScaler
21 |
22 | import time
23 |
24 | def make_var_density_blobs(n_samples=750, centers=[[0,0]], cluster_std=[0.5], random_state=0):
25 | samples_per_blob = n_samples // len(centers)
26 | blobs = [make_blobs(n_samples=samples_per_blob, centers=[c], cluster_std=cluster_std[i])[0]
27 | for i, c in enumerate(centers)]
28 | labels = [i * np.ones(samples_per_blob) for i in range(len(centers))]
29 | return np.vstack(blobs), np.hstack(labels)
30 |
31 |
32 | ##############################################################################
33 | # Generate sample data
34 | centers = [[1, 1], [-1, -1], [1, -1]]
35 | densities = [0.2, 0.35, 0.5]
36 | X, labels_true = make_var_density_blobs(n_samples=750, centers=centers, cluster_std=densities,
37 | random_state=0)
38 |
39 | X = StandardScaler().fit_transform(X)
40 |
41 | ##############################################################################
42 | # Compute DBSCAN
43 | hdb_t1 = time.time()
44 | hdb = HDBSCAN(min_cluster_size=10).fit(X)
45 | hdb_labels = hdb.labels_
46 | hdb_elapsed_time = time.time() - hdb_t1
47 |
48 | db_t1 = time.time()
49 | db = DBSCAN(eps=0.1).fit(X)
50 | db_labels = db.labels_
51 | db_elapsed_time = time.time() - db_t1
52 |
53 | # Number of clusters in labels, ignoring noise if present.
54 | n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)
55 |
56 | print('\n\n++ HDBSCAN Results')
57 | print('Estimated number of clusters: %d' % n_clusters_hdb_)
58 | print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time)
59 | print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, hdb_labels))
60 | print('Completeness: %0.3f' % metrics.completeness_score(labels_true, hdb_labels))
61 | print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, hdb_labels))
62 | print('Adjusted Rand Index: %0.3f'
63 | % metrics.adjusted_rand_score(labels_true, hdb_labels))
64 | print('Adjusted Mutual Information: %0.3f'
65 | % metrics.adjusted_mutual_info_score(labels_true, hdb_labels))
66 | print('Silhouette Coefficient: %0.3f'
67 | % metrics.silhouette_score(X, hdb_labels))
68 |
69 | n_clusters_db_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)
70 |
71 | print('\n\n++ DBSCAN Results')
72 | print('Estimated number of clusters: %d' % n_clusters_db_)
73 | print('Elapsed time to cluster: %.4f s' % db_elapsed_time)
74 | print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, db_labels))
75 | print('Completeness: %0.3f' % metrics.completeness_score(labels_true, db_labels))
76 | print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, db_labels))
77 | print('Adjusted Rand Index: %0.3f'
78 | % metrics.adjusted_rand_score(labels_true, db_labels))
79 | print('Adjusted Mutual Information: %0.3f'
80 | % metrics.adjusted_mutual_info_score(labels_true, db_labels))
81 | if n_clusters_db_ > 1:
82 | print('Silhouette Coefficient: %0.3f'
83 | % metrics.silhouette_score(X, db_labels))
84 | else:
85 | print('Silhouette Coefficient: NaN (too few clusters)')
86 |
87 | ##############################################################################
88 | # Plot result
89 | import matplotlib.pyplot as plt
90 |
91 | # Black removed and is used for noise instead.
92 | hdb_unique_labels = set(hdb_labels)
93 | db_unique_labels = set(db_labels)
94 | hdb_colors = plt.cm.Spectral(np.linspace(0, 1, len(hdb_unique_labels)))
95 | db_colors = plt.cm.Spectral(np.linspace(0, 1, len(db_unique_labels)))
96 | fig = plt.figure(figsize=plt.figaspect(0.5))
97 | hdb_axis = fig.add_subplot('121')
98 | db_axis = fig.add_subplot('122')
99 | for k, col in zip(hdb_unique_labels, hdb_colors):
100 | if k == -1:
101 | # Black used for noise.
102 | col = 'k'
103 |
104 | hdb_axis.plot(X[hdb_labels == k, 0], X[hdb_labels == k, 1], 'o', markerfacecolor=col,
105 | markeredgecolor='k', markersize=6)
106 | for k, col in zip(db_unique_labels, db_colors):
107 | if k == -1:
108 | # Black used for noise.
109 | col = 'k'
110 |
111 | db_axis.plot(X[db_labels == k, 0], X[db_labels == k, 1], 'o', markerfacecolor=col,
112 | markeredgecolor='k', markersize=6)
113 |
114 | hdb_axis.set_title('HDBSCAN\nEstimated number of clusters: %d' % n_clusters_hdb_)
115 | db_axis.set_title('DBSCAN\nEstimated number of clusters: %d' % n_clusters_db_)
116 | plt.show()
117 |
--------------------------------------------------------------------------------
/hdbscan/__init__.py:
--------------------------------------------------------------------------------
1 | from .hdbscan_ import HDBSCAN, hdbscan
2 | from .robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage
3 | from .validity import validity_index
4 | from .prediction import (approximate_predict,
5 | membership_vector,
6 | all_points_membership_vectors,
7 | approximate_predict_scores)
8 | from .branches import (BranchDetector,
9 | detect_branches_in_clusters,
10 | approximate_predict_branch)
11 |
12 |
13 |
--------------------------------------------------------------------------------
/hdbscan/_hdbscan_linkage.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: nonecheck=False
3 | # Minimum spanning tree single linkage implementation for hdbscan
4 | # Authors: Leland McInnes, Steve Astels
5 | # License: 3-clause BSD
6 |
7 | import numpy as np
8 | cimport numpy as np
9 |
10 | from libc.float cimport DBL_MAX
11 |
12 | from hdbscan.dist_metrics cimport DistanceMetric
13 |
14 |
15 | cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core(
16 | np.ndarray[np.double_t,
17 | ndim=2] distance_matrix):
18 |
19 | cdef np.ndarray[np.intp_t, ndim=1] node_labels
20 | cdef np.ndarray[np.intp_t, ndim=1] current_labels
21 | cdef np.ndarray[np.double_t, ndim=1] current_distances
22 | cdef np.ndarray[np.double_t, ndim=1] left
23 | cdef np.ndarray[np.double_t, ndim=1] right
24 | cdef np.ndarray[np.double_t, ndim=2] result
25 |
26 | cdef np.ndarray label_filter
27 |
28 | cdef np.intp_t current_node
29 | cdef np.intp_t new_node_index
30 | cdef np.intp_t new_node
31 | cdef np.intp_t i
32 |
33 | result = np.zeros((distance_matrix.shape[0] - 1, 3))
34 | node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp)
35 | current_node = 0
36 | current_distances = np.inf * np.ones(distance_matrix.shape[0])
37 | current_labels = node_labels
38 | for i in range(1, node_labels.shape[0]):
39 | label_filter = current_labels != current_node
40 | current_labels = current_labels[label_filter]
41 | left = current_distances[label_filter]
42 | right = distance_matrix[current_node][current_labels]
43 | current_distances = np.where(left < right, left, right)
44 |
45 | new_node_index = np.argmin(current_distances)
46 | new_node = current_labels[new_node_index]
47 | result[i - 1, 0] = current_node
48 | result[i - 1, 1] = new_node
49 | result[i - 1, 2] = current_distances[new_node_index]
50 | current_node = new_node
51 |
52 | return result
53 |
54 |
55 | cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector(
56 | np.ndarray[np.double_t, ndim=2, mode='c'] raw_data,
57 | np.ndarray[np.double_t, ndim=1, mode='c'] core_distances,
58 | DistanceMetric dist_metric,
59 | np.double_t alpha=1.0):
60 |
61 | # Add a comment
62 | cdef np.ndarray[np.double_t, ndim=1] current_distances_arr
63 | cdef np.ndarray[np.double_t, ndim=1] current_sources_arr
64 | cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr
65 | cdef np.ndarray[np.double_t, ndim=2] result_arr
66 |
67 | cdef np.double_t * current_distances
68 | cdef np.double_t * current_sources
69 | cdef np.double_t * current_core_distances
70 | cdef np.double_t * raw_data_ptr
71 | cdef np.int8_t * in_tree
72 | cdef np.double_t[:, ::1] raw_data_view
73 | cdef np.double_t[:, ::1] result
74 |
75 | cdef np.ndarray label_filter
76 |
77 | cdef np.intp_t current_node
78 | cdef np.intp_t source_node
79 | cdef np.intp_t right_node, right_source
80 | cdef np.intp_t left_node, left_source
81 | cdef np.intp_t new_node
82 | cdef np.intp_t i
83 | cdef np.intp_t j
84 | cdef np.intp_t dim
85 | cdef np.intp_t num_features
86 |
87 | cdef double current_node_core_distance
88 | cdef double right_value
89 | cdef double left_value
90 | cdef double core_value
91 | cdef double new_distance
92 |
93 | dim = raw_data.shape[0]
94 | num_features = raw_data.shape[1]
95 |
96 | raw_data_view = ( (
97 | raw_data.data))
98 | raw_data_ptr = ( &raw_data_view[0, 0])
99 |
100 | result_arr = np.zeros((dim - 1, 3))
101 | in_tree_arr = np.zeros(dim, dtype=np.int8)
102 | current_node = 0
103 | current_distances_arr = np.inf * np.ones(dim)
104 | current_sources_arr = np.ones(dim)
105 |
106 | result = ( ( result_arr.data))
107 | in_tree = ( in_tree_arr.data)
108 | current_distances = ( current_distances_arr.data)
109 | current_sources = ( current_sources_arr.data)
110 | current_core_distances = ( core_distances.data)
111 |
112 | for i in range(1, dim):
113 |
114 | in_tree[current_node] = 1
115 |
116 | current_node_core_distance = current_core_distances[current_node]
117 |
118 | new_distance = DBL_MAX
119 | source_node = 0
120 | new_node = 0
121 |
122 | for j in range(dim):
123 | if in_tree[j]:
124 | continue
125 |
126 | right_value = current_distances[j]
127 | right_source = current_sources[j]
128 |
129 | left_value = dist_metric.dist(&raw_data_ptr[num_features *
130 | current_node],
131 | &raw_data_ptr[num_features * j],
132 | num_features)
133 | left_source = current_node
134 |
135 | if alpha != 1.0:
136 | left_value /= alpha
137 |
138 | core_value = core_distances[j]
139 | if (current_node_core_distance > right_value or
140 | core_value > right_value or
141 | left_value > right_value):
142 | if right_value < new_distance:
143 | new_distance = right_value
144 | source_node = right_source
145 | new_node = j
146 | continue
147 |
148 | if core_value > current_node_core_distance:
149 | if core_value > left_value:
150 | left_value = core_value
151 | else:
152 | if current_node_core_distance > left_value:
153 | left_value = current_node_core_distance
154 |
155 | if left_value < right_value:
156 | current_distances[j] = left_value
157 | current_sources[j] = left_source
158 | if left_value < new_distance:
159 | new_distance = left_value
160 | source_node = left_source
161 | new_node = j
162 | else:
163 | if right_value < new_distance:
164 | new_distance = right_value
165 | source_node = right_source
166 | new_node = j
167 |
168 | result[i - 1, 0] = source_node
169 | result[i - 1, 1] = new_node
170 | result[i - 1, 2] = new_distance
171 | current_node = new_node
172 |
173 | return result_arr
174 |
175 |
176 | cdef class UnionFind (object):
177 |
178 | cdef np.ndarray parent_arr
179 | cdef np.ndarray size_arr
180 | cdef np.intp_t next_label
181 | cdef np.intp_t *parent
182 | cdef np.intp_t *size
183 |
184 | def __init__(self, N):
185 | self.parent_arr = -1 * np.ones(2 * N - 1, dtype=np.intp, order='C')
186 | self.next_label = N
187 | self.size_arr = np.hstack((np.ones(N, dtype=np.intp),
188 | np.zeros(N-1, dtype=np.intp)))
189 | self.parent = ( self.parent_arr.data)
190 | self.size = ( self.size_arr.data)
191 |
192 | cdef void union(self, np.intp_t m, np.intp_t n):
193 | self.size[self.next_label] = self.size[m] + self.size[n]
194 | self.parent[m] = self.next_label
195 | self.parent[n] = self.next_label
196 | self.size[self.next_label] = self.size[m] + self.size[n]
197 | self.next_label += 1
198 |
199 | return
200 |
201 | cdef np.intp_t fast_find(self, np.intp_t n):
202 | cdef np.intp_t p
203 | p = n
204 | while self.parent_arr[n] != -1:
205 | n = self.parent_arr[n]
206 | # label up to the root
207 | while self.parent_arr[p] != n:
208 | p, self.parent_arr[p] = self.parent_arr[p], n
209 | return n
210 |
211 |
212 | cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L):
213 |
214 | cdef np.ndarray[np.double_t, ndim=2] result_arr
215 | cdef np.double_t[:, ::1] result
216 |
217 | cdef np.intp_t N, a, aa, b, bb, index
218 | cdef np.double_t delta
219 |
220 | result_arr = np.zeros((L.shape[0], L.shape[1] + 1))
221 | result = ( (
222 | result_arr.data))
223 | N = L.shape[0] + 1
224 | U = UnionFind(N)
225 |
226 | for index in range(L.shape[0]):
227 |
228 | a = L[index, 0]
229 | b = L[index, 1]
230 | delta = L[index, 2]
231 |
232 | aa, bb = U.fast_find(a), U.fast_find(b)
233 |
234 | result[index][0] = aa
235 | result[index][1] = bb
236 | result[index][2] = delta
237 | result[index][3] = U.size[aa] + U.size[bb]
238 |
239 | U.union(aa, bb)
240 |
241 | return result_arr
242 |
243 |
244 | cpdef np.ndarray[np.double_t, ndim=2] single_linkage(distance_matrix):
245 |
246 | cdef np.ndarray[np.double_t, ndim=2] hierarchy
247 | cdef np.ndarray[np.double_t, ndim=2] for_labelling
248 |
249 | hierarchy = mst_linkage_core(distance_matrix)
250 | for_labelling = hierarchy[np.argsort(hierarchy.T[2]), :]
251 |
252 | return label(for_labelling)
253 |
--------------------------------------------------------------------------------
/hdbscan/_hdbscan_reachability.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: nonecheck=False
3 | # cython: initializedcheck=False
4 | # mutual reachability distance compiutations
5 | # Authors: Leland McInnes
6 | # License: 3-clause BSD
7 |
8 | import numpy as np
9 | cimport numpy as np
10 |
11 | from scipy.spatial.distance import pdist, squareform
12 | from scipy.sparse import lil_matrix as sparse_matrix
13 | from sklearn.neighbors import KDTree, BallTree
14 | import gc
15 |
16 |
17 | def mutual_reachability(distance_matrix, min_points=5, alpha=1.0):
18 | """Compute the weighted adjacency matrix of the mutual reachability
19 | graph of a distance matrix.
20 |
21 | Parameters
22 | ----------
23 | distance_matrix : ndarray, shape (n_samples, n_samples)
24 | Array of distances between samples.
25 |
26 | min_points : int, optional (default=5)
27 | The number of points in a neighbourhood for a point to be considered
28 | a core point.
29 |
30 | Returns
31 | -------
32 | mututal_reachability: ndarray, shape (n_samples, n_samples)
33 | Weighted adjacency matrix of the mutual reachability graph.
34 |
35 | References
36 | ----------
37 | .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
38 | Density-based clustering based on hierarchical density estimates.
39 | In Pacific-Asia Conference on Knowledge Discovery and Data Mining
40 | (pp. 160-172). Springer Berlin Heidelberg.
41 | """
42 | size = distance_matrix.shape[0]
43 | min_points = min(size - 1, min_points)
44 | try:
45 | core_distances = np.partition(distance_matrix,
46 | min_points,
47 | axis=0)[min_points]
48 | except AttributeError:
49 | core_distances = np.sort(distance_matrix,
50 | axis=0)[min_points]
51 |
52 | if alpha != 1.0:
53 | distance_matrix = distance_matrix / alpha
54 |
55 | stage1 = np.where(core_distances > distance_matrix,
56 | core_distances, distance_matrix)
57 | result = np.where(core_distances > stage1.T,
58 | core_distances.T, stage1.T).T
59 | return result
60 |
61 |
62 | cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
63 | float alpha=1.0, float max_dist=0.):
64 |
65 | cdef np.intp_t i
66 | cdef np.intp_t j
67 | cdef np.intp_t n
68 | cdef np.double_t mr_dist
69 | cdef list sorted_row_data
70 | cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance
71 | cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data
72 | cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data
73 |
74 | result = sparse_matrix(lil_matrix.shape)
75 | core_distance = np.empty(lil_matrix.shape[0], dtype=np.double)
76 |
77 | for i in range(lil_matrix.shape[0]):
78 | sorted_row_data = sorted(lil_matrix.data[i])
79 | if min_points - 1 < len(sorted_row_data):
80 | core_distance[i] = sorted_row_data[min_points - 1]
81 | else:
82 | core_distance[i] = np.inf
83 |
84 | if alpha != 1.0:
85 | lil_matrix = lil_matrix / alpha
86 |
87 | nz_row_data, nz_col_data = lil_matrix.nonzero()
88 |
89 | for n in range(nz_row_data.shape[0]):
90 | i = nz_row_data[n]
91 | j = nz_col_data[n]
92 |
93 | mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j])
94 | if np.isfinite(mr_dist):
95 | result[i, j] = mr_dist
96 | elif max_dist > 0:
97 | result[i, j] = max_dist
98 |
99 | return result.tocsr()
100 |
101 |
102 | def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
103 | alpha=1.0, **kwargs):
104 | dim = distance_matrix.shape[0]
105 | min_points = min(dim - 1, min_points)
106 |
107 | if metric == 'minkowski':
108 | tree = KDTree(X, metric=metric, p=p)
109 | else:
110 | tree = KDTree(X, metric=metric, **kwargs)
111 |
112 | core_distances = tree.query(X, k=min_points)[0][:, -1]
113 |
114 | if alpha != 1.0:
115 | distance_matrix = distance_matrix / alpha
116 |
117 | stage1 = np.where(core_distances > distance_matrix,
118 | core_distances, distance_matrix)
119 | result = np.where(core_distances > stage1.T,
120 | core_distances.T, stage1.T).T
121 | return result
122 |
123 |
124 | def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
125 | alpha=1.0, **kwargs):
126 | dim = distance_matrix.shape[0]
127 | min_points = min(dim - 1, min_points)
128 |
129 | tree = BallTree(X, metric=metric, **kwargs)
130 |
131 | core_distances = tree.query(X, k=min_points)[0][:, -1]
132 |
133 | if alpha != 1.0:
134 | distance_matrix = distance_matrix / alpha
135 |
136 | stage1 = np.where(core_distances > distance_matrix,
137 | core_distances, distance_matrix)
138 | result = np.where(core_distances > stage1.T,
139 | core_distances.T, stage1.T).T
140 | return result
141 |
142 |
143 | cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist(
144 | np.ndarray[np.double_t, ndim=1] core_distances,
145 | np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim):
146 |
147 | cdef np.intp_t i
148 | cdef np.intp_t j
149 | cdef np.intp_t result_pos
150 |
151 | result_pos = 0
152 | for i in range(dim):
153 | for j in range(i + 1, dim):
154 | if core_distances[i] > core_distances[j]:
155 | if core_distances[i] > dists[result_pos]:
156 | dists[result_pos] = core_distances[i]
157 |
158 | else:
159 | if core_distances[j] > dists[result_pos]:
160 | dists[result_pos] = core_distances[j]
161 |
162 | result_pos += 1
163 |
164 | return dists
165 |
166 |
167 | def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0,
168 | **kwargs):
169 |
170 | dim = X.shape[0]
171 | min_points = min(dim - 1, min_points)
172 |
173 | if metric == 'minkowski':
174 | tree = KDTree(X, metric=metric, p=p)
175 | else:
176 | tree = KDTree(X, metric=metric, **kwargs)
177 |
178 | core_distances = tree.query(X, k=min_points)[0][:, -1]
179 |
180 | del tree
181 | gc.collect()
182 |
183 | dists = pdist(X, metric=metric, p=p, **kwargs)
184 |
185 | if alpha != 1.0:
186 | dists /= alpha
187 |
188 | dists = mutual_reachability_from_pdist(core_distances, dists, dim)
189 |
190 | return dists
191 |
192 |
193 | def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0,
194 | **kwargs):
195 |
196 | dim = X.shape[0]
197 | min_points = min(dim - 1, min_points)
198 |
199 | tree = BallTree(X, metric=metric, **kwargs)
200 |
201 | core_distances = tree.query(X, k=min_points)[0][:, -1]
202 |
203 | del tree
204 | gc.collect()
205 |
206 | dists = pdist(X, metric=metric, p=p, **kwargs)
207 |
208 | if alpha != 1.0:
209 | dists /= alpha
210 |
211 | dists = mutual_reachability_from_pdist(core_distances, dists, dim)
212 |
213 | return dists
214 |
--------------------------------------------------------------------------------
/hdbscan/branch_data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.neighbors import KDTree, BallTree
3 | from .dist_metrics import DistanceMetric
4 |
5 |
6 | class BranchDetectionData(object):
7 | """Input data for branch detection functionality.
8 |
9 | Recreates and caches internal data structures from the clustering stage.
10 |
11 | Parameters
12 | ----------
13 |
14 | data : array (n_samples, n_features)
15 | The original data set that was clustered.
16 |
17 | labels : array (n_samples)
18 | The cluster labels for every point in the data set.
19 |
20 | condensed_tree : array (n_points + n_merges, 4)
21 | The condensed tree produced during clustering, used to extract outliers.
22 |
23 | min_samples : int
24 | The min_samples value used in clustering.
25 |
26 | tree_type : string, optional
27 | Which type of space tree to use for core distance computation.
28 | One of:
29 | * ``kdtree``
30 | * ``balltree``
31 |
32 | metric : string, optional
33 | The metric used to determine distance for the clustering.
34 | This is the metric that will be used for the space tree to determine
35 | core distances etc.
36 |
37 | **kwargs :
38 | Any further arguments to the metric.
39 |
40 | Attributes
41 | ----------
42 |
43 | all_finite : bool
44 | Whether the data set contains any infinite or NaN values.
45 |
46 | finite_index : array (n_samples)
47 | The indices of the finite data points in the original data set.
48 |
49 | internal_to_raw : dict
50 | A mapping from the finite data set indices to the original data set.
51 |
52 | tree : KDTree or BallTree
53 | A space partitioning tree that can be queried for nearest neighbors if
54 | the metric is supported by a KDTree or BallTree.
55 |
56 | neighbors : array (n_samples, min_samples)
57 | The nearest neighbor for every non-noise point in the original data set.
58 |
59 | core_distances : array (n_samples)
60 | The core distance for every non-noise point in the original data set.
61 |
62 | dist_metric : callable
63 | Accelerated distance metric function.
64 | """
65 |
66 | _tree_type_map = {"kdtree": KDTree, "balltree": BallTree}
67 |
68 | def __init__(
69 | self,
70 | data,
71 | labels,
72 | condensed_tree,
73 | min_samples,
74 | tree_type="kdtree",
75 | metric="euclidean",
76 | **kwargs,
77 | ):
78 | clean_data = data.astype(np.float64)
79 | last_outlier = np.searchsorted(condensed_tree["lambda_val"], 0.0, side="right")
80 | if last_outlier == 0:
81 | self.all_finite = True
82 | self.internal_to_raw = None
83 | self.finite_index = None
84 | else:
85 | self.all_finite = False
86 | self.finite_index = np.setdiff1d(
87 | np.arange(data.shape[0]),
88 | condensed_tree["child"][:last_outlier]
89 | )
90 | labels = labels[self.finite_index]
91 | clean_data = clean_data[self.finite_index]
92 | self.internal_to_raw = {
93 | x: y for x, y in enumerate(self.finite_index)
94 | }
95 |
96 | # Construct tree
97 | self.tree = self._tree_type_map[tree_type](clean_data, metric=metric, **kwargs)
98 | self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
99 |
100 | # Allocate to maintain data point indices
101 | self.core_distances = np.full(clean_data.shape[0], np.nan)
102 | self.neighbors = np.full((clean_data.shape[0], min_samples), -1, dtype=np.int64)
103 |
104 | # Find neighbors for non-noise points
105 | noise_mask = labels != -1
106 | if noise_mask.any():
107 | distances, self.neighbors[noise_mask, :] = self.tree.query(
108 | clean_data[noise_mask], k=min_samples
109 | )
110 | self.core_distances[noise_mask] = distances[:, -1]
111 |
112 |
--------------------------------------------------------------------------------
/hdbscan/dist_metrics.pxd:
--------------------------------------------------------------------------------
1 | #!python
2 | #cython: boundscheck=False
3 | #cython: wraparound=False
4 | #cython: cdivision=True
5 |
6 | import cython
7 | cimport cython
8 |
9 | import numpy as np
10 | cimport numpy as np
11 |
12 | from libc.math cimport fabs, sqrt, exp, cos, pow
13 |
14 | ctypedef np.double_t DTYPE_t
15 | ctypedef np.intp_t ITYPE_t
16 |
17 | cdef enum:
18 | DTYPECODE = np.NPY_FLOAT64
19 | ITYPECODE = np.NPY_INTP
20 |
21 | # Fused type for certain operations
22 | ctypedef fused DITYPE_t:
23 | ITYPE_t
24 | DTYPE_t
25 |
26 | ITYPE = np.intp
27 |
28 | DTYPE = np.double
29 |
30 | ######################################################################
31 | # Inline distance functions
32 | #
33 | # We use these for the default (euclidean) case so that they can be
34 | # inlined. This leads to faster computation for the most common case
35 | cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2,
36 | ITYPE_t size) nogil except -1:
37 | cdef DTYPE_t tmp, d=0
38 | cdef np.intp_t j
39 | for j in range(size):
40 | tmp = x1[j] - x2[j]
41 | d += tmp * tmp
42 | return sqrt(d)
43 |
44 |
45 | cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2,
46 | ITYPE_t size) nogil except -1:
47 | cdef DTYPE_t tmp, d=0
48 | cdef np.intp_t j
49 | for j in range(size):
50 | tmp = x1[j] - x2[j]
51 | d += tmp * tmp
52 | return d
53 |
54 |
55 | cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1:
56 | return dist * dist
57 |
58 |
59 | cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1:
60 | return sqrt(dist)
61 |
62 |
63 | ######################################################################
64 | # DistanceMetric base class
65 | cdef class DistanceMetric:
66 | # The following attributes are required for a few of the subclasses.
67 | # we must define them here so that cython's limited polymorphism will work.
68 | # Because we don't expect to instantiate a lot of these objects, the
69 | # extra memory overhead of this setup should not be an issue.
70 | cdef DTYPE_t p
71 | #cdef DTYPE_t[::1] vec
72 | #cdef DTYPE_t[:, ::1] mat
73 | cdef np.ndarray vec
74 | cdef np.ndarray mat
75 | cdef DTYPE_t* vec_ptr
76 | cdef DTYPE_t* mat_ptr
77 | cdef ITYPE_t size
78 | cdef object func
79 | cdef object kwargs
80 |
81 | cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
82 | ITYPE_t size) nogil except -1
83 |
84 | cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
85 | ITYPE_t size) nogil except -1
86 |
87 | cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
88 |
89 | cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
90 | DTYPE_t[:, ::1] D) except -1
91 |
92 | cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1
93 |
94 | cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
95 |
--------------------------------------------------------------------------------
/hdbscan/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/hdbscan/tests/__init__.py
--------------------------------------------------------------------------------
/hdbscan/tests/test_prediction_utils.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from hdbscan._prediction_utils import safe_always_positive_division
4 |
5 |
6 | @pytest.mark.parametrize('denominator', [-1, 0, 1])
7 | def test_safe_always_positive_division(denominator):
8 | numerator = 1
9 | # Given negative, zero and positive denominator and positive numerator
10 | value = safe_always_positive_division(numerator, 0)
11 | # Make sure safe division is always positive and doesn't raise ZeroDivision error
12 | assert value >= 0
13 |
--------------------------------------------------------------------------------
/hdbscan/tests/test_rsl.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for Robust Single Linkage clustering algorithm
3 | """
4 | # import pickle
5 | import numpy as np
6 | from scipy.spatial import distance
7 | from scipy import sparse
8 | from sklearn.utils.estimator_checks import check_estimator
9 | from hdbscan import RobustSingleLinkage, robust_single_linkage
10 |
11 | # from sklearn.cluster.tests.common import generate_clustered_data
12 |
13 | from sklearn import datasets
14 | import warnings
15 |
16 | from sklearn.datasets import make_blobs
17 | from sklearn.utils import shuffle
18 | from sklearn.preprocessing import StandardScaler
19 |
20 | import pytest
21 |
22 | n_clusters = 3
23 | X, y = make_blobs(n_samples=50, random_state=1)
24 | X, y = shuffle(X, y, random_state=7)
25 | X = StandardScaler().fit_transform(X)
26 | # X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)
27 |
28 | def test_rsl_distance_matrix():
29 | D = distance.squareform(distance.pdist(X))
30 | D /= np.max(D)
31 |
32 | labels, tree = robust_single_linkage(D, 0.4, metric='precomputed')
33 | # number of clusters, ignoring noise if present
34 | n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise
35 | assert(n_clusters_1 == 2)
36 |
37 | labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_
38 | n_clusters_2 = len(set(labels)) - int(-1 in labels)
39 | assert(n_clusters_2 == 2)
40 |
41 |
42 | def test_rsl_feature_vector():
43 | labels, tree = robust_single_linkage(X, 0.4)
44 | n_clusters_1 = len(set(labels)) - int(-1 in labels)
45 | assert(n_clusters_1 == n_clusters)
46 |
47 | labels = RobustSingleLinkage().fit(X).labels_
48 | n_clusters_2 = len(set(labels)) - int(-1 in labels)
49 | assert(n_clusters_2 == n_clusters)
50 |
51 |
52 | def test_rsl_callable_metric():
53 | # metric is the function reference, not the string key.
54 | metric = distance.euclidean
55 |
56 | labels, tree = robust_single_linkage(X, 0.4, metric=metric)
57 | n_clusters_1 = len(set(labels)) - int(-1 in labels)
58 | assert(n_clusters_1 == n_clusters)
59 |
60 | labels = RobustSingleLinkage(metric=metric).fit(X).labels_
61 | n_clusters_2 = len(set(labels)) - int(-1 in labels)
62 | assert(n_clusters_2 == n_clusters)
63 |
64 |
65 | def test_rsl_input_lists():
66 | X = [[1., 2.], [3., 4.]]
67 | RobustSingleLinkage().fit(X) # must not raise exception
68 |
69 |
70 | def test_rsl_boruvka_balltree():
71 | labels, tree = robust_single_linkage(X, 0.45, algorithm='boruvka_balltree')
72 | n_clusters_1 = len(set(labels)) - int(-1 in labels)
73 | assert(n_clusters_1 == n_clusters)
74 |
75 | labels = RobustSingleLinkage(cut=0.45,
76 | algorithm='boruvka_balltree').fit(X).labels_
77 | n_clusters_2 = len(set(labels)) - int(-1 in labels)
78 | assert(n_clusters_2 == n_clusters)
79 |
80 |
81 | def test_rsl_prims_balltree():
82 | labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_balltree')
83 | n_clusters_1 = len(set(labels)) - int(-1 in labels)
84 | assert(n_clusters_1 == n_clusters)
85 |
86 | labels = RobustSingleLinkage(algorithm='prims_balltree').fit(X).labels_
87 | n_clusters_2 = len(set(labels)) - int(-1 in labels)
88 | assert(n_clusters_2 == n_clusters)
89 |
90 |
91 | def test_rsl_prims_kdtree():
92 | labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_kdtree')
93 | n_clusters_1 = len(set(labels)) - int(-1 in labels)
94 | assert(n_clusters_1 == n_clusters)
95 |
96 | labels = RobustSingleLinkage(algorithm='prims_kdtree').fit(X).labels_
97 | n_clusters_2 = len(set(labels)) - int(-1 in labels)
98 | assert(n_clusters_2 == n_clusters)
99 |
100 |
101 | # def test_rsl_unavailable_hierarchy():
102 | # clusterer = RobustSingleLinkage()
103 | # with warnings.catch_warnings(record=True) as w:
104 | # tree = clusterer.cluster_hierarchy_
105 | # assert len(w) > 0
106 | # assert tree is None
107 |
108 |
109 | def test_rsl_hierarchy():
110 | clusterer = RobustSingleLinkage().fit(X)
111 | assert clusterer.cluster_hierarchy_ is not None
112 |
113 |
114 | def test_rsl_high_dimensional():
115 | H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
116 | # H, y = shuffle(X, y, random_state=7)
117 | H = StandardScaler().fit_transform(H)
118 | labels, tree = robust_single_linkage(H, 5.5)
119 | n_clusters_1 = len(set(labels)) - int(-1 in labels)
120 | assert(n_clusters_1 == n_clusters)
121 |
122 | labels = RobustSingleLinkage(cut=5.5, algorithm='best',
123 | metric='seuclidean',
124 | metric_params={'V': np.ones(H.shape[1])}).fit(H).labels_
125 | n_clusters_2 = len(set(labels)) - int(-1 in labels)
126 | assert(n_clusters_2 == n_clusters)
127 |
128 |
129 | def test_rsl_badargs():
130 | with pytest.raises(ValueError):
131 | robust_single_linkage('fail', 0.4)
132 | with pytest.raises(ValueError):
133 | robust_single_linkage(None, 0.4)
134 | with pytest.raises(ValueError):
135 | robust_single_linkage(X, 0.4, k='fail')
136 | with pytest.raises(ValueError):
137 | robust_single_linkage(X, 0.4, k=-1)
138 | with pytest.raises(ValueError):
139 | robust_single_linkage(X, 0.4, metric='imperial')
140 | with pytest.raises(ValueError):
141 | robust_single_linkage(X, 0.4, metric=None)
142 | with pytest.raises(ValueError):
143 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1)
144 | with pytest.raises(ValueError):
145 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='prims_kdtree')
146 | with pytest.raises(ValueError):
147 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='prims_balltree')
148 | with pytest.raises(ValueError):
149 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='boruvka_balltree')
150 | with pytest.raises(ValueError):
151 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='boruvka_kdtree')
152 | with pytest.raises(ValueError):
153 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='prims_kdtree')
154 | with pytest.raises(ValueError):
155 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='prims_balltree')
156 | with pytest.raises(ValueError):
157 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='boruvka_balltree')
158 | with pytest.raises(ValueError):
159 | robust_single_linkage(X, 0.4, alpha=-1)
160 | with pytest.raises(ValueError):
161 | robust_single_linkage(X, 0.4, alpha='fail')
162 | with pytest.raises(Exception):
163 | robust_single_linkage(X, 0.4, algorithm='something_else')
164 | with pytest.raises(TypeError):
165 | robust_single_linkage(X, 0.4, metric='minkowski', p=None)
166 | with pytest.raises(ValueError):
167 | robust_single_linkage(X, 0.4, leaf_size=0)
168 | with pytest.raises(ValueError):
169 | robust_single_linkage(X, 0.4, gamma=0)
170 |
171 |
172 | # Disable for now -- need to refactor to meet newer standards
173 | @pytest.mark.skip(reason="need to refactor to meet newer standards")
174 | def test_rsl_is_sklearn_estimator():
175 | check_estimator(RobustSingleLinkage)
176 |
--------------------------------------------------------------------------------
/notebooks/clusterable_data.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/notebooks/clusterable_data.npy
--------------------------------------------------------------------------------
/notebooks/hdbscan01_timings.csv:
--------------------------------------------------------------------------------
1 | 2,2000,0.203334093094
2 | 2,4000,0.259212017059
3 | 2,6000,0.530183076859
4 | 2,8000,0.928155183792
5 | 2,10000,1.33956003189
6 | 2,12000,2.02227687836
7 | 2,14000,2.74701404572
8 | 2,16000,3.63934803009
9 | 2,18000,4.60029006004
10 | 2,20000,6.09813690186
11 | 2,22000,10.7129349709
12 | 2,24000,9.08472108841
13 | 2,26000,15.8526310921
14 | 2,28000,19.4355289936
15 | 2,30000,24.5378270149
16 | 2,32000,30.3819289207
17 | 5,2000,0.21369099617
18 | 5,4000,0.255190849304
19 | 5,6000,0.527250051498
20 | 5,8000,0.93247294426
21 | 5,10000,1.47298002243
22 | 5,12000,2.07997608185
23 | 5,14000,2.84801912308
24 | 5,16000,3.78576898575
25 | 5,18000,4.60007095337
26 | 5,20000,5.82796311378
27 | 5,22000,7.35501813889
28 | 5,24000,8.69181203842
29 | 5,26000,10.3049359322
30 | 5,28000,12.5369310379
31 | 5,30000,28.7729370594
32 | 5,32000,29.6381349564
33 | 10,2000,0.174388170242
34 | 10,4000,0.296141147614
35 | 10,6000,0.662806987762
36 | 10,8000,1.17675209045
37 | 10,10000,1.79025316238
38 | 10,12000,2.48112487793
39 | 10,14000,3.44052696228
40 | 10,16000,4.44019889832
41 | 10,18000,5.61963176727
42 | 10,20000,7.39718699455
43 | 10,22000,8.64890098572
44 | 10,24000,10.4458150864
45 | 10,26000,12.8114190102
46 | 10,28000,20.3707690239
47 | 10,30000,29.7545838356
48 | 10,32000,34.2230820656
49 | 25,2000,0.198121070862
50 | 25,4000,0.452563047409
51 | 25,6000,0.94957280159
52 | 25,8000,1.62946105003
53 | 25,10000,2.49307203293
54 | 25,12000,3.63441205025
55 | 25,14000,4.78342199326
56 | 25,16000,6.30564498901
57 | 25,18000,8.03539299965
58 | 25,20000,10.3152740002
59 | 25,22000,12.7070331573
60 | 25,24000,15.693295002
61 | 25,26000,18.6774010658
62 | 25,28000,28.0319800377
63 | 25,30000,35.5377750397
64 | 25,32000,43.5508480072
65 | 50,2000,0.241183042526
66 | 50,4000,0.691927909851
67 | 50,6000,1.46878409386
68 | 50,8000,2.71946191788
69 | 50,10000,3.89164805412
70 | 50,12000,5.76127791405
71 | 50,14000,8.03004384041
72 | 50,16000,10.2894189358
73 | 50,18000,13.2365300655
74 | 50,20000,16.5973930359
75 | 50,22000,19.8884520531
76 | 50,24000,23.8139870167
77 | 50,26000,28.6661889553
78 | 50,28000,38.4153680801
79 | 50,30000,49.254393816
80 | 50,32000,58.0542850494
81 |
--------------------------------------------------------------------------------
/notebooks/hdbscan02_timings.csv:
--------------------------------------------------------------------------------
1 | 2,2000,0.190771818161
2 | 2,4000,0.33536696434
3 | 2,6000,0.475166797638
4 | 2,8000,0.830126047134
5 | 2,10000,1.21801495552
6 | 2,12000,1.66791892052
7 | 2,14000,2.25732898712
8 | 2,16000,2.97524309158
9 | 2,18000,3.75251483917
10 | 2,20000,4.78878498077
11 | 2,22000,5.71841812134
12 | 2,24000,6.86345005035
13 | 2,26000,8.4248509407
14 | 2,28000,10.5936911106
15 | 2,30000,12.250483036
16 | 2,32000,14.2500619888
17 | 5,2000,0.165930986404
18 | 5,4000,0.25049495697
19 | 5,6000,0.505705833435
20 | 5,8000,0.85303401947
21 | 5,10000,1.30479001999
22 | 5,12000,1.78360509872
23 | 5,14000,2.37719798088
24 | 5,16000,3.19220519066
25 | 5,18000,4.0063521862
26 | 5,20000,5.10847592354
27 | 5,22000,6.15350604057
28 | 5,24000,7.8016500473
29 | 5,26000,9.36254882812
30 | 5,28000,10.940628767
31 | 5,30000,13.0416350365
32 | 5,32000,15.0905759335
33 | 10,2000,0.171450138092
34 | 10,4000,0.306551933289
35 | 10,6000,0.609230041504
36 | 10,8000,1.01101207733
37 | 10,10000,1.56092309952
38 | 10,12000,2.25636100769
39 | 10,14000,3.02007102966
40 | 10,16000,3.85052204132
41 | 10,18000,4.90771794319
42 | 10,20000,6.28313612938
43 | 10,22000,7.84088993073
44 | 10,24000,9.35490894318
45 | 10,26000,11.2061488628
46 | 10,28000,13.258589983
47 | 10,30000,15.8290801048
48 | 10,32000,18.140255928
49 | 25,2000,0.187772035599
50 | 25,4000,0.422642946243
51 | 25,6000,0.917279958725
52 | 25,8000,1.49317598343
53 | 25,10000,2.3160700798
54 | 25,12000,3.33820199966
55 | 25,14000,4.4094080925
56 | 25,16000,5.88487386703
57 | 25,18000,7.52313017845
58 | 25,20000,9.37871217728
59 | 25,22000,11.7811200619
60 | 25,24000,14.447204113
61 | 25,26000,17.3661310673
62 | 25,28000,20.1399390697
63 | 25,30000,24.2563328743
64 | 25,32000,28.605463028
65 | 50,2000,0.230389118195
66 | 50,4000,0.681818008423
67 | 50,6000,1.39964485168
68 | 50,8000,2.48313784599
69 | 50,10000,3.77135896683
70 | 50,12000,5.48401618004
71 | 50,14000,7.19847917557
72 | 50,16000,9.64172506332
73 | 50,18000,12.4206252098
74 | 50,20000,15.4045789242
75 | 50,22000,18.8578879833
76 | 50,24000,22.6411821842
77 | 50,26000,26.6900000572
78 | 50,28000,31.2701971531
79 | 50,30000,36.5198609829
80 | 50,32000,41.7656099796
81 |
--------------------------------------------------------------------------------
/notebooks/hdbscan03_timings.csv:
--------------------------------------------------------------------------------
1 | 2,4000,0.254909992218
2 | 2,8000,0.781009912491
3 | 2,12000,1.65578794479
4 | 2,16000,2.86548995972
5 | 2,20000,4.5723490715
6 | 2,24000,7.35976219177
7 | 2,28000,10.392701149
8 | 2,32000,9.43943691254
9 | 2,36000,11.3052511215
10 | 2,40000,13.9955811501
11 | 2,44000,18.7241039276
12 | 2,48000,20.6580238342
13 | 2,52000,24.4679880142
14 | 2,56000,29.1394848824
15 | 2,60000,34.244658947
16 | 2,64000,39.4027280807
17 | 5,4000,0.25834608078
18 | 5,8000,0.854709863663
19 | 5,12000,1.76500201225
20 | 5,16000,3.11302685738
21 | 5,20000,5.05285406113
22 | 5,24000,7.59221887589
23 | 5,28000,11.0022101402
24 | 5,32000,11.0250749588
25 | 5,36000,14.1674640179
26 | 5,40000,17.6738820076
27 | 5,44000,22.3881859779
28 | 5,48000,26.0163779259
29 | 5,52000,30.8282210827
30 | 5,56000,35.8936729431
31 | 5,60000,41.7060689926
32 | 5,64000,48.1323189735
33 | 10,4000,0.300674915314
34 | 10,8000,1.02144503593
35 | 10,12000,2.25444197655
36 | 10,16000,3.87991809845
37 | 10,20000,6.13427686691
38 | 10,24000,9.54126405716
39 | 10,28000,13.4590039253
40 | 10,32000,17.133865118
41 | 10,36000,21.9930670261
42 | 10,40000,27.4153258801
43 | 10,44000,33.9543378353
44 | 10,48000,40.5958509445
45 | 10,52000,47.9032700062
46 | 10,56000,57.3020319939
47 | 10,60000,65.7409169674
48 | 10,64000,74.7461779118
49 | 25,4000,0.429993152618
50 | 25,8000,1.53049278259
51 | 25,12000,3.27671718597
52 | 25,16000,5.81940603256
53 | 25,20000,9.31306195259
54 | 25,24000,14.3008999825
55 | 25,28000,20.7219820023
56 | 25,32000,35.4473462105
57 | 25,36000,44.8741598129
58 | 25,40000,55.1005539894
59 | 25,44000,66.9944300652
60 | 25,48000,78.9403419495
61 | 25,52000,92.4163110256
62 | 25,56000,107.29060483
63 | 25,60000,124.042211056
64 | 25,64000,139.81782198
65 | 50,4000,0.689707040787
66 | 50,8000,2.43957304955
67 | 50,12000,5.3949701786
68 | 50,16000,9.77388811111
69 | 50,20000,15.3528060913
70 | 50,24000,22.688354969
71 | 50,28000,31.6130321026
72 | 50,32000,60.4746580124
73 | 50,36000,76.1894528866
74 | 50,40000,93.2929999828
75 | 50,44000,111.741698027
76 | 50,48000,132.439800024
77 | 50,52000,153.971266031
78 | 50,56000,177.992291927
79 | 50,60000,204.601658106
80 | 50,64000,231.908761978
81 |
--------------------------------------------------------------------------------
/notebooks/hdbscan04_timings.csv:
--------------------------------------------------------------------------------
1 | 2,8000,0.227055072784
2 | 2,16000,0.532173156738
3 | 2,24000,0.879513025284
4 | 2,32000,1.24024891853
5 | 2,40000,1.81793093681
6 | 2,48000,2.22707700729
7 | 2,56000,2.89961886406
8 | 2,64000,3.2689011097
9 | 2,72000,3.87070393562
10 | 2,80000,6.16474890709
11 | 2,88000,6.37934803963
12 | 2,96000,8.87552189827
13 | 2,104000,8.83126091957
14 | 2,112000,10.2158279419
15 | 2,120000,12.5876441002
16 | 2,128000,13.6096761227
17 | 5,8000,0.405529975891
18 | 5,16000,1.33872485161
19 | 5,24000,2.52023291588
20 | 5,32000,3.81210708618
21 | 5,40000,4.77973794937
22 | 5,48000,7.4870300293
23 | 5,56000,7.76650905609
24 | 5,64000,8.53143310547
25 | 5,72000,11.8250510693
26 | 5,80000,14.0402071476
27 | 5,88000,16.0629730225
28 | 5,96000,19.1256659031
29 | 5,104000,19.8361799717
30 | 5,112000,20.415594101
31 | 5,120000,21.5572421551
32 | 5,128000,24.9693388939
33 | 10,8000,0.523543119431
34 | 10,16000,1.62090706825
35 | 10,24000,3.66929006577
36 | 10,32000,5.36760091782
37 | 10,40000,7.74307012558
38 | 10,48000,13.7823400497
39 | 10,56000,15.9222350121
40 | 10,64000,19.0056459904
41 | 10,72000,22.3747861385
42 | 10,80000,31.0509710312
43 | 10,88000,49.9119548798
44 | 10,96000,47.1509799957
45 | 10,104000,58.6490371227
46 | 10,112000,72.9800539017
47 | 10,120000,68.7178759575
48 | 10,128000,60.2585930824
49 | 25,8000,0.886401891708
50 | 25,16000,2.55635499954
51 | 25,24000,10.2341220379
52 | 25,32000,10.0402569771
53 | 25,40000,16.4257571697
54 | 25,48000,23.4617791176
55 | 25,56000,32.1058709621
56 | 25,64000,35.5998060703
57 | 25,72000,51.0438849926
58 | 25,80000,53.5488469601
59 | 25,88000,74.6229739189
60 | 25,96000,87.4415640831
61 | 25,104000,103.67979002
62 | 25,112000,100.422867775
63 | 25,120000,117.445795059
64 | 25,128000,127.074856043
65 | 50,8000,2.15198493004
66 | 50,16000,6.01606011391
67 | 50,24000,15.0741400719
68 | 50,32000,24.8565030098
69 | 50,40000,32.738462925
70 | 50,48000,54.6907629967
71 | 50,56000,65.1226139069
72 | 50,64000,80.4430060387
73 | 50,72000,103.5877738
74 | 50,80000,120.219110966
75 | 50,88000,171.107203007
76 | 50,96000,201.432529926
77 | 50,104000,238.729315996
78 | 50,112000,258.13277483
79 | 50,120000,285.661708117
80 | 50,128000,316.628612041
81 |
--------------------------------------------------------------------------------
/notebooks/hdbscan05_timings.csv:
--------------------------------------------------------------------------------
1 | 2,8000,0.201974868774
2 | 2,16000,0.382796049118
3 | 2,24000,0.677625179291
4 | 2,32000,0.857353925705
5 | 2,40000,1.19192004204
6 | 2,48000,1.65057206154
7 | 2,56000,1.76224017143
8 | 2,64000,2.09517502785
9 | 2,72000,2.37437987328
10 | 2,80000,2.61393880844
11 | 2,88000,3.86622595787
12 | 2,96000,4.16805887222
13 | 2,104000,4.60610985756
14 | 2,112000,4.65505003929
15 | 2,120000,4.94053196907
16 | 2,128000,5.48205113411
17 | 5,8000,0.390153884888
18 | 5,16000,1.1207010746
19 | 5,24000,2.12859201431
20 | 5,32000,3.20195794106
21 | 5,40000,4.50784707069
22 | 5,48000,5.86051797867
23 | 5,56000,6.96505713463
24 | 5,64000,8.35725998878
25 | 5,72000,10.0785040855
26 | 5,80000,11.8928399086
27 | 5,88000,14.2854990959
28 | 5,96000,16.3619041443
29 | 5,104000,18.1008689404
30 | 5,112000,18.765378952
31 | 5,120000,20.262346983
32 | 5,128000,22.245456934
33 | 10,8000,0.362307071686
34 | 10,16000,1.10565090179
35 | 10,24000,2.1113088131
36 | 10,32000,3.8094599247
37 | 10,40000,5.60643601418
38 | 10,48000,8.05391407013
39 | 10,56000,12.0181820393
40 | 10,64000,14.4568071365
41 | 10,72000,17.575797081
42 | 10,80000,20.9547560215
43 | 10,88000,28.589566946
44 | 10,96000,31.5660579205
45 | 10,104000,35.0399270058
46 | 10,112000,46.7496728897
47 | 10,120000,51.5727710724
48 | 10,128000,56.6605160236
49 | 25,8000,0.503958940506
50 | 25,16000,1.15347003937
51 | 25,24000,2.52892589569
52 | 25,32000,3.7748811245
53 | 25,40000,5.54964900017
54 | 25,48000,7.7039680481
55 | 25,56000,10.2646648884
56 | 25,64000,12.3325390816
57 | 25,72000,14.4936189651
58 | 25,80000,17.8296489716
59 | 25,88000,24.9521570206
60 | 25,96000,27.6805050373
61 | 25,104000,31.0702199936
62 | 25,112000,38.4048509598
63 | 25,120000,41.4252431393
64 | 25,128000,45.7964301109
65 | 50,8000,1.46589207649
66 | 50,16000,2.91623210907
67 | 50,24000,4.17734980583
68 | 50,32000,6.72125601768
69 | 50,40000,9.49217200279
70 | 50,48000,11.0911870003
71 | 50,56000,13.4033820629
72 | 50,64000,16.9308049679
73 | 50,72000,20.2958710194
74 | 50,80000,27.0205729008
75 | 50,88000,31.7669379711
76 | 50,96000,37.2198050022
77 | 50,104000,39.0934021473
78 | 50,112000,45.5359759331
79 | 50,120000,49.7200181484
80 | 50,128000,54.0523099899
81 |
--------------------------------------------------------------------------------
/notebooks/hdbscan06_timings.csv:
--------------------------------------------------------------------------------
1 | 2,8000,0.175021886826
2 | 2,16000,0.387292146683
3 | 2,24000,0.677018880844
4 | 2,32000,0.934924125671
5 | 2,40000,1.17343378067
6 | 2,48000,1.38080406189
7 | 2,56000,1.60144400597
8 | 2,64000,1.79244303703
9 | 2,72000,2.1175339222
10 | 2,80000,2.43222498894
11 | 2,88000,2.75695896149
12 | 2,96000,3.10400700569
13 | 2,104000,3.41808009148
14 | 2,112000,3.49205112457
15 | 2,120000,3.87581586838
16 | 2,128000,4.19616699219
17 | 5,8000,0.372463941574
18 | 5,16000,1.05067205429
19 | 5,24000,1.93789100647
20 | 5,32000,2.74101495743
21 | 5,40000,3.80962181091
22 | 5,48000,4.98932695389
23 | 5,56000,5.92916297913
24 | 5,64000,7.09130311012
25 | 5,72000,8.22766804695
26 | 5,80000,9.74051809311
27 | 5,88000,11.0401978493
28 | 5,96000,12.6047639847
29 | 5,104000,14.0353701115
30 | 5,112000,14.6283960342
31 | 5,120000,16.2875649929
32 | 5,128000,17.4939930439
33 | 10,8000,0.349482059479
34 | 10,16000,1.09388589859
35 | 10,24000,1.87578415871
36 | 10,32000,3.21113491058
37 | 10,40000,4.35681700706
38 | 10,48000,6.19830203056
39 | 10,56000,9.55884099007
40 | 10,64000,11.4342520237
41 | 10,72000,13.2101860046
42 | 10,80000,16.1834290028
43 | 10,88000,20.0170080662
44 | 10,96000,22.5502281189
45 | 10,104000,24.9669640064
46 | 10,112000,35.226790905
47 | 10,120000,39.5434041023
48 | 10,128000,42.897605896
49 | 25,8000,0.444399118423
50 | 25,16000,1.209430933
51 | 25,24000,1.97230005264
52 | 25,32000,3.10147595406
53 | 25,40000,4.67809796333
54 | 25,48000,5.50237488747
55 | 25,56000,7.86162614822
56 | 25,64000,9.46203804016
57 | 25,72000,11.5571279526
58 | 25,80000,13.881565094
59 | 25,88000,16.1510570049
60 | 25,96000,18.3807759285
61 | 25,104000,20.2770631313
62 | 25,112000,25.9744091034
63 | 25,120000,28.6864550114
64 | 25,128000,31.9634900093
65 | 50,8000,1.42019295692
66 | 50,16000,2.98401212692
67 | 50,24000,3.57059788704
68 | 50,32000,5.97410511971
69 | 50,40000,7.985861063
70 | 50,48000,9.6884970665
71 | 50,56000,11.9059169292
72 | 50,64000,13.7416830063
73 | 50,72000,17.8067760468
74 | 50,80000,20.3124599457
75 | 50,88000,20.6006500721
76 | 50,96000,22.6325879097
77 | 50,104000,27.3392460346
78 | 50,112000,31.2804059982
79 | 50,120000,34.6195569038
80 | 50,128000,39.2653598785
81 |
--------------------------------------------------------------------------------
/notebooks/reference_impl_external_timings.csv:
--------------------------------------------------------------------------------
1 | 2,8000,3.59666895866
2 | 2,16000,15.2572879791
3 | 2,24000,31.3827497959
4 | 2,32000,60.9953649044
5 | 2,40000,111.264041901
6 | 2,48000,80.2624919415
7 | 2,56000,111.845596075
8 | 2,64000,157.572174072
9 | 2,72000,213.970286131
10 | 2,80000,291.316827059
11 | 2,88000,364.542631865
12 | 2,96000,330.40318799
13 | 2,104000,376.085955858
14 | 2,112000,437.023652077
15 | 2,120000,512.283486128
16 | 2,128000,639.647830963
17 | 5,8000,2.96017384529
18 | 5,16000,12.4860448837
19 | 5,24000,24.3062229156
20 | 5,32000,27.3480169773
21 | 5,40000,57.2987709045
22 | 5,48000,100.169524908
23 | 5,56000,79.1349971294
24 | 5,64000,124.066302061
25 | 5,72000,185.705877066
26 | 5,80000,266.771252155
27 | 5,88000,344.634408951
28 | 5,96000,437.551882982
29 | 5,104000,446.130121946
30 | 5,112000,365.777822018
31 | 5,120000,447.037277937
32 | 5,128000,591.354615211
33 | 10,8000,3.74887800217
34 | 10,16000,9.18430614471
35 | 10,24000,30.3249309063
36 | 10,32000,33.4931271076
37 | 10,40000,78.0882520676
38 | 10,48000,91.3173689842
39 | 10,56000,200.770553112
40 | 10,64000,158.011397839
41 | 10,72000,241.757611036
42 | 10,80000,323.283601046
43 | 10,88000,342.906905174
44 | 10,96000,354.992150068
45 | 10,104000,435.243753195
46 | 10,112000,547.999858856
47 | 10,120000,687.23850894
48 | 10,128000,572.590743065
49 | 25,8000,3.80018186569
50 | 25,16000,18.4901921749
51 | 25,24000,33.0604710579
52 | 25,32000,90.8991298676
53 | 25,40000,110.421215057
54 | 25,48000,153.691064119
55 | 25,56000,236.893220901
56 | 25,64000,371.323115826
57 | 25,72000,413.138042927
58 | 25,80000,580.538727999
59 | 25,88000,492.039662123
60 | 25,96000,665.976908922
61 | 25,104000,879.488523006
62 | 25,112000,946.649399996
63 | 25,120000,1354.74109793
64 | 25,128000,1628.48575211
65 | 50,8000,7.23535704613
66 | 50,16000,35.2021028996
67 | 50,24000,69.9486300945
68 | 50,32000,146.289216995
69 | 50,40000,234.030052185
70 | 50,48000,305.608191013
71 | 50,56000,423.300146103
72 | 50,64000,642.593301058
73 | 50,72000,703.198181152
74 | 50,80000,885.244357109
75 | 50,88000,1099.00257683
76 | 50,96000,1249.79146123
77 | 50,104000,1456.11673903
78 | 50,112000,1785.89922595
79 | 50,120000,2121.75022507
80 | 50,128000,2446.19570708
81 |
--------------------------------------------------------------------------------
/notebooks/reference_impl_internal_timings.csv:
--------------------------------------------------------------------------------
1 | ,,calculate MST,compute core distances,compute hierarchy and cluster tree,compute outlier scores,find flat result,runtime
2 | 2,8000,624,622,1492,22,243,3060
3 | 2,16000,3422,4744,5711,40,542,14514
4 | 2,24000,8290,13080,9732,56,12,31246
5 | 2,32000,18030,24890,16561,90,1192,60865
6 | 2,40000,35571,50032,25340,71,20,111135
7 | 2,48000,20298,15773,38696,118,5086,80123
8 | 2,56000,30316,24342,50220,99,6582,111702
9 | 2,64000,42993,41946,63860,102,8401,157433
10 | 2,72000,61349,62803,78890,136,10489,213827
11 | 2,80000,86100,87121,104851,154,12777,291163
12 | 2,88000,107276,119212,121918,182,15636,364407
13 | 2,96000,80565,88392,142386,122,18594,330248
14 | 2,104000,99502,85378,168434,171,22258,375928
15 | 2,112000,118551,90896,199844,226,27171,436855
16 | 2,120000,147139,121902,210340,158,32281,512020
17 | 2,128000,182831,158954,260799,228,36374,639384
18 | 5,8000,803,907,640,24,17,2468
19 | 5,16000,4025,6237,1609,37,19,12025
20 | 5,24000,5345,15126,3249,60,26,23952
21 | 5,32000,10404,10845,5548,77,29,27043
22 | 5,40000,22235,27360,7303,84,33,57168
23 | 5,48000,39939,49988,9814,93,39,100041
24 | 5,56000,33073,30384,15212,107,38,78986
25 | 5,64000,52653,53860,17047,127,46,123917
26 | 5,72000,77242,84245,23702,118,36,185555
27 | 5,80000,108770,125721,31750,140,34,266622
28 | 5,88000,140356,170121,33576,149,53,344486
29 | 5,96000,186953,203925,46083,185,40,437405
30 | 5,104000,146302,248246,50781,353,73,445989
31 | 5,112000,159718,150778,54602,175,53,365611
32 | 5,120000,194952,180324,71056,226,69,446893
33 | 5,128000,250133,255315,85176,241,46,591201
34 | 10,8000,1074,1763,373,28,14,3347
35 | 10,16000,3615,3858,1245,50,24,8914
36 | 10,24000,10598,17317,1938,68,30,30144
37 | 10,32000,13761,15707,3413,82,39,33204
38 | 10,40000,29709,39094,8769,101,57,77928
39 | 10,48000,34325,37811,18517,133,49,91048
40 | 10,56000,81323,108916,9915,127,50,200561
41 | 10,64000,64556,74037,18723,139,61,157770
42 | 10,72000,111252,114018,15777,151,72,241532
43 | 10,80000,136247,165805,20230,197,80,322847
44 | 10,88000,122443,204552,15006,203,80,342573
45 | 10,96000,160015,173657,20586,179,73,354798
46 | 10,104000,199701,215870,18923,202,74,435081
47 | 10,112000,244868,279603,22277,265,104,547449
48 | 10,120000,306516,355204,24496,265,115,686922
49 | 10,128000,271135,269269,31109,206,136,572215
50 | 25,8000,1195,1829,350,31,43,3585
51 | 25,16000,6410,10653,924,46,36,18263
52 | 25,24000,13252,17456,1826,63,63,32881
53 | 25,32000,34042,50735,5504,78,76,90680
54 | 25,40000,46438,58907,4457,109,82,110275
55 | 25,48000,64879,74894,13223,124,107,153555
56 | 25,56000,98906,128676,8398,123,138,236582
57 | 25,64000,147256,207237,16032,137,117,371163
58 | 25,72000,175076,226689,10241,156,157,412771
59 | 25,80000,232469,327623,19225,174,227,580180
60 | 25,88000,218678,253468,18846,176,224,491880
61 | 25,96000,287941,362339,14325,197,189,665513
62 | 25,104000,368983,488749,20733,163,135,879320
63 | 25,112000,407948,512364,25227,175,200,946495
64 | 25,120000,550593,778372,24603,205,176,1354579
65 | 25,128000,647824,945765,33639,224,174,1628338
66 | 50,8000,2313,4115,441,28,43,7119
67 | 50,16000,12828,20685,1163,50,88,35048
68 | 50,24000,27541,39661,1988,65,113,69679
69 | 50,32000,56303,85807,3371,73,159,146071
70 | 50,40000,90720,137414,4801,102,197,233684
71 | 50,48000,122137,173262,9205,104,178,305392
72 | 50,56000,168479,242965,10462,117,166,422749
73 | 50,64000,246315,379799,14992,132,229,642115
74 | 50,72000,282092,405504,14290,165,244,703033
75 | 50,80000,352981,512679,17941,177,330,884889
76 | 50,88000,438449,643205,15735,179,353,1098758
77 | 50,96000,504656,724577,19070,207,219,1249631
78 | 50,104000,592942,836562,24831,189,317,1455788
79 | 50,112000,714665,1047422,21969,239,428,1785726
80 | 50,120000,847335,1254820,17559,209,548,2121531
81 | 50,128000,969114,1450654,24317,259,424,2445925
82 |
--------------------------------------------------------------------------------
/paper/hdbscan_clustering_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/paper/hdbscan_clustering_result.png
--------------------------------------------------------------------------------
/paper/hdbscan_condensed_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/paper/hdbscan_condensed_tree.png
--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
1 | @inproceedings{campello2013density,
2 | title={Density-based clustering based on hierarchical density estimates},
3 | author={Campello, Ricardo JGB and Moulavi, Davoud and Sander, Joerg},
4 | booktitle={Pacific-Asia Conference on Knowledge Discovery and Data Mining},
5 | pages={160--172},
6 | year={2013},
7 | organization={Springer},
8 | doi={10.1007/978-3-642-37456-2_14},
9 | url={http://dx.doi.org/10.1007/978-3-642-37456-2_14}
10 | }
11 |
12 | @article{campello2015hierarchical,
13 | title={Hierarchical density estimates for data clustering, visualization, and outlier detection},
14 | author={Campello, Ricardo JGB and Moulavi, Davoud and Zimek, Arthur and Sander, J{\"o}rg},
15 | journal={ACM Transactions on Knowledge Discovery from Data (TKDD)},
16 | volume={10},
17 | number={1},
18 | pages={5},
19 | year={2015},
20 | publisher={ACM},
21 | url = {http://doi.acm.org/10.1145/2733381},
22 | doi = {10.1145/2733381}
23 | }
24 |
25 | @article{chaudhuri2014consistent,
26 | title={Consistent procedures for cluster tree estimation and pruning},
27 | author={Chaudhuri, Kamalika and Dasgupta, Sanjoy and Kpotufe, Samory and von Luxburg, Ulrike},
28 | journal={IEEE Transactions on Information Theory},
29 | volume={60},
30 | number={12},
31 | pages={7900--7912},
32 | year={2014},
33 | publisher={IEEE},
34 | doi={10.1109/TIT.2014.2361055}
35 | }
36 |
37 | @inproceedings{chaudhuri2010rates,
38 | author = {Chaudhuri, Kamalika and Dasgupta, Sanjoy},
39 | title = {Rates of Convergence for the Cluster Tree},
40 | booktitle = {Proceedings of the 23rd International Conference on Neural Information Processing Systems},
41 | series = {NIPS'10},
42 | year = {2010},
43 | location = {Vancouver, British Columbia, Canada},
44 | pages = {343--351},
45 | numpages = {9},
46 | url = {https://papers.nips.cc/paper/4068-rates-of-convergence-for-the-cluster-tree},
47 | acmid = {2997228},
48 | publisher = {Curran Associates Inc.},
49 | address = {USA},
50 | }
51 |
--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'hdbscan: Hierarchical density based clustering'
3 | tags:
4 | - clustering
5 | - unsupervised learning
6 | - machine learning
7 | authors:
8 | - name: Leland McInnes
9 | orcid: 0000-0003-2143-6834
10 | affiliation: 1
11 | - name: John Healy
12 | affiliation: 1
13 | - name: Steve Astels
14 | affiliation: 2
15 | affiliations:
16 | - name: Tutte Institute for Mathematics and Computing
17 | index: 1
18 | - name: Shopify
19 | index: 2
20 | date: 26 February 2017
21 | bibliography: paper.bib
22 | ---
23 |
24 | # Summary
25 |
26 | HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise
27 | [@campello2013density], [@campello2015hierarchical].
28 | Performs DBSCAN over varying epsilon values and integrates the result to find a
29 | clustering that gives the best stability over epsilon. This allows HDBSCAN to
30 | find clusters of varying densities (unlike DBSCAN), and be more robust to parameter
31 | selection. The library also includes support for Robust Single Linkage clustering
32 | [@chaudhuri2014consistent], [@chaudhuri2010rates],
33 | GLOSH outlier detection [@campello2015hierarchical], and tools for visualizing
34 | and exploring cluster structures.
35 | Finally support for prediction and soft clustering is also available.
36 |
37 | -
38 | -
39 |
40 | # References
41 |
42 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools",
4 | "wheel",
5 | "cython<4",
6 | "numpy<3"
7 | ]
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.20,<3
2 | scipy>= 1.0
3 | scikit-learn>=0.20
4 | joblib>=1.0
5 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | try:
4 | # from Cython.Distutils import build_ext
5 | from Cython.Build import cythonize
6 | from setuptools import setup, Extension
7 | from setuptools.command.build_ext import build_ext
8 | HAVE_CYTHON = True
9 | except ImportError as e:
10 | warnings.warn(e.args[0])
11 | cythonize = lambda ext: ext
12 | from setuptools import setup, Extension
13 | from setuptools.command.build_ext import build_ext
14 | HAVE_CYTHON = False
15 |
16 |
17 | class CustomBuildExtCommand(build_ext):
18 | """build_ext command for use when numpy headers are needed."""
19 |
20 | def run(self):
21 |
22 | # Import numpy here, only when headers are needed
23 | import numpy
24 |
25 | # Add numpy headers to include_dirs
26 | self.include_dirs.append(numpy.get_include())
27 |
28 | # Call original build_ext command
29 | build_ext.run(self)
30 |
31 |
32 | _hdbscan_tree = Extension('hdbscan._hdbscan_tree',
33 | sources=['hdbscan/_hdbscan_tree.pyx'])
34 | _hdbscan_linkage = Extension('hdbscan._hdbscan_linkage',
35 | sources=['hdbscan/_hdbscan_linkage.pyx'])
36 | _hdbscan_boruvka = Extension('hdbscan._hdbscan_boruvka',
37 | sources=['hdbscan/_hdbscan_boruvka.pyx'])
38 | _hdbscan_reachability = Extension('hdbscan._hdbscan_reachability',
39 | sources=['hdbscan/_hdbscan_reachability.pyx'])
40 | _prediction_utils = Extension('hdbscan._prediction_utils',
41 | sources=['hdbscan/_prediction_utils.pyx'])
42 | dist_metrics = Extension('hdbscan.dist_metrics',
43 | sources=['hdbscan/dist_metrics.pyx'])
44 |
45 |
46 |
47 | def readme():
48 | with open('README.rst') as readme_file:
49 | return readme_file.read()
50 |
51 | def requirements():
52 | # The dependencies are the same as the contents of requirements.txt
53 | with open('requirements.txt') as f:
54 | return [line.strip() for line in f if line.strip()]
55 |
56 | configuration = {
57 | 'name': 'hdbscan',
58 | 'version': '0.8.40',
59 | 'description': 'Clustering based on density with variable density clusters',
60 | 'long_description': readme(),
61 | 'classifiers': [
62 | 'Development Status :: 4 - Beta',
63 | 'Intended Audience :: Science/Research',
64 | 'Intended Audience :: Developers',
65 | 'License :: OSI Approved',
66 | 'Programming Language :: C',
67 | 'Programming Language :: Python',
68 | 'Topic :: Software Development',
69 | 'Topic :: Scientific/Engineering',
70 | 'Operating System :: Microsoft :: Windows',
71 | 'Operating System :: POSIX',
72 | 'Operating System :: Unix',
73 | 'Operating System :: MacOS',
74 | 'Programming Language :: Python :: 3.9',
75 | 'Programming Language :: Python :: 3.10',
76 | 'Programming Language :: Python :: 3.11',
77 | 'Programming Language :: Python :: 3.12',
78 | ],
79 | 'keywords': 'cluster clustering density hierarchical',
80 | 'url': 'http://github.com/scikit-learn-contrib/hdbscan',
81 | 'maintainer': 'Leland McInnes',
82 | 'maintainer_email': 'leland.mcinnes@gmail.com',
83 | 'license': 'BSD',
84 | 'packages': ['hdbscan', 'hdbscan.tests'],
85 | 'install_requires': requirements(),
86 | 'ext_modules': cythonize([
87 | _hdbscan_tree,
88 | _hdbscan_linkage,
89 | _hdbscan_boruvka,
90 | _hdbscan_reachability,
91 | _prediction_utils,
92 | dist_metrics]),
93 | 'cmdclass': {'build_ext': CustomBuildExtCommand},
94 | 'test_suite': 'nose.collector',
95 | 'tests_require': ['nose'],
96 | 'data_files': ('hdbscan/dist_metrics.pxd',)
97 | }
98 |
99 | if not HAVE_CYTHON:
100 | warnings.warn('Due to incompatibilities with Python 3.7 hdbscan now'
101 | 'requires Cython to be installed in order to build it')
102 | raise ImportError('Cython not found! Please install cython and try again')
103 |
104 | setup(**configuration)
105 |
--------------------------------------------------------------------------------