├── .github
    └── workflows
    │   ├── pythonpublish.yml
    │   ├── pythonpublish_wheel.yml
    │   └── pythonpublish_windows.yml
├── .gitignore
├── .idea
    ├── .gitignore
    ├── codeStyleSettings.xml
    ├── codeStyles
    │   ├── Project.xml
    │   └── codeStyleConfig.xml
    ├── hdbscan.iml
    ├── inspectionProfiles
    │   └── Project_Default.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── .nojekyll
├── .pep8speaks.yml
├── .readthedocs.yaml
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── azure-pipelines.yml
├── ci_scripts
    └── push_doc.sh
├── circle.yml
├── docs
    ├── Makefile
    ├── advanced_hdbscan.rst
    ├── api.rst
    ├── basic_hdbscan.rst
    ├── comparing_clustering_algorithms.rst
    ├── conf.py
    ├── dbscan_from_hdbscan.rst
    ├── docs_requirements.txt
    ├── faq.rst
    ├── how_hdbscan_works.rst
    ├── how_to_detect_branches.rst
    ├── how_to_use_epsilon.rst
    ├── images
    │   ├── advanced_hdbscan_11_1.png
    │   ├── advanced_hdbscan_26_1.png
    │   ├── advanced_hdbscan_3_1.png
    │   ├── advanced_hdbscan_5_1.png
    │   ├── advanced_hdbscan_9_1.png
    │   ├── allow_single_cluster.png
    │   ├── comparing_clustering_algorithms_12_0.png
    │   ├── comparing_clustering_algorithms_15_0.png
    │   ├── comparing_clustering_algorithms_18_0.png
    │   ├── comparing_clustering_algorithms_21_0.png
    │   ├── comparing_clustering_algorithms_24_0.png
    │   ├── comparing_clustering_algorithms_27_0.png
    │   ├── comparing_clustering_algorithms_31_0.png
    │   ├── comparing_clustering_algorithms_6_0.png
    │   ├── distance1.svg
    │   ├── distance2.svg
    │   ├── distance3.svg
    │   ├── distance4.svg
    │   ├── distance4a.svg
    │   ├── distance5.svg
    │   ├── epsilon_parameter_dataset.png
    │   ├── epsilon_parameter_dbscan.png
    │   ├── epsilon_parameter_hdbscan_e3_leaf.png
    │   ├── epsilon_parameter_hdbscan_eom.png
    │   ├── epsilon_parameter_hdbscan_eps.png
    │   ├── generative_model_kde.png
    │   ├── generative_model_scatter.png
    │   ├── how_hdbscan_works_10_1.png
    │   ├── how_hdbscan_works_12_1.png
    │   ├── how_hdbscan_works_15_1.png
    │   ├── how_hdbscan_works_18_1.png
    │   ├── how_hdbscan_works_20_1.png
    │   ├── how_hdbscan_works_3_1.png
    │   ├── how_to_detect_branches_13_0.png
    │   ├── how_to_detect_branches_15_0.png
    │   ├── how_to_detect_branches_17_0.png
    │   ├── how_to_detect_branches_19_0.png
    │   ├── how_to_detect_branches_21_0.png
    │   ├── how_to_detect_branches_23_0.png
    │   ├── how_to_detect_branches_25_0.png
    │   ├── how_to_detect_branches_3_0.png
    │   ├── how_to_detect_branches_5_0.png
    │   ├── how_to_detect_branches_7_0.png
    │   ├── how_to_detect_branches_9_0.png
    │   ├── outlier_detection_3_1.png
    │   ├── outlier_detection_7_1.png
    │   ├── outlier_detection_9_1.png
    │   ├── parameter_selection_11_1.png
    │   ├── parameter_selection_12_1.png
    │   ├── parameter_selection_15_1.png
    │   ├── parameter_selection_18_1.png
    │   ├── parameter_selection_3_1.png
    │   ├── parameter_selection_7_1.png
    │   ├── parameter_selection_9_1.png
    │   ├── performance_and_scalability_14_1.png
    │   ├── performance_and_scalability_20_2.png
    │   ├── performance_and_scalability_24_1.png
    │   ├── performance_and_scalability_9_1.png
    │   ├── prediction_tutorial_3_0.png
    │   ├── prediction_tutorial_5_1.png
    │   ├── prediction_tutorial_9_1.png
    │   ├── soft_clustering_10_1.png
    │   ├── soft_clustering_13_1.png
    │   ├── soft_clustering_15_0.png
    │   ├── soft_clustering_3_1.png
    │   ├── soft_clustering_6_1.png
    │   ├── soft_clustering_8_1.png
    │   ├── soft_clustering_explanation_11_0.png
    │   ├── soft_clustering_explanation_15_0.png
    │   ├── soft_clustering_explanation_26_0.png
    │   ├── soft_clustering_explanation_2_0.png
    │   ├── soft_clustering_explanation_31_0.png
    │   ├── soft_clustering_explanation_36_0.png
    │   └── soft_clustering_explanation_6_0.png
    ├── index.rst
    ├── make.bat
    ├── outlier_detection.rst
    ├── parameter_selection.rst
    ├── performance_and_scalability.rst
    ├── prediction_tutorial.rst
    ├── soft_clustering.rst
    └── soft_clustering_explanation.rst
├── environment.yml
├── examples
    ├── plot_cluster_comparison.py
    └── plot_hdbscan.py
├── hdbscan
    ├── __init__.py
    ├── _hdbscan_boruvka.pyx
    ├── _hdbscan_linkage.pyx
    ├── _hdbscan_reachability.pyx
    ├── _hdbscan_tree.pyx
    ├── _prediction_utils.pyx
    ├── branch_data.py
    ├── branches.py
    ├── dist_metrics.pxd
    ├── dist_metrics.pyx
    ├── flat.py
    ├── hdbscan_.py
    ├── plots.py
    ├── prediction.py
    ├── robust_single_linkage_.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_branches.py
    │   ├── test_flat.py
    │   ├── test_hdbscan.py
    │   ├── test_prediction_utils.py
    │   └── test_rsl.py
    └── validity.py
├── notebooks
    ├── Benchmarking scalability of clustering implementations 2D v0.7.ipynb
    ├── Benchmarking scalability of clustering implementations-v0.7.ipynb
    ├── Comparing Clustering Algorithms.ipynb
    ├── Flat clustering.ipynb
    ├── How HDBSCAN Works.ipynb
    ├── How Soft Clustering for HDBSCAN Works.ipynb
    ├── How to detect branches.ipynb
    ├── Looking at cluster consistency.ipynb
    ├── Performance data generation .ipynb
    ├── Python vs Java.ipynb
    ├── clusterable_data.npy
    ├── distance1.svg
    ├── distance2.svg
    ├── distance3.svg
    ├── distance4.svg
    ├── distance4a.svg
    ├── distance5.svg
    ├── hdbscan01_timings.csv
    ├── hdbscan02_timings.csv
    ├── hdbscan03_timings.csv
    ├── hdbscan04_timings.csv
    ├── hdbscan05_timings.csv
    ├── hdbscan06_timings.csv
    ├── reference_impl_external_timings.csv
    └── reference_impl_internal_timings.csv
├── paper
    ├── hdbscan_clustering_result.png
    ├── hdbscan_condensed_tree.png
    ├── paper.bib
    └── paper.md
├── pyproject.toml
├── requirements.txt
└── setup.py


/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.x'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |         pip install scikit-learn cython
21 |         pip install auditwheel
22 |     - name: Build and publish
23 |       env:
24 |         TWINE_USERNAME: __token__
25 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
26 |       run: |
27 |         python setup.py sdist bdist_wheel
28 |         twine upload dist/*.tar.gz
29 |         auditwheel repair dist/*linux_x86_64.whl
30 |         twine upload wheelhouse/*.whl
31 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish_wheel.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Wheels
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       checkout_ref:
 7 |         description: "The branch, tag or SHA to checkout."
 8 |         required: true
 9 |         default: "master"
10 | 
11 | jobs:
12 |   linux-deploy:
13 |     runs-on: ubuntu-latest
14 |     container: quay.io/pypa/manylinux2014_x86_64
15 |     strategy:
16 |       matrix:
17 |         python: ["cp38-cp38", "cp39-cp39", "cp310-cp310", "cp311-cp311"]
18 |     steps:
19 |       - uses: actions/checkout@v1
20 |         with:
21 |           ref: ${{ inputs.checkout_ref }}
22 |       - name: Build wheel
23 |         env:
24 |           PYTHON: /opt/python/${{ matrix.python }}/bin/python
25 |         run: |
26 |           $PYTHON -m pip install "cython<3" oldest-supported-numpy
27 |           $PYTHON -m build --no-isolation
28 |           auditwheel repair dist/*linux_x86_64.whl
29 |       - name: Publish to pypi
30 |         env:
31 |           TWINE_USERNAME: __token__
32 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
33 |         run: |
34 |           pipx install twine
35 |           twine upload dist/*.tar.gz --skip-existing
36 |           twine upload wheelhouse/*.whl --skip-existing
37 |   other-deploy:
38 |     strategy:
39 |       matrix:
40 |         python: ["3.9", "3.10", "3.11", "3.12"]
41 |         os: [windows-2019, macos-11]
42 |     runs-on: ${{ matrix.os }}
43 |     steps:
44 |       - uses: actions/checkout@v1
45 |         with:
46 |           ref: ${{ inputs.checkout_ref }}
47 |       - name: Set up Python
48 |         uses: actions/setup-python@v1
49 |         with:
50 |           python-version: ${{ matrix.python }}
51 |       - name: Install dependencies
52 |         run: |
53 |           python -m pip install --upgrade pip
54 |           pip install setuptools build wheel twine
55 |           pip install cython "numpy>=2"
56 |       - name: Build wheel
57 |         run: |
58 |           python -m build --no-isolation
59 |       - name: Publish to pypi
60 |         env:
61 |           TWINE_USERNAME: __token__
62 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
63 |         run: |
64 |           twine upload dist/*.whl --skip-existing
65 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish_windows.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: windows-2019
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.x'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |         pip install scikit-learn cython
21 |         pip install auditwheel
22 |     - name: Build and publish
23 |       env:
24 |         TWINE_USERNAME: __token__
25 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
26 |       run: |
27 |         python setup.py sdist bdist_wheel
28 |         twine upload dist/*
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/*
 2 | hdbscan/*.pyc
 3 | hdbscan/tests/*.pyc
 4 | hdbscan/*.pyd
 5 | hdbscan/tests/*.pyd
 6 | hdbscan/__pycache__/*
 7 | dist/*
 8 | *egg-info
 9 | notebooks/.ipynb_checkpoints/*
10 | __pycache__/
11 | 
12 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml


--------------------------------------------------------------------------------
/.idea/codeStyleSettings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectCodeStyleSettingsManager">
 4 |     <option name="PER_PROJECT_SETTINGS">
 5 |       <value>
 6 |         <XML>
 7 |           <option name="XML_LEGACY_SETTINGS_IMPORTED" value="true" />
 8 |         </XML>
 9 |       </value>
10 |     </option>
11 |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default (1)" />
12 |   </component>
13 | </project>


--------------------------------------------------------------------------------
/.idea/codeStyles/Project.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectCodeStyleConfiguration">
2 |   <code_scheme name="Project" version="173">
3 |     <XML>
4 |       <option name="XML_LEGACY_SETTINGS_IMPORTED" value="true" />
5 |     </XML>
6 |   </code_scheme>
7 | </component>


--------------------------------------------------------------------------------
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectCodeStyleConfiguration">
2 |   <state>
3 |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default (1)" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/.idea/hdbscan.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="jdk" jdkName="Python 3.10 (hdbscan)" jdkType="Python SDK" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="4">
 8 |             <item index="0" class="java.lang.String" itemvalue="scipy" />
 9 |             <item index="1" class="java.lang.String" itemvalue="numba" />
10 |             <item index="2" class="java.lang.String" itemvalue="scikit-learn" />
11 |             <item index="3" class="java.lang.String" itemvalue="numpy" />
12 |           </list>
13 |         </value>
14 |       </option>
15 |     </inspection_tool>
16 |   </profile>
17 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (hdbscan)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/hdbscan.iml" filepath="$PROJECT_DIR$/.idea/hdbscan.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.pep8speaks.yml:
--------------------------------------------------------------------------------
 1 | # File : .pep8speaks.yml
 2 | 
 3 | message:  # Customize the comment made by the bot
 4 |     opened:  # Messages when a new PR is submitted
 5 |         header: "Hello @{name}, Thank you for submitting the Pull Request !"
 6 |                 # The keyword {name} is converted into the author's username
 7 |         footer: ""
 8 |                 # The messages can be written as they would over GitHub
 9 |     updated:  # Messages when new commits are added to the PR
10 |         header: "Hello @{name}, Thank you for updating !"
11 |         footer: ""  # Why to comment the link to the style guide everytime? :)
12 |     no_errors: "Cheers ! There are no PEP8 issues in this Pull Request. :beers: "
13 | 
14 | scanner:
15 |     diff_only: False  # If True, errors caused by only the patch are shown
16 | 
17 | pycodestyle:
18 |     max-line-length: 100  # Default is 79 in PEP8
19 |     ignore:  # Errors and warnings to ignore
20 |         - W391
21 |         - E203
22 | 
23 | only_mention_files_with_errors: True  # If False, a separate status comment for each file is made.
24 | descending_issues_order: False # If True, PEP8 issues in message will be displayed in descending order of line numbers in the file
25 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 | 
 6 |   tools:
 7 |     python: "3.12"
 8 |   
 9 | 
10 | python:
11 |   install:
12 |     - requirements: docs/docs_requirements.txt
13 |     - method: pip
14 |       path: .
15 | 
16 | sphinx:
17 | 
18 |   configuration: docs/conf.py
19 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.6"
 5 |   - "3.7"
 6 |   - "3.8"
 7 |   - "3.9"
 8 | 
 9 | cache:
10 |   apt: true
11 |   # We use three different cache directory
12 |   # to work around a Travis bug with multi-platform cache
13 |   directories:
14 |   - $HOME/.cache/pip
15 |   - $HOME/download
16 | env:
17 |   global:
18 |     # Directory where tests are run from
19 |     - TEST_DIR=/tmp/test_dir/
20 |     - MODULE=hdbscan
21 |   matrix:
22 |     - DISTRIB="conda"
23 | 
24 | install:
25 |   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
26 |   - bash miniconda.sh -b -p $HOME/miniconda
27 |   - source "$HOME/miniconda/etc/profile.d/conda.sh"
28 |   - hash -r
29 |   - conda config --set always_yes yes --set changeps1 no
30 |   - conda update -q conda
31 |   - conda info -a
32 |   - conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas networkx scikit-learn pytest pytest-cov codecov coverage cython 
33 |   - conda activate testenv
34 |   - python -c "import numpy; print('numpy %s' % numpy.__version__)"
35 |   - python -c "import scipy; print('scipy %s' % scipy.__version__)"
36 |   - python setup.py develop
37 | 
38 | script: 
39 |   - conda activate testenv
40 |   - pytest --cov=./
41 | 
42 | after_success:
43 |   - bash <(curl -s https://codecov.io/bash)
44 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at leland.mcinnes@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Leland McInnes
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | 
 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | 
10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst *.txt pyproject.toml LICENSE
2 | recursive-include hdbscan *.py *.pyx *.pxd *.c
3 | recursive-include notebooks *.ipynb *.npy *.svg
4 | recursive-include examples *.py
5 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | # Trigger a build when there is a push to the main branch or a tag starts with release-
  2 | trigger:
  3 |   branches:
  4 |     include:
  5 |     - master
  6 |   tags:
  7 |     include:
  8 |     - release-*
  9 | 
 10 | # Trigger a build when there is a pull request to the main branch
 11 | # Ignore PRs that are just updating the docs
 12 | pr:
 13 |   branches:
 14 |     include:
 15 |     - master
 16 |     exclude:
 17 |     - doc/*
 18 |     - README.rst
 19 | 
 20 | variables:
 21 |   triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
 22 | 
 23 | stages:
 24 |   - stage: RunAllTests
 25 |     displayName: Run test suite
 26 |     jobs:
 27 |       - job: run_platform_tests
 28 |         strategy:
 29 |           matrix:
 30 |             mac_py39:
 31 |               imageName: 'macOS-latest'
 32 |               python.version: '3.9'
 33 |             linux_py39:
 34 |               imageName: 'ubuntu-latest'
 35 |               python.version: '3.9'
 36 |             windows_py39:
 37 |               imageName: 'windows-latest'
 38 |               python.version: '3.9'
 39 |             mac_py310:
 40 |               imageName: 'macOS-latest'
 41 |               python.version: '3.10'
 42 |             linux_py310:
 43 |               imageName: 'ubuntu-latest'
 44 |               python.version: '3.10'
 45 |             windows_py310:
 46 |               imageName: 'windows-latest'
 47 |               python.version: '3.10'
 48 |             mac_py311:
 49 |               imageName: 'macOS-latest'
 50 |               python.version: '3.11'
 51 |             linux_py311:
 52 |               imageName: 'ubuntu-latest'
 53 |               python.version: '3.11'
 54 |             windows_py311:
 55 |               imageName: 'windows-latest'
 56 |               python.version: '3.11'
 57 |             mac_py312:
 58 |               imageName: 'macOS-latest'
 59 |               python.version: '3.12'
 60 |             linux_py312:
 61 |               imageName: 'ubuntu-latest'
 62 |               python.version: '3.12'
 63 |             windows_py312:
 64 |               imageName: 'windows-latest'
 65 |               python.version: '3.12'
 66 |         pool:
 67 |           vmImage: $(imageName)
 68 | 
 69 |         steps:
 70 |         - task: UsePythonVersion@0
 71 |           inputs:
 72 |             versionSpec: '$(python.version)'
 73 |           displayName: 'Use Python $(python.version)'
 74 | 
 75 |         - script: |
 76 |             python -m pip install --upgrade pip
 77 |             pip install -r requirements.txt
 78 |           displayName: 'Install dependencies'
 79 | 
 80 |         - script: |
 81 |             pip install -e .
 82 |             pip install pytest pytest-azurepipelines
 83 |             pip install pytest-cov
 84 |             pip install coveralls
 85 |           displayName: 'Install package'
 86 | 
 87 |         - script: |
 88 |             pytest hdbscan/tests --show-capture=no -v --disable-warnings --junitxml=junit/test-results.xml --cov=hdbscan/ --cov-report=xml --cov-report=html
 89 |           displayName: 'Run tests'
 90 | 
 91 |         - bash: |
 92 |             coveralls
 93 |           displayName: 'Publish to coveralls'
 94 |           condition: and(succeeded(), eq(variables.triggeredByPullRequest, false)) # Don't run this for PRs because they can't access pipeline secrets
 95 |           env:
 96 |             COVERALLS_REPO_TOKEN: $(COVERALLS_TOKEN)
 97 | 
 98 |         - task: PublishTestResults@2
 99 |           inputs:
100 |             testResultsFiles: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
101 |             testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
102 |           condition: succeededOrFailed()
103 | 
104 |   - stage: BuildPublishArtifact
105 |     dependsOn: RunAllTests
106 |     condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/release-'), eq(variables.triggeredByPullRequest, false))
107 |     jobs:
108 |       # Need to use manylinux as ubuntu-latest is too new
109 |       - job: Manylinux2014Build
110 |         pool:
111 |           vmImage: 'ubuntu-latest'
112 |         container: quay.io/pypa/manylinux2014_x86_64:latest
113 |         strategy:
114 |           matrix:
115 |             linux_py38:
116 |               python.version: 'cp38-cp38'
117 |             linux_py39:
118 |               python.version: 'cp39-cp39'
119 |             linux_py310:
120 |               python.version: 'cp310-cp310'
121 |             linux_py311:
122 |               python.version: 'cp311-cp311'
123 |             linux_py312:
124 |               python.version: 'cp312-cp312'
125 |         steps:
126 |         - script: |
127 |             "${PYBIN}/python" -m pip install --upgrade pip
128 |             "${PYBIN}/python" -m pip install wheel
129 |             "${PYBIN}/python" -m pip install -r requirements.txt
130 |             "${PYBIN}/python" -m pip install cython
131 |           displayName: 'Install dependencies and build tools'
132 |           env:
133 |             PYBIN: /opt/python/$(python.version)/bin
134 |         - script: |
135 |             "${PYBIN}/python" setup.py sdist bdist_wheel
136 |           displayName: 'Build wheels'
137 |           env:
138 |             PYBIN: /opt/python/$(python.version)/bin
139 |         - bash: |
140 |             auditwheel repair dist/*linux_x86_64.whl --plat manylinux2014_x86_64 -w wheelhouse-manylinux/
141 |           displayName: 'Audit wheels'
142 |         
143 |         - task: DownloadSecureFile@1
144 |           name: PYPIRC_CONFIG
145 |           displayName: 'Download pypirc'
146 |           inputs:
147 |             secureFile: 'pypirc'  
148 |         
149 |         - bash: |
150 |             "${PYBIN}/python" -m pip install twine
151 |             "${PYBIN}/python" -m twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing --disable-progress-bar wheelhouse-manylinux/*
152 |             "${PYBIN}/python" -m twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing --disable-progress-bar dist/*.tar.gz
153 |           displayName: 'Publish wheel to PyPi'
154 |           env:
155 |             PYBIN: /opt/python/$(python.version)/bin
156 | 
157 |       - job: BuildWindowsAndMacOSArtifacts
158 |         displayName: Build source dists and wheels for windows and macOS
159 |         strategy:
160 |           matrix:        
161 |             mac_py38:
162 |               imageName: 'macOS-latest'
163 |               python.version: '3.8'
164 |             windows_py38:
165 |               imageName: 'windows-latest'
166 |               python.version: '3.8'
167 |             mac_py39:
168 |               imageName: 'macOS-latest'
169 |               python.version: '3.9'
170 |             windows_py39:
171 |               imageName: 'windows-latest'
172 |               python.version: '3.9'
173 |             mac_py310:
174 |               imageName: 'macOS-latest'
175 |               python.version: '3.10'
176 |             windows_py310:
177 |               imageName: 'windows-latest'
178 |               python.version: '3.10'
179 |             mac_py311:
180 |               imageName: 'macOS-latest'
181 |               python.version: '3.11'
182 |             windows_py311:
183 |               imageName: 'windows-latest'
184 |               python.version: '3.11'
185 |             mac_py312:
186 |               imageName: 'macOS-latest'
187 |               python.version: '3.12'
188 |             windows_py312:
189 |               imageName: 'windows-latest'
190 |               python.version: '3.12'
191 |         pool:
192 |           vmImage: $(imageName) 
193 | 
194 |         steps:
195 |         - task: UsePythonVersion@0
196 |           inputs:
197 |             versionSpec: '$(python.version)'
198 |           displayName: 'Use Python $(python.version)'
199 | 
200 |         - script: |
201 |             python -m pip install --upgrade pip
202 |             pip install wheel
203 |             pip install -r requirements.txt
204 |             pip install cython
205 |             pip install setuptools
206 |           displayName: 'Install dependencies'
207 | 
208 |         - script: |
209 |             pip install -e .
210 |           displayName: 'Install package locally'
211 |         
212 |         - bash: |
213 |             python setup.py sdist bdist_wheel
214 |           displayName: 'Build package'
215 | 
216 |         - bash: |
217 |             export PACKAGE_VERSION="$(python setup.py --version)"
218 |             echo "Package Version: ${PACKAGE_VERSION}"
219 |             echo "##vso[task.setvariable variable=packageVersionFormatted;]release-${PACKAGE_VERSION}"
220 |           displayName: 'Get package version'
221 | 
222 |         - script: |
223 |             echo "Version in git tag $(Build.SourceBranchName) does not match version derived from setup.py $(packageVersionFormatted)"
224 |             exit 1
225 |           displayName: Raise error if version doesnt match tag
226 |           condition: and(succeeded(), ne(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
227 | 
228 |         - task: DownloadSecureFile@1
229 |           name: PYPIRC_CONFIG
230 |           displayName: 'Download pypirc'
231 |           inputs:
232 |             secureFile: 'pypirc'  
233 | 
234 |         - script: |
235 |             pip install twine
236 |             twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing  dist/* 
237 |           displayName: 'Upload to PyPI'
238 |           condition: and(succeeded(), eq(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
239 | 


--------------------------------------------------------------------------------
/ci_scripts/push_doc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is meant to be called in the "deploy" step defined in 
 3 | # circle.yml. See https://circleci.com/docs/ for more details.
 4 | # The behavior of the script is controlled by environment variable defined
 5 | # in the circle.yml in the top level folder of the project.
 6 | 
 7 | MSG="Pushing the docs for revision for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1"
 8 | 
 9 | cd $HOME
10 | # Copy the build docs to a temporary folder
11 | rm -rf tmp
12 | mkdir tmp
13 | cp -R $HOME/$DOC_REPO/doc/_build/html/* ./tmp/ 
14 | 
15 | # Clone the docs repo if it isnt already there
16 | if [ ! -d $DOC_REPO ];
17 |     then git clone "git@github.com:$USERNAME/"$DOC_REPO".git";
18 | fi
19 | 
20 | cd $DOC_REPO
21 | git branch gh-pages
22 | git checkout -f gh-pages
23 | git reset --hard origin/gh-pages
24 | git clean -dfx
25 | 
26 | for name in $(ls -A $HOME/$DOC_REPO); do
27 |     case $name in
28 |         .nojekyll) # So that github does not build this as a Jekyll website.
29 |         ;;
30 |         circle.yml) # Config so that build gh-pages branch.
31 |         ;;
32 |         *)
33 |         git rm -rf $name
34 |         ;;
35 |     esac
36 | done
37 | 
38 | # Copy the new build docs
39 | mkdir $DOC_URL
40 | cp -R $HOME/tmp/* ./$DOC_URL/
41 | 
42 | git config --global user.email $EMAIL
43 | git config --global user.name $USERNAME
44 | git add -f ./$DOC_URL/
45 | git commit -m "$MSG"
46 | git push -f origin gh-pages
47 | if [ $? -ne 0 ]; then
48 |     echo "Pushing docs failed"
49 |     echo
50 |     exit 1
51 | fi
52 | 
53 | echo $MSG 
54 | 


--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
 1 | machine:
 2 |   environment:
 3 |     # The github organization or username of the repository which hosts the
 4 |     # project and documentation.
 5 |     USERNAME: "scikit-learn-contrib"
 6 | 
 7 |     # The repository where the documentation will be hosted
 8 |     DOC_REPO: "hdbscan"
 9 | 
10 |     # The base URL for the Github page where the documentation will be hosted
11 |     DOC_URL: ""
12 | 
13 |     # The email is to be used for commits in the Github Page
14 |     EMAIL: "leland.mcinnes+ci@gmail.com"
15 | 
16 | dependencies:
17 | 
18 |   # Various dependencies
19 |   pre:
20 |     - sudo -E apt-get -yq remove texlive-binaries --purge
21 |     - sudo apt-get update
22 |     - sudo apt-get install libatlas-dev libatlas3gf-base
23 |     - sudo apt-get install build-essential python-dev python-setuptools
24 |     # install numpy first as it is a compile time dependency for other packages
25 |     - pip install --upgrade numpy
26 |     - pip install --upgrade scipy matplotlib setuptools nose coverage sphinx pillow sphinx-gallery sphinx_rtd_theme
27 |     # Installing required packages for `make -C doc check command` to work.
28 |     - sudo -E apt-get -yq update
29 |     - sudo -E apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra
30 |     - pip install --upgrade cython numpydoc
31 |     - pip install --upgrade scikit-learn
32 | 
33 |   # The --user is needed to let sphinx see the source and the binaries
34 |   # The pipefail is requested to propagate exit code
35 |   override:
36 |     - python setup.py clean
37 |     - python setup.py develop
38 |     - set -o pipefail && cd doc && make html 2>&1 | tee ~/log.txt
39 | test:
40 |   # Grep error on the documentation
41 |   override:
42 |     - cat ~/log.txt && if grep -q "Traceback (most recent call last):" ~/log.txt; then false; else true; fi
43 | deployment:
44 |   push:
45 |     branch: master
46 |     commands:
47 |       - bash ci_scripts/push_doc.sh
48 | general:
49 |   # Open the doc to the API
50 |   artifacts:
51 |     - "doc/_build/html"
52 |     - "~/log.txt"
53 |   # Restric the build to the branch master only
54 |   branches:
55 |     ignore:
56 |        - gh-pages
57 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 49 | 
 50 | .PHONY: clean
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | .PHONY: html
 55 | html:
 56 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 57 | 	@echo
 58 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 59 | 
 60 | .PHONY: dirhtml
 61 | dirhtml:
 62 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 65 | 
 66 | .PHONY: singlehtml
 67 | singlehtml:
 68 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 71 | 
 72 | .PHONY: pickle
 73 | pickle:
 74 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 75 | 	@echo
 76 | 	@echo "Build finished; now you can process the pickle files."
 77 | 
 78 | .PHONY: json
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | .PHONY: htmlhelp
 85 | htmlhelp:
 86 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 89 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 90 | 
 91 | .PHONY: qthelp
 92 | qthelp:
 93 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 94 | 	@echo
 95 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 96 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 97 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/hdbscan.qhcp"
 98 | 	@echo "To view the help file:"
 99 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/hdbscan.qhc"
100 | 
101 | .PHONY: applehelp
102 | applehelp:
103 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | 	@echo
105 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | 	@echo "N.B. You won't be able to view it unless you put it in" \
107 | 	      "~/Library/Documentation/Help or install it in your application" \
108 | 	      "bundle."
109 | 
110 | .PHONY: devhelp
111 | devhelp:
112 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | 	@echo
114 | 	@echo "Build finished."
115 | 	@echo "To view the help file:"
116 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/hdbscan"
117 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/hdbscan"
118 | 	@echo "# devhelp"
119 | 
120 | .PHONY: epub
121 | epub:
122 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | 	@echo
124 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 | 
126 | .PHONY: latex
127 | latex:
128 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | 	@echo
130 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | 	      "(use \`make latexpdf' here to do that automatically)."
133 | 
134 | .PHONY: latexpdf
135 | latexpdf:
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through pdflatex..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
145 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 | 
148 | .PHONY: text
149 | text:
150 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | 	@echo
152 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
153 | 
154 | .PHONY: man
155 | man:
156 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | 	@echo
158 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 | 
160 | .PHONY: texinfo
161 | texinfo:
162 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | 	@echo
164 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
166 | 	      "(use \`make info' here to do that automatically)."
167 | 
168 | .PHONY: info
169 | info:
170 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | 	@echo "Running Texinfo files through makeinfo..."
172 | 	make -C $(BUILDDIR)/texinfo info
173 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 | 
175 | .PHONY: gettext
176 | gettext:
177 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | 	@echo
179 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 | 
181 | .PHONY: changes
182 | changes:
183 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | 	@echo
185 | 	@echo "The overview file is in $(BUILDDIR)/changes."
186 | 
187 | .PHONY: linkcheck
188 | linkcheck:
189 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | 	@echo
191 | 	@echo "Link check complete; look for any errors in the above output " \
192 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
193 | 
194 | .PHONY: doctest
195 | doctest:
196 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | 	@echo "Testing of doctests in the sources finished, look at the " \
198 | 	      "results in $(BUILDDIR)/doctest/output.txt."
199 | 
200 | .PHONY: coverage
201 | coverage:
202 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | 	@echo "Testing of coverage in the sources finished, look at the " \
204 | 	      "results in $(BUILDDIR)/coverage/python.txt."
205 | 
206 | .PHONY: xml
207 | xml:
208 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | 	@echo
210 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 | 
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | 	@echo
216 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 | 


--------------------------------------------------------------------------------
/docs/advanced_hdbscan.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Getting More Information About a Clustering
  3 | ===========================================
  4 | 
  5 | Once you have the basics of clustering sorted you may want to dig a
  6 | little deeper than just the cluster labels returned to you. Fortunately, the hdbscan library provides you with the facilities to do this. During
  7 | processing HDBSCAN\* builds a hierarchy of potential clusters, from
  8 | which it extracts the flat clustering returned. It can be informative to
  9 | look at that hierarchy, and potentially make use of the extra
 10 | information contained therein.
 11 | 
 12 | Suppose we have a dataset for clustering. It is a binary file in NumPy format and it can be found at https://github.com/lmcinnes/hdbscan/blob/master/notebooks/clusterable_data.npy.
 13 | 
 14 | .. code:: python
 15 | 
 16 |     import hdbscan
 17 |     import numpy as np
 18 |     import matplotlib.pyplot as plt
 19 |     import seaborn as sns
 20 |     %matplotlib inline
 21 |     
 22 | .. code:: python
 23 | 
 24 |     data = np.load('clusterable_data.bin')
 25 |     #or
 26 |     data = np.load('clusterable_data.npy')
 27 |     #depending on the format of the file
 28 |     
 29 | .. code:: python
 30 | 
 31 |     data.shape
 32 |     
 33 | .. parsed-literal::
 34 | 
 35 |     (2309, 2)
 36 |     
 37 | .. code:: python
 38 | 
 39 |     data
 40 |     
 41 | .. parsed-literal::
 42 | 
 43 |     array([[-0.12153499, -0.22876337],
 44 |        [-0.22093687, -0.25251088],
 45 |        [ 0.1259037 , -0.27314321],
 46 |        ..., 
 47 |        [ 0.50243143, -0.3002958 ],
 48 |        [ 0.53822256,  0.19412199],
 49 |        [-0.08688887, -0.2092721 ]])
 50 | 
 51 |     
 52 | .. code:: python
 53 | 
 54 |     plt.scatter(*data.T, s=50, linewidth=0, c='b', alpha=0.25)
 55 |     
 56 | .. parsed-literal::
 57 | 
 58 |     <matplotlib.collections.PathCollection at 0x7f6b61ad6e10>
 59 | 
 60 | .. image:: images/advanced_hdbscan_3_1.png
 61 | 
 62 | 
 63 | We can cluster the data as normal, and visualize the labels with
 64 | different colors (and even the cluster membership strengths as levels of
 65 | saturation)
 66 | 
 67 | .. code:: python
 68 | 
 69 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data)
 70 |     color_palette = sns.color_palette('deep', 8)
 71 |     cluster_colors = [color_palette[x] if x >= 0 
 72 |                       else (0.5, 0.5, 0.5) 
 73 |                       for x in clusterer.labels_]
 74 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
 75 |                              zip(cluster_colors, clusterer.probabilities_)]
 76 |     plt.scatter(*data.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
 77 | 
 78 | 
 79 | .. image:: images/advanced_hdbscan_5_1.png
 80 | 
 81 | Condensed Trees
 82 | ---------------
 83 | 
 84 | The question now is what does the cluster hierarchy look like -- which
 85 | clusters are near each other, or could perhaps be merged, and which are
 86 | far apart. We can access the basic hierarchy via the :py:attr:`~hdbscan.HDBSCAN.condensed_tree_`
 87 | attribute of the clusterer object.
 88 | 
 89 | .. code:: python
 90 | 
 91 |     clusterer.condensed_tree_
 92 | 
 93 | 
 94 | 
 95 | 
 96 | .. parsed-literal::
 97 | 
 98 |     <hdbscan.plots.CondensedTree at 0x10ea23a20>
 99 | 
100 | 
101 | 
102 | This merely gives us a :class:`~hdbscan.plots.CondensedTree` object. If we want to visualize the
103 | hierarchy we can call the :py:meth:`~hdbscan.plots.CondensedTree.plot` method:
104 | 
105 | .. code:: python
106 | 
107 |     clusterer.condensed_tree_.plot()
108 | 
109 | 
110 | .. image:: images/advanced_hdbscan_9_1.png
111 | 
112 | 
113 | We can now see the hierarchy as a dendrogram, the width (and color) of
114 | each branch representing the number of points in the cluster at that
115 | level. If we wish to know which branches were selected by the HDBSCAN\*
116 | algorithm we can pass ``select_clusters=True``. You can even pass a
117 | selection palette to color the selections according to the cluster
118 | labeling.
119 | 
120 | .. code:: python
121 | 
122 |     clusterer.condensed_tree_.plot(select_clusters=True, 
123 |                                    selection_palette=sns.color_palette('deep', 8))
124 | 
125 | 
126 | .. image:: images/advanced_hdbscan_11_1.png
127 | 
128 | 
129 | From this, we can see, for example, that the yellow cluster at the
130 | center of the plot forms early (breaking off from the pale blue and
131 | purple clusters) and persists for a long time. By comparison the green
132 | cluster, which also forms early, quickly breaks apart and then
133 | vanishes altogether (shattering into clusters all smaller than the
134 | ``min_cluster_size`` of 15).
135 | 
136 | You can also see that the pale blue cluster breaks apart into several
137 | subclusters that in turn persist for quite some time -- so there is some
138 | interesting substructure to the pale blue cluster that is not present,
139 | for example, in the dark blue cluster.
140 | 
141 | If this was a simple visual analysis of the condensed tree can tell you
142 | a lot more about the structure of your data. This is not all we can do
143 | with condensed trees, however. For larger and more complex datasets the
144 | tree itself may be very complex, and it may be desirable to run more
145 | interesting analytics over the tree itself. This can be achieved via
146 | several converter methods: :py:meth:`~hdbscan.plots.CondensedTree.to_networkx`, :py:meth:`~hdbscan.plots.CondensedTree.to_pandas`, and
147 | :py:meth:`~hdbscan.plots.CondensedTree.to_numpy`.
148 | 
149 | First we'll consider :py:meth:`~hdbscan.plots.CondensedTree.to_networkx`
150 | 
151 | .. code:: python
152 | 
153 |     clusterer.condensed_tree_.to_networkx()
154 | 
155 | 
156 | 
157 | 
158 | .. parsed-literal::
159 | 
160 |     <networkx.classes.digraph.DiGraph at 0x11d8023c8>
161 | 
162 | 
163 | 
164 | As you can see we get a NetworkX directed graph, which we can then use
165 | all the regular NetworkX tools and analytics on. The graph is richer
166 | than the visual plot above may lead you to believe, however:
167 | 
168 | .. code:: python
169 | 
170 |     g = clusterer.condensed_tree_.to_networkx()
171 |     g.number_of_nodes()
172 | 
173 | 
174 | 
175 | 
176 | .. parsed-literal::
177 | 
178 |     2338
179 | 
180 | 
181 | 
182 | The graph actually contains nodes for all the points falling out of
183 | clusters as well as the clusters themselves. Each node has an associated
184 | ``size`` attribute and each edge has a ``weight`` of the lambda value
185 | at which that edge forms. This allows for much more interesting
186 | analyses.
187 | 
188 | Next, we have the :py:meth:`~hdbscan.plots.CondensedTree.to_pandas` method, which returns a panda DataFrame
189 | where each row corresponds to an edge of the NetworkX graph:
190 | 
191 | .. code:: python
192 | 
193 |     clusterer.condensed_tree_.to_pandas().head()
194 | 
195 | 
196 | 
197 | 
198 | .. raw:: html
199 | 
200 |     <div>
201 |     <table border="1" class="dataframe">
202 |       <thead>
203 |         <tr style="text-align: right;">
204 |           <th></th>
205 |           <th>parent</th>
206 |           <th>child</th>
207 |           <th>lambda_val</th>
208 |           <th>child_size</th>
209 |         </tr>
210 |       </thead>
211 |       <tbody>
212 |         <tr>
213 |           <th>0</th>
214 |           <td>2309</td>
215 |           <td>2048</td>
216 |           <td>5.016526</td>
217 |           <td>1</td>
218 |         </tr>
219 |         <tr>
220 |           <th>1</th>
221 |           <td>2309</td>
222 |           <td>2006</td>
223 |           <td>5.076503</td>
224 |           <td>1</td>
225 |         </tr>
226 |         <tr>
227 |           <th>2</th>
228 |           <td>2309</td>
229 |           <td>2024</td>
230 |           <td>5.279133</td>
231 |           <td>1</td>
232 |         </tr>
233 |         <tr>
234 |           <th>3</th>
235 |           <td>2309</td>
236 |           <td>2050</td>
237 |           <td>5.347332</td>
238 |           <td>1</td>
239 |         </tr>
240 |         <tr>
241 |           <th>4</th>
242 |           <td>2309</td>
243 |           <td>1992</td>
244 |           <td>5.381930</td>
245 |           <td>1</td>
246 |         </tr>
247 |       </tbody>
248 |     </table>
249 |     </div>
250 | 
251 | 
252 | 
253 | 
254 | 
255 | Here the ``parent`` denotes the id of the parent cluster, the ``child``
256 | the id of the child cluster (or, if the child is a single data point
257 | rather than a cluster, the index in the dataset of that point), the
258 | ``lambda_val`` provides the lambda value at which the edge forms, and
259 | the ``child_size`` provides the number of points in the child cluster.
260 | As you can see the start of the DataFrame has singleton points falling
261 | out of the root cluster, with each ``child_size`` equal to 1.
262 | 
263 | If you want just the clusters, rather than all the individual points
264 | as well, simply select the rows of the DataFrame with ``child_size``
265 | greater than 1.
266 | 
267 | .. code:: python
268 | 
269 |     tree = clusterer.condensed_tree_.to_pandas()
270 |     cluster_tree = tree[tree.child_size > 1]
271 | 
272 | 
273 | 
274 | Finally we have the :py:meth:`~hdbscan.plots.CondensedTree.to_numpy` function, which returns a numpy record
275 | array:
276 | 
277 | .. code:: python
278 | 
279 |     clusterer.condensed_tree_.to_numpy()
280 | 
281 | 
282 | 
283 | 
284 | .. parsed-literal::
285 | 
286 |     array([(2309, 2048, 5.016525967983049, 1),
287 |            (2309, 2006, 5.076503128308643, 1),
288 |            (2309, 2024, 5.279133057912248, 1), ...,
289 |            (2318, 1105, 86.5507370650292, 1), (2318, 965, 86.5507370650292, 1),
290 |            (2318, 954, 86.5507370650292, 1)], 
291 |           dtype=[('parent', '<i8'), ('child', '<i8'), ('lambda_val', '<f8'), ('child_size', '<i8')])
292 | 
293 | 
294 | 
295 | This is equivalent to the pandas DataFrame but is in pure NumPy and
296 | hence has no pandas dependencies if you do not wish to use pandas.
297 | 
298 | Single Linkage Trees
299 | --------------------
300 | 
301 | We have still more data at our disposal, however. As noted in the How
302 | HDBSCAN Works section, prior to providing a condensed tree the algorithm
303 | builds a complete dendrogram. We have access to this too via the
304 | :py:attr:`~hdbscan.HDBSCAN.single_linkage_tree_` attribute of the clusterer.
305 | 
306 | .. code:: python
307 | 
308 |     clusterer.single_linkage_tree_
309 | 
310 | 
311 | 
312 | 
313 | .. parsed-literal::
314 | 
315 |     <hdbscan.plots.SingleLinkageTree at 0x121d4b128>
316 | 
317 | 
318 | 
319 | Again we have an object which we can then query for relevant
320 | information. The most basic approach is the :py:meth:`~hdbscan.plots.SingleLinkageTree.plot` method, just like
321 | the condensed tree.
322 | 
323 | .. code:: python
324 | 
325 |     clusterer.single_linkage_tree_.plot()
326 | 
327 | 
328 | .. image:: images/advanced_hdbscan_26_1.png
329 | 
330 | 
331 | As you can see we gain a lot from condensing the tree in terms of better
332 | presenting and summarising the data. There is a lot less to be gained
333 | from visual inspection of a plot like this (and it only gets worse for
334 | larger datasets). The plot function support most of the same
335 | functionality as the dendrogram plotting from
336 | ``scipy.cluster.hierarchy``, so you can view various truncations of the
337 | tree if necessary. In practice, however, you are more likely to be
338 | interested in access the raw data for further analysis. Again we have
339 | :py:meth:`~hdbscan.plots.SingleLinkageTree.to_networkx`, :py:meth:`~hdbscan.plots.SingleLinkageTree.to_pandas` and :py:meth:`~hdbscan.plots.SingleLinkageTree.to_numpy`. This time the
340 | :py:meth:`~hdbscan.plots.SingleLinkageTree.to_networkx` provides a direct NetworkX version of what you see
341 | above. The NumPy and pandas results conform to the single linkage
342 | hierarchy format of ``scipy.cluster.hierarchy``, and can be passed to
343 | routines there if necessary.
344 | 
345 | If you wish to know what the clusters are at a given fixed level of the
346 | single linkage tree you can use the :py:meth:`~hdbscan.plots.SingleLinkageTree.get_clusters` method to extract
347 | a vector of cluster labels. The method takes a cut value of the level
348 | at which to cut the tree, and a ``minimum_cluster_size`` to determine
349 | noise points (any cluster smaller than the ``minimum_cluster_size``).
350 | 
351 | .. code:: python
352 | 
353 |     clusterer.single_linkage_tree_.get_clusters(0.023, min_cluster_size=2)
354 | 
355 | 
356 | 
357 | .. parsed-literal::
358 | 
359 |     array([ 0, -1,  0, ..., -1, -1,  0])
360 | 
361 | 
362 | In this way, it is possible to extract the DBSCAN clustering that would result
363 | for any given epsilon value, all from one run of hdbscan.
364 | 
365 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | Major classes are :class:`HDBSCAN` and :class:`RobustSingleLinkage`.
 5 | 
 6 | HDBSCAN
 7 | -------
 8 | 
 9 | .. autoclass:: hdbscan.hdbscan_.HDBSCAN
10 |    :members:
11 | 
12 | RobustSingleLinkage
13 | -------------------
14 | 
15 | .. autoclass:: hdbscan.robust_single_linkage_.RobustSingleLinkage
16 |    :members:
17 | 
18 | 
19 | Utilities
20 | ---------
21 | 
22 | Other useful classes are contained in the plots module, the validity module,
23 | and the prediction module.
24 | 
25 | .. autoclass:: hdbscan.plots.CondensedTree
26 |    :members:
27 | 
28 | .. autoclass:: hdbscan.plots.SingleLinkageTree
29 |    :members:
30 | 
31 | .. autoclass:: hdbscan.plots.MinimumSpanningTree
32 |    :members:
33 | 
34 | .. automodule:: hdbscan.validity
35 |    :members:
36 | 
37 | .. automodule:: hdbscan.prediction
38 |    :members:
39 | 
40 | 
41 | Branch detection
42 | ----------------
43 | 
44 | The branches module contains classes for detecting branches within clusters.
45 | 
46 | .. automodule:: hdbscan.branches
47 |    :members: BranchDetector, detect_branches_in_clusters, approximate_predict_branch
48 | 
49 | .. autoclass:: hdbscan.plots.ApproximationGraph
50 |    :members:
51 | 


--------------------------------------------------------------------------------
/docs/basic_hdbscan.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Basic Usage of HDBSCAN\* for Clustering
  3 | =======================================
  4 | 
  5 | We have some data, and we want to cluster it. How exactly do we do that,
  6 | and what do the results look like? If you are very familiar with sklearn
  7 | and its API, particularly for clustering, then you can probably skip
  8 | this tutorial -- ``hdbscan`` implements exactly this API, so you can use
  9 | it just as you would any other sklearn clustering algorithm. If, on the
 10 | other hand, you aren't that familiar with sklearn, fear not, and read
 11 | on. Let's start with the simplest case first -- we have data in a nice
 12 | tidy dataframe format.
 13 | 
 14 | The Simple Case
 15 | ---------------
 16 | 
 17 | Let's generate some data with, say 2000 samples, and 10 features. We can
 18 | put it in a dataframe for a nice clean table view of it.
 19 | 
 20 | .. code:: python
 21 | 
 22 |     from sklearn.datasets import make_blobs
 23 |     import pandas as pd
 24 |     
 25 | .. code:: python
 26 | 
 27 |     blobs, labels = make_blobs(n_samples=2000, n_features=10)
 28 | 
 29 | .. code:: python
 30 | 
 31 |     pd.DataFrame(blobs).head()
 32 | 
 33 | 
 34 | .. raw:: html
 35 | 
 36 |     <div>
 37 |     <table border="1" class="dataframe">
 38 |       <thead>
 39 |         <tr style="text-align: right;">
 40 |           <th></th>
 41 |           <th>0</th>
 42 |           <th>1</th>
 43 |           <th>2</th>
 44 |           <th>3</th>
 45 |           <th>4</th>
 46 |           <th>5</th>
 47 |           <th>6</th>
 48 |           <th>7</th>
 49 |           <th>8</th>
 50 |           <th>9</th>
 51 |         </tr>
 52 |       </thead>
 53 |       <tbody>
 54 |         <tr>
 55 |           <th>0</th>
 56 |           <td>-3.370804</td>
 57 |           <td>8.487688</td>
 58 |           <td>4.631243</td>
 59 |           <td>-10.181475</td>
 60 |           <td>9.146487</td>
 61 |           <td>-8.070935</td>
 62 |           <td>-1.612017</td>
 63 |           <td>-2.418106</td>
 64 |           <td>-8.975390</td>
 65 |           <td>-1.769952</td>
 66 |         </tr>
 67 |         <tr>
 68 |           <th>1</th>
 69 |           <td>-4.092931</td>
 70 |           <td>8.409841</td>
 71 |           <td>3.362516</td>
 72 |           <td>-9.748945</td>
 73 |           <td>9.556615</td>
 74 |           <td>-9.240307</td>
 75 |           <td>-2.038291</td>
 76 |           <td>-3.129068</td>
 77 |           <td>-7.109673</td>
 78 |           <td>-0.993827</td>
 79 |         </tr>
 80 |         <tr>
 81 |           <th>2</th>
 82 |           <td>-4.604753</td>
 83 |           <td>9.616391</td>
 84 |           <td>4.631508</td>
 85 |           <td>-11.166361</td>
 86 |           <td>10.888212</td>
 87 |           <td>-8.427564</td>
 88 |           <td>-3.929517</td>
 89 |           <td>-4.563951</td>
 90 |           <td>-8.886373</td>
 91 |           <td>-1.995063</td>
 92 |         </tr>
 93 |         <tr>
 94 |           <th>3</th>
 95 |           <td>-6.889866</td>
 96 |           <td>-7.801482</td>
 97 |           <td>-6.974958</td>
 98 |           <td>-8.570025</td>
 99 |           <td>5.438101</td>
100 |           <td>-5.097457</td>
101 |           <td>-4.941206</td>
102 |           <td>-5.926394</td>
103 |           <td>-10.145152</td>
104 |           <td>0.219269</td>
105 |         </tr>
106 |         <tr>
107 |           <th>4</th>
108 |           <td>5.339728</td>
109 |           <td>2.791309</td>
110 |           <td>0.611464</td>
111 |           <td>-2.929875</td>
112 |           <td>-7.694973</td>
113 |           <td>7.776050</td>
114 |           <td>-1.218101</td>
115 |           <td>0.408141</td>
116 |           <td>-4.563975</td>
117 |           <td>-1.309128</td>
118 |         </tr>
119 |       </tbody>
120 |     </table>
121 |     </div>
122 | 
123 | 
124 | 
125 | So now we need to import the hdbscan library.
126 | 
127 | .. code:: python
128 | 
129 |     import hdbscan
130 | 
131 | Now, to cluster we need to generate a clustering object.
132 | 
133 | .. code:: python
134 | 
135 |     clusterer = hdbscan.HDBSCAN()
136 | 
137 | We can then use this clustering object and fit it to the data we have.
138 | This will return the clusterer object back to you -- just in case you
139 | want do some method chaining.
140 | 
141 | .. code:: python
142 | 
143 |     clusterer.fit(blobs)
144 | 
145 | 
146 | .. parsed-literal::
147 | 
148 |     HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,
149 |         gen_min_span_tree=False, leaf_size=40, memory=Memory(None),
150 |         metric='euclidean', min_cluster_size=5, min_samples=None, p=None)
151 | 
152 | 
153 | 
154 | At this point we are actually done! We've done the clustering! But where
155 | are the results? How do I get the clusters? The clusterer object knows,
156 | and stores the result in an attribute ``labels_``.
157 | 
158 | .. code:: python
159 | 
160 |     clusterer.labels_
161 | 
162 | 
163 | .. parsed-literal::
164 | 
165 |     array([2, 2, 2, ..., 2, 2, 0])
166 | 
167 | 
168 | 
169 | So it is an array of integers. What are we to make of that? It is an
170 | array with an integer for each data sample. Samples that are in the same
171 | cluster get assigned the same number. The cluster labels start at 0 and count
172 | up. We can thus determine the number of clusters found by finding the largest
173 | cluster label.
174 | 
175 | .. code:: python
176 | 
177 |     clusterer.labels_.max()
178 | 
179 | 
180 | .. parsed-literal::
181 | 
182 |     2
183 | 
184 | So we have a total of three clusters, with labels 0, 1, and 2.
185 | Importantly HDBSCAN is noise aware -- it has a notion of data samples
186 | that are not assigned to any cluster. This is handled by assigning these
187 | samples the label -1. But wait, there's more. The ``hdbscan`` library
188 | implements soft clustering, where each data point is assigned a cluster
189 | membership score ranging from 0.0 to 1.0. A score of 0.0 represents a
190 | sample that is not in the cluster at all (all noise points will get this
191 | score) while a score of 1.0 represents a sample that is at the heart of
192 | the cluster (note that this is not the spatial centroid notion of core).
193 | You can access these scores via the ``probabilities_`` attribute.
194 | 
195 | .. code:: python
196 | 
197 |     clusterer.probabilities_
198 | 
199 | 
200 | .. parsed-literal::
201 | 
202 |     array([ 0.83890858,  1.        ,  0.72629904, ...,  0.79456452,
203 |             0.65311137,  0.76382928])
204 | 
205 | 
206 | 
207 | What about different metrics?
208 | -----------------------------
209 | 
210 | That is all well and good, but even data that is embedded in a vector
211 | space may not want to consider distances between data points to be pure
212 | Euclidean distance. What can we do in that case? We are still in good
213 | shape, since ``hdbscan`` supports a wide variety of metrics, which you
214 | can set when creating the clusterer object. For example we can do the
215 | following:
216 | 
217 | .. code:: python
218 | 
219 |     clusterer = hdbscan.HDBSCAN(metric='manhattan')
220 |     clusterer.fit(blobs)
221 |     clusterer.labels_
222 | 
223 | 
224 | 
225 | 
226 | .. parsed-literal::
227 | 
228 |     array([1, 1, 1, ..., 1, 1, 0])
229 | 
230 | 
231 | 
232 | What metrics are supported? Because we simply steal metric computations
233 | from sklearn we get a large number of metrics readily available.
234 | 
235 | .. code:: python
236 | 
237 |     hdbscan.dist_metrics.METRIC_MAPPING
238 | 
239 | 
240 | 
241 | 
242 | .. parsed-literal::
243 | 
244 |     {'braycurtis': hdbscan.dist_metrics.BrayCurtisDistance,
245 |      'canberra': hdbscan.dist_metrics.CanberraDistance,
246 |      'chebyshev': hdbscan.dist_metrics.ChebyshevDistance,
247 |      'cityblock': hdbscan.dist_metrics.ManhattanDistance,
248 |      'dice': hdbscan.dist_metrics.DiceDistance,
249 |      'euclidean': hdbscan.dist_metrics.EuclideanDistance,
250 |      'hamming': hdbscan.dist_metrics.HammingDistance,
251 |      'haversine': hdbscan.dist_metrics.HaversineDistance,
252 |      'infinity': hdbscan.dist_metrics.ChebyshevDistance,
253 |      'jaccard': hdbscan.dist_metrics.JaccardDistance,
254 |      'kulsinski': hdbscan.dist_metrics.KulsinskiDistance,
255 |      'l1': hdbscan.dist_metrics.ManhattanDistance,
256 |      'l2': hdbscan.dist_metrics.EuclideanDistance,
257 |      'mahalanobis': hdbscan.dist_metrics.MahalanobisDistance,
258 |      'manhattan': hdbscan.dist_metrics.ManhattanDistance,
259 |      'matching': hdbscan.dist_metrics.MatchingDistance,
260 |      'minkowski': hdbscan.dist_metrics.MinkowskiDistance,
261 |      'p': hdbscan.dist_metrics.MinkowskiDistance,
262 |      'pyfunc': hdbscan.dist_metrics.PyFuncDistance,
263 |      'rogerstanimoto': hdbscan.dist_metrics.RogersTanimotoDistance,
264 |      'russellrao': hdbscan.dist_metrics.RussellRaoDistance,
265 |      'seuclidean': hdbscan.dist_metrics.SEuclideanDistance,
266 |      'sokalmichener': hdbscan.dist_metrics.SokalMichenerDistance,
267 |      'sokalsneath': hdbscan.dist_metrics.SokalSneathDistance,
268 |      'wminkowski': hdbscan.dist_metrics.WMinkowskiDistance}
269 | 
270 | 
271 | 
272 | Distance matrices
273 | -----------------
274 | 
275 | What if you don't have a nice set of points in a vector space, but only
276 | have a pairwise distance matrix providing the distance between each pair
277 | of points? This is a common situation. Perhaps you have a complex custom
278 | distance measure; perhaps you have strings and are using Levenshtein
279 | distance, etc. Again, this is all fine as ``hdbscan`` supports a special
280 | metric called ``precomputed``. If you create the clusterer with the
281 | metric set to ``precomputed`` then the clusterer will assume that,
282 | rather than being handed a vector of points in a vector space, it is
283 | receiving an all-pairs distance matrix. Missing distances can be
284 | indicated by ``numpy.inf``, which leads HDBSCAN to ignore these pairwise
285 | relationships as long as there exists a path between two points that
286 | contains defined distances (i.e. if there are too many distances
287 | missing, the clustering is going to fail).
288 | 
289 | NOTE: The input vector _must_ contain numerical data. If you have a 
290 | distance matrix for non-numerical vectors, you will need to map your
291 | input vectors to numerical vectors. (e.g use map ['A', 'G', 'C', 'T']->
292 | [ 1, 2, 3, 4] to replace input vector ['A', 'A', 'A', 'C', 'G'] with
293 | [ 1, 1, 1, 3, 2])
294 | 
295 | .. code:: python
296 | 
297 |     from sklearn.metrics.pairwise import pairwise_distances
298 | 
299 | .. code:: python
300 | 
301 |     distance_matrix = pairwise_distances(blobs)
302 |     clusterer = hdbscan.HDBSCAN(metric='precomputed')
303 |     clusterer.fit(distance_matrix)
304 |     clusterer.labels_
305 | 
306 | 
307 | 
308 | 
309 | .. parsed-literal::
310 | 
311 |     array([1, 1, 1, ..., 1, 1, 2])
312 | 
313 | 
314 | 
315 | Note that this result only appears different due to a different
316 | labelling order for the clusters.
317 | 
318 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # hdbscan documentation build configuration file, created by
  4 | # sphinx-quickstart on Sat May 28 10:34:44 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | import sphinx_rtd_theme
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | 
 23 | ### We now install the package in a virtualenv to build docs, so this is not needed
 24 | # sys.path.insert(0, os.path.abspath('../'))
 25 | 
 26 | # -- General configuration ------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     'sphinx.ext.autodoc',
 36 |     'sphinx.ext.doctest',
 37 |     'sphinx.ext.todo',
 38 |     'sphinx.ext.coverage',
 39 |     'sphinx.ext.imgmath',
 40 |     'sphinx.ext.viewcode',
 41 |     # 'sphinx.ext.napoleon',
 42 |     # 'numpy_ext.numpydoc'
 43 | ]
 44 | #napoleon_google_docstring = False
 45 | #napoleon_numpy_docstring = True
 46 | 
 47 | # Add any paths that contain templates here, relative to this directory.
 48 | templates_path = ['_templates']
 49 | 
 50 | # The suffix(es) of source filenames.
 51 | # You can specify multiple suffix as a list of string:
 52 | # source_suffix = ['.rst', '.md']
 53 | source_suffix = '.rst'
 54 | 
 55 | # The encoding of source files.
 56 | #source_encoding = 'utf-8-sig'
 57 | 
 58 | # The master toctree document.
 59 | master_doc = 'index'
 60 | 
 61 | # General information about the project.
 62 | project = u'hdbscan'
 63 | copyright = u'2016, Leland McInnes, John Healy, Steve Astels'
 64 | author = u'Leland McInnes, John Healy, Steve Astels'
 65 | 
 66 | # The version info for the project you're documenting, acts as replacement for
 67 | # |version| and |release|, also used in various other places throughout the
 68 | # built documents.
 69 | #
 70 | # The short X.Y version.
 71 | version = u'0.8.1'
 72 | # The full version, including alpha/beta/rc tags.
 73 | release = u'0.8.1'
 74 | 
 75 | # The language for content autogenerated by Sphinx. Refer to documentation
 76 | # for a list of supported languages.
 77 | #
 78 | # This is also used if you do content translation via gettext catalogs.
 79 | # Usually you set "language" from the command line for these cases.
 80 | language = None
 81 | 
 82 | # There are two options for replacing |today|: either, you set today to some
 83 | # non-false value, then it is used:
 84 | #today = ''
 85 | # Else, today_fmt is used as the format for a strftime call.
 86 | #today_fmt = '%B %d, %Y'
 87 | 
 88 | # List of patterns, relative to source directory, that match files and
 89 | # directories to ignore when looking for source files.
 90 | exclude_patterns = ['_build']
 91 | 
 92 | # The reST default role (used for this markup: `text`) to use for all
 93 | # documents.
 94 | #default_role = None
 95 | 
 96 | # If true, '()' will be appended to :func: etc. cross-reference text.
 97 | #add_function_parentheses = True
 98 | 
 99 | # If true, the current module name will be prepended to all description
100 | # unit titles (such as .. function::).
101 | #add_module_names = True
102 | 
103 | # If true, sectionauthor and moduleauthor directives will be shown in the
104 | # output. They are ignored by default.
105 | #show_authors = False
106 | 
107 | # The name of the Pygments (syntax highlighting) style to use.
108 | pygments_style = 'sphinx'
109 | 
110 | # A list of ignored prefixes for module index sorting.
111 | #modindex_common_prefix = []
112 | 
113 | # If true, keep warnings as "system message" paragraphs in the built documents.
114 | #keep_warnings = False
115 | 
116 | # If true, `todo` and `todoList` produce output, else they produce nothing.
117 | todo_include_todos = True
118 | 
119 | 
120 | # -- Options for HTML output ----------------------------------------------
121 | 
122 | # The theme to use for HTML and HTML Help pages.  See the documentation for
123 | # a list of builtin themes.
124 | #html_theme = 'alabaster'
125 | html_theme = 'sphinx_rtd_theme'
126 | 
127 | # Theme options are theme-specific and customize the look and feel of a theme
128 | # further.  For a list of options available for each theme, see the
129 | # documentation.
130 | #html_theme_options = {}
131 | 
132 | # Add any paths that contain custom themes here, relative to this directory.
133 | #html_theme_path = []
134 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
135 | 
136 | # The name for this set of Sphinx documents.  If None, it defaults to
137 | # "<project> v<release> documentation".
138 | #html_title = None
139 | 
140 | # A shorter title for the navigation bar.  Default is the same as html_title.
141 | #html_short_title = None
142 | 
143 | # The name of an image file (relative to this directory) to place at the top
144 | # of the sidebar.
145 | #html_logo = None
146 | 
147 | # The name of an image file (within the static path) to use as favicon of the
148 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
149 | # pixels large.
150 | #html_favicon = None
151 | 
152 | # Add any paths that contain custom static files (such as style sheets) here,
153 | # relative to this directory. They are copied after the builtin static files,
154 | # so a file named "default.css" will overwrite the builtin "default.css".
155 | html_static_path = ['_static']
156 | 
157 | # Add any extra paths that contain custom files (such as robots.txt or
158 | # .htaccess) here, relative to this directory. These files are copied
159 | # directly to the root of the documentation.
160 | #html_extra_path = []
161 | 
162 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
163 | # using the given strftime format.
164 | #html_last_updated_fmt = '%b %d, %Y'
165 | 
166 | # If true, SmartyPants will be used to convert quotes and dashes to
167 | # typographically correct entities.
168 | #html_use_smartypants = True
169 | 
170 | # Custom sidebar templates, maps document names to template names.
171 | #html_sidebars = {}
172 | 
173 | # Additional templates that should be rendered to pages, maps page names to
174 | # template names.
175 | #html_additional_pages = {}
176 | 
177 | # If false, no module index is generated.
178 | #html_domain_indices = True
179 | 
180 | # If false, no index is generated.
181 | #html_use_index = True
182 | 
183 | # If true, the index is split into individual pages for each letter.
184 | #html_split_index = False
185 | 
186 | # If true, links to the reST sources are added to the pages.
187 | #html_show_sourcelink = True
188 | 
189 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
190 | #html_show_sphinx = True
191 | 
192 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
193 | #html_show_copyright = True
194 | 
195 | # If true, an OpenSearch description file will be output, and all pages will
196 | # contain a <link> tag referring to it.  The value of this option must be the
197 | # base URL from which the finished HTML is served.
198 | #html_use_opensearch = ''
199 | 
200 | # This is the file name suffix for HTML files (e.g. ".xhtml").
201 | #html_file_suffix = None
202 | 
203 | # Language to be used for generating the HTML full-text search index.
204 | # Sphinx supports the following languages:
205 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
206 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
207 | #html_search_language = 'en'
208 | 
209 | # A dictionary with options for the search language support, empty by default.
210 | # Now only 'ja' uses this config value
211 | #html_search_options = {'type': 'default'}
212 | 
213 | # The name of a javascript file (relative to the configuration directory) that
214 | # implements a search results scorer. If empty, the default will be used.
215 | #html_search_scorer = 'scorer.js'
216 | 
217 | # Output file base name for HTML help builder.
218 | htmlhelp_basename = 'hdbscandoc'
219 | 
220 | # -- Options for LaTeX output ---------------------------------------------
221 | 
222 | latex_elements = {
223 | # The paper size ('letterpaper' or 'a4paper').
224 | #'papersize': 'letterpaper',
225 | 
226 | # The font size ('10pt', '11pt' or '12pt').
227 | #'pointsize': '10pt',
228 | 
229 | # Additional stuff for the LaTeX preamble.
230 | #'preamble': '',
231 | 
232 | # Latex figure (float) alignment
233 | #'figure_align': 'htbp',
234 | }
235 | 
236 | # Grouping the document tree into LaTeX files. List of tuples
237 | # (source start file, target name, title,
238 | #  author, documentclass [howto, manual, or own class]).
239 | latex_documents = [
240 |     (master_doc, 'hdbscan.tex', u'hdbscan Documentation',
241 |      u'Leland McInnes, John Healy, Steve Astels', 'manual'),
242 | ]
243 | 
244 | # The name of an image file (relative to this directory) to place at the top of
245 | # the title page.
246 | #latex_logo = None
247 | 
248 | # For "manual" documents, if this is true, then toplevel headings are parts,
249 | # not chapters.
250 | #latex_use_parts = False
251 | 
252 | # If true, show page references after internal links.
253 | #latex_show_pagerefs = False
254 | 
255 | # If true, show URL addresses after external links.
256 | #latex_show_urls = False
257 | 
258 | # Documents to append as an appendix to all manuals.
259 | #latex_appendices = []
260 | 
261 | # If false, no module index is generated.
262 | #latex_domain_indices = True
263 | 
264 | 
265 | # -- Options for manual page output ---------------------------------------
266 | 
267 | # One entry per manual page. List of tuples
268 | # (source start file, name, description, authors, manual section).
269 | man_pages = [
270 |     (master_doc, 'hdbscan', u'hdbscan Documentation',
271 |      [author], 1)
272 | ]
273 | 
274 | # If true, show URL addresses after external links.
275 | #man_show_urls = False
276 | 
277 | 
278 | # -- Options for Texinfo output -------------------------------------------
279 | 
280 | # Grouping the document tree into Texinfo files. List of tuples
281 | # (source start file, target name, title, author,
282 | #  dir menu entry, description, category)
283 | texinfo_documents = [
284 |     (master_doc, 'hdbscan', u'hdbscan Documentation',
285 |      author, 'hdbscan', 'One line description of project.',
286 |      'Miscellaneous'),
287 | ]
288 | 
289 | # Documents to append as an appendix to all manuals.
290 | #texinfo_appendices = []
291 | 
292 | # If false, no module index is generated.
293 | #texinfo_domain_indices = True
294 | 
295 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
296 | #texinfo_show_urls = 'footnote'
297 | 
298 | # If true, do not generate a @detailmenu in the "Top" node's menu.
299 | #texinfo_no_detailmenu = False
300 | 
301 | 
302 | # -- Options for Epub output ----------------------------------------------
303 | 
304 | # Bibliographic Dublin Core info.
305 | epub_title = project
306 | epub_author = author
307 | epub_publisher = author
308 | epub_copyright = copyright
309 | 
310 | # The basename for the epub file. It defaults to the project name.
311 | #epub_basename = project
312 | 
313 | # The HTML theme for the epub output. Since the default themes are not
314 | # optimized for small screen space, using the same theme for HTML and epub
315 | # output is usually not wise. This defaults to 'epub', a theme designed to save
316 | # visual space.
317 | #epub_theme = 'epub'
318 | 
319 | # The language of the text. It defaults to the language option
320 | # or 'en' if the language is not set.
321 | #epub_language = ''
322 | 
323 | # The scheme of the identifier. Typical schemes are ISBN or URL.
324 | #epub_scheme = ''
325 | 
326 | # The unique identifier of the text. This can be a ISBN number
327 | # or the project homepage.
328 | #epub_identifier = ''
329 | 
330 | # A unique identification for the text.
331 | #epub_uid = ''
332 | 
333 | # A tuple containing the cover image and cover page html template filenames.
334 | #epub_cover = ()
335 | 
336 | # A sequence of (type, uri, title) tuples for the guide element of content.opf.
337 | #epub_guide = ()
338 | 
339 | # HTML files that should be inserted before the pages created by sphinx.
340 | # The format is a list of tuples containing the path and title.
341 | #epub_pre_files = []
342 | 
343 | # HTML files that should be inserted after the pages created by sphinx.
344 | # The format is a list of tuples containing the path and title.
345 | #epub_post_files = []
346 | 
347 | # A list of files that should not be packed into the epub file.
348 | epub_exclude_files = ['search.html']
349 | 
350 | # The depth of the table of contents in toc.ncx.
351 | #epub_tocdepth = 3
352 | 
353 | # Allow duplicate toc entries.
354 | #epub_tocdup = True
355 | 
356 | # Choose between 'default' and 'includehidden'.
357 | #epub_tocscope = 'default'
358 | 
359 | # Fix unsupported image types using the Pillow.
360 | #epub_fix_images = False
361 | 
362 | # Scale large images.
363 | #epub_max_image_width = 0
364 | 
365 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
366 | #epub_show_urls = 'inline'
367 | 
368 | # If false, no index is generated.
369 | #epub_use_index = True
370 | 


--------------------------------------------------------------------------------
/docs/dbscan_from_hdbscan.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Extracting DBSCAN* clustering from HDBSCAN*
  3 | ===========================================
  4 | 
  5 | There are a number of reasons that one might prefer `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`__'s
  6 | clustering over that of HDBSCAN*.  The biggest difficulty many folks have with
  7 | DBSCAN is that the epsilon distance parameter can be hard to determine and often
  8 | requires a great deal of trial and error to tune.  If your data lived in a more
  9 | interpretable space and you had a good notion of distance in that space this problem
 10 | is certainly mitigated and a user might want to set a very specific epsilon distance
 11 | for their use case.  Another viable use case might be that a user is interested in a
 12 | constant density clustering.
 13 | HDBSCAN* does variable density clustering by default, looking for the clusters that persist
 14 | over a wide range of epsilon distance parameters to find a 'natural' clustering.  This might
 15 | not be the right result for your application.  A DBSCAN clustering at a particular
 16 | epsilon value might work better for your particular task.
 17 | 
 18 | HDBSCAN returns a very natural clustering of your data which is often very useful in exploring
 19 | a new data set.  That doesn't necessarily make it the right clustering algorithm or every
 20 | task.
 21 | 
 22 | HDBSCAN* can best be thought of as a DBSCAN* implementation which varies across
 23 | all epsilon values and extracts the clusters that persist over the widest range
 24 | of these parameter choices.  It is therefore able to ignore the parameter and
 25 | only needs the minimum cluster size as single input parameter.
 26 | The 'eom' (Excess of Mass) cluster selection method then returns clusters with the
 27 | best stability over epsilon.
 28 | 
 29 | There are a number of alternative ways of extracting a flat clustering from
 30 | the HDBSCAN* hierarchical tree.  If one is interested in finer resolution
 31 | clusters while still maintaining variable density one could set
 32 | ``cluster_selection_method='leaf'`` to extract the leaves of the condensed
 33 | tree instead of the most persistent clusters.  For more details on these
 34 | cluster selection methods see :ref:`leaf_clustering_label`.
 35 | 
 36 | If one wasn't interested in the variable density clustering that is the hallmark of
 37 | HDBSCAN* it is relatively easy to extract any DBSCAN* clustering from a
 38 | single run of HDBSCAN*.  This has the advantage of allowing you to perform
 39 | a single computationally efficient HDBSCAN* run and then quickly search over
 40 | the DBSCAN* parameter space by extracting clustering results from our
 41 | pre-constructed tree.  This can save significant computational time when
 42 | searching across multiple cluster parameter settings on large amounts of data.
 43 | 
 44 | Alternatively, one could make use of the ``cluster_selection_epsilon`` as a
 45 | post processing step with any ``cluster_selection_method`` in order to
 46 | return a hybrid clustering of DBSCAN* and HDBSCAN*.  For more details on
 47 | this see :doc:`how_to_use_epsilon`.
 48 | 
 49 | In order to extract a DBSCAN* clustering from an HDBSCAN run we must first train
 50 | and HDBSCAN model on our data.
 51 | 
 52 | .. code:: python
 53 | 
 54 |     import hdbscan
 55 |     h_cluster = hdbscan.HDBSCAN(min_samples=5,match_reference_implementation=True).fit(X)
 56 | 
 57 | The ``min_cluster_size`` parameter is unimportant in this case in that it is
 58 | only used in the creation of our condensed tree which we won't be using here.
 59 | Now we choose a ``cut_distance`` which is just another name for the epsilon
 60 | threshold in DBSCAN and will be passed to our
 61 | :py:meth:`~hdbscan.hdbscan_.dbscan_clustering` method.
 62 | 
 63 | .. code:: python
 64 | 
 65 |     eps = 0.2
 66 |     labels = h_cluster.dbscan_clustering(cut_distance=eps, min_cluster_size=5)
 67 |     sns.scatterplot(x=X[:,0], y=X[:,1], hue=labels.astype(str));
 68 | 
 69 | .. image:: images/dbscan_from_hdbscan_clustering.png
 70 |     :align: center
 71 | 
 72 | It should be noted that a DBSCAN* clustering extracted from our HDBSCAN* tree will
 73 | not precisely match the clustering results from sklearn's DBSCAN implementation.
 74 | Our clustering results should better match DBSCAN* (which can be thought of as
 75 | DBSCAN without the border points).  As such when comparing the two results one
 76 | should expect them to mostly differ in the points that DBSCAN considers boarder
 77 | points.  We'll deal with
 78 | this by only looking at the comparison of our clustering results based on the points identified
 79 | by DBSCAN as core points.  We can see below that the differences between these two
 80 | clusterings mostly occur in the boundaries of the clusters.  This matches our
 81 | intuition of stability within the core points.
 82 | 
 83 | .. image:: images/dbscan_from_hdbscan_comparision.png
 84 |     :align: center
 85 | 
 86 | For a slightly more empirical comparison we we make use of the `adjusted rand score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html>`__
 87 | to compare the clustering of the core points between a DBSCAN cluster from sklearn and
 88 | a DBSCAN* clustering extracted from our HDBSCAN* object.
 89 | 
 90 | .. image:: images/dbscan_from_hdbscan_percentage_core.png
 91 |     :align: center
 92 | 
 93 | .. image:: images/dbscan_from_hdbscan_number_of_clusters.png
 94 |     :align: center
 95 | 
 96 | We see that for very small epsilon values our number of clusters tends to be quite
 97 | far apart, largely due to a large number of the points being considered boundary points
 98 | instead of core points.  As the epsilon value increases, more and more points are
 99 | considered core and the number of clusters generated by each algorithm converge.
100 | 
101 | Additionally, the adjusted rand score between the core points of both algorithm
102 | stays consistently high (mostly 1.0) for our entire range of epsilon.  There may be
103 | be some minor discrepancies between core point results largely due to implementation
104 | details and optimizations with the code base.
105 | 
106 | Why might one just extract the DBSCAN* clustering results from a single HDBSCAN* run
107 | instead of making use of sklearns DBSSCAN code?  The short answer is efficiency.
108 | If you aren't sure what epsilon parameter to select for DBSCAN then you may have to
109 | run the algorithm many times on your data set.  While those runs can be inexpensive for
110 | very small epsilon values they can get quite expensive for large parameter values.
111 | 
112 | In this small benchmark case of 50,000 two dimensional data points we have broken even
113 | after having only had to try two epsilon parameters from DBSCAN, or only a single
114 | run with a large parameter selected.  This trend is only exacerbated for larger
115 | data sets in higher dimensional spaces.  For more detailed scaling experiments see
116 | `Accelearted Hierarchical Density Clustering <https://arxiv.org/abs/1705.07321>`__
117 | by McInnes and Healy.
118 | 
119 | .. image:: images/dbscan_from_hdbscan_timing.png
120 |     :align: center
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/docs/docs_requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | 


--------------------------------------------------------------------------------
/docs/faq.rst:
--------------------------------------------------------------------------------
  1 | Frequently Asked Questions
  2 | ==========================
  3 | 
  4 | Here we attempt to address some common questions, directing the user to some
  5 | helpful answers.
  6 | 
  7 | Q: Most of data is classified as noise; why?
  8 | --------------------------------------------
  9 | 
 10 | The amount of data classified as noise is controlled by the ``min_samples``
 11 | parameter. By default, if not otherwise set, this value is set to the same
 12 | value as ``min_cluster_size``. You can set it independently if you wish by
 13 | specifying it separately. The lower the value, the less noise you'll get, but
 14 | there are limits, and it is possible that you simply have noisy data. See
 15 | :any:`min_samples <min_samples_label>` for more details.
 16 | 
 17 | Q: I mostly just get one large cluster; I want smaller clusters.
 18 | ----------------------------------------------------------------
 19 | 
 20 | If you are getting a single large cluster and a few small outlying clusters
 21 | that means your data is essentially a large glob with some small outlying
 22 | clusters -- there may be structure to the glob, but compared to how well
 23 | separated those other small clusters are, it doesn't really show up. You may,
 24 | however, want to get at that more fine grained structure. You can do that,
 25 | and what you are looking for is :any:`leaf clustering <leaf_clustering_label>`.
 26 | 
 27 | Q: HDBSCAN is failing to separate the clusters I think it should.
 28 | -----------------------------------------------------------------
 29 | 
 30 | Density based clustering relies on having enough data to separate dense areas.
 31 | In higher dimensional spaces this becomes more difficult, and hence
 32 | requires more data. Quite possibly there is not enough data to make your
 33 | clusters clearly separable. Consider the following plots:
 34 | 
 35 | .. image:: images/generative_model_scatter.png
 36 | .. image:: images/generative_model_kde.png
 37 | 
 38 | Four different generative models, when sampled, produce results that are hard to
 39 | easily differentiate. The blue dataset is sampled from a mixture of three
 40 | standard Gaussians centered at (-2, 0), (0,0) and (2,0); the green dataset is
 41 | sampled from a mixture of two standard Gaussians centered at (-1,0) and (1,0);
 42 | the red data is sampled from a multivariate Gaussian with covariance
 43 | [2, 0; 0, 1]; the purple data is a single standard Gaussian with uniform
 44 | background noise.
 45 | 
 46 | Despite the generate model having clearly different "clusters", without more
 47 | data we simply cannot differentiate between these models, and hence no
 48 | density based clustering will manage cluster these according to the model.
 49 | 
 50 | Q: I am not getting the claimed performance. Why not?
 51 | -----------------------------------------------------
 52 | 
 53 | The most likely explanation is to do with the dimensionality of your input data.
 54 | While HDBSCAN can perform well on low to medium dimensional data the performance
 55 | tends to decrease significantly as dimension increases. In general HDBSCAN can do
 56 | well on up to around 50 or 100 dimensional data, but performance can see 
 57 | significant decreases beyond that. Of course a lot is also dataset dependent, so 
 58 | you can still get good performance even on high dimensional data, but it
 59 | is no longer guaranteed.
 60 | 
 61 | Q: I want to predict the cluster of a new unseen point. How do I do this?
 62 | -------------------------------------------------------------------------
 63 | 
 64 | This is possible via the function :func:`~hdbscan.prediction.approximate_predict`. Note that you
 65 | either need to set ``prediction_data=True`` on initialization of your
 66 | clusterer object, or run the ``generate_prediction_data`` method after
 67 | fitting. With that done you can run :func:`~hdbscan.prediction.approximate_predict` with the model
 68 | and any new data points you wish to predict. Note that this differs from
 69 | re-running HDBSCAN with the new points added since no new clusters will be
 70 | considered -- instead the new points will be labelled according to the
 71 | clusters already labelled by the model.
 72 | 
 73 | Q: Haversine metric is not clustering my Lat-Lon data correctly.
 74 | ----------------------------------------------------------------
 75 | 
 76 | The Haversine metric as implemented supports coordinates in radians. That
 77 | means you'll need to convert your latitude and longitude data into radians
 78 | before passing it in to HDBSCAN.
 79 | 
 80 | Q: I want to cite this software in my journal publication. How do I do that?
 81 | ----------------------------------------------------------------------------
 82 | 
 83 | If you have used this codebase in a scientific publication and wish to cite it, please use the `Journal of Open Source Software article <http://joss.theoj.org/papers/10.21105/joss.00205>`_.
 84 | 
 85 |     L. McInnes, J. Healy, S. Astels, *hdbscan: Hierarchical density based clustering* 
 86 |     In: Journal of Open Source Software, The Open Journal, volume 2, number 11. 
 87 |     2017
 88 |     
 89 | BibTeX::
 90 |     
 91 |     @article{McInnes2017,
 92 |       doi = {10.21105/joss.00205},
 93 |       url = {https://doi.org/10.21105%2Fjoss.00205},
 94 |       year  = {2017},
 95 |       month = {mar},
 96 |       publisher = {The Open Journal},
 97 |       volume = {2},
 98 |       number = {11},
 99 |       author = {Leland McInnes and John Healy and Steve Astels},
100 |       title = {hdbscan: Hierarchical density based clustering},
101 |       journal = {The Journal of Open Source Software}
102 |     }
103 |     
104 | ::
105 | 


--------------------------------------------------------------------------------
/docs/how_to_use_epsilon.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Combining HDBSCAN\* with DBSCAN 
 3 | =============================
 4 | 
 5 | While DBSCAN needs a minimum cluster size *and* a distance threshold epsilon as user-defined input parameters, 
 6 | HDBSCAN\* is basically a DBSCAN implementation for varying epsilon values and therefore only needs the minimum cluster size as single input parameter.
 7 | The ``'eom'`` (Excess of Mass) cluster selection method then returns clusters with the best stability over epsilon.
 8 | 
 9 | Unlike DBSCAN, this allows to it find clusters of variable densities without having to choose a suitable distance threshold first.
10 | However, there are cases where we could still benefit from the use of an epsilon threshold.
11 | 
12 | For illustration, see this map with GPS locations, representing recorded pick-up and drop-off locations for customers of a ride pooling provider.
13 | The largest (visual) data cluster can be found around the train station. Smaller clusters are placed along the streets, depending on the requested location
14 | in the form of a postal address or point of interest. Since we are considering a door-to-door system where customers are not bound to collective pick-up or
15 | drop-off locations, we are interested in both large clusters and small clusters with a minimum size of 4.  
16 | 
17 | .. image:: images/epsilon_parameter_dataset.png
18 | 	:align: center
19 | 	
20 | Clustering the given data set with `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`__ and an epsilon threshold of 5 meters gives us good results, 
21 | but neglects clusters with points that are more than 5 meters apart from each other. 
22 | However, increasing epsilon would result in cluster chains along the streets, especially when working with a larger data set. 
23 | 
24 | .. image:: images/epsilon_parameter_dbscan.png
25 | 	:align: center
26 | 
27 | Unfortunately, HDBSCAN\* does not produce any better results in this case: while it discovers the clusters that DBSCAN missed, it also returns a very high number of micro-clusters around the train station, 
28 | even though we would prefer one or only few clusters representing this location. We could achieve this by increasing ``min_cluster_size`` or 
29 | the smoothing parameter ``min_samples``, but with the trade-off of losing small clusters in less dense areas or merging them into other clusters 
30 | separated by a relatively large distance.
31 | 
32 | .. image:: images/epsilon_parameter_hdbscan_eom.png
33 | 	:align: center
34 | 	
35 | This is where the parameter ``cluster_selection_epsilon`` comes into play. The cluster extraction method using this parameter, as described in detail
36 | by `Malzer and Baum <https://arxiv.org/abs/1911.02282>`__, acts like a hybrid between DBSCAN 
37 | (or, to be precise, DBSCAN\*, i.e. DBSCAN without the border points) by extracting DBSCAN results for data partitions
38 | affected by the given parameter value, and HDBSCAN\* results for all others. 
39 | 
40 | In our example, we choose to merge nested clusters below 5 meters (0.005 kilometers) and therefore set  the parameter ``cluster_selection_epsilon`` accordingly: 
41 | 
42 | .. code:: python
43 | 
44 | 	X = np.radians(coordinates) #convert the list of lat/lon coordinates to radians
45 | 	earth_radius_km = 6371
46 | 	epsilon = 0.005 / earth_radius_km #calculate 5 meter epsilon threshold
47 | 	
48 | 	clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='haversine', 
49 | 	cluster_selection_epsilon=epsilon, cluster_selection_method = 'eom')
50 | 	clusterer.fit(X)
51 | 	
52 | And indeed, the result looks like a mix between DBSCAN and HDBSCAN(eom). We no longer lose clusters of variable densities beyond the given epsilon, but at the
53 | same time avoid the abundance of micro-clusters in the original HDBSCAN\* clustering, which was an undesired side-effect of having to choose a low ``min_cluster_size`` value.
54 | 
55 | .. image:: images/epsilon_parameter_hdbscan_eps.png
56 | 	:align: center
57 | 	
58 | Note that for the given parameter setting, running HDBSCAN\* based on ``cluster_selection_method = 'eom'`` or ``cluster_selection_method = 'leaf'`` does not make
59 | any difference: the ``cluster_selection_epsilon`` threshold neutralizes the effect of HDBSCAN(eom)'s stability calculations.
60 | When using a lower threshold, some minor differences can be noticed. For example, an epsilon value of 3 meters with ``'eom'`` produces the same results as
61 | a the 5 meter value on the given data set, but 3 meters in combination with ``'leaf'`` achieves a slightly different result:
62 | 	
63 | .. image:: images/epsilon_parameter_hdbscan_e3_leaf.png
64 | 	:align: center
65 | 
66 | A ``cluster_selection_epsilon`` value of 0 (the default value) always returns the original HDBSCAN\* results, either according to ``'eom'`` or ``'leaf'``.
67 | 	
68 | 


--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_11_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_11_1.png


--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_26_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_26_1.png


--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_3_1.png


--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_5_1.png


--------------------------------------------------------------------------------
/docs/images/advanced_hdbscan_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_9_1.png


--------------------------------------------------------------------------------
/docs/images/allow_single_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/allow_single_cluster.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_12_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_12_0.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_15_0.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_18_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_18_0.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_21_0.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_24_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_24_0.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_27_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_27_0.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_31_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_31_0.png


--------------------------------------------------------------------------------
/docs/images/comparing_clustering_algorithms_6_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_6_0.png


--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_dataset.png


--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_dbscan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_dbscan.png


--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_hdbscan_e3_leaf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_e3_leaf.png


--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_hdbscan_eom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_eom.png


--------------------------------------------------------------------------------
/docs/images/epsilon_parameter_hdbscan_eps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_eps.png


--------------------------------------------------------------------------------
/docs/images/generative_model_kde.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/generative_model_kde.png


--------------------------------------------------------------------------------
/docs/images/generative_model_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/generative_model_scatter.png


--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_10_1.png


--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_12_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_12_1.png


--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_15_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_15_1.png


--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_18_1.png


--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_20_1.png


--------------------------------------------------------------------------------
/docs/images/how_hdbscan_works_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_3_1.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_13_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_13_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_15_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_17_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_17_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_19_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_19_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_21_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_23_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_23_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_25_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_25_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_3_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_3_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_5_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_7_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_7_0.png


--------------------------------------------------------------------------------
/docs/images/how_to_detect_branches_9_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_9_0.png


--------------------------------------------------------------------------------
/docs/images/outlier_detection_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_3_1.png


--------------------------------------------------------------------------------
/docs/images/outlier_detection_7_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_7_1.png


--------------------------------------------------------------------------------
/docs/images/outlier_detection_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_9_1.png


--------------------------------------------------------------------------------
/docs/images/parameter_selection_11_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_11_1.png


--------------------------------------------------------------------------------
/docs/images/parameter_selection_12_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_12_1.png


--------------------------------------------------------------------------------
/docs/images/parameter_selection_15_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_15_1.png


--------------------------------------------------------------------------------
/docs/images/parameter_selection_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_18_1.png


--------------------------------------------------------------------------------
/docs/images/parameter_selection_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_3_1.png


--------------------------------------------------------------------------------
/docs/images/parameter_selection_7_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_7_1.png


--------------------------------------------------------------------------------
/docs/images/parameter_selection_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_9_1.png


--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_14_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_14_1.png


--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_20_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_20_2.png


--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_24_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_24_1.png


--------------------------------------------------------------------------------
/docs/images/performance_and_scalability_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_9_1.png


--------------------------------------------------------------------------------
/docs/images/prediction_tutorial_3_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_3_0.png


--------------------------------------------------------------------------------
/docs/images/prediction_tutorial_5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_5_1.png


--------------------------------------------------------------------------------
/docs/images/prediction_tutorial_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_9_1.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_10_1.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_13_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_13_1.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_15_0.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_3_1.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_6_1.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_8_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_8_1.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_11_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_11_0.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_15_0.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_26_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_26_0.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_2_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_2_0.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_31_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_31_0.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_36_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_36_0.png


--------------------------------------------------------------------------------
/docs/images/soft_clustering_explanation_6_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_6_0.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. hdbscan documentation master file, created by
 2 |    sphinx-quickstart on Sat May 28 10:34:44 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | The hdbscan Clustering Library
 7 | ==============================
 8 | 
 9 | The hdbscan library is a suite of tools to use unsupervised learning to find clusters, or
10 | dense regions, of a dataset. The primary algorithm is HDBSCAN* as proposed by Campello,
11 | Moulavi, and Sander. The library provides a high performance implementation of this algorithm,
12 | along with tools for analysing the resulting clustering.
13 | 
14 | 
15 | User Guide / Tutorial
16 | ---------------------
17 | 
18 | .. toctree::
19 |    :maxdepth: 2
20 | 
21 |    basic_hdbscan
22 |    advanced_hdbscan
23 |    parameter_selection
24 |    outlier_detection
25 |    prediction_tutorial
26 |    soft_clustering
27 |    how_to_use_epsilon
28 |    dbscan_from_hdbscan
29 |    how_to_detect_branches
30 |    faq
31 | 
32 | Background on Clustering with HDBSCAN
33 | -------------------------------------
34 | 
35 | .. toctree::
36 |    :maxdepth: 2
37 | 
38 |    how_hdbscan_works
39 |    comparing_clustering_algorithms
40 |    performance_and_scalability
41 |    soft_clustering_explanation
42 | 
43 | API Reference
44 | -------------
45 | 
46 | .. toctree::
47 | 
48 |    api
49 | 
50 | Indices and tables
51 | ==================
52 | 
53 | * :ref:`genindex`
54 | * :ref:`modindex`
55 | * :ref:`search`
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 1>NUL 2>NUL
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\hdbscan.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\hdbscan.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/docs/outlier_detection.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Outlier Detection
 3 | =================
 4 | 
 5 | The hdbscan library supports the GLOSH outlier detection algorithm, and
 6 | does so within the HDBSCAN clustering class. The GLOSH outlier detection
 7 | algorithm is related to older outlier detection methods such as
 8 | `LOF <https://en.wikipedia.org/wiki/Local_outlier_factor>`__ and
 9 | `LOCI <http://www.informedia.cs.cmu.edu/documents/loci_icde03.pdf>`__.
10 | It is a fast and flexible outlier detection system, and supports a
11 | notion of local outliers. This means that it can detect outliers that
12 | may be noticeably different from points in its local region (for example
13 | points not on a local submanifold) but that are not necessarily outliers
14 | globally. So how do we find outliers? We proceed identically to the
15 | basic use of HDBSCAN\*. We start with some data, and fit it with an
16 | HDBSCAN object.
17 | 
18 | .. code:: python
19 | 
20 |     plt.scatter(*data.T, s=50, linewidth=0, c='b', alpha=0.25)
21 | 
22 | 
23 | .. image:: images/outlier_detection_3_1.png
24 | 
25 | 
26 | .. code:: python
27 | 
28 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data)
29 | 
30 | The ``clusterer`` object now has an attribute (computed when first accessed)
31 | called ``outlier_scores_``. This provides a numpy array with a value for
32 | each sample in the original dataset that was fit with the ``clusterer``. The
33 | higher the score, the more likely the point is to be an outlier. In
34 | practice it is often best to look at the distributions of outlier
35 | scores.
36 | 
37 | .. code:: python
38 | 
39 |     clusterer.outlier_scores_
40 | 
41 | 
42 | 
43 | 
44 | .. parsed-literal::
45 | 
46 |     array([ 0.14791852,  0.14116731,  0.09171929, ...,  0.62050534,
47 |             0.56749298,  0.20681685])
48 | 
49 | 
50 | 
51 | .. code:: python
52 | 
53 |     sns.distplot(clusterer.outlier_scores_[np.isfinite(clusterer.outlier_scores_)], rug=True)
54 | 
55 | .. image:: images/outlier_detection_7_1.png
56 | 
57 | 
58 | We can pull off upper quantiles to detect outliers, which we can then
59 | plot.
60 | 
61 | .. code:: python
62 | 
63 |     threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
64 |     outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
65 |     plt.scatter(*data.T, s=50, linewidth=0, c='gray', alpha=0.25)
66 |     plt.scatter(*data[outliers].T, s=50, linewidth=0, c='red', alpha=0.5)
67 | 
68 | .. image:: images/outlier_detection_9_1.png
69 | 
70 | 
71 | Note that not only are the outlying border points highlighted as
72 | outliers, but points at the edge of the central ball like cluster, and
73 | just below the vertical band cluster, are also designated as outliers.
74 | This is because those two clusters are extremely dense, and the points
75 | at the edge of this cluster are close enough to the cluster that they
76 | should be part of it, but far enough from the being core parts of the
77 | cluster that they are extremely unlikely and hence anomalous.
78 | 
79 | 


--------------------------------------------------------------------------------
/docs/parameter_selection.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Parameter Selection for HDBSCAN\*
  3 | =================================
  4 | 
  5 | While the HDBSCAN class has a large number of parameters that can be set
  6 | on initialization, in practice there are a very small number of
  7 | parameters that have significant practical effect on clustering. We will
  8 | consider those major parameters, and consider how one may go about
  9 | choosing them effectively.
 10 | 
 11 | .. _min_cluster_size_label:
 12 | 
 13 | Selecting ``min_cluster_size``
 14 | ------------------------------
 15 | 
 16 | The primary parameter to effect the resulting clustering is
 17 | ``min_cluster_size``. Ideally this is a relatively intuitive parameter
 18 | to select -- set it to the smallest size grouping that you wish to
 19 | consider a cluster. It can have slightly non-obvious effects however.
 20 | Let's consider the digits dataset from sklearn. We can project the data
 21 | into two dimensions to visualize it via t-SNE.
 22 | 
 23 | .. code:: python
 24 | 
 25 |     digits = datasets.load_digits()
 26 |     data = digits.data
 27 |     projection = TSNE().fit_transform(data)
 28 |     plt.scatter(*projection.T, **plot_kwds)
 29 | 
 30 | 
 31 | .. image:: images/parameter_selection_3_1.png
 32 | 
 33 | 
 34 | If we cluster this data in the full 64 dimensional space with HDBSCAN\* we
 35 | can see some effects from varying the ``min_cluster_size``.
 36 | 
 37 | We start with a ``min_cluster_size`` of 15.
 38 | 
 39 | .. code:: python
 40 | 
 41 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data)
 42 |     color_palette = sns.color_palette('Paired', 12)
 43 |     cluster_colors = [color_palette[x] if x >= 0 
 44 |                       else (0.5, 0.5, 0.5) 
 45 |                       for x in clusterer.labels_]
 46 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
 47 |                              zip(cluster_colors, clusterer.probabilities_)]
 48 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
 49 | 
 50 | 
 51 | .. image:: images/parameter_selection_7_1.png
 52 | 
 53 | 
 54 | Increasing the ``min_cluster_size`` to 30 reduces the number of
 55 | clusters, merging some together. This is a result of HDBSCAN\*
 56 | reoptimizing which flat clustering provides greater stability under a
 57 | slightly different notion of what constitutes a cluster.
 58 | 
 59 | .. code:: python
 60 | 
 61 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=30).fit(data)
 62 |     color_palette = sns.color_palette('Paired', 12)
 63 |     cluster_colors = [color_palette[x] if x >= 0 
 64 |                       else (0.5, 0.5, 0.5) 
 65 |                       for x in clusterer.labels_]
 66 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
 67 |                              zip(cluster_colors, clusterer.probabilities_)]
 68 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
 69 | 
 70 | .. image:: images/parameter_selection_9_1.png
 71 | 
 72 | 
 73 | Doubling the ``min_cluster_size`` again to 60 gives us just two clusters
 74 | -- the really core clusters. This is somewhat as expected, but surely
 75 | some of the other clusters that we had previously had more than 60
 76 | members? Why are they being considered noise? The answer is that
 77 | HDBSCAN\* has a second parameter ``min_samples``. The implementation
 78 | defaults this value (if it is unspecified) to whatever
 79 | ``min_cluster_size`` is set to. We can recover some of our original
 80 | clusters by explicitly providing ``min_samples`` at the original value
 81 | of 15.
 82 | 
 83 | .. code:: python
 84 | 
 85 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=60).fit(data)
 86 |     color_palette = sns.color_palette('Paired', 12)
 87 |     cluster_colors = [color_palette[x] if x >= 0 
 88 |                       else (0.5, 0.5, 0.5) 
 89 |                       for x in clusterer.labels_]
 90 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
 91 |                              zip(cluster_colors, clusterer.probabilities_)]
 92 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
 93 | 
 94 | 
 95 | .. image:: images/parameter_selection_11_1.png
 96 | 
 97 | 
 98 | .. code:: python
 99 | 
100 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=15).fit(data)
101 |     color_palette = sns.color_palette('Paired', 12)
102 |     cluster_colors = [color_palette[x] if x >= 0 
103 |                       else (0.5, 0.5, 0.5) 
104 |                       for x in clusterer.labels_]
105 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
106 |                              zip(cluster_colors, clusterer.probabilities_)]
107 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
108 | 
109 | .. image:: images/parameter_selection_12_1.png
110 | 
111 | 
112 | As you can see this results in us recovering something much closer to
113 | our original clustering, only now with some of the smaller clusters
114 | pruned out. Thus ``min_cluster_size`` does behave more closely to our
115 | intuitions, but only if we fix ``min_samples``. 
116 | 
117 |     If you wish to explore different ``min_cluster_size`` settings with 
118 |     a fixed ``min_samples`` value, especially for larger dataset sizes, 
119 |     you can cache the hard computation, and recompute only the relatively
120 |     cheap flat cluster extraction using the ``memory`` parameter, which 
121 |     makes use of `joblib <https://pythonhosted.org/joblib/>`_
122 | 
123 | .. _min_samples_label:
124 | 
125 | Selecting ``min_samples``
126 | -----------------------
127 | 
128 | Since we have seen that ``min_samples`` clearly has a dramatic effect on
129 | clustering, the question becomes: how do we select this parameter? The
130 | simplest intuition for what ``min_samples`` does is provide a measure of
131 | how conservative you want your clustering to be. The larger the value of
132 | ``min_samples`` you provide, the more conservative the clustering --
133 | more points will be declared as noise, and clusters will be restricted
134 | to progressively more dense areas. We can see this in practice by
135 | leaving the ``min_cluster_size`` at 60, but reducing ``min_samples`` to
136 | 1.
137 | 
138 |     Note: adjusting ``min_samples`` will result in recomputing the **hard 
139 |     comptuation** of the single linkage tree.
140 |     
141 | .. code:: python
142 | 
143 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=1).fit(data)
144 |     color_palette = sns.color_palette('Paired', 12)
145 |     cluster_colors = [color_palette[x] if x >= 0 
146 |                       else (0.5, 0.5, 0.5) 
147 |                       for x in clusterer.labels_]
148 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
149 |                              zip(cluster_colors, clusterer.probabilities_)]
150 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
151 | 
152 | 
153 | 
154 | 
155 | .. parsed-literal::
156 | 
157 |     <matplotlib.collections.PathCollection at 0x120978438>
158 | 
159 | 
160 | 
161 | 
162 | .. image:: images/parameter_selection_15_1.png
163 | 
164 | 
165 | Now most points are clustered, and there are much fewer noise points.
166 | Steadily increasing ``min_samples`` will, as we saw in the examples
167 | above, make the clustering progressively more conservative, culminating
168 | in the example above where ``min_samples`` was set to 60 and we had only
169 | two clusters with most points declared as noise.
170 | 
171 | .. _epsilon_label:
172 | 
173 | Selecting ``cluster_selection_epsilon``
174 | ---------------------------------------
175 | 
176 | In some cases, we want to choose a small ``min_cluster_size`` because even groups of few points might be of interest to us.
177 | However, if our data set also contains partitions with high concentrations of objects, this parameter setting can result in
178 | a large number of micro-clusters. Selecting a value for ``cluster_selection_epsilon`` helps us to merge clusters in these regions.
179 | Or in other words, it ensures that clusters below the given threshold are not split up any further.
180 | 
181 | The choice of ``cluster_selection_epsilon`` depends on the given distances between your data points. For example, set the value to 0.5 if you don't want to
182 | separate clusters that are less than 0.5 units apart. This will basically extract DBSCAN* clusters for epsilon = 0.5 from the condensed cluster tree, but leave
183 | HDBSCAN* clusters that emerged at distances greater than 0.5 untouched. See :doc:`how_to_use_epsilon` for a more detailed demonstration of the effect this parameter
184 | has on the resulting clustering.
185 | 
186 | .. _alpha_label:
187 | 
188 | Selecting ``alpha``
189 | -----------------
190 | 
191 | A further parameter that effects the resulting clustering is ``alpha``.
192 | In practice it is best not to mess with this parameter -- ultimately it
193 | is part of the ``RobustSingleLinkage`` code, but flows naturally into
194 | HDBSCAN\*. If, for some reason, ``min_samples`` or ``cluster_selection_epsilon`` is not providing you
195 | what you need, stop, rethink things, and try again with ``min_samples`` or ``cluster_selection_epsilon``.
196 | If you still need to play with another parameter (and you shouldn't),
197 | then you can try setting ``alpha``. The ``alpha`` parameter provides a
198 | slightly different approach to determining how conservative the
199 | clustering is. By default ``alpha`` is set to 1.0. Increasing ``alpha``
200 | will make the clustering more conservative, but on a much tighter scale,
201 | as we can see by setting ``alpha`` to 1.3.
202 | 
203 |     Note: adjusting ``alpha`` will result in recomputing the **hard 
204 |     comptuation** of the single linkage tree.
205 | 
206 | .. code:: python
207 | 
208 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=15, alpha=1.3).fit(data)
209 |     color_palette = sns.color_palette('Paired', 12)
210 |     cluster_colors = [color_palette[x] if x >= 0 
211 |                       else (0.5, 0.5, 0.5) 
212 |                       for x in clusterer.labels_]
213 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
214 |                              zip(cluster_colors, clusterer.probabilities_)]
215 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
216 | 
217 | .. image:: images/parameter_selection_18_1.png
218 | 
219 | 
220 | .. _leaf_clustering_label:
221 | 
222 | Leaf clustering
223 | ---------------
224 | 
225 | HDBSCAN supports an extra parameter ``cluster_selection_method`` to determine
226 | how it selects flat clusters from the cluster tree hierarchy. The default
227 | method is ``'eom'`` for Excess of Mass, the algorithm described in
228 | :doc:`how_hdbscan_works`. This is not always the most desireable approach to
229 | cluster selection. If you are more interested in having small homogeneous
230 | clusters then you may find Excess of Mass has a tendency to pick one or two
231 | large clusters and then a number of small extra clusters. In this situation
232 | you may be tempted to recluster just the data in the single large cluster.
233 | Instead, a better option is to select ``'leaf'`` as a cluster selection
234 | method. This will select leaf nodes from the tree, producing many small
235 | homogeneous clusters. Note that you can still get variable density clusters
236 | via this method, and it is also still possible to get large clusters, but
237 | there will be a tendency to produce a more fine grained clustering than
238 | Excess of Mass can provide.
239 | 
240 | .. _single_cluster_label:
241 | 
242 | Allowing a single cluster
243 | -------------------------
244 | 
245 | In contrast, if you are getting lots of small clusters, but believe there
246 | should be some larger scale structure (or the possibility of no structure),
247 | consider the ``allow_single_cluster`` option. By default HDBSCAN\* does not
248 | allow a single cluster to be returned -- this is due to how the Excess of
249 | Mass algorithm works, and a bias towards the root cluster that may occur. You
250 | can override this behaviour and see what clustering would look like if you
251 | allow a single cluster to be returned. This can alleviate issue caused by
252 | there only being a single large cluster, or by data that is essentially just
253 | noise. For example, the image below shows the effects of setting
254 | ``allow_single_cluster=True`` in the bottom row, compared to the top row
255 | which used default settings.
256 | 
257 | .. image:: images/allow_single_cluster.png
258 | 


--------------------------------------------------------------------------------
/docs/prediction_tutorial.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Predicting clusters for new points
  3 | ==================================
  4 | 
  5 | Often it is useful to train a model once on a large amount of data, and
  6 | then query the model repeatedly with small amounts of new data. This is
  7 | hard for HDBSCAN\* as it is a transductive method -- new data points
  8 | can (and should!) be able to alter the underlying clustering. That is,
  9 | given new information it might make sense to create a new cluster, split
 10 | an existing cluster, or merge two previously separate clusters. If the
 11 | actual clusters (and hence their labels) change with each new data point
 12 | it becomes impossible to compare the cluster assignments between such
 13 | queries.
 14 | 
 15 | We can accommodate this by effectively holding a clustering fixed (after
 16 | a potentially expensive training run) and then asking: *if we do not
 17 | change the existing clusters* which cluster would HDBSCAN\* assign a new
 18 | data point to. In practice this amounts to determining where in the
 19 | condensed tree the new data point would fall (see
 20 | :any:`how_hdbscan_works`) assuming we do not change the condensed
 21 | tree. This allows for a very inexpensive operation to compute a
 22 | predicted cluster for the new data point.
 23 | 
 24 | This has been implemented in ``hdbscan`` as the
 25 | :py:func:`~hdbscan.predict.approximate_predict` function. We'll look
 26 | at how this works below.
 27 | 
 28 | As usual we begin with our test synthetic data set, and cluster it with
 29 | HDBSCAN. The primary point to note here, however, is the use of the
 30 | ``prediction_data=True`` keyword argument. This ensures that HDBSCAN
 31 | does a little extra computation when fitting the model that can
 32 | dramatically speed up the prediction queries later.
 33 | 
 34 | You can also get an HDBSCAN object to create this data after the fact
 35 | via the :py:meth:`~hdbscan.HDBSCAN.generate_prediction_data` method.
 36 | 
 37 | .. code:: python
 38 | 
 39 |     data = np.load('clusterable_data.npy')
 40 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=15, prediction_data=True).fit(data)
 41 |     pal = sns.color_palette('deep', 8)
 42 |     colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, 
 43 |                                                                 clusterer.probabilities_)]
 44 |     plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);
 45 | 
 46 | 
 47 | 
 48 | .. image:: images/prediction_tutorial_3_0.png
 49 | 
 50 | 
 51 | Now to make things a little more interesting let's generate 50 new data
 52 | points scattered across the data. We can plot them in black to see where
 53 | they happen to fall.
 54 | 
 55 | .. code:: python
 56 | 
 57 |     test_points = np.random.random(size=(50, 2)) - 0.5
 58 |     
 59 |     colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, 
 60 |                                                                 clusterer.probabilities_)]
 61 |     plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);
 62 |     plt.scatter(*test_points.T, c='k', s=50)
 63 | 
 64 | 
 65 | 
 66 | .. image:: images/prediction_tutorial_5_1.png
 67 | 
 68 | 
 69 | We can use the predict API on this data, calling
 70 | :py:func:`~hdbscan.predict.approximate_predict` with the HDBSCAN object,
 71 | and the numpy array of new points. Note that
 72 | :py:func:`~hdbscan.predict.approximate_predict` takes an *array* of new
 73 | points. If you have a single point be sure to wrap it in a list.
 74 | 
 75 | .. code:: python
 76 | 
 77 |     test_labels, strengths = hdbscan.approximate_predict(clusterer, test_points)
 78 |     test_labels
 79 | 
 80 | 
 81 | 
 82 | 
 83 | .. parsed-literal::
 84 | 
 85 |     array([ 2, -1, -1, -1, -1, -1,  1,  5, -1, -1,  5, -1, -1, -1, -1,  4, -1,
 86 |            -1, -1, -1, -1,  4, -1, -1, -1, -1,  2, -1, -1,  1, -1, -1, -1,  0,
 87 |            -1,  2, -1, -1,  3, -1, -1,  1, -1, -1, -1, -1, -1,  5,  3,  2])
 88 | 
 89 | 
 90 | 
 91 | The result is a set of labels as you can see. Many of the points as
 92 | classified as noise, but several are also assigned to clusters. This is
 93 | a very fast operation, even with large datasets, as long the HDBSCAN
 94 | object has the prediction data generated beforehand.
 95 | 
 96 | We can also visualize how this worked, coloring the new data points by
 97 | the cluster to which they were assigned. I have added black border
 98 | around the points so they don't get lost inside the clusters they fall
 99 | into.
100 | 
101 | .. code:: python
102 | 
103 |     colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, 
104 |                                                                 clusterer.probabilities_)]
105 |     test_colors = [pal[col] if col >= 0 else (0.1, 0.1, 0.1) for col in test_labels]
106 |     plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);
107 |     plt.scatter(*test_points.T, c=test_colors, s=80, linewidths=1, edgecolors='k')
108 | 
109 | 
110 | 
111 | .. image:: images/prediction_tutorial_9_1.png
112 | 
113 | 
114 | It is as simple as that. So now you can get started using HDBSCAN as a
115 | streaming clustering service -- just be sure to cache your data and
116 | retrain your model periodically to avoid drift!
117 | 
118 | 


--------------------------------------------------------------------------------
/docs/soft_clustering.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Soft Clustering for HDBSCAN\*
  3 | =============================
  4 | 
  5 | Soft clustering is a new (and still somewhat experimental) feature of
  6 | the hdbscan library. It takes advantage of the fact that the condensed
  7 | tree is a kind of smoothed density function over data points, and the
  8 | notion of exemplars for clusters. If you want to better understand how
  9 | soft clustering works please refer to :any:`soft_clustering_explanation`.
 10 | 
 11 | Let's consider the digits dataset from sklearn. We can project the data
 12 | into two dimensions to visualize it via t-SNE.
 13 | 
 14 | .. code:: python
 15 | 
 16 |     from sklearn import datasets
 17 |     from sklearn.manifold import TSNE
 18 |     import matplotlib.pyplot as plt
 19 |     import seaborn as sns
 20 |     import numpy as np
 21 | 
 22 | .. code:: python
 23 | 
 24 |     digits = datasets.load_digits()
 25 |     data = digits.data
 26 |     projection = TSNE().fit_transform(data)
 27 |     plt.scatter(*projection.T, **plot_kwds)
 28 | 
 29 | 
 30 | .. image:: images/soft_clustering_3_1.png
 31 | 
 32 | 
 33 | Now we import hdbscan and then cluster in the full 64 dimensional space.
 34 | It is important to note that, if we wish to use the soft clustering we
 35 | should use the ``prediction_data=True`` option for HDBSCAN. This will
 36 | ensure we generate the extra data required that will allow soft
 37 | clustering to work.
 38 | 
 39 | .. code:: python
 40 | 
 41 |     import hdbscan
 42 | 
 43 | .. code:: python
 44 | 
 45 |     clusterer = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True).fit(data)
 46 |     color_palette = sns.color_palette('Paired', 12)
 47 |     cluster_colors = [color_palette[x] if x >= 0 
 48 |                       else (0.5, 0.5, 0.5) 
 49 |                       for x in clusterer.labels_]
 50 |     cluster_member_colors = [sns.desaturate(x, p) for x, p in 
 51 |                              zip(cluster_colors, clusterer.probabilities_)]
 52 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
 53 | 
 54 | 
 55 | 
 56 | .. image:: images/soft_clustering_6_1.png
 57 | 
 58 | 
 59 | Certainly a number of clusters were found, but the data is fairly noisy
 60 | in 64 dimensions, so there are a number of points that have been
 61 | classified as noise. We can generate a soft clustering to get more
 62 | information about some of these noise points.
 63 | 
 64 | To generate a soft clustering for all the points in the original dataset
 65 | we use the
 66 | :py:func:`~hdbscan.prediction.all_points_membership_vectors` function
 67 | which takes a clusterer object. If we wanted to get soft cluster
 68 | membership values for a set of new unseen points we could use
 69 | :py:func:`~hdbscan.prediction.membership_vector` instead.
 70 | 
 71 | The return value is a two-dimensional numpy array. Each point of the
 72 | input data is assigned a vector of probabilities of being in a cluster.
 73 | For a first pass we can visualize the data looking at what the *most
 74 | likely* cluster was, by coloring according to the ``argmax`` of the
 75 | probability vector (i.e. the cluster for which a given point has the
 76 | highest probability of being in).
 77 | 
 78 | .. code:: python
 79 | 
 80 |     soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
 81 |     color_palette = sns.color_palette('Paired', 12)
 82 |     cluster_colors = [color_palette[np.argmax(x)]
 83 |                       for x in soft_clusters]
 84 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25)
 85 | 
 86 | 
 87 | 
 88 | .. image:: images/soft_clustering_8_1.png
 89 | 
 90 | 
 91 | This fills out the clusters nicely -- we see that there were many noise
 92 | points that are most likely to belong to the clusters we would expect;
 93 | we can also see where things have gotten confused in the middle, and
 94 | there is a mix of cluster assignments.
 95 | 
 96 | We are still only using part of the information however; we can
 97 | desaturate according to the actual probability value for the most likely
 98 | cluster.
 99 | 
100 | .. code:: python
101 | 
102 |     color_palette = sns.color_palette('Paired', 12)
103 |     cluster_colors = [sns.desaturate(color_palette[np.argmax(x)], np.max(x))
104 |                       for x in soft_clusters]
105 |     plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25)
106 | 
107 | 
108 | 
109 | .. image:: images/soft_clustering_10_1.png
110 | 
111 | 
112 | We see that many points actually have a low probability of being in the
113 | cluster -- indeed the soft clustering applies *within* a cluster, so
114 | only the very cores of each cluster have high probabilities. In practice
115 | desaturating is a fairly string treatment; visually a lot will look
116 | gray. We could apply a function and put a lower limit on the
117 | desaturation that meets better with human visual perception, but that is
118 | left as an exercise for the reader.
119 | 
120 | Instead we'll explore what else we can learn about the data from these
121 | cluster membership probabilities. An interesting question is which
122 | points have high likelihoods for *two* clusters (and low likelihoods for
123 | the other clusters).
124 | 
125 | .. code:: python
126 | 
127 |     def top_two_probs_diff(probs):
128 |         sorted_probs = np.sort(probs)
129 |         return sorted_probs[-1] - sorted_probs[-2]
130 |     
131 |     # Compute the differences between the top two probabilities
132 |     diffs = np.array([top_two_probs_diff(x) for x in soft_clusters])
133 |     # Select out the indices that have a small difference, and a larger total probability
134 |     mixed_points = np.where((diffs < 0.001) & (np.sum(soft_clusters, axis=1) > 0.5))[0]
135 | 
136 | .. code:: python
137 | 
138 |     colors = [(0.75, 0.1, 0.1) if x in mixed_points 
139 |               else (0.5, 0.5, 0.5) for x in range(data.shape[0])]
140 |     plt.scatter(*projection.T, s=50, linewidth=0, c=colors, alpha=0.5)
141 | 
142 | 
143 | 
144 | 
145 | .. image:: images/soft_clustering_13_1.png
146 | 
147 | 
148 | We can look at a few of these and see that many are, indeed, hard to
149 | classify (even for humans). It also seems that 8 was not assigned a
150 | cluster and is seen as a mixture of other clusters.
151 | 
152 | .. code:: python
153 | 
154 |     fig = plt.figure()
155 |     for i, image in enumerate(digits.images[mixed_points][:16]):
156 |         ax = fig.add_subplot(4,4,i+1)
157 |         ax.imshow(image)
158 |     plt.tight_layout()
159 | 
160 | 
161 | 
162 | .. image:: images/soft_clustering_15_0.png
163 | 
164 | 
165 | There is, of course, a lot more analysis that can be done from here, but
166 | hopefully this provides sufficient introduction to what can be achieved
167 | with soft clustering.
168 | 
169 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: hdbscan
2 | dependencies:
3 | - python>=3.5
4 | - scikit-learn>=0.19
5 | - matplotlib>=2.0
6 | - seaborn>=0.8
7 | - hdbscan>=0.8.11
8 | 


--------------------------------------------------------------------------------
/examples/plot_cluster_comparison.py:
--------------------------------------------------------------------------------
  1 | """
  2 | =========================================================
  3 | Comparing different clustering algorithms on toy datasets
  4 | =========================================================
  5 | 
  6 | This example aims at showing characteristics of different
  7 | clustering algorithms on datasets that are "interesting"
  8 | but still in 2D. The last dataset is an example of a 'null'
  9 | situation for clustering: the data is homogeneous, and
 10 | there is no good clustering.
 11 | 
 12 | While these examples give some intuition about the algorithms,
 13 | this intuition might not apply to very high dimensional data.
 14 | 
 15 | The results could be improved by tweaking the parameters for
 16 | each clustering strategy, for instance setting the number of
 17 | clusters for the methods that needs this parameter
 18 | specified. Note that affinity propagation has a tendency to
 19 | create many clusters. Thus in this example its two parameters
 20 | (damping and per-point preference) were set to to mitigate this
 21 | behavior.
 22 | """
 23 | print(__doc__)
 24 | 
 25 | import time
 26 | 
 27 | import numpy as np
 28 | import matplotlib.pyplot as plt
 29 | 
 30 | from sklearn import cluster, datasets
 31 | from sklearn.neighbors import kneighbors_graph
 32 | from sklearn.preprocessing import StandardScaler
 33 | 
 34 | import hdbscan
 35 | 
 36 | np.random.seed(0)
 37 | plt.style.use('fivethirtyeight')
 38 | 
 39 | def make_var_density_blobs(n_samples=750, centers=[[0,0]], cluster_std=[0.5], random_state=0):
 40 |     samples_per_blob = n_samples // len(centers)
 41 |     blobs = [datasets.make_blobs(n_samples=samples_per_blob, centers=[c], cluster_std=cluster_std[i])[0]
 42 |              for i, c in enumerate(centers)]
 43 |     labels = [i * np.ones(samples_per_blob) for i in range(len(centers))]
 44 |     return np.vstack(blobs), np.hstack(labels)
 45 | 
 46 | # Generate datasets. We choose the size big enough to see the scalability
 47 | # of the algorithms, but not too big to avoid too long running times
 48 | n_samples = 1500
 49 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
 50 |                                       noise=.08)
 51 | noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.10)
 52 | blobs = datasets.make_blobs(n_samples=n_samples-200, random_state=8)
 53 | noisy_blobs = np.vstack((blobs[0], 25.0*np.random.rand(200, 2)-[10.0,10.0])), np.hstack((blobs[1], -1*np.ones(200))) 
 54 | varying_blobs = make_var_density_blobs(n_samples,
 55 |                                        centers=[[1, 1],
 56 |                                                 [-1, -1],
 57 |                                                 [1, -1]],
 58 |                                        cluster_std=[0.2, 0.35, 0.5])
 59 | no_structure = np.random.rand(n_samples, 2), None
 60 | 
 61 | colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
 62 | colors = np.hstack([colors] * 20)
 63 | 
 64 | clustering_names = [
 65 |     'MiniBatchKMeans', 'AffinityPropagation',
 66 |     'SpectralClustering', 'AgglomerativeClustering',
 67 |     'DBSCAN', 'HDBSCAN']
 68 | 
 69 | plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
 70 | plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
 71 |                     hspace=.01)
 72 | 
 73 | plot_num = 1
 74 | 
 75 | datasets = [noisy_circles, noisy_moons, noisy_blobs, varying_blobs, no_structure]
 76 | for i_dataset, dataset in enumerate(datasets):
 77 |     X, y = dataset
 78 |     # normalize dataset for easier parameter selection
 79 |     X = StandardScaler().fit_transform(X)
 80 | 
 81 |     # estimate bandwidth for mean shift
 82 |     bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
 83 | 
 84 |     # connectivity matrix for structured Ward
 85 |     connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
 86 |     # make connectivity symmetric
 87 |     connectivity = 0.5 * (connectivity + connectivity.T)
 88 | 
 89 |     # create clustering estimators
 90 |     two_means = cluster.MiniBatchKMeans(n_clusters=2)
 91 |     spectral = cluster.SpectralClustering(n_clusters=2,
 92 |                                           eigen_solver='arpack',
 93 |                                           affinity="nearest_neighbors")
 94 |     dbscan = cluster.DBSCAN(eps=.2)
 95 |     affinity_propagation = cluster.AffinityPropagation(damping=.9,
 96 |                                                        preference=-200)
 97 | 
 98 |     average_linkage = cluster.AgglomerativeClustering(
 99 |         linkage="average", affinity="cityblock", n_clusters=2,
100 |         connectivity=connectivity)
101 | 
102 |     hdbscanner = hdbscan.HDBSCAN()
103 |     clustering_algorithms = [
104 |         two_means, affinity_propagation, spectral, average_linkage,
105 |         dbscan, hdbscanner]
106 | 
107 |     for name, algorithm in zip(clustering_names, clustering_algorithms):
108 |         # predict cluster memberships
109 |         t0 = time.time()
110 |         algorithm.fit(X)
111 |         t1 = time.time()
112 |         if hasattr(algorithm, 'labels_'):
113 |             y_pred = algorithm.labels_.astype(np.int)
114 |         else:
115 |             y_pred = algorithm.predict(X)
116 | 
117 |         # plot
118 |         plt.subplot(5, len(clustering_algorithms), plot_num)
119 |         if i_dataset == 0:
120 |             plt.title(name, size=18)
121 |         plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
122 | 
123 |         if hasattr(algorithm, 'cluster_centers_'):
124 |             centers = algorithm.cluster_centers_
125 |             center_colors = colors[:len(centers)]
126 |             plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
127 |         plt.xlim(-2, 2)
128 |         plt.ylim(-2, 2)
129 |         plt.xticks(())
130 |         plt.yticks(())
131 |         plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
132 |                  transform=plt.gca().transAxes, size=15,
133 |                  horizontalalignment='right')
134 |         plot_num += 1
135 | 
136 | plt.show()
137 | 


--------------------------------------------------------------------------------
/examples/plot_hdbscan.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | ===================================
  4 | Demo of HDBSCAN clustering algorithm
  5 | ===================================
  6 | 
  7 | Finds a clustering that has the greatest stability over a range
  8 | of epsilon values for standard DBSCAN. This allows clusterings
  9 | of different densities unlike DBSCAN.
 10 | 
 11 | """
 12 | print(__doc__)
 13 | 
 14 | import numpy as np
 15 | 
 16 | from hdbscan import HDBSCAN
 17 | from sklearn.cluster import DBSCAN
 18 | from sklearn import metrics
 19 | from sklearn.datasets.samples_generator import make_blobs
 20 | from sklearn.preprocessing import StandardScaler
 21 | 
 22 | import time
 23 | 
 24 | def make_var_density_blobs(n_samples=750, centers=[[0,0]], cluster_std=[0.5], random_state=0):
 25 |     samples_per_blob = n_samples // len(centers)
 26 |     blobs = [make_blobs(n_samples=samples_per_blob, centers=[c], cluster_std=cluster_std[i])[0]
 27 |              for i, c in enumerate(centers)]
 28 |     labels = [i * np.ones(samples_per_blob) for i in range(len(centers))]
 29 |     return np.vstack(blobs), np.hstack(labels)
 30 |         
 31 | 
 32 | ##############################################################################
 33 | # Generate sample data
 34 | centers = [[1, 1], [-1, -1], [1, -1]]
 35 | densities = [0.2, 0.35, 0.5]
 36 | X, labels_true = make_var_density_blobs(n_samples=750, centers=centers, cluster_std=densities,
 37 |                             random_state=0)
 38 | 
 39 | X = StandardScaler().fit_transform(X)
 40 | 
 41 | ##############################################################################
 42 | # Compute DBSCAN
 43 | hdb_t1 = time.time()
 44 | hdb = HDBSCAN(min_cluster_size=10).fit(X)
 45 | hdb_labels = hdb.labels_
 46 | hdb_elapsed_time = time.time() - hdb_t1
 47 | 
 48 | db_t1 = time.time()
 49 | db = DBSCAN(eps=0.1).fit(X)
 50 | db_labels = db.labels_
 51 | db_elapsed_time = time.time() - db_t1
 52 | 
 53 | # Number of clusters in labels, ignoring noise if present.
 54 | n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)
 55 | 
 56 | print('\n\n++ HDBSCAN Results')
 57 | print('Estimated number of clusters: %d' % n_clusters_hdb_)
 58 | print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time)
 59 | print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, hdb_labels))
 60 | print('Completeness: %0.3f' % metrics.completeness_score(labels_true, hdb_labels))
 61 | print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, hdb_labels))
 62 | print('Adjusted Rand Index: %0.3f'
 63 |       % metrics.adjusted_rand_score(labels_true, hdb_labels))
 64 | print('Adjusted Mutual Information: %0.3f'
 65 |       % metrics.adjusted_mutual_info_score(labels_true, hdb_labels))
 66 | print('Silhouette Coefficient: %0.3f'
 67 |       % metrics.silhouette_score(X, hdb_labels))
 68 | 
 69 | n_clusters_db_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)
 70 | 
 71 | print('\n\n++ DBSCAN Results')
 72 | print('Estimated number of clusters: %d' % n_clusters_db_)
 73 | print('Elapsed time to cluster: %.4f s' % db_elapsed_time)
 74 | print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, db_labels))
 75 | print('Completeness: %0.3f' % metrics.completeness_score(labels_true, db_labels))
 76 | print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, db_labels))
 77 | print('Adjusted Rand Index: %0.3f'
 78 |       % metrics.adjusted_rand_score(labels_true, db_labels))
 79 | print('Adjusted Mutual Information: %0.3f'
 80 |       % metrics.adjusted_mutual_info_score(labels_true, db_labels))
 81 | if n_clusters_db_ > 1:
 82 |     print('Silhouette Coefficient: %0.3f'
 83 |           % metrics.silhouette_score(X, db_labels))
 84 | else:
 85 |     print('Silhouette Coefficient: NaN (too few clusters)')
 86 | 
 87 | ##############################################################################
 88 | # Plot result
 89 | import matplotlib.pyplot as plt
 90 | 
 91 | # Black removed and is used for noise instead.
 92 | hdb_unique_labels = set(hdb_labels)
 93 | db_unique_labels = set(db_labels)
 94 | hdb_colors = plt.cm.Spectral(np.linspace(0, 1, len(hdb_unique_labels)))
 95 | db_colors = plt.cm.Spectral(np.linspace(0, 1, len(db_unique_labels)))
 96 | fig = plt.figure(figsize=plt.figaspect(0.5))
 97 | hdb_axis = fig.add_subplot('121')
 98 | db_axis = fig.add_subplot('122')
 99 | for k, col in zip(hdb_unique_labels, hdb_colors):
100 |     if k == -1:
101 |         # Black used for noise.
102 |         col = 'k'
103 | 
104 |     hdb_axis.plot(X[hdb_labels == k, 0], X[hdb_labels == k, 1], 'o', markerfacecolor=col,
105 |                   markeredgecolor='k', markersize=6)
106 | for k, col in zip(db_unique_labels, db_colors):
107 |     if k == -1:
108 |         # Black used for noise.
109 |         col = 'k'
110 | 
111 |     db_axis.plot(X[db_labels == k, 0], X[db_labels == k, 1], 'o', markerfacecolor=col,
112 |                   markeredgecolor='k', markersize=6)
113 | 
114 | hdb_axis.set_title('HDBSCAN\nEstimated number of clusters: %d' % n_clusters_hdb_)
115 | db_axis.set_title('DBSCAN\nEstimated number of clusters: %d' % n_clusters_db_)
116 | plt.show()
117 | 


--------------------------------------------------------------------------------
/hdbscan/__init__.py:
--------------------------------------------------------------------------------
 1 | from .hdbscan_ import HDBSCAN, hdbscan
 2 | from .robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage
 3 | from .validity import validity_index
 4 | from .prediction import (approximate_predict,
 5 |                          membership_vector,
 6 |                          all_points_membership_vectors,
 7 |                          approximate_predict_scores)
 8 | from .branches import (BranchDetector, 
 9 |                        detect_branches_in_clusters, 
10 |                        approximate_predict_branch)
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/hdbscan/_hdbscan_linkage.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: nonecheck=False
  3 | # Minimum spanning tree single linkage implementation for hdbscan
  4 | # Authors: Leland McInnes, Steve Astels
  5 | # License: 3-clause BSD
  6 | 
  7 | import numpy as np
  8 | cimport numpy as np
  9 | 
 10 | from libc.float cimport DBL_MAX
 11 | 
 12 | from hdbscan.dist_metrics cimport DistanceMetric
 13 | 
 14 | 
 15 | cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core(
 16 |                                np.ndarray[np.double_t,
 17 |                                           ndim=2] distance_matrix):
 18 | 
 19 |     cdef np.ndarray[np.intp_t, ndim=1] node_labels
 20 |     cdef np.ndarray[np.intp_t, ndim=1] current_labels
 21 |     cdef np.ndarray[np.double_t, ndim=1] current_distances
 22 |     cdef np.ndarray[np.double_t, ndim=1] left
 23 |     cdef np.ndarray[np.double_t, ndim=1] right
 24 |     cdef np.ndarray[np.double_t, ndim=2] result
 25 | 
 26 |     cdef np.ndarray label_filter
 27 | 
 28 |     cdef np.intp_t current_node
 29 |     cdef np.intp_t new_node_index
 30 |     cdef np.intp_t new_node
 31 |     cdef np.intp_t i
 32 | 
 33 |     result = np.zeros((distance_matrix.shape[0] - 1, 3))
 34 |     node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp)
 35 |     current_node = 0
 36 |     current_distances = np.inf * np.ones(distance_matrix.shape[0])
 37 |     current_labels = node_labels
 38 |     for i in range(1, node_labels.shape[0]):
 39 |         label_filter = current_labels != current_node
 40 |         current_labels = current_labels[label_filter]
 41 |         left = current_distances[label_filter]
 42 |         right = distance_matrix[current_node][current_labels]
 43 |         current_distances = np.where(left < right, left, right)
 44 | 
 45 |         new_node_index = np.argmin(current_distances)
 46 |         new_node = current_labels[new_node_index]
 47 |         result[i - 1, 0] = <double> current_node
 48 |         result[i - 1, 1] = <double> new_node
 49 |         result[i - 1, 2] = current_distances[new_node_index]
 50 |         current_node = new_node
 51 | 
 52 |     return result
 53 | 
 54 | 
 55 | cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector(
 56 |         np.ndarray[np.double_t, ndim=2, mode='c'] raw_data,
 57 |         np.ndarray[np.double_t, ndim=1, mode='c'] core_distances,
 58 |         DistanceMetric dist_metric,
 59 |         np.double_t alpha=1.0):
 60 | 
 61 |     # Add a comment
 62 |     cdef np.ndarray[np.double_t, ndim=1] current_distances_arr
 63 |     cdef np.ndarray[np.double_t, ndim=1] current_sources_arr
 64 |     cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr
 65 |     cdef np.ndarray[np.double_t, ndim=2] result_arr
 66 | 
 67 |     cdef np.double_t * current_distances
 68 |     cdef np.double_t * current_sources
 69 |     cdef np.double_t * current_core_distances
 70 |     cdef np.double_t * raw_data_ptr
 71 |     cdef np.int8_t * in_tree
 72 |     cdef np.double_t[:, ::1] raw_data_view
 73 |     cdef np.double_t[:, ::1] result
 74 | 
 75 |     cdef np.ndarray label_filter
 76 | 
 77 |     cdef np.intp_t current_node
 78 |     cdef np.intp_t source_node
 79 |     cdef np.intp_t right_node, right_source
 80 |     cdef np.intp_t left_node, left_source
 81 |     cdef np.intp_t new_node
 82 |     cdef np.intp_t i
 83 |     cdef np.intp_t j
 84 |     cdef np.intp_t dim
 85 |     cdef np.intp_t num_features
 86 | 
 87 |     cdef double current_node_core_distance
 88 |     cdef double right_value
 89 |     cdef double left_value
 90 |     cdef double core_value
 91 |     cdef double new_distance
 92 | 
 93 |     dim = raw_data.shape[0]
 94 |     num_features = raw_data.shape[1]
 95 | 
 96 |     raw_data_view = (<np.double_t[:raw_data.shape[0], :raw_data.shape[1]:1]> (
 97 |         <np.double_t *> raw_data.data))
 98 |     raw_data_ptr = (<np.double_t *> &raw_data_view[0, 0])
 99 | 
100 |     result_arr = np.zeros((dim - 1, 3))
101 |     in_tree_arr = np.zeros(dim, dtype=np.int8)
102 |     current_node = 0
103 |     current_distances_arr = np.inf * np.ones(dim)
104 |     current_sources_arr = np.ones(dim)
105 | 
106 |     result = (<np.double_t[:dim - 1, :3:1]> (<np.double_t *> result_arr.data))
107 |     in_tree = (<np.int8_t *> in_tree_arr.data)
108 |     current_distances = (<np.double_t *> current_distances_arr.data)
109 |     current_sources = (<np.double_t *> current_sources_arr.data)
110 |     current_core_distances = (<np.double_t *> core_distances.data)
111 | 
112 |     for i in range(1, dim):
113 | 
114 |         in_tree[current_node] = 1
115 | 
116 |         current_node_core_distance = current_core_distances[current_node]
117 | 
118 |         new_distance = DBL_MAX
119 |         source_node = 0
120 |         new_node = 0
121 | 
122 |         for j in range(dim):
123 |             if in_tree[j]:
124 |                 continue
125 | 
126 |             right_value = current_distances[j]
127 |             right_source = <np.intp_t> current_sources[j]
128 | 
129 |             left_value = dist_metric.dist(&raw_data_ptr[num_features *
130 |                                                         current_node],
131 |                                           &raw_data_ptr[num_features * j],
132 |                                           num_features)
133 |             left_source = current_node
134 | 
135 |             if alpha != 1.0:
136 |                 left_value /= alpha
137 | 
138 |             core_value = core_distances[j]
139 |             if (current_node_core_distance > right_value or
140 |                     core_value > right_value or
141 |                     left_value > right_value):
142 |                 if right_value < new_distance:
143 |                     new_distance = right_value
144 |                     source_node = right_source
145 |                     new_node = j
146 |                 continue
147 | 
148 |             if core_value > current_node_core_distance:
149 |                 if core_value > left_value:
150 |                     left_value = core_value
151 |             else:
152 |                 if current_node_core_distance > left_value:
153 |                     left_value = current_node_core_distance
154 | 
155 |             if left_value < right_value:
156 |                 current_distances[j] = left_value
157 |                 current_sources[j] = left_source
158 |                 if left_value < new_distance:
159 |                     new_distance = left_value
160 |                     source_node = left_source
161 |                     new_node = j
162 |             else:
163 |                 if right_value < new_distance:
164 |                     new_distance = right_value
165 |                     source_node = right_source
166 |                     new_node = j
167 | 
168 |         result[i - 1, 0] = <double> source_node
169 |         result[i - 1, 1] = <double> new_node
170 |         result[i - 1, 2] = new_distance
171 |         current_node = new_node
172 | 
173 |     return result_arr
174 | 
175 | 
176 | cdef class UnionFind (object):
177 | 
178 |     cdef np.ndarray parent_arr
179 |     cdef np.ndarray size_arr
180 |     cdef np.intp_t next_label
181 |     cdef np.intp_t *parent
182 |     cdef np.intp_t *size
183 | 
184 |     def __init__(self, N):
185 |         self.parent_arr = -1 * np.ones(2 * N - 1, dtype=np.intp, order='C')
186 |         self.next_label = N
187 |         self.size_arr = np.hstack((np.ones(N, dtype=np.intp),
188 |                                    np.zeros(N-1, dtype=np.intp)))
189 |         self.parent = (<np.intp_t *> self.parent_arr.data)
190 |         self.size = (<np.intp_t *> self.size_arr.data)
191 | 
192 |     cdef void union(self, np.intp_t m, np.intp_t n):
193 |         self.size[self.next_label] = self.size[m] + self.size[n]
194 |         self.parent[m] = self.next_label
195 |         self.parent[n] = self.next_label
196 |         self.size[self.next_label] = self.size[m] + self.size[n]
197 |         self.next_label += 1
198 | 
199 |         return
200 | 
201 |     cdef np.intp_t fast_find(self, np.intp_t n):
202 |         cdef np.intp_t p
203 |         p = n
204 |         while self.parent_arr[n] != -1:
205 |             n = self.parent_arr[n]
206 |         # label up to the root
207 |         while self.parent_arr[p] != n:
208 |             p, self.parent_arr[p] = self.parent_arr[p], n
209 |         return n
210 | 
211 | 
212 | cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L):
213 | 
214 |     cdef np.ndarray[np.double_t, ndim=2] result_arr
215 |     cdef np.double_t[:, ::1] result
216 | 
217 |     cdef np.intp_t N, a, aa, b, bb, index
218 |     cdef np.double_t delta
219 | 
220 |     result_arr = np.zeros((L.shape[0], L.shape[1] + 1))
221 |     result = (<np.double_t[:L.shape[0], :4:1]> (
222 |         <np.double_t *> result_arr.data))
223 |     N = L.shape[0] + 1
224 |     U = UnionFind(N)
225 | 
226 |     for index in range(L.shape[0]):
227 | 
228 |         a = <np.intp_t> L[index, 0]
229 |         b = <np.intp_t> L[index, 1]
230 |         delta = L[index, 2]
231 | 
232 |         aa, bb = U.fast_find(a), U.fast_find(b)
233 | 
234 |         result[index][0] = aa
235 |         result[index][1] = bb
236 |         result[index][2] = delta
237 |         result[index][3] = U.size[aa] + U.size[bb]
238 | 
239 |         U.union(aa, bb)
240 | 
241 |     return result_arr
242 | 
243 | 
244 | cpdef np.ndarray[np.double_t, ndim=2] single_linkage(distance_matrix):
245 | 
246 |     cdef np.ndarray[np.double_t, ndim=2] hierarchy
247 |     cdef np.ndarray[np.double_t, ndim=2] for_labelling
248 | 
249 |     hierarchy = mst_linkage_core(distance_matrix)
250 |     for_labelling = hierarchy[np.argsort(hierarchy.T[2]), :]
251 | 
252 |     return label(for_labelling)
253 | 


--------------------------------------------------------------------------------
/hdbscan/_hdbscan_reachability.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: nonecheck=False
  3 | # cython: initializedcheck=False
  4 | # mutual reachability distance compiutations
  5 | # Authors: Leland McInnes
  6 | # License: 3-clause BSD
  7 | 
  8 | import numpy as np
  9 | cimport numpy as np
 10 | 
 11 | from scipy.spatial.distance import pdist, squareform
 12 | from scipy.sparse import lil_matrix as sparse_matrix
 13 | from sklearn.neighbors import KDTree, BallTree
 14 | import gc
 15 | 
 16 | 
 17 | def mutual_reachability(distance_matrix, min_points=5, alpha=1.0):
 18 |     """Compute the weighted adjacency matrix of the mutual reachability
 19 |     graph of a distance matrix.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     distance_matrix : ndarray, shape (n_samples, n_samples)
 24 |         Array of distances between samples.
 25 | 
 26 |     min_points : int, optional (default=5)
 27 |         The number of points in a neighbourhood for a point to be considered
 28 |         a core point.
 29 | 
 30 |     Returns
 31 |     -------
 32 |     mututal_reachability: ndarray, shape (n_samples, n_samples)
 33 |         Weighted adjacency matrix of the mutual reachability graph.
 34 | 
 35 |     References
 36 |     ----------
 37 |     .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
 38 |        Density-based clustering based on hierarchical density estimates.
 39 |        In Pacific-Asia Conference on Knowledge Discovery and Data Mining
 40 |        (pp. 160-172). Springer Berlin Heidelberg.
 41 |     """
 42 |     size = distance_matrix.shape[0]
 43 |     min_points = min(size - 1, min_points)
 44 |     try:
 45 |         core_distances = np.partition(distance_matrix,
 46 |                                       min_points,
 47 |                                       axis=0)[min_points]
 48 |     except AttributeError:
 49 |         core_distances = np.sort(distance_matrix,
 50 |                                  axis=0)[min_points]
 51 | 
 52 |     if alpha != 1.0:
 53 |         distance_matrix = distance_matrix / alpha
 54 | 
 55 |     stage1 = np.where(core_distances > distance_matrix,
 56 |                       core_distances, distance_matrix)
 57 |     result = np.where(core_distances > stage1.T,
 58 |                       core_distances.T, stage1.T).T
 59 |     return result
 60 | 
 61 | 
 62 | cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
 63 |                                  float alpha=1.0, float max_dist=0.):
 64 | 
 65 |     cdef np.intp_t i
 66 |     cdef np.intp_t j
 67 |     cdef np.intp_t n
 68 |     cdef np.double_t mr_dist
 69 |     cdef list sorted_row_data
 70 |     cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance
 71 |     cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data
 72 |     cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data
 73 | 
 74 |     result = sparse_matrix(lil_matrix.shape)
 75 |     core_distance = np.empty(lil_matrix.shape[0], dtype=np.double)
 76 | 
 77 |     for i in range(lil_matrix.shape[0]):
 78 |         sorted_row_data = sorted(lil_matrix.data[i])
 79 |         if min_points - 1 < len(sorted_row_data):
 80 |             core_distance[i] = sorted_row_data[min_points - 1]
 81 |         else:
 82 |             core_distance[i] = np.inf
 83 | 
 84 |     if alpha != 1.0:
 85 |         lil_matrix = lil_matrix / alpha
 86 | 
 87 |     nz_row_data, nz_col_data = lil_matrix.nonzero()
 88 | 
 89 |     for n in range(nz_row_data.shape[0]):
 90 |         i = nz_row_data[n]
 91 |         j = nz_col_data[n]
 92 | 
 93 |         mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j])
 94 |         if np.isfinite(mr_dist):
 95 |             result[i, j] = mr_dist
 96 |         elif max_dist > 0:
 97 |             result[i, j] = max_dist
 98 | 
 99 |     return result.tocsr()
100 | 
101 | 
102 | def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
103 |                                alpha=1.0, **kwargs):
104 |     dim = distance_matrix.shape[0]
105 |     min_points = min(dim - 1, min_points)
106 | 
107 |     if metric == 'minkowski':
108 |         tree = KDTree(X, metric=metric, p=p)
109 |     else:
110 |         tree = KDTree(X, metric=metric, **kwargs)
111 | 
112 |     core_distances = tree.query(X, k=min_points)[0][:, -1]
113 | 
114 |     if alpha != 1.0:
115 |         distance_matrix = distance_matrix / alpha
116 | 
117 |     stage1 = np.where(core_distances > distance_matrix,
118 |                       core_distances, distance_matrix)
119 |     result = np.where(core_distances > stage1.T,
120 |                       core_distances.T, stage1.T).T
121 |     return result
122 | 
123 | 
124 | def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
125 |                                  alpha=1.0, **kwargs):
126 |     dim = distance_matrix.shape[0]
127 |     min_points = min(dim - 1, min_points)
128 | 
129 |     tree = BallTree(X, metric=metric, **kwargs)
130 | 
131 |     core_distances = tree.query(X, k=min_points)[0][:, -1]
132 | 
133 |     if alpha != 1.0:
134 |         distance_matrix = distance_matrix / alpha
135 | 
136 |     stage1 = np.where(core_distances > distance_matrix,
137 |                       core_distances, distance_matrix)
138 |     result = np.where(core_distances > stage1.T,
139 |                       core_distances.T, stage1.T).T
140 |     return result
141 | 
142 | 
143 | cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist(
144 |         np.ndarray[np.double_t, ndim=1] core_distances,
145 |         np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim):
146 | 
147 |     cdef np.intp_t i
148 |     cdef np.intp_t j
149 |     cdef np.intp_t result_pos
150 | 
151 |     result_pos = 0
152 |     for i in range(dim):
153 |         for j in range(i + 1, dim):
154 |             if core_distances[i] > core_distances[j]:
155 |                 if core_distances[i] > dists[result_pos]:
156 |                     dists[result_pos] = core_distances[i]
157 | 
158 |             else:
159 |                 if core_distances[j] > dists[result_pos]:
160 |                     dists[result_pos] = core_distances[j]
161 | 
162 |             result_pos += 1
163 | 
164 |     return dists
165 | 
166 | 
167 | def kdtree_pdist_mutual_reachability(X,  metric, p=2, min_points=5, alpha=1.0,
168 |                                      **kwargs):
169 | 
170 |     dim = X.shape[0]
171 |     min_points = min(dim - 1, min_points)
172 | 
173 |     if metric == 'minkowski':
174 |         tree = KDTree(X, metric=metric, p=p)
175 |     else:
176 |         tree = KDTree(X, metric=metric, **kwargs)
177 | 
178 |     core_distances = tree.query(X, k=min_points)[0][:, -1]
179 | 
180 |     del tree
181 |     gc.collect()
182 | 
183 |     dists = pdist(X, metric=metric, p=p, **kwargs)
184 | 
185 |     if alpha != 1.0:
186 |         dists /= alpha
187 | 
188 |     dists = mutual_reachability_from_pdist(core_distances, dists, dim)
189 | 
190 |     return dists
191 | 
192 | 
193 | def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0,
194 |                                        **kwargs):
195 | 
196 |     dim = X.shape[0]
197 |     min_points = min(dim - 1, min_points)
198 | 
199 |     tree = BallTree(X, metric=metric, **kwargs)
200 | 
201 |     core_distances = tree.query(X, k=min_points)[0][:, -1]
202 | 
203 |     del tree
204 |     gc.collect()
205 | 
206 |     dists = pdist(X, metric=metric, p=p, **kwargs)
207 | 
208 |     if alpha != 1.0:
209 |         dists /= alpha
210 | 
211 |     dists = mutual_reachability_from_pdist(core_distances, dists, dim)
212 | 
213 |     return dists
214 | 


--------------------------------------------------------------------------------
/hdbscan/branch_data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.neighbors import KDTree, BallTree
  3 | from .dist_metrics import DistanceMetric
  4 | 
  5 | 
  6 | class BranchDetectionData(object):
  7 |     """Input data for branch detection functionality.
  8 | 
  9 |     Recreates and caches internal data structures from the clustering stage.
 10 | 
 11 |     Parameters
 12 |     ----------
 13 | 
 14 |     data : array (n_samples, n_features)
 15 |         The original data set that was clustered.
 16 | 
 17 |     labels : array (n_samples)
 18 |         The cluster labels for every point in the data set.
 19 | 
 20 |     condensed_tree : array (n_points + n_merges, 4)
 21 |         The condensed tree produced during clustering, used to extract outliers.
 22 | 
 23 |     min_samples : int
 24 |         The min_samples value used in clustering.
 25 | 
 26 |     tree_type : string, optional
 27 |         Which type of space tree to use for core distance computation.
 28 |         One of:
 29 |             * ``kdtree``
 30 |             * ``balltree``
 31 | 
 32 |     metric : string, optional
 33 |         The metric used to determine distance for the clustering.
 34 |         This is the metric that will be used for the space tree to determine
 35 |         core distances etc.
 36 | 
 37 |     **kwargs :
 38 |         Any further arguments to the metric.
 39 | 
 40 |     Attributes
 41 |     ----------
 42 | 
 43 |     all_finite : bool
 44 |         Whether the data set contains any infinite or NaN values.
 45 | 
 46 |     finite_index : array (n_samples)
 47 |         The indices of the finite data points in the original data set.
 48 | 
 49 |     internal_to_raw : dict
 50 |         A mapping from the finite data set indices to the original data set.
 51 | 
 52 |     tree : KDTree or BallTree
 53 |         A space partitioning tree that can be queried for nearest neighbors if
 54 |         the metric is supported by a KDTree or BallTree.
 55 | 
 56 |     neighbors : array (n_samples, min_samples)
 57 |         The nearest neighbor for every non-noise point in the original data set.
 58 | 
 59 |     core_distances : array (n_samples)
 60 |         The core distance for every non-noise point in the original data set.
 61 | 
 62 |     dist_metric : callable
 63 |         Accelerated distance metric function.
 64 |     """
 65 | 
 66 |     _tree_type_map = {"kdtree": KDTree, "balltree": BallTree}
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         data,
 71 |         labels,
 72 |         condensed_tree,
 73 |         min_samples,
 74 |         tree_type="kdtree",
 75 |         metric="euclidean",
 76 |         **kwargs,
 77 |     ):
 78 |         clean_data = data.astype(np.float64)
 79 |         last_outlier = np.searchsorted(condensed_tree["lambda_val"], 0.0, side="right")
 80 |         if last_outlier == 0:
 81 |             self.all_finite = True
 82 |             self.internal_to_raw = None
 83 |             self.finite_index = None
 84 |         else:
 85 |             self.all_finite = False
 86 |             self.finite_index = np.setdiff1d(
 87 |                 np.arange(data.shape[0]),
 88 |                 condensed_tree["child"][:last_outlier]
 89 |             )
 90 |             labels = labels[self.finite_index]
 91 |             clean_data = clean_data[self.finite_index]
 92 |             self.internal_to_raw = {
 93 |                 x: y for x, y in enumerate(self.finite_index)
 94 |             }
 95 | 
 96 |         # Construct tree
 97 |         self.tree = self._tree_type_map[tree_type](clean_data, metric=metric, **kwargs)
 98 |         self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
 99 | 
100 |         # Allocate to maintain data point indices
101 |         self.core_distances = np.full(clean_data.shape[0], np.nan)
102 |         self.neighbors = np.full((clean_data.shape[0], min_samples), -1, dtype=np.int64)
103 | 
104 |         # Find neighbors for non-noise points
105 |         noise_mask = labels != -1
106 |         if noise_mask.any():
107 |             distances, self.neighbors[noise_mask, :] = self.tree.query(
108 |                 clean_data[noise_mask], k=min_samples
109 |             )
110 |             self.core_distances[noise_mask] = distances[:, -1]
111 | 
112 | 


--------------------------------------------------------------------------------
/hdbscan/dist_metrics.pxd:
--------------------------------------------------------------------------------
 1 | #!python
 2 | #cython: boundscheck=False
 3 | #cython: wraparound=False
 4 | #cython: cdivision=True
 5 | 
 6 | import cython
 7 | cimport cython
 8 | 
 9 | import numpy as np
10 | cimport numpy as np
11 | 
12 | from libc.math cimport fabs, sqrt, exp, cos, pow
13 | 
14 | ctypedef np.double_t DTYPE_t
15 | ctypedef np.intp_t ITYPE_t
16 | 
17 | cdef enum:
18 |     DTYPECODE = np.NPY_FLOAT64
19 |     ITYPECODE = np.NPY_INTP
20 | 
21 | # Fused type for certain operations
22 | ctypedef fused DITYPE_t:
23 |     ITYPE_t
24 |     DTYPE_t
25 | 
26 | ITYPE = np.intp
27 | 
28 | DTYPE = np.double
29 | 
30 | ######################################################################
31 | # Inline distance functions
32 | #
33 | #  We use these for the default (euclidean) case so that they can be
34 | #  inlined.  This leads to faster computation for the most common case
35 | cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2,
36 |                                    ITYPE_t size) nogil except -1:
37 |     cdef DTYPE_t tmp, d=0
38 |     cdef np.intp_t j
39 |     for j in range(size):
40 |         tmp = x1[j] - x2[j]
41 |         d += tmp * tmp
42 |     return sqrt(d)
43 | 
44 | 
45 | cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2,
46 |                                     ITYPE_t size) nogil except -1:
47 |     cdef DTYPE_t tmp, d=0
48 |     cdef np.intp_t j
49 |     for j in range(size):
50 |         tmp = x1[j] - x2[j]
51 |         d += tmp * tmp
52 |     return d
53 | 
54 | 
55 | cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1:
56 |     return dist * dist
57 | 
58 | 
59 | cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1:
60 |     return sqrt(dist)
61 | 
62 | 
63 | ######################################################################
64 | # DistanceMetric base class
65 | cdef class DistanceMetric:
66 |     # The following attributes are required for a few of the subclasses.
67 |     # we must define them here so that cython's limited polymorphism will work.
68 |     # Because we don't expect to instantiate a lot of these objects, the
69 |     # extra memory overhead of this setup should not be an issue.
70 |     cdef DTYPE_t p
71 |     #cdef DTYPE_t[::1] vec
72 |     #cdef DTYPE_t[:, ::1] mat
73 |     cdef np.ndarray vec
74 |     cdef np.ndarray mat
75 |     cdef DTYPE_t* vec_ptr
76 |     cdef DTYPE_t* mat_ptr
77 |     cdef ITYPE_t size
78 |     cdef object func
79 |     cdef object kwargs
80 | 
81 |     cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
82 |                       ITYPE_t size) nogil except -1
83 | 
84 |     cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
85 |                        ITYPE_t size) nogil except -1
86 | 
87 |     cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
88 | 
89 |     cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
90 |                    DTYPE_t[:, ::1] D) except -1
91 | 
92 |     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1
93 | 
94 |     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
95 | 


--------------------------------------------------------------------------------
/hdbscan/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/hdbscan/tests/__init__.py


--------------------------------------------------------------------------------
/hdbscan/tests/test_prediction_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from hdbscan._prediction_utils import safe_always_positive_division
 4 | 
 5 | 
 6 | @pytest.mark.parametrize('denominator', [-1, 0, 1])
 7 | def test_safe_always_positive_division(denominator):
 8 |     numerator = 1
 9 |     # Given negative, zero and positive denominator and positive numerator
10 |     value = safe_always_positive_division(numerator, 0)
11 |     # Make sure safe division is always positive and doesn't raise ZeroDivision error
12 |     assert value >= 0
13 | 


--------------------------------------------------------------------------------
/hdbscan/tests/test_rsl.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for Robust Single Linkage clustering algorithm
  3 | """
  4 | # import pickle
  5 | import numpy as np
  6 | from scipy.spatial import distance
  7 | from scipy import sparse
  8 | from sklearn.utils.estimator_checks import check_estimator
  9 | from hdbscan import RobustSingleLinkage, robust_single_linkage
 10 | 
 11 | # from sklearn.cluster.tests.common import generate_clustered_data
 12 | 
 13 | from sklearn import datasets
 14 | import warnings
 15 | 
 16 | from sklearn.datasets import make_blobs
 17 | from sklearn.utils import shuffle
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | import pytest
 21 | 
 22 | n_clusters = 3
 23 | X, y = make_blobs(n_samples=50, random_state=1)
 24 | X, y = shuffle(X, y, random_state=7)
 25 | X = StandardScaler().fit_transform(X)
 26 | # X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)
 27 | 
 28 | def test_rsl_distance_matrix():
 29 |     D = distance.squareform(distance.pdist(X))
 30 |     D /= np.max(D)
 31 | 
 32 |     labels, tree = robust_single_linkage(D, 0.4, metric='precomputed')
 33 |     # number of clusters, ignoring noise if present
 34 |     n_clusters_1 = len(set(labels)) - int(-1 in labels)  # ignore noise
 35 |     assert(n_clusters_1 == 2)
 36 | 
 37 |     labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_
 38 |     n_clusters_2 = len(set(labels)) - int(-1 in labels)
 39 |     assert(n_clusters_2 == 2)
 40 | 
 41 | 
 42 | def test_rsl_feature_vector():
 43 |     labels, tree = robust_single_linkage(X, 0.4)
 44 |     n_clusters_1 = len(set(labels)) - int(-1 in labels)
 45 |     assert(n_clusters_1 == n_clusters)
 46 | 
 47 |     labels = RobustSingleLinkage().fit(X).labels_
 48 |     n_clusters_2 = len(set(labels)) - int(-1 in labels)
 49 |     assert(n_clusters_2 == n_clusters)
 50 | 
 51 | 
 52 | def test_rsl_callable_metric():
 53 |     # metric is the function reference, not the string key.
 54 |     metric = distance.euclidean
 55 | 
 56 |     labels, tree = robust_single_linkage(X, 0.4, metric=metric)
 57 |     n_clusters_1 = len(set(labels)) - int(-1 in labels)
 58 |     assert(n_clusters_1 == n_clusters)
 59 | 
 60 |     labels = RobustSingleLinkage(metric=metric).fit(X).labels_
 61 |     n_clusters_2 = len(set(labels)) - int(-1 in labels)
 62 |     assert(n_clusters_2 == n_clusters)
 63 | 
 64 | 
 65 | def test_rsl_input_lists():
 66 |     X = [[1., 2.], [3., 4.]]
 67 |     RobustSingleLinkage().fit(X)  # must not raise exception
 68 | 
 69 | 
 70 | def test_rsl_boruvka_balltree():
 71 |     labels, tree = robust_single_linkage(X, 0.45, algorithm='boruvka_balltree')
 72 |     n_clusters_1 = len(set(labels)) - int(-1 in labels)
 73 |     assert(n_clusters_1 == n_clusters)
 74 | 
 75 |     labels = RobustSingleLinkage(cut=0.45,
 76 |                                  algorithm='boruvka_balltree').fit(X).labels_
 77 |     n_clusters_2 = len(set(labels)) - int(-1 in labels)
 78 |     assert(n_clusters_2 == n_clusters)
 79 | 
 80 | 
 81 | def test_rsl_prims_balltree():
 82 |     labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_balltree')
 83 |     n_clusters_1 = len(set(labels)) - int(-1 in labels)
 84 |     assert(n_clusters_1 == n_clusters)
 85 | 
 86 |     labels = RobustSingleLinkage(algorithm='prims_balltree').fit(X).labels_
 87 |     n_clusters_2 = len(set(labels)) - int(-1 in labels)
 88 |     assert(n_clusters_2 == n_clusters)
 89 | 
 90 | 
 91 | def test_rsl_prims_kdtree():
 92 |     labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_kdtree')
 93 |     n_clusters_1 = len(set(labels)) - int(-1 in labels)
 94 |     assert(n_clusters_1 == n_clusters)
 95 | 
 96 |     labels = RobustSingleLinkage(algorithm='prims_kdtree').fit(X).labels_
 97 |     n_clusters_2 = len(set(labels)) - int(-1 in labels)
 98 |     assert(n_clusters_2 == n_clusters)
 99 | 
100 | 
101 | # def test_rsl_unavailable_hierarchy():
102 | #     clusterer = RobustSingleLinkage()
103 | #     with warnings.catch_warnings(record=True) as w:
104 | #         tree = clusterer.cluster_hierarchy_
105 | #         assert len(w) > 0
106 | #         assert tree is None
107 | 
108 | 
109 | def test_rsl_hierarchy():
110 |     clusterer = RobustSingleLinkage().fit(X)
111 |     assert clusterer.cluster_hierarchy_ is not None
112 | 
113 | 
114 | def test_rsl_high_dimensional():
115 |     H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
116 |     # H, y = shuffle(X, y, random_state=7)
117 |     H = StandardScaler().fit_transform(H)
118 |     labels, tree = robust_single_linkage(H, 5.5)
119 |     n_clusters_1 = len(set(labels)) - int(-1 in labels)
120 |     assert(n_clusters_1 == n_clusters)
121 | 
122 |     labels = RobustSingleLinkage(cut=5.5, algorithm='best',
123 |                                  metric='seuclidean',
124 |                                  metric_params={'V': np.ones(H.shape[1])}).fit(H).labels_
125 |     n_clusters_2 = len(set(labels)) - int(-1 in labels)
126 |     assert(n_clusters_2 == n_clusters)
127 | 
128 | 
129 | def test_rsl_badargs():
130 |     with pytest.raises(ValueError):
131 |         robust_single_linkage('fail', 0.4)
132 |     with pytest.raises(ValueError):
133 |         robust_single_linkage(None, 0.4)
134 |     with pytest.raises(ValueError):
135 |         robust_single_linkage(X, 0.4, k='fail')
136 |     with pytest.raises(ValueError):
137 |         robust_single_linkage(X, 0.4, k=-1)
138 |     with pytest.raises(ValueError):
139 |         robust_single_linkage(X, 0.4, metric='imperial')
140 |     with pytest.raises(ValueError):
141 |         robust_single_linkage(X, 0.4, metric=None)
142 |     with pytest.raises(ValueError):
143 |         robust_single_linkage(X, 0.4, metric='minkowski', p=-1)
144 |     with pytest.raises(ValueError):
145 |         robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='prims_kdtree')
146 |     with pytest.raises(ValueError):
147 |         robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='prims_balltree')
148 |     with pytest.raises(ValueError):
149 |         robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='boruvka_balltree')
150 |     with pytest.raises(ValueError):
151 |         robust_single_linkage(X, 0.4, metric='precomputed', algorithm='boruvka_kdtree')
152 |     with pytest.raises(ValueError):
153 |         robust_single_linkage(X, 0.4, metric='precomputed', algorithm='prims_kdtree')
154 |     with pytest.raises(ValueError):
155 |         robust_single_linkage(X, 0.4, metric='precomputed', algorithm='prims_balltree')
156 |     with pytest.raises(ValueError):
157 |         robust_single_linkage(X, 0.4, metric='precomputed', algorithm='boruvka_balltree')
158 |     with pytest.raises(ValueError):
159 |         robust_single_linkage(X, 0.4, alpha=-1)
160 |     with pytest.raises(ValueError):
161 |         robust_single_linkage(X, 0.4, alpha='fail')
162 |     with pytest.raises(Exception):
163 |         robust_single_linkage(X, 0.4, algorithm='something_else')
164 |     with pytest.raises(TypeError):
165 |         robust_single_linkage(X, 0.4, metric='minkowski', p=None)
166 |     with pytest.raises(ValueError):
167 |         robust_single_linkage(X, 0.4, leaf_size=0)
168 |     with pytest.raises(ValueError):
169 |         robust_single_linkage(X, 0.4, gamma=0)
170 | 
171 | 
172 | # Disable for now -- need to refactor to meet newer standards
173 | @pytest.mark.skip(reason="need to refactor to meet newer standards")
174 | def test_rsl_is_sklearn_estimator():
175 |     check_estimator(RobustSingleLinkage)
176 | 


--------------------------------------------------------------------------------
/notebooks/clusterable_data.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/notebooks/clusterable_data.npy


--------------------------------------------------------------------------------
/notebooks/hdbscan01_timings.csv:
--------------------------------------------------------------------------------
 1 | 2,2000,0.203334093094
 2 | 2,4000,0.259212017059
 3 | 2,6000,0.530183076859
 4 | 2,8000,0.928155183792
 5 | 2,10000,1.33956003189
 6 | 2,12000,2.02227687836
 7 | 2,14000,2.74701404572
 8 | 2,16000,3.63934803009
 9 | 2,18000,4.60029006004
10 | 2,20000,6.09813690186
11 | 2,22000,10.7129349709
12 | 2,24000,9.08472108841
13 | 2,26000,15.8526310921
14 | 2,28000,19.4355289936
15 | 2,30000,24.5378270149
16 | 2,32000,30.3819289207
17 | 5,2000,0.21369099617
18 | 5,4000,0.255190849304
19 | 5,6000,0.527250051498
20 | 5,8000,0.93247294426
21 | 5,10000,1.47298002243
22 | 5,12000,2.07997608185
23 | 5,14000,2.84801912308
24 | 5,16000,3.78576898575
25 | 5,18000,4.60007095337
26 | 5,20000,5.82796311378
27 | 5,22000,7.35501813889
28 | 5,24000,8.69181203842
29 | 5,26000,10.3049359322
30 | 5,28000,12.5369310379
31 | 5,30000,28.7729370594
32 | 5,32000,29.6381349564
33 | 10,2000,0.174388170242
34 | 10,4000,0.296141147614
35 | 10,6000,0.662806987762
36 | 10,8000,1.17675209045
37 | 10,10000,1.79025316238
38 | 10,12000,2.48112487793
39 | 10,14000,3.44052696228
40 | 10,16000,4.44019889832
41 | 10,18000,5.61963176727
42 | 10,20000,7.39718699455
43 | 10,22000,8.64890098572
44 | 10,24000,10.4458150864
45 | 10,26000,12.8114190102
46 | 10,28000,20.3707690239
47 | 10,30000,29.7545838356
48 | 10,32000,34.2230820656
49 | 25,2000,0.198121070862
50 | 25,4000,0.452563047409
51 | 25,6000,0.94957280159
52 | 25,8000,1.62946105003
53 | 25,10000,2.49307203293
54 | 25,12000,3.63441205025
55 | 25,14000,4.78342199326
56 | 25,16000,6.30564498901
57 | 25,18000,8.03539299965
58 | 25,20000,10.3152740002
59 | 25,22000,12.7070331573
60 | 25,24000,15.693295002
61 | 25,26000,18.6774010658
62 | 25,28000,28.0319800377
63 | 25,30000,35.5377750397
64 | 25,32000,43.5508480072
65 | 50,2000,0.241183042526
66 | 50,4000,0.691927909851
67 | 50,6000,1.46878409386
68 | 50,8000,2.71946191788
69 | 50,10000,3.89164805412
70 | 50,12000,5.76127791405
71 | 50,14000,8.03004384041
72 | 50,16000,10.2894189358
73 | 50,18000,13.2365300655
74 | 50,20000,16.5973930359
75 | 50,22000,19.8884520531
76 | 50,24000,23.8139870167
77 | 50,26000,28.6661889553
78 | 50,28000,38.4153680801
79 | 50,30000,49.254393816
80 | 50,32000,58.0542850494
81 | 


--------------------------------------------------------------------------------
/notebooks/hdbscan02_timings.csv:
--------------------------------------------------------------------------------
 1 | 2,2000,0.190771818161
 2 | 2,4000,0.33536696434
 3 | 2,6000,0.475166797638
 4 | 2,8000,0.830126047134
 5 | 2,10000,1.21801495552
 6 | 2,12000,1.66791892052
 7 | 2,14000,2.25732898712
 8 | 2,16000,2.97524309158
 9 | 2,18000,3.75251483917
10 | 2,20000,4.78878498077
11 | 2,22000,5.71841812134
12 | 2,24000,6.86345005035
13 | 2,26000,8.4248509407
14 | 2,28000,10.5936911106
15 | 2,30000,12.250483036
16 | 2,32000,14.2500619888
17 | 5,2000,0.165930986404
18 | 5,4000,0.25049495697
19 | 5,6000,0.505705833435
20 | 5,8000,0.85303401947
21 | 5,10000,1.30479001999
22 | 5,12000,1.78360509872
23 | 5,14000,2.37719798088
24 | 5,16000,3.19220519066
25 | 5,18000,4.0063521862
26 | 5,20000,5.10847592354
27 | 5,22000,6.15350604057
28 | 5,24000,7.8016500473
29 | 5,26000,9.36254882812
30 | 5,28000,10.940628767
31 | 5,30000,13.0416350365
32 | 5,32000,15.0905759335
33 | 10,2000,0.171450138092
34 | 10,4000,0.306551933289
35 | 10,6000,0.609230041504
36 | 10,8000,1.01101207733
37 | 10,10000,1.56092309952
38 | 10,12000,2.25636100769
39 | 10,14000,3.02007102966
40 | 10,16000,3.85052204132
41 | 10,18000,4.90771794319
42 | 10,20000,6.28313612938
43 | 10,22000,7.84088993073
44 | 10,24000,9.35490894318
45 | 10,26000,11.2061488628
46 | 10,28000,13.258589983
47 | 10,30000,15.8290801048
48 | 10,32000,18.140255928
49 | 25,2000,0.187772035599
50 | 25,4000,0.422642946243
51 | 25,6000,0.917279958725
52 | 25,8000,1.49317598343
53 | 25,10000,2.3160700798
54 | 25,12000,3.33820199966
55 | 25,14000,4.4094080925
56 | 25,16000,5.88487386703
57 | 25,18000,7.52313017845
58 | 25,20000,9.37871217728
59 | 25,22000,11.7811200619
60 | 25,24000,14.447204113
61 | 25,26000,17.3661310673
62 | 25,28000,20.1399390697
63 | 25,30000,24.2563328743
64 | 25,32000,28.605463028
65 | 50,2000,0.230389118195
66 | 50,4000,0.681818008423
67 | 50,6000,1.39964485168
68 | 50,8000,2.48313784599
69 | 50,10000,3.77135896683
70 | 50,12000,5.48401618004
71 | 50,14000,7.19847917557
72 | 50,16000,9.64172506332
73 | 50,18000,12.4206252098
74 | 50,20000,15.4045789242
75 | 50,22000,18.8578879833
76 | 50,24000,22.6411821842
77 | 50,26000,26.6900000572
78 | 50,28000,31.2701971531
79 | 50,30000,36.5198609829
80 | 50,32000,41.7656099796
81 | 


--------------------------------------------------------------------------------
/notebooks/hdbscan03_timings.csv:
--------------------------------------------------------------------------------
 1 | 2,4000,0.254909992218
 2 | 2,8000,0.781009912491
 3 | 2,12000,1.65578794479
 4 | 2,16000,2.86548995972
 5 | 2,20000,4.5723490715
 6 | 2,24000,7.35976219177
 7 | 2,28000,10.392701149
 8 | 2,32000,9.43943691254
 9 | 2,36000,11.3052511215
10 | 2,40000,13.9955811501
11 | 2,44000,18.7241039276
12 | 2,48000,20.6580238342
13 | 2,52000,24.4679880142
14 | 2,56000,29.1394848824
15 | 2,60000,34.244658947
16 | 2,64000,39.4027280807
17 | 5,4000,0.25834608078
18 | 5,8000,0.854709863663
19 | 5,12000,1.76500201225
20 | 5,16000,3.11302685738
21 | 5,20000,5.05285406113
22 | 5,24000,7.59221887589
23 | 5,28000,11.0022101402
24 | 5,32000,11.0250749588
25 | 5,36000,14.1674640179
26 | 5,40000,17.6738820076
27 | 5,44000,22.3881859779
28 | 5,48000,26.0163779259
29 | 5,52000,30.8282210827
30 | 5,56000,35.8936729431
31 | 5,60000,41.7060689926
32 | 5,64000,48.1323189735
33 | 10,4000,0.300674915314
34 | 10,8000,1.02144503593
35 | 10,12000,2.25444197655
36 | 10,16000,3.87991809845
37 | 10,20000,6.13427686691
38 | 10,24000,9.54126405716
39 | 10,28000,13.4590039253
40 | 10,32000,17.133865118
41 | 10,36000,21.9930670261
42 | 10,40000,27.4153258801
43 | 10,44000,33.9543378353
44 | 10,48000,40.5958509445
45 | 10,52000,47.9032700062
46 | 10,56000,57.3020319939
47 | 10,60000,65.7409169674
48 | 10,64000,74.7461779118
49 | 25,4000,0.429993152618
50 | 25,8000,1.53049278259
51 | 25,12000,3.27671718597
52 | 25,16000,5.81940603256
53 | 25,20000,9.31306195259
54 | 25,24000,14.3008999825
55 | 25,28000,20.7219820023
56 | 25,32000,35.4473462105
57 | 25,36000,44.8741598129
58 | 25,40000,55.1005539894
59 | 25,44000,66.9944300652
60 | 25,48000,78.9403419495
61 | 25,52000,92.4163110256
62 | 25,56000,107.29060483
63 | 25,60000,124.042211056
64 | 25,64000,139.81782198
65 | 50,4000,0.689707040787
66 | 50,8000,2.43957304955
67 | 50,12000,5.3949701786
68 | 50,16000,9.77388811111
69 | 50,20000,15.3528060913
70 | 50,24000,22.688354969
71 | 50,28000,31.6130321026
72 | 50,32000,60.4746580124
73 | 50,36000,76.1894528866
74 | 50,40000,93.2929999828
75 | 50,44000,111.741698027
76 | 50,48000,132.439800024
77 | 50,52000,153.971266031
78 | 50,56000,177.992291927
79 | 50,60000,204.601658106
80 | 50,64000,231.908761978
81 | 


--------------------------------------------------------------------------------
/notebooks/hdbscan04_timings.csv:
--------------------------------------------------------------------------------
 1 | 2,8000,0.227055072784
 2 | 2,16000,0.532173156738
 3 | 2,24000,0.879513025284
 4 | 2,32000,1.24024891853
 5 | 2,40000,1.81793093681
 6 | 2,48000,2.22707700729
 7 | 2,56000,2.89961886406
 8 | 2,64000,3.2689011097
 9 | 2,72000,3.87070393562
10 | 2,80000,6.16474890709
11 | 2,88000,6.37934803963
12 | 2,96000,8.87552189827
13 | 2,104000,8.83126091957
14 | 2,112000,10.2158279419
15 | 2,120000,12.5876441002
16 | 2,128000,13.6096761227
17 | 5,8000,0.405529975891
18 | 5,16000,1.33872485161
19 | 5,24000,2.52023291588
20 | 5,32000,3.81210708618
21 | 5,40000,4.77973794937
22 | 5,48000,7.4870300293
23 | 5,56000,7.76650905609
24 | 5,64000,8.53143310547
25 | 5,72000,11.8250510693
26 | 5,80000,14.0402071476
27 | 5,88000,16.0629730225
28 | 5,96000,19.1256659031
29 | 5,104000,19.8361799717
30 | 5,112000,20.415594101
31 | 5,120000,21.5572421551
32 | 5,128000,24.9693388939
33 | 10,8000,0.523543119431
34 | 10,16000,1.62090706825
35 | 10,24000,3.66929006577
36 | 10,32000,5.36760091782
37 | 10,40000,7.74307012558
38 | 10,48000,13.7823400497
39 | 10,56000,15.9222350121
40 | 10,64000,19.0056459904
41 | 10,72000,22.3747861385
42 | 10,80000,31.0509710312
43 | 10,88000,49.9119548798
44 | 10,96000,47.1509799957
45 | 10,104000,58.6490371227
46 | 10,112000,72.9800539017
47 | 10,120000,68.7178759575
48 | 10,128000,60.2585930824
49 | 25,8000,0.886401891708
50 | 25,16000,2.55635499954
51 | 25,24000,10.2341220379
52 | 25,32000,10.0402569771
53 | 25,40000,16.4257571697
54 | 25,48000,23.4617791176
55 | 25,56000,32.1058709621
56 | 25,64000,35.5998060703
57 | 25,72000,51.0438849926
58 | 25,80000,53.5488469601
59 | 25,88000,74.6229739189
60 | 25,96000,87.4415640831
61 | 25,104000,103.67979002
62 | 25,112000,100.422867775
63 | 25,120000,117.445795059
64 | 25,128000,127.074856043
65 | 50,8000,2.15198493004
66 | 50,16000,6.01606011391
67 | 50,24000,15.0741400719
68 | 50,32000,24.8565030098
69 | 50,40000,32.738462925
70 | 50,48000,54.6907629967
71 | 50,56000,65.1226139069
72 | 50,64000,80.4430060387
73 | 50,72000,103.5877738
74 | 50,80000,120.219110966
75 | 50,88000,171.107203007
76 | 50,96000,201.432529926
77 | 50,104000,238.729315996
78 | 50,112000,258.13277483
79 | 50,120000,285.661708117
80 | 50,128000,316.628612041
81 | 


--------------------------------------------------------------------------------
/notebooks/hdbscan05_timings.csv:
--------------------------------------------------------------------------------
 1 | 2,8000,0.201974868774
 2 | 2,16000,0.382796049118
 3 | 2,24000,0.677625179291
 4 | 2,32000,0.857353925705
 5 | 2,40000,1.19192004204
 6 | 2,48000,1.65057206154
 7 | 2,56000,1.76224017143
 8 | 2,64000,2.09517502785
 9 | 2,72000,2.37437987328
10 | 2,80000,2.61393880844
11 | 2,88000,3.86622595787
12 | 2,96000,4.16805887222
13 | 2,104000,4.60610985756
14 | 2,112000,4.65505003929
15 | 2,120000,4.94053196907
16 | 2,128000,5.48205113411
17 | 5,8000,0.390153884888
18 | 5,16000,1.1207010746
19 | 5,24000,2.12859201431
20 | 5,32000,3.20195794106
21 | 5,40000,4.50784707069
22 | 5,48000,5.86051797867
23 | 5,56000,6.96505713463
24 | 5,64000,8.35725998878
25 | 5,72000,10.0785040855
26 | 5,80000,11.8928399086
27 | 5,88000,14.2854990959
28 | 5,96000,16.3619041443
29 | 5,104000,18.1008689404
30 | 5,112000,18.765378952
31 | 5,120000,20.262346983
32 | 5,128000,22.245456934
33 | 10,8000,0.362307071686
34 | 10,16000,1.10565090179
35 | 10,24000,2.1113088131
36 | 10,32000,3.8094599247
37 | 10,40000,5.60643601418
38 | 10,48000,8.05391407013
39 | 10,56000,12.0181820393
40 | 10,64000,14.4568071365
41 | 10,72000,17.575797081
42 | 10,80000,20.9547560215
43 | 10,88000,28.589566946
44 | 10,96000,31.5660579205
45 | 10,104000,35.0399270058
46 | 10,112000,46.7496728897
47 | 10,120000,51.5727710724
48 | 10,128000,56.6605160236
49 | 25,8000,0.503958940506
50 | 25,16000,1.15347003937
51 | 25,24000,2.52892589569
52 | 25,32000,3.7748811245
53 | 25,40000,5.54964900017
54 | 25,48000,7.7039680481
55 | 25,56000,10.2646648884
56 | 25,64000,12.3325390816
57 | 25,72000,14.4936189651
58 | 25,80000,17.8296489716
59 | 25,88000,24.9521570206
60 | 25,96000,27.6805050373
61 | 25,104000,31.0702199936
62 | 25,112000,38.4048509598
63 | 25,120000,41.4252431393
64 | 25,128000,45.7964301109
65 | 50,8000,1.46589207649
66 | 50,16000,2.91623210907
67 | 50,24000,4.17734980583
68 | 50,32000,6.72125601768
69 | 50,40000,9.49217200279
70 | 50,48000,11.0911870003
71 | 50,56000,13.4033820629
72 | 50,64000,16.9308049679
73 | 50,72000,20.2958710194
74 | 50,80000,27.0205729008
75 | 50,88000,31.7669379711
76 | 50,96000,37.2198050022
77 | 50,104000,39.0934021473
78 | 50,112000,45.5359759331
79 | 50,120000,49.7200181484
80 | 50,128000,54.0523099899
81 | 


--------------------------------------------------------------------------------
/notebooks/hdbscan06_timings.csv:
--------------------------------------------------------------------------------
 1 | 2,8000,0.175021886826
 2 | 2,16000,0.387292146683
 3 | 2,24000,0.677018880844
 4 | 2,32000,0.934924125671
 5 | 2,40000,1.17343378067
 6 | 2,48000,1.38080406189
 7 | 2,56000,1.60144400597
 8 | 2,64000,1.79244303703
 9 | 2,72000,2.1175339222
10 | 2,80000,2.43222498894
11 | 2,88000,2.75695896149
12 | 2,96000,3.10400700569
13 | 2,104000,3.41808009148
14 | 2,112000,3.49205112457
15 | 2,120000,3.87581586838
16 | 2,128000,4.19616699219
17 | 5,8000,0.372463941574
18 | 5,16000,1.05067205429
19 | 5,24000,1.93789100647
20 | 5,32000,2.74101495743
21 | 5,40000,3.80962181091
22 | 5,48000,4.98932695389
23 | 5,56000,5.92916297913
24 | 5,64000,7.09130311012
25 | 5,72000,8.22766804695
26 | 5,80000,9.74051809311
27 | 5,88000,11.0401978493
28 | 5,96000,12.6047639847
29 | 5,104000,14.0353701115
30 | 5,112000,14.6283960342
31 | 5,120000,16.2875649929
32 | 5,128000,17.4939930439
33 | 10,8000,0.349482059479
34 | 10,16000,1.09388589859
35 | 10,24000,1.87578415871
36 | 10,32000,3.21113491058
37 | 10,40000,4.35681700706
38 | 10,48000,6.19830203056
39 | 10,56000,9.55884099007
40 | 10,64000,11.4342520237
41 | 10,72000,13.2101860046
42 | 10,80000,16.1834290028
43 | 10,88000,20.0170080662
44 | 10,96000,22.5502281189
45 | 10,104000,24.9669640064
46 | 10,112000,35.226790905
47 | 10,120000,39.5434041023
48 | 10,128000,42.897605896
49 | 25,8000,0.444399118423
50 | 25,16000,1.209430933
51 | 25,24000,1.97230005264
52 | 25,32000,3.10147595406
53 | 25,40000,4.67809796333
54 | 25,48000,5.50237488747
55 | 25,56000,7.86162614822
56 | 25,64000,9.46203804016
57 | 25,72000,11.5571279526
58 | 25,80000,13.881565094
59 | 25,88000,16.1510570049
60 | 25,96000,18.3807759285
61 | 25,104000,20.2770631313
62 | 25,112000,25.9744091034
63 | 25,120000,28.6864550114
64 | 25,128000,31.9634900093
65 | 50,8000,1.42019295692
66 | 50,16000,2.98401212692
67 | 50,24000,3.57059788704
68 | 50,32000,5.97410511971
69 | 50,40000,7.985861063
70 | 50,48000,9.6884970665
71 | 50,56000,11.9059169292
72 | 50,64000,13.7416830063
73 | 50,72000,17.8067760468
74 | 50,80000,20.3124599457
75 | 50,88000,20.6006500721
76 | 50,96000,22.6325879097
77 | 50,104000,27.3392460346
78 | 50,112000,31.2804059982
79 | 50,120000,34.6195569038
80 | 50,128000,39.2653598785
81 | 


--------------------------------------------------------------------------------
/notebooks/reference_impl_external_timings.csv:
--------------------------------------------------------------------------------
 1 | 2,8000,3.59666895866
 2 | 2,16000,15.2572879791
 3 | 2,24000,31.3827497959
 4 | 2,32000,60.9953649044
 5 | 2,40000,111.264041901
 6 | 2,48000,80.2624919415
 7 | 2,56000,111.845596075
 8 | 2,64000,157.572174072
 9 | 2,72000,213.970286131
10 | 2,80000,291.316827059
11 | 2,88000,364.542631865
12 | 2,96000,330.40318799
13 | 2,104000,376.085955858
14 | 2,112000,437.023652077
15 | 2,120000,512.283486128
16 | 2,128000,639.647830963
17 | 5,8000,2.96017384529
18 | 5,16000,12.4860448837
19 | 5,24000,24.3062229156
20 | 5,32000,27.3480169773
21 | 5,40000,57.2987709045
22 | 5,48000,100.169524908
23 | 5,56000,79.1349971294
24 | 5,64000,124.066302061
25 | 5,72000,185.705877066
26 | 5,80000,266.771252155
27 | 5,88000,344.634408951
28 | 5,96000,437.551882982
29 | 5,104000,446.130121946
30 | 5,112000,365.777822018
31 | 5,120000,447.037277937
32 | 5,128000,591.354615211
33 | 10,8000,3.74887800217
34 | 10,16000,9.18430614471
35 | 10,24000,30.3249309063
36 | 10,32000,33.4931271076
37 | 10,40000,78.0882520676
38 | 10,48000,91.3173689842
39 | 10,56000,200.770553112
40 | 10,64000,158.011397839
41 | 10,72000,241.757611036
42 | 10,80000,323.283601046
43 | 10,88000,342.906905174
44 | 10,96000,354.992150068
45 | 10,104000,435.243753195
46 | 10,112000,547.999858856
47 | 10,120000,687.23850894
48 | 10,128000,572.590743065
49 | 25,8000,3.80018186569
50 | 25,16000,18.4901921749
51 | 25,24000,33.0604710579
52 | 25,32000,90.8991298676
53 | 25,40000,110.421215057
54 | 25,48000,153.691064119
55 | 25,56000,236.893220901
56 | 25,64000,371.323115826
57 | 25,72000,413.138042927
58 | 25,80000,580.538727999
59 | 25,88000,492.039662123
60 | 25,96000,665.976908922
61 | 25,104000,879.488523006
62 | 25,112000,946.649399996
63 | 25,120000,1354.74109793
64 | 25,128000,1628.48575211
65 | 50,8000,7.23535704613
66 | 50,16000,35.2021028996
67 | 50,24000,69.9486300945
68 | 50,32000,146.289216995
69 | 50,40000,234.030052185
70 | 50,48000,305.608191013
71 | 50,56000,423.300146103
72 | 50,64000,642.593301058
73 | 50,72000,703.198181152
74 | 50,80000,885.244357109
75 | 50,88000,1099.00257683
76 | 50,96000,1249.79146123
77 | 50,104000,1456.11673903
78 | 50,112000,1785.89922595
79 | 50,120000,2121.75022507
80 | 50,128000,2446.19570708
81 | 


--------------------------------------------------------------------------------
/notebooks/reference_impl_internal_timings.csv:
--------------------------------------------------------------------------------
 1 | ,,calculate MST,compute core distances,compute hierarchy and cluster tree,compute outlier scores,find flat result,runtime
 2 | 2,8000,624,622,1492,22,243,3060
 3 | 2,16000,3422,4744,5711,40,542,14514
 4 | 2,24000,8290,13080,9732,56,12,31246
 5 | 2,32000,18030,24890,16561,90,1192,60865
 6 | 2,40000,35571,50032,25340,71,20,111135
 7 | 2,48000,20298,15773,38696,118,5086,80123
 8 | 2,56000,30316,24342,50220,99,6582,111702
 9 | 2,64000,42993,41946,63860,102,8401,157433
10 | 2,72000,61349,62803,78890,136,10489,213827
11 | 2,80000,86100,87121,104851,154,12777,291163
12 | 2,88000,107276,119212,121918,182,15636,364407
13 | 2,96000,80565,88392,142386,122,18594,330248
14 | 2,104000,99502,85378,168434,171,22258,375928
15 | 2,112000,118551,90896,199844,226,27171,436855
16 | 2,120000,147139,121902,210340,158,32281,512020
17 | 2,128000,182831,158954,260799,228,36374,639384
18 | 5,8000,803,907,640,24,17,2468
19 | 5,16000,4025,6237,1609,37,19,12025
20 | 5,24000,5345,15126,3249,60,26,23952
21 | 5,32000,10404,10845,5548,77,29,27043
22 | 5,40000,22235,27360,7303,84,33,57168
23 | 5,48000,39939,49988,9814,93,39,100041
24 | 5,56000,33073,30384,15212,107,38,78986
25 | 5,64000,52653,53860,17047,127,46,123917
26 | 5,72000,77242,84245,23702,118,36,185555
27 | 5,80000,108770,125721,31750,140,34,266622
28 | 5,88000,140356,170121,33576,149,53,344486
29 | 5,96000,186953,203925,46083,185,40,437405
30 | 5,104000,146302,248246,50781,353,73,445989
31 | 5,112000,159718,150778,54602,175,53,365611
32 | 5,120000,194952,180324,71056,226,69,446893
33 | 5,128000,250133,255315,85176,241,46,591201
34 | 10,8000,1074,1763,373,28,14,3347
35 | 10,16000,3615,3858,1245,50,24,8914
36 | 10,24000,10598,17317,1938,68,30,30144
37 | 10,32000,13761,15707,3413,82,39,33204
38 | 10,40000,29709,39094,8769,101,57,77928
39 | 10,48000,34325,37811,18517,133,49,91048
40 | 10,56000,81323,108916,9915,127,50,200561
41 | 10,64000,64556,74037,18723,139,61,157770
42 | 10,72000,111252,114018,15777,151,72,241532
43 | 10,80000,136247,165805,20230,197,80,322847
44 | 10,88000,122443,204552,15006,203,80,342573
45 | 10,96000,160015,173657,20586,179,73,354798
46 | 10,104000,199701,215870,18923,202,74,435081
47 | 10,112000,244868,279603,22277,265,104,547449
48 | 10,120000,306516,355204,24496,265,115,686922
49 | 10,128000,271135,269269,31109,206,136,572215
50 | 25,8000,1195,1829,350,31,43,3585
51 | 25,16000,6410,10653,924,46,36,18263
52 | 25,24000,13252,17456,1826,63,63,32881
53 | 25,32000,34042,50735,5504,78,76,90680
54 | 25,40000,46438,58907,4457,109,82,110275
55 | 25,48000,64879,74894,13223,124,107,153555
56 | 25,56000,98906,128676,8398,123,138,236582
57 | 25,64000,147256,207237,16032,137,117,371163
58 | 25,72000,175076,226689,10241,156,157,412771
59 | 25,80000,232469,327623,19225,174,227,580180
60 | 25,88000,218678,253468,18846,176,224,491880
61 | 25,96000,287941,362339,14325,197,189,665513
62 | 25,104000,368983,488749,20733,163,135,879320
63 | 25,112000,407948,512364,25227,175,200,946495
64 | 25,120000,550593,778372,24603,205,176,1354579
65 | 25,128000,647824,945765,33639,224,174,1628338
66 | 50,8000,2313,4115,441,28,43,7119
67 | 50,16000,12828,20685,1163,50,88,35048
68 | 50,24000,27541,39661,1988,65,113,69679
69 | 50,32000,56303,85807,3371,73,159,146071
70 | 50,40000,90720,137414,4801,102,197,233684
71 | 50,48000,122137,173262,9205,104,178,305392
72 | 50,56000,168479,242965,10462,117,166,422749
73 | 50,64000,246315,379799,14992,132,229,642115
74 | 50,72000,282092,405504,14290,165,244,703033
75 | 50,80000,352981,512679,17941,177,330,884889
76 | 50,88000,438449,643205,15735,179,353,1098758
77 | 50,96000,504656,724577,19070,207,219,1249631
78 | 50,104000,592942,836562,24831,189,317,1455788
79 | 50,112000,714665,1047422,21969,239,428,1785726
80 | 50,120000,847335,1254820,17559,209,548,2121531
81 | 50,128000,969114,1450654,24317,259,424,2445925
82 | 


--------------------------------------------------------------------------------
/paper/hdbscan_clustering_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/paper/hdbscan_clustering_result.png


--------------------------------------------------------------------------------
/paper/hdbscan_condensed_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/paper/hdbscan_condensed_tree.png


--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{campello2013density,
 2 |   title={Density-based clustering based on hierarchical density estimates},
 3 |   author={Campello, Ricardo JGB and Moulavi, Davoud and Sander, Joerg},
 4 |   booktitle={Pacific-Asia Conference on Knowledge Discovery and Data Mining},
 5 |   pages={160--172},
 6 |   year={2013},
 7 |   organization={Springer},
 8 |   doi={10.1007/978-3-642-37456-2_14},
 9 |   url={http://dx.doi.org/10.1007/978-3-642-37456-2_14}
10 | }
11 | 
12 | @article{campello2015hierarchical,
13 |   title={Hierarchical density estimates for data clustering, visualization, and outlier detection},
14 |   author={Campello, Ricardo JGB and Moulavi, Davoud and Zimek, Arthur and Sander, J{\"o}rg},
15 |   journal={ACM Transactions on Knowledge Discovery from Data (TKDD)},
16 |   volume={10},
17 |   number={1},
18 |   pages={5},
19 |   year={2015},
20 |   publisher={ACM},
21 |   url = {http://doi.acm.org/10.1145/2733381},
22 |   doi = {10.1145/2733381}
23 | }
24 | 
25 | @article{chaudhuri2014consistent,
26 |   title={Consistent procedures for cluster tree estimation and pruning},
27 |   author={Chaudhuri, Kamalika and Dasgupta, Sanjoy and Kpotufe, Samory and von Luxburg, Ulrike},
28 |   journal={IEEE Transactions on Information Theory},
29 |   volume={60},
30 |   number={12},
31 |   pages={7900--7912},
32 |   year={2014},
33 |   publisher={IEEE},
34 |   doi={10.1109/TIT.2014.2361055}
35 | }
36 | 
37 | @inproceedings{chaudhuri2010rates,
38 |  author = {Chaudhuri, Kamalika and Dasgupta, Sanjoy},
39 |  title = {Rates of Convergence for the Cluster Tree},
40 |  booktitle = {Proceedings of the 23rd International Conference on Neural Information Processing Systems},
41 |  series = {NIPS'10},
42 |  year = {2010},
43 |  location = {Vancouver, British Columbia, Canada},
44 |  pages = {343--351},
45 |  numpages = {9},
46 |  url = {https://papers.nips.cc/paper/4068-rates-of-convergence-for-the-cluster-tree},
47 |  acmid = {2997228},
48 |  publisher = {Curran Associates Inc.},
49 |  address = {USA},
50 | } 
51 | 


--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'hdbscan: Hierarchical density based clustering'
 3 | tags:
 4 |   - clustering
 5 |   - unsupervised learning
 6 |   - machine learning
 7 | authors:
 8 |  - name: Leland McInnes
 9 |    orcid: 0000-0003-2143-6834
10 |    affiliation: 1
11 |  - name: John Healy
12 |    affiliation: 1
13 |  - name: Steve Astels
14 |    affiliation: 2
15 | affiliations:
16 |  - name: Tutte Institute for Mathematics and Computing
17 |    index: 1
18 |  - name: Shopify
19 |    index: 2
20 | date: 26 February 2017
21 | bibliography: paper.bib
22 | ---
23 | 
24 | # Summary
25 | 
26 | HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise 
27 | [@campello2013density], [@campello2015hierarchical]. 
28 | Performs DBSCAN over varying epsilon values and integrates the result to find a 
29 | clustering that gives the best stability over epsilon. This allows HDBSCAN to 
30 | find clusters of varying densities (unlike DBSCAN), and be more robust to parameter 
31 | selection. The library also includes support for Robust Single Linkage clustering
32 | [@chaudhuri2014consistent], [@chaudhuri2010rates],
33 | GLOSH outlier detection [@campello2015hierarchical], and tools for visualizing 
34 | and exploring cluster structures.
35 | Finally support for prediction and soft clustering is also available.
36 | 
37 | -![Example clustering results.](hdbscan_clustering_result.png)
38 | -![Hierarchical tree structure.](hdbscan_condensed_tree.png)
39 | 
40 | # References
41 |   
42 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |   "setuptools",
4 |   "wheel",
5 |   "cython<4",
6 |   "numpy<3"
7 | ]
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.20,<3
2 | scipy>= 1.0
3 | scikit-learn>=0.20
4 | joblib>=1.0
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | try:
  4 |     # from Cython.Distutils import build_ext
  5 |     from Cython.Build import cythonize
  6 |     from setuptools import setup, Extension
  7 |     from setuptools.command.build_ext import build_ext
  8 |     HAVE_CYTHON = True
  9 | except ImportError as e:
 10 |     warnings.warn(e.args[0])
 11 |     cythonize = lambda ext: ext
 12 |     from setuptools import setup, Extension
 13 |     from setuptools.command.build_ext import build_ext
 14 |     HAVE_CYTHON = False
 15 | 
 16 | 
 17 | class CustomBuildExtCommand(build_ext):
 18 |     """build_ext command for use when numpy headers are needed."""
 19 | 
 20 |     def run(self):
 21 | 
 22 |         # Import numpy here, only when headers are needed
 23 |         import numpy
 24 | 
 25 |         # Add numpy headers to include_dirs
 26 |         self.include_dirs.append(numpy.get_include())
 27 | 
 28 |         # Call original build_ext command
 29 |         build_ext.run(self)
 30 | 
 31 | 
 32 | _hdbscan_tree = Extension('hdbscan._hdbscan_tree',
 33 |                           sources=['hdbscan/_hdbscan_tree.pyx'])
 34 | _hdbscan_linkage = Extension('hdbscan._hdbscan_linkage',
 35 |                              sources=['hdbscan/_hdbscan_linkage.pyx'])
 36 | _hdbscan_boruvka = Extension('hdbscan._hdbscan_boruvka',
 37 |                              sources=['hdbscan/_hdbscan_boruvka.pyx'])
 38 | _hdbscan_reachability = Extension('hdbscan._hdbscan_reachability',
 39 |                                   sources=['hdbscan/_hdbscan_reachability.pyx'])
 40 | _prediction_utils = Extension('hdbscan._prediction_utils',
 41 |                               sources=['hdbscan/_prediction_utils.pyx'])
 42 | dist_metrics = Extension('hdbscan.dist_metrics',
 43 |                          sources=['hdbscan/dist_metrics.pyx'])
 44 | 
 45 | 
 46 | 
 47 | def readme():
 48 |     with open('README.rst') as readme_file:
 49 |         return readme_file.read()
 50 | 
 51 | def requirements():
 52 |     # The dependencies are the same as the contents of requirements.txt
 53 |     with open('requirements.txt') as f:
 54 |         return [line.strip() for line in f if line.strip()]
 55 | 
 56 | configuration = {
 57 |     'name': 'hdbscan',
 58 |     'version': '0.8.40',
 59 |     'description': 'Clustering based on density with variable density clusters',
 60 |     'long_description': readme(),
 61 |     'classifiers': [
 62 |         'Development Status :: 4 - Beta',
 63 |         'Intended Audience :: Science/Research',
 64 |         'Intended Audience :: Developers',
 65 |         'License :: OSI Approved',
 66 |         'Programming Language :: C',
 67 |         'Programming Language :: Python',
 68 |         'Topic :: Software Development',
 69 |         'Topic :: Scientific/Engineering',
 70 |         'Operating System :: Microsoft :: Windows',
 71 |         'Operating System :: POSIX',
 72 |         'Operating System :: Unix',
 73 |         'Operating System :: MacOS',
 74 |         'Programming Language :: Python :: 3.9',
 75 |         'Programming Language :: Python :: 3.10',
 76 |         'Programming Language :: Python :: 3.11',
 77 |         'Programming Language :: Python :: 3.12',
 78 |     ],
 79 |     'keywords': 'cluster clustering density hierarchical',
 80 |     'url': 'http://github.com/scikit-learn-contrib/hdbscan',
 81 |     'maintainer': 'Leland McInnes',
 82 |     'maintainer_email': 'leland.mcinnes@gmail.com',
 83 |     'license': 'BSD',
 84 |     'packages': ['hdbscan', 'hdbscan.tests'],
 85 |     'install_requires': requirements(),
 86 |     'ext_modules': cythonize([
 87 |                     _hdbscan_tree,
 88 |                     _hdbscan_linkage,
 89 |                     _hdbscan_boruvka,
 90 |                     _hdbscan_reachability,
 91 |                     _prediction_utils,
 92 |                     dist_metrics]),
 93 |     'cmdclass': {'build_ext': CustomBuildExtCommand},
 94 |     'test_suite': 'nose.collector',
 95 |     'tests_require': ['nose'],
 96 |     'data_files': ('hdbscan/dist_metrics.pxd',)
 97 | }
 98 | 
 99 | if not HAVE_CYTHON:
100 |     warnings.warn('Due to incompatibilities with Python 3.7 hdbscan now'
101 |                   'requires Cython to be installed in order to build it')
102 |     raise ImportError('Cython not found! Please install cython and try again')
103 | 
104 | setup(**configuration)
105 | 


--------------------------------------------------------------------------------