├── .github └── workflows │ ├── pythonpublish.yml │ ├── pythonpublish_wheel.yml │ └── pythonpublish_windows.yml ├── .gitignore ├── .idea ├── .gitignore ├── codeStyleSettings.xml ├── codeStyles │ ├── Project.xml │ └── codeStyleConfig.xml ├── hdbscan.iml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── .nojekyll ├── .pep8speaks.yml ├── .readthedocs.yaml ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── azure-pipelines.yml ├── ci_scripts └── push_doc.sh ├── circle.yml ├── docs ├── Makefile ├── advanced_hdbscan.rst ├── api.rst ├── basic_hdbscan.rst ├── comparing_clustering_algorithms.rst ├── conf.py ├── dbscan_from_hdbscan.rst ├── docs_requirements.txt ├── faq.rst ├── how_hdbscan_works.rst ├── how_to_detect_branches.rst ├── how_to_use_epsilon.rst ├── images │ ├── advanced_hdbscan_11_1.png │ ├── advanced_hdbscan_26_1.png │ ├── advanced_hdbscan_3_1.png │ ├── advanced_hdbscan_5_1.png │ ├── advanced_hdbscan_9_1.png │ ├── allow_single_cluster.png │ ├── comparing_clustering_algorithms_12_0.png │ ├── comparing_clustering_algorithms_15_0.png │ ├── comparing_clustering_algorithms_18_0.png │ ├── comparing_clustering_algorithms_21_0.png │ ├── comparing_clustering_algorithms_24_0.png │ ├── comparing_clustering_algorithms_27_0.png │ ├── comparing_clustering_algorithms_31_0.png │ ├── comparing_clustering_algorithms_6_0.png │ ├── distance1.svg │ ├── distance2.svg │ ├── distance3.svg │ ├── distance4.svg │ ├── distance4a.svg │ ├── distance5.svg │ ├── epsilon_parameter_dataset.png │ ├── epsilon_parameter_dbscan.png │ ├── epsilon_parameter_hdbscan_e3_leaf.png │ ├── epsilon_parameter_hdbscan_eom.png │ ├── epsilon_parameter_hdbscan_eps.png │ ├── generative_model_kde.png │ ├── generative_model_scatter.png │ ├── how_hdbscan_works_10_1.png │ ├── how_hdbscan_works_12_1.png │ ├── how_hdbscan_works_15_1.png │ ├── how_hdbscan_works_18_1.png │ ├── how_hdbscan_works_20_1.png │ ├── how_hdbscan_works_3_1.png │ ├── how_to_detect_branches_13_0.png │ ├── how_to_detect_branches_15_0.png │ ├── how_to_detect_branches_17_0.png │ ├── how_to_detect_branches_19_0.png │ ├── how_to_detect_branches_21_0.png │ ├── how_to_detect_branches_23_0.png │ ├── how_to_detect_branches_25_0.png │ ├── how_to_detect_branches_3_0.png │ ├── how_to_detect_branches_5_0.png │ ├── how_to_detect_branches_7_0.png │ ├── how_to_detect_branches_9_0.png │ ├── outlier_detection_3_1.png │ ├── outlier_detection_7_1.png │ ├── outlier_detection_9_1.png │ ├── parameter_selection_11_1.png │ ├── parameter_selection_12_1.png │ ├── parameter_selection_15_1.png │ ├── parameter_selection_18_1.png │ ├── parameter_selection_3_1.png │ ├── parameter_selection_7_1.png │ ├── parameter_selection_9_1.png │ ├── performance_and_scalability_14_1.png │ ├── performance_and_scalability_20_2.png │ ├── performance_and_scalability_24_1.png │ ├── performance_and_scalability_9_1.png │ ├── prediction_tutorial_3_0.png │ ├── prediction_tutorial_5_1.png │ ├── prediction_tutorial_9_1.png │ ├── soft_clustering_10_1.png │ ├── soft_clustering_13_1.png │ ├── soft_clustering_15_0.png │ ├── soft_clustering_3_1.png │ ├── soft_clustering_6_1.png │ ├── soft_clustering_8_1.png │ ├── soft_clustering_explanation_11_0.png │ ├── soft_clustering_explanation_15_0.png │ ├── soft_clustering_explanation_26_0.png │ ├── soft_clustering_explanation_2_0.png │ ├── soft_clustering_explanation_31_0.png │ ├── soft_clustering_explanation_36_0.png │ └── soft_clustering_explanation_6_0.png ├── index.rst ├── make.bat ├── outlier_detection.rst ├── parameter_selection.rst ├── performance_and_scalability.rst ├── prediction_tutorial.rst ├── soft_clustering.rst └── soft_clustering_explanation.rst ├── environment.yml ├── examples ├── plot_cluster_comparison.py └── plot_hdbscan.py ├── hdbscan ├── __init__.py ├── _hdbscan_boruvka.pyx ├── _hdbscan_linkage.pyx ├── _hdbscan_reachability.pyx ├── _hdbscan_tree.pyx ├── _prediction_utils.pyx ├── branch_data.py ├── branches.py ├── dist_metrics.pxd ├── dist_metrics.pyx ├── flat.py ├── hdbscan_.py ├── plots.py ├── prediction.py ├── robust_single_linkage_.py ├── tests │ ├── __init__.py │ ├── test_branches.py │ ├── test_flat.py │ ├── test_hdbscan.py │ ├── test_prediction_utils.py │ └── test_rsl.py └── validity.py ├── notebooks ├── Benchmarking scalability of clustering implementations 2D v0.7.ipynb ├── Benchmarking scalability of clustering implementations-v0.7.ipynb ├── Comparing Clustering Algorithms.ipynb ├── Flat clustering.ipynb ├── How HDBSCAN Works.ipynb ├── How Soft Clustering for HDBSCAN Works.ipynb ├── How to detect branches.ipynb ├── Looking at cluster consistency.ipynb ├── Performance data generation .ipynb ├── Python vs Java.ipynb ├── clusterable_data.npy ├── distance1.svg ├── distance2.svg ├── distance3.svg ├── distance4.svg ├── distance4a.svg ├── distance5.svg ├── hdbscan01_timings.csv ├── hdbscan02_timings.csv ├── hdbscan03_timings.csv ├── hdbscan04_timings.csv ├── hdbscan05_timings.csv ├── hdbscan06_timings.csv ├── reference_impl_external_timings.csv └── reference_impl_internal_timings.csv ├── paper ├── hdbscan_clustering_result.png ├── hdbscan_condensed_tree.png ├── paper.bib └── paper.md ├── pyproject.toml ├── requirements.txt └── setup.py /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | pip install scikit-learn cython 21 | pip install auditwheel 22 | - name: Build and publish 23 | env: 24 | TWINE_USERNAME: __token__ 25 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 26 | run: | 27 | python setup.py sdist bdist_wheel 28 | twine upload dist/*.tar.gz 29 | auditwheel repair dist/*linux_x86_64.whl 30 | twine upload wheelhouse/*.whl 31 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish_wheel.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | checkout_ref: 7 | description: "The branch, tag or SHA to checkout." 8 | required: true 9 | default: "master" 10 | 11 | jobs: 12 | linux-deploy: 13 | runs-on: ubuntu-latest 14 | container: quay.io/pypa/manylinux2014_x86_64 15 | strategy: 16 | matrix: 17 | python: ["cp38-cp38", "cp39-cp39", "cp310-cp310", "cp311-cp311"] 18 | steps: 19 | - uses: actions/checkout@v1 20 | with: 21 | ref: ${{ inputs.checkout_ref }} 22 | - name: Build wheel 23 | env: 24 | PYTHON: /opt/python/${{ matrix.python }}/bin/python 25 | run: | 26 | $PYTHON -m pip install "cython<3" oldest-supported-numpy 27 | $PYTHON -m build --no-isolation 28 | auditwheel repair dist/*linux_x86_64.whl 29 | - name: Publish to pypi 30 | env: 31 | TWINE_USERNAME: __token__ 32 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 33 | run: | 34 | pipx install twine 35 | twine upload dist/*.tar.gz --skip-existing 36 | twine upload wheelhouse/*.whl --skip-existing 37 | other-deploy: 38 | strategy: 39 | matrix: 40 | python: ["3.9", "3.10", "3.11", "3.12"] 41 | os: [windows-2019, macos-11] 42 | runs-on: ${{ matrix.os }} 43 | steps: 44 | - uses: actions/checkout@v1 45 | with: 46 | ref: ${{ inputs.checkout_ref }} 47 | - name: Set up Python 48 | uses: actions/setup-python@v1 49 | with: 50 | python-version: ${{ matrix.python }} 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | pip install setuptools build wheel twine 55 | pip install cython "numpy>=2" 56 | - name: Build wheel 57 | run: | 58 | python -m build --no-isolation 59 | - name: Publish to pypi 60 | env: 61 | TWINE_USERNAME: __token__ 62 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 63 | run: | 64 | twine upload dist/*.whl --skip-existing 65 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish_windows.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: windows-2019 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | pip install scikit-learn cython 21 | pip install auditwheel 22 | - name: Build and publish 23 | env: 24 | TWINE_USERNAME: __token__ 25 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 26 | run: | 27 | python setup.py sdist bdist_wheel 28 | twine upload dist/* 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | hdbscan/*.pyc 3 | hdbscan/tests/*.pyc 4 | hdbscan/*.pyd 5 | hdbscan/tests/*.pyd 6 | hdbscan/__pycache__/* 7 | dist/* 8 | *egg-info 9 | notebooks/.ipynb_checkpoints/* 10 | __pycache__/ 11 | 12 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /.idea/codeStyleSettings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 13 | -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/hdbscan.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 17 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.nojekyll: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.pep8speaks.yml: -------------------------------------------------------------------------------- 1 | # File : .pep8speaks.yml 2 | 3 | message: # Customize the comment made by the bot 4 | opened: # Messages when a new PR is submitted 5 | header: "Hello @{name}, Thank you for submitting the Pull Request !" 6 | # The keyword {name} is converted into the author's username 7 | footer: "" 8 | # The messages can be written as they would over GitHub 9 | updated: # Messages when new commits are added to the PR 10 | header: "Hello @{name}, Thank you for updating !" 11 | footer: "" # Why to comment the link to the style guide everytime? :) 12 | no_errors: "Cheers ! There are no PEP8 issues in this Pull Request. :beers: " 13 | 14 | scanner: 15 | diff_only: False # If True, errors caused by only the patch are shown 16 | 17 | pycodestyle: 18 | max-line-length: 100 # Default is 79 in PEP8 19 | ignore: # Errors and warnings to ignore 20 | - W391 21 | - E203 22 | 23 | only_mention_files_with_errors: True # If False, a separate status comment for each file is made. 24 | descending_issues_order: False # If True, PEP8 issues in message will be displayed in descending order of line numbers in the file 25 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | 6 | tools: 7 | python: "3.12" 8 | 9 | 10 | python: 11 | install: 12 | - requirements: docs/docs_requirements.txt 13 | - method: pip 14 | path: . 15 | 16 | sphinx: 17 | 18 | configuration: docs/conf.py 19 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.6" 5 | - "3.7" 6 | - "3.8" 7 | - "3.9" 8 | 9 | cache: 10 | apt: true 11 | # We use three different cache directory 12 | # to work around a Travis bug with multi-platform cache 13 | directories: 14 | - $HOME/.cache/pip 15 | - $HOME/download 16 | env: 17 | global: 18 | # Directory where tests are run from 19 | - TEST_DIR=/tmp/test_dir/ 20 | - MODULE=hdbscan 21 | matrix: 22 | - DISTRIB="conda" 23 | 24 | install: 25 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 26 | - bash miniconda.sh -b -p $HOME/miniconda 27 | - source "$HOME/miniconda/etc/profile.d/conda.sh" 28 | - hash -r 29 | - conda config --set always_yes yes --set changeps1 no 30 | - conda update -q conda 31 | - conda info -a 32 | - conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas networkx scikit-learn pytest pytest-cov codecov coverage cython 33 | - conda activate testenv 34 | - python -c "import numpy; print('numpy %s' % numpy.__version__)" 35 | - python -c "import scipy; print('scipy %s' % scipy.__version__)" 36 | - python setup.py develop 37 | 38 | script: 39 | - conda activate testenv 40 | - pytest --cov=./ 41 | 42 | after_success: 43 | - bash <(curl -s https://codecov.io/bash) 44 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at leland.mcinnes@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Leland McInnes 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst *.txt pyproject.toml LICENSE 2 | recursive-include hdbscan *.py *.pyx *.pxd *.c 3 | recursive-include notebooks *.ipynb *.npy *.svg 4 | recursive-include examples *.py 5 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Trigger a build when there is a push to the main branch or a tag starts with release- 2 | trigger: 3 | branches: 4 | include: 5 | - master 6 | tags: 7 | include: 8 | - release-* 9 | 10 | # Trigger a build when there is a pull request to the main branch 11 | # Ignore PRs that are just updating the docs 12 | pr: 13 | branches: 14 | include: 15 | - master 16 | exclude: 17 | - doc/* 18 | - README.rst 19 | 20 | variables: 21 | triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')] 22 | 23 | stages: 24 | - stage: RunAllTests 25 | displayName: Run test suite 26 | jobs: 27 | - job: run_platform_tests 28 | strategy: 29 | matrix: 30 | mac_py39: 31 | imageName: 'macOS-latest' 32 | python.version: '3.9' 33 | linux_py39: 34 | imageName: 'ubuntu-latest' 35 | python.version: '3.9' 36 | windows_py39: 37 | imageName: 'windows-latest' 38 | python.version: '3.9' 39 | mac_py310: 40 | imageName: 'macOS-latest' 41 | python.version: '3.10' 42 | linux_py310: 43 | imageName: 'ubuntu-latest' 44 | python.version: '3.10' 45 | windows_py310: 46 | imageName: 'windows-latest' 47 | python.version: '3.10' 48 | mac_py311: 49 | imageName: 'macOS-latest' 50 | python.version: '3.11' 51 | linux_py311: 52 | imageName: 'ubuntu-latest' 53 | python.version: '3.11' 54 | windows_py311: 55 | imageName: 'windows-latest' 56 | python.version: '3.11' 57 | mac_py312: 58 | imageName: 'macOS-latest' 59 | python.version: '3.12' 60 | linux_py312: 61 | imageName: 'ubuntu-latest' 62 | python.version: '3.12' 63 | windows_py312: 64 | imageName: 'windows-latest' 65 | python.version: '3.12' 66 | pool: 67 | vmImage: $(imageName) 68 | 69 | steps: 70 | - task: UsePythonVersion@0 71 | inputs: 72 | versionSpec: '$(python.version)' 73 | displayName: 'Use Python $(python.version)' 74 | 75 | - script: | 76 | python -m pip install --upgrade pip 77 | pip install -r requirements.txt 78 | displayName: 'Install dependencies' 79 | 80 | - script: | 81 | pip install -e . 82 | pip install pytest pytest-azurepipelines 83 | pip install pytest-cov 84 | pip install coveralls 85 | displayName: 'Install package' 86 | 87 | - script: | 88 | pytest hdbscan/tests --show-capture=no -v --disable-warnings --junitxml=junit/test-results.xml --cov=hdbscan/ --cov-report=xml --cov-report=html 89 | displayName: 'Run tests' 90 | 91 | - bash: | 92 | coveralls 93 | displayName: 'Publish to coveralls' 94 | condition: and(succeeded(), eq(variables.triggeredByPullRequest, false)) # Don't run this for PRs because they can't access pipeline secrets 95 | env: 96 | COVERALLS_REPO_TOKEN: $(COVERALLS_TOKEN) 97 | 98 | - task: PublishTestResults@2 99 | inputs: 100 | testResultsFiles: '$(System.DefaultWorkingDirectory)/**/coverage.xml' 101 | testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)' 102 | condition: succeededOrFailed() 103 | 104 | - stage: BuildPublishArtifact 105 | dependsOn: RunAllTests 106 | condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/release-'), eq(variables.triggeredByPullRequest, false)) 107 | jobs: 108 | # Need to use manylinux as ubuntu-latest is too new 109 | - job: Manylinux2014Build 110 | pool: 111 | vmImage: 'ubuntu-latest' 112 | container: quay.io/pypa/manylinux2014_x86_64:latest 113 | strategy: 114 | matrix: 115 | linux_py38: 116 | python.version: 'cp38-cp38' 117 | linux_py39: 118 | python.version: 'cp39-cp39' 119 | linux_py310: 120 | python.version: 'cp310-cp310' 121 | linux_py311: 122 | python.version: 'cp311-cp311' 123 | linux_py312: 124 | python.version: 'cp312-cp312' 125 | steps: 126 | - script: | 127 | "${PYBIN}/python" -m pip install --upgrade pip 128 | "${PYBIN}/python" -m pip install wheel 129 | "${PYBIN}/python" -m pip install -r requirements.txt 130 | "${PYBIN}/python" -m pip install cython 131 | displayName: 'Install dependencies and build tools' 132 | env: 133 | PYBIN: /opt/python/$(python.version)/bin 134 | - script: | 135 | "${PYBIN}/python" setup.py sdist bdist_wheel 136 | displayName: 'Build wheels' 137 | env: 138 | PYBIN: /opt/python/$(python.version)/bin 139 | - bash: | 140 | auditwheel repair dist/*linux_x86_64.whl --plat manylinux2014_x86_64 -w wheelhouse-manylinux/ 141 | displayName: 'Audit wheels' 142 | 143 | - task: DownloadSecureFile@1 144 | name: PYPIRC_CONFIG 145 | displayName: 'Download pypirc' 146 | inputs: 147 | secureFile: 'pypirc' 148 | 149 | - bash: | 150 | "${PYBIN}/python" -m pip install twine 151 | "${PYBIN}/python" -m twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing --disable-progress-bar wheelhouse-manylinux/* 152 | "${PYBIN}/python" -m twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing --disable-progress-bar dist/*.tar.gz 153 | displayName: 'Publish wheel to PyPi' 154 | env: 155 | PYBIN: /opt/python/$(python.version)/bin 156 | 157 | - job: BuildWindowsAndMacOSArtifacts 158 | displayName: Build source dists and wheels for windows and macOS 159 | strategy: 160 | matrix: 161 | mac_py38: 162 | imageName: 'macOS-latest' 163 | python.version: '3.8' 164 | windows_py38: 165 | imageName: 'windows-latest' 166 | python.version: '3.8' 167 | mac_py39: 168 | imageName: 'macOS-latest' 169 | python.version: '3.9' 170 | windows_py39: 171 | imageName: 'windows-latest' 172 | python.version: '3.9' 173 | mac_py310: 174 | imageName: 'macOS-latest' 175 | python.version: '3.10' 176 | windows_py310: 177 | imageName: 'windows-latest' 178 | python.version: '3.10' 179 | mac_py311: 180 | imageName: 'macOS-latest' 181 | python.version: '3.11' 182 | windows_py311: 183 | imageName: 'windows-latest' 184 | python.version: '3.11' 185 | mac_py312: 186 | imageName: 'macOS-latest' 187 | python.version: '3.12' 188 | windows_py312: 189 | imageName: 'windows-latest' 190 | python.version: '3.12' 191 | pool: 192 | vmImage: $(imageName) 193 | 194 | steps: 195 | - task: UsePythonVersion@0 196 | inputs: 197 | versionSpec: '$(python.version)' 198 | displayName: 'Use Python $(python.version)' 199 | 200 | - script: | 201 | python -m pip install --upgrade pip 202 | pip install wheel 203 | pip install -r requirements.txt 204 | pip install cython 205 | pip install setuptools 206 | displayName: 'Install dependencies' 207 | 208 | - script: | 209 | pip install -e . 210 | displayName: 'Install package locally' 211 | 212 | - bash: | 213 | python setup.py sdist bdist_wheel 214 | displayName: 'Build package' 215 | 216 | - bash: | 217 | export PACKAGE_VERSION="$(python setup.py --version)" 218 | echo "Package Version: ${PACKAGE_VERSION}" 219 | echo "##vso[task.setvariable variable=packageVersionFormatted;]release-${PACKAGE_VERSION}" 220 | displayName: 'Get package version' 221 | 222 | - script: | 223 | echo "Version in git tag $(Build.SourceBranchName) does not match version derived from setup.py $(packageVersionFormatted)" 224 | exit 1 225 | displayName: Raise error if version doesnt match tag 226 | condition: and(succeeded(), ne(variables['Build.SourceBranchName'], variables['packageVersionFormatted'])) 227 | 228 | - task: DownloadSecureFile@1 229 | name: PYPIRC_CONFIG 230 | displayName: 'Download pypirc' 231 | inputs: 232 | secureFile: 'pypirc' 233 | 234 | - script: | 235 | pip install twine 236 | twine upload -r pypi --config-file $(PYPIRC_CONFIG.secureFilePath) --skip-existing dist/* 237 | displayName: 'Upload to PyPI' 238 | condition: and(succeeded(), eq(variables['Build.SourceBranchName'], variables['packageVersionFormatted'])) 239 | -------------------------------------------------------------------------------- /ci_scripts/push_doc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called in the "deploy" step defined in 3 | # circle.yml. See https://circleci.com/docs/ for more details. 4 | # The behavior of the script is controlled by environment variable defined 5 | # in the circle.yml in the top level folder of the project. 6 | 7 | MSG="Pushing the docs for revision for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1" 8 | 9 | cd $HOME 10 | # Copy the build docs to a temporary folder 11 | rm -rf tmp 12 | mkdir tmp 13 | cp -R $HOME/$DOC_REPO/doc/_build/html/* ./tmp/ 14 | 15 | # Clone the docs repo if it isnt already there 16 | if [ ! -d $DOC_REPO ]; 17 | then git clone "git@github.com:$USERNAME/"$DOC_REPO".git"; 18 | fi 19 | 20 | cd $DOC_REPO 21 | git branch gh-pages 22 | git checkout -f gh-pages 23 | git reset --hard origin/gh-pages 24 | git clean -dfx 25 | 26 | for name in $(ls -A $HOME/$DOC_REPO); do 27 | case $name in 28 | .nojekyll) # So that github does not build this as a Jekyll website. 29 | ;; 30 | circle.yml) # Config so that build gh-pages branch. 31 | ;; 32 | *) 33 | git rm -rf $name 34 | ;; 35 | esac 36 | done 37 | 38 | # Copy the new build docs 39 | mkdir $DOC_URL 40 | cp -R $HOME/tmp/* ./$DOC_URL/ 41 | 42 | git config --global user.email $EMAIL 43 | git config --global user.name $USERNAME 44 | git add -f ./$DOC_URL/ 45 | git commit -m "$MSG" 46 | git push -f origin gh-pages 47 | if [ $? -ne 0 ]; then 48 | echo "Pushing docs failed" 49 | echo 50 | exit 1 51 | fi 52 | 53 | echo $MSG 54 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | machine: 2 | environment: 3 | # The github organization or username of the repository which hosts the 4 | # project and documentation. 5 | USERNAME: "scikit-learn-contrib" 6 | 7 | # The repository where the documentation will be hosted 8 | DOC_REPO: "hdbscan" 9 | 10 | # The base URL for the Github page where the documentation will be hosted 11 | DOC_URL: "" 12 | 13 | # The email is to be used for commits in the Github Page 14 | EMAIL: "leland.mcinnes+ci@gmail.com" 15 | 16 | dependencies: 17 | 18 | # Various dependencies 19 | pre: 20 | - sudo -E apt-get -yq remove texlive-binaries --purge 21 | - sudo apt-get update 22 | - sudo apt-get install libatlas-dev libatlas3gf-base 23 | - sudo apt-get install build-essential python-dev python-setuptools 24 | # install numpy first as it is a compile time dependency for other packages 25 | - pip install --upgrade numpy 26 | - pip install --upgrade scipy matplotlib setuptools nose coverage sphinx pillow sphinx-gallery sphinx_rtd_theme 27 | # Installing required packages for `make -C doc check command` to work. 28 | - sudo -E apt-get -yq update 29 | - sudo -E apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra 30 | - pip install --upgrade cython numpydoc 31 | - pip install --upgrade scikit-learn 32 | 33 | # The --user is needed to let sphinx see the source and the binaries 34 | # The pipefail is requested to propagate exit code 35 | override: 36 | - python setup.py clean 37 | - python setup.py develop 38 | - set -o pipefail && cd doc && make html 2>&1 | tee ~/log.txt 39 | test: 40 | # Grep error on the documentation 41 | override: 42 | - cat ~/log.txt && if grep -q "Traceback (most recent call last):" ~/log.txt; then false; else true; fi 43 | deployment: 44 | push: 45 | branch: master 46 | commands: 47 | - bash ci_scripts/push_doc.sh 48 | general: 49 | # Open the doc to the API 50 | artifacts: 51 | - "doc/_build/html" 52 | - "~/log.txt" 53 | # Restric the build to the branch master only 54 | branches: 55 | ignore: 56 | - gh-pages 57 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | @echo " coverage to run coverage check of the documentation (if enabled)" 49 | 50 | .PHONY: clean 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | .PHONY: html 55 | html: 56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 57 | @echo 58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 59 | 60 | .PHONY: dirhtml 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | .PHONY: singlehtml 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | .PHONY: pickle 73 | pickle: 74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 75 | @echo 76 | @echo "Build finished; now you can process the pickle files." 77 | 78 | .PHONY: json 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | .PHONY: htmlhelp 85 | htmlhelp: 86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 87 | @echo 88 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 89 | ".hhp project file in $(BUILDDIR)/htmlhelp." 90 | 91 | .PHONY: qthelp 92 | qthelp: 93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 94 | @echo 95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/hdbscan.qhcp" 98 | @echo "To view the help file:" 99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/hdbscan.qhc" 100 | 101 | .PHONY: applehelp 102 | applehelp: 103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 104 | @echo 105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 106 | @echo "N.B. You won't be able to view it unless you put it in" \ 107 | "~/Library/Documentation/Help or install it in your application" \ 108 | "bundle." 109 | 110 | .PHONY: devhelp 111 | devhelp: 112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 113 | @echo 114 | @echo "Build finished." 115 | @echo "To view the help file:" 116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/hdbscan" 117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/hdbscan" 118 | @echo "# devhelp" 119 | 120 | .PHONY: epub 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | .PHONY: latex 127 | latex: 128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 129 | @echo 130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 132 | "(use \`make latexpdf' here to do that automatically)." 133 | 134 | .PHONY: latexpdf 135 | latexpdf: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through pdflatex..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | .PHONY: latexpdfja 142 | latexpdfja: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through platex and dvipdfmx..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: text 149 | text: 150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 151 | @echo 152 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 153 | 154 | .PHONY: man 155 | man: 156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 157 | @echo 158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 159 | 160 | .PHONY: texinfo 161 | texinfo: 162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 163 | @echo 164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 165 | @echo "Run \`make' in that directory to run these through makeinfo" \ 166 | "(use \`make info' here to do that automatically)." 167 | 168 | .PHONY: info 169 | info: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo "Running Texinfo files through makeinfo..." 172 | make -C $(BUILDDIR)/texinfo info 173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 174 | 175 | .PHONY: gettext 176 | gettext: 177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 178 | @echo 179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 180 | 181 | .PHONY: changes 182 | changes: 183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 184 | @echo 185 | @echo "The overview file is in $(BUILDDIR)/changes." 186 | 187 | .PHONY: linkcheck 188 | linkcheck: 189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 190 | @echo 191 | @echo "Link check complete; look for any errors in the above output " \ 192 | "or in $(BUILDDIR)/linkcheck/output.txt." 193 | 194 | .PHONY: doctest 195 | doctest: 196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 197 | @echo "Testing of doctests in the sources finished, look at the " \ 198 | "results in $(BUILDDIR)/doctest/output.txt." 199 | 200 | .PHONY: coverage 201 | coverage: 202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 203 | @echo "Testing of coverage in the sources finished, look at the " \ 204 | "results in $(BUILDDIR)/coverage/python.txt." 205 | 206 | .PHONY: xml 207 | xml: 208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 209 | @echo 210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 211 | 212 | .PHONY: pseudoxml 213 | pseudoxml: 214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 215 | @echo 216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 217 | -------------------------------------------------------------------------------- /docs/advanced_hdbscan.rst: -------------------------------------------------------------------------------- 1 | 2 | Getting More Information About a Clustering 3 | =========================================== 4 | 5 | Once you have the basics of clustering sorted you may want to dig a 6 | little deeper than just the cluster labels returned to you. Fortunately, the hdbscan library provides you with the facilities to do this. During 7 | processing HDBSCAN\* builds a hierarchy of potential clusters, from 8 | which it extracts the flat clustering returned. It can be informative to 9 | look at that hierarchy, and potentially make use of the extra 10 | information contained therein. 11 | 12 | Suppose we have a dataset for clustering. It is a binary file in NumPy format and it can be found at https://github.com/lmcinnes/hdbscan/blob/master/notebooks/clusterable_data.npy. 13 | 14 | .. code:: python 15 | 16 | import hdbscan 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | import seaborn as sns 20 | %matplotlib inline 21 | 22 | .. code:: python 23 | 24 | data = np.load('clusterable_data.bin') 25 | #or 26 | data = np.load('clusterable_data.npy') 27 | #depending on the format of the file 28 | 29 | .. code:: python 30 | 31 | data.shape 32 | 33 | .. parsed-literal:: 34 | 35 | (2309, 2) 36 | 37 | .. code:: python 38 | 39 | data 40 | 41 | .. parsed-literal:: 42 | 43 | array([[-0.12153499, -0.22876337], 44 | [-0.22093687, -0.25251088], 45 | [ 0.1259037 , -0.27314321], 46 | ..., 47 | [ 0.50243143, -0.3002958 ], 48 | [ 0.53822256, 0.19412199], 49 | [-0.08688887, -0.2092721 ]]) 50 | 51 | 52 | .. code:: python 53 | 54 | plt.scatter(*data.T, s=50, linewidth=0, c='b', alpha=0.25) 55 | 56 | .. parsed-literal:: 57 | 58 | 59 | 60 | .. image:: images/advanced_hdbscan_3_1.png 61 | 62 | 63 | We can cluster the data as normal, and visualize the labels with 64 | different colors (and even the cluster membership strengths as levels of 65 | saturation) 66 | 67 | .. code:: python 68 | 69 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data) 70 | color_palette = sns.color_palette('deep', 8) 71 | cluster_colors = [color_palette[x] if x >= 0 72 | else (0.5, 0.5, 0.5) 73 | for x in clusterer.labels_] 74 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 75 | zip(cluster_colors, clusterer.probabilities_)] 76 | plt.scatter(*data.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 77 | 78 | 79 | .. image:: images/advanced_hdbscan_5_1.png 80 | 81 | Condensed Trees 82 | --------------- 83 | 84 | The question now is what does the cluster hierarchy look like -- which 85 | clusters are near each other, or could perhaps be merged, and which are 86 | far apart. We can access the basic hierarchy via the :py:attr:`~hdbscan.HDBSCAN.condensed_tree_` 87 | attribute of the clusterer object. 88 | 89 | .. code:: python 90 | 91 | clusterer.condensed_tree_ 92 | 93 | 94 | 95 | 96 | .. parsed-literal:: 97 | 98 | 99 | 100 | 101 | 102 | This merely gives us a :class:`~hdbscan.plots.CondensedTree` object. If we want to visualize the 103 | hierarchy we can call the :py:meth:`~hdbscan.plots.CondensedTree.plot` method: 104 | 105 | .. code:: python 106 | 107 | clusterer.condensed_tree_.plot() 108 | 109 | 110 | .. image:: images/advanced_hdbscan_9_1.png 111 | 112 | 113 | We can now see the hierarchy as a dendrogram, the width (and color) of 114 | each branch representing the number of points in the cluster at that 115 | level. If we wish to know which branches were selected by the HDBSCAN\* 116 | algorithm we can pass ``select_clusters=True``. You can even pass a 117 | selection palette to color the selections according to the cluster 118 | labeling. 119 | 120 | .. code:: python 121 | 122 | clusterer.condensed_tree_.plot(select_clusters=True, 123 | selection_palette=sns.color_palette('deep', 8)) 124 | 125 | 126 | .. image:: images/advanced_hdbscan_11_1.png 127 | 128 | 129 | From this, we can see, for example, that the yellow cluster at the 130 | center of the plot forms early (breaking off from the pale blue and 131 | purple clusters) and persists for a long time. By comparison the green 132 | cluster, which also forms early, quickly breaks apart and then 133 | vanishes altogether (shattering into clusters all smaller than the 134 | ``min_cluster_size`` of 15). 135 | 136 | You can also see that the pale blue cluster breaks apart into several 137 | subclusters that in turn persist for quite some time -- so there is some 138 | interesting substructure to the pale blue cluster that is not present, 139 | for example, in the dark blue cluster. 140 | 141 | If this was a simple visual analysis of the condensed tree can tell you 142 | a lot more about the structure of your data. This is not all we can do 143 | with condensed trees, however. For larger and more complex datasets the 144 | tree itself may be very complex, and it may be desirable to run more 145 | interesting analytics over the tree itself. This can be achieved via 146 | several converter methods: :py:meth:`~hdbscan.plots.CondensedTree.to_networkx`, :py:meth:`~hdbscan.plots.CondensedTree.to_pandas`, and 147 | :py:meth:`~hdbscan.plots.CondensedTree.to_numpy`. 148 | 149 | First we'll consider :py:meth:`~hdbscan.plots.CondensedTree.to_networkx` 150 | 151 | .. code:: python 152 | 153 | clusterer.condensed_tree_.to_networkx() 154 | 155 | 156 | 157 | 158 | .. parsed-literal:: 159 | 160 | 161 | 162 | 163 | 164 | As you can see we get a NetworkX directed graph, which we can then use 165 | all the regular NetworkX tools and analytics on. The graph is richer 166 | than the visual plot above may lead you to believe, however: 167 | 168 | .. code:: python 169 | 170 | g = clusterer.condensed_tree_.to_networkx() 171 | g.number_of_nodes() 172 | 173 | 174 | 175 | 176 | .. parsed-literal:: 177 | 178 | 2338 179 | 180 | 181 | 182 | The graph actually contains nodes for all the points falling out of 183 | clusters as well as the clusters themselves. Each node has an associated 184 | ``size`` attribute and each edge has a ``weight`` of the lambda value 185 | at which that edge forms. This allows for much more interesting 186 | analyses. 187 | 188 | Next, we have the :py:meth:`~hdbscan.plots.CondensedTree.to_pandas` method, which returns a panda DataFrame 189 | where each row corresponds to an edge of the NetworkX graph: 190 | 191 | .. code:: python 192 | 193 | clusterer.condensed_tree_.to_pandas().head() 194 | 195 | 196 | 197 | 198 | .. raw:: html 199 | 200 |
201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 |
parentchildlambda_valchild_size
0230920485.0165261
1230920065.0765031
2230920245.2791331
3230920505.3473321
4230919925.3819301
249 |
250 | 251 | 252 | 253 | 254 | 255 | Here the ``parent`` denotes the id of the parent cluster, the ``child`` 256 | the id of the child cluster (or, if the child is a single data point 257 | rather than a cluster, the index in the dataset of that point), the 258 | ``lambda_val`` provides the lambda value at which the edge forms, and 259 | the ``child_size`` provides the number of points in the child cluster. 260 | As you can see the start of the DataFrame has singleton points falling 261 | out of the root cluster, with each ``child_size`` equal to 1. 262 | 263 | If you want just the clusters, rather than all the individual points 264 | as well, simply select the rows of the DataFrame with ``child_size`` 265 | greater than 1. 266 | 267 | .. code:: python 268 | 269 | tree = clusterer.condensed_tree_.to_pandas() 270 | cluster_tree = tree[tree.child_size > 1] 271 | 272 | 273 | 274 | Finally we have the :py:meth:`~hdbscan.plots.CondensedTree.to_numpy` function, which returns a numpy record 275 | array: 276 | 277 | .. code:: python 278 | 279 | clusterer.condensed_tree_.to_numpy() 280 | 281 | 282 | 283 | 284 | .. parsed-literal:: 285 | 286 | array([(2309, 2048, 5.016525967983049, 1), 287 | (2309, 2006, 5.076503128308643, 1), 288 | (2309, 2024, 5.279133057912248, 1), ..., 289 | (2318, 1105, 86.5507370650292, 1), (2318, 965, 86.5507370650292, 1), 290 | (2318, 954, 86.5507370650292, 1)], 291 | dtype=[('parent', ' 316 | 317 | 318 | 319 | Again we have an object which we can then query for relevant 320 | information. The most basic approach is the :py:meth:`~hdbscan.plots.SingleLinkageTree.plot` method, just like 321 | the condensed tree. 322 | 323 | .. code:: python 324 | 325 | clusterer.single_linkage_tree_.plot() 326 | 327 | 328 | .. image:: images/advanced_hdbscan_26_1.png 329 | 330 | 331 | As you can see we gain a lot from condensing the tree in terms of better 332 | presenting and summarising the data. There is a lot less to be gained 333 | from visual inspection of a plot like this (and it only gets worse for 334 | larger datasets). The plot function support most of the same 335 | functionality as the dendrogram plotting from 336 | ``scipy.cluster.hierarchy``, so you can view various truncations of the 337 | tree if necessary. In practice, however, you are more likely to be 338 | interested in access the raw data for further analysis. Again we have 339 | :py:meth:`~hdbscan.plots.SingleLinkageTree.to_networkx`, :py:meth:`~hdbscan.plots.SingleLinkageTree.to_pandas` and :py:meth:`~hdbscan.plots.SingleLinkageTree.to_numpy`. This time the 340 | :py:meth:`~hdbscan.plots.SingleLinkageTree.to_networkx` provides a direct NetworkX version of what you see 341 | above. The NumPy and pandas results conform to the single linkage 342 | hierarchy format of ``scipy.cluster.hierarchy``, and can be passed to 343 | routines there if necessary. 344 | 345 | If you wish to know what the clusters are at a given fixed level of the 346 | single linkage tree you can use the :py:meth:`~hdbscan.plots.SingleLinkageTree.get_clusters` method to extract 347 | a vector of cluster labels. The method takes a cut value of the level 348 | at which to cut the tree, and a ``minimum_cluster_size`` to determine 349 | noise points (any cluster smaller than the ``minimum_cluster_size``). 350 | 351 | .. code:: python 352 | 353 | clusterer.single_linkage_tree_.get_clusters(0.023, min_cluster_size=2) 354 | 355 | 356 | 357 | .. parsed-literal:: 358 | 359 | array([ 0, -1, 0, ..., -1, -1, 0]) 360 | 361 | 362 | In this way, it is possible to extract the DBSCAN clustering that would result 363 | for any given epsilon value, all from one run of hdbscan. 364 | 365 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | Major classes are :class:`HDBSCAN` and :class:`RobustSingleLinkage`. 5 | 6 | HDBSCAN 7 | ------- 8 | 9 | .. autoclass:: hdbscan.hdbscan_.HDBSCAN 10 | :members: 11 | 12 | RobustSingleLinkage 13 | ------------------- 14 | 15 | .. autoclass:: hdbscan.robust_single_linkage_.RobustSingleLinkage 16 | :members: 17 | 18 | 19 | Utilities 20 | --------- 21 | 22 | Other useful classes are contained in the plots module, the validity module, 23 | and the prediction module. 24 | 25 | .. autoclass:: hdbscan.plots.CondensedTree 26 | :members: 27 | 28 | .. autoclass:: hdbscan.plots.SingleLinkageTree 29 | :members: 30 | 31 | .. autoclass:: hdbscan.plots.MinimumSpanningTree 32 | :members: 33 | 34 | .. automodule:: hdbscan.validity 35 | :members: 36 | 37 | .. automodule:: hdbscan.prediction 38 | :members: 39 | 40 | 41 | Branch detection 42 | ---------------- 43 | 44 | The branches module contains classes for detecting branches within clusters. 45 | 46 | .. automodule:: hdbscan.branches 47 | :members: BranchDetector, detect_branches_in_clusters, approximate_predict_branch 48 | 49 | .. autoclass:: hdbscan.plots.ApproximationGraph 50 | :members: 51 | -------------------------------------------------------------------------------- /docs/basic_hdbscan.rst: -------------------------------------------------------------------------------- 1 | 2 | Basic Usage of HDBSCAN\* for Clustering 3 | ======================================= 4 | 5 | We have some data, and we want to cluster it. How exactly do we do that, 6 | and what do the results look like? If you are very familiar with sklearn 7 | and its API, particularly for clustering, then you can probably skip 8 | this tutorial -- ``hdbscan`` implements exactly this API, so you can use 9 | it just as you would any other sklearn clustering algorithm. If, on the 10 | other hand, you aren't that familiar with sklearn, fear not, and read 11 | on. Let's start with the simplest case first -- we have data in a nice 12 | tidy dataframe format. 13 | 14 | The Simple Case 15 | --------------- 16 | 17 | Let's generate some data with, say 2000 samples, and 10 features. We can 18 | put it in a dataframe for a nice clean table view of it. 19 | 20 | .. code:: python 21 | 22 | from sklearn.datasets import make_blobs 23 | import pandas as pd 24 | 25 | .. code:: python 26 | 27 | blobs, labels = make_blobs(n_samples=2000, n_features=10) 28 | 29 | .. code:: python 30 | 31 | pd.DataFrame(blobs).head() 32 | 33 | 34 | .. raw:: html 35 | 36 |
37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 |
0123456789
0-3.3708048.4876884.631243-10.1814759.146487-8.070935-1.612017-2.418106-8.975390-1.769952
1-4.0929318.4098413.362516-9.7489459.556615-9.240307-2.038291-3.129068-7.109673-0.993827
2-4.6047539.6163914.631508-11.16636110.888212-8.427564-3.929517-4.563951-8.886373-1.995063
3-6.889866-7.801482-6.974958-8.5700255.438101-5.097457-4.941206-5.926394-10.1451520.219269
45.3397282.7913090.611464-2.929875-7.6949737.776050-1.2181010.408141-4.563975-1.309128
121 |
122 | 123 | 124 | 125 | So now we need to import the hdbscan library. 126 | 127 | .. code:: python 128 | 129 | import hdbscan 130 | 131 | Now, to cluster we need to generate a clustering object. 132 | 133 | .. code:: python 134 | 135 | clusterer = hdbscan.HDBSCAN() 136 | 137 | We can then use this clustering object and fit it to the data we have. 138 | This will return the clusterer object back to you -- just in case you 139 | want do some method chaining. 140 | 141 | .. code:: python 142 | 143 | clusterer.fit(blobs) 144 | 145 | 146 | .. parsed-literal:: 147 | 148 | HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, 149 | gen_min_span_tree=False, leaf_size=40, memory=Memory(None), 150 | metric='euclidean', min_cluster_size=5, min_samples=None, p=None) 151 | 152 | 153 | 154 | At this point we are actually done! We've done the clustering! But where 155 | are the results? How do I get the clusters? The clusterer object knows, 156 | and stores the result in an attribute ``labels_``. 157 | 158 | .. code:: python 159 | 160 | clusterer.labels_ 161 | 162 | 163 | .. parsed-literal:: 164 | 165 | array([2, 2, 2, ..., 2, 2, 0]) 166 | 167 | 168 | 169 | So it is an array of integers. What are we to make of that? It is an 170 | array with an integer for each data sample. Samples that are in the same 171 | cluster get assigned the same number. The cluster labels start at 0 and count 172 | up. We can thus determine the number of clusters found by finding the largest 173 | cluster label. 174 | 175 | .. code:: python 176 | 177 | clusterer.labels_.max() 178 | 179 | 180 | .. parsed-literal:: 181 | 182 | 2 183 | 184 | So we have a total of three clusters, with labels 0, 1, and 2. 185 | Importantly HDBSCAN is noise aware -- it has a notion of data samples 186 | that are not assigned to any cluster. This is handled by assigning these 187 | samples the label -1. But wait, there's more. The ``hdbscan`` library 188 | implements soft clustering, where each data point is assigned a cluster 189 | membership score ranging from 0.0 to 1.0. A score of 0.0 represents a 190 | sample that is not in the cluster at all (all noise points will get this 191 | score) while a score of 1.0 represents a sample that is at the heart of 192 | the cluster (note that this is not the spatial centroid notion of core). 193 | You can access these scores via the ``probabilities_`` attribute. 194 | 195 | .. code:: python 196 | 197 | clusterer.probabilities_ 198 | 199 | 200 | .. parsed-literal:: 201 | 202 | array([ 0.83890858, 1. , 0.72629904, ..., 0.79456452, 203 | 0.65311137, 0.76382928]) 204 | 205 | 206 | 207 | What about different metrics? 208 | ----------------------------- 209 | 210 | That is all well and good, but even data that is embedded in a vector 211 | space may not want to consider distances between data points to be pure 212 | Euclidean distance. What can we do in that case? We are still in good 213 | shape, since ``hdbscan`` supports a wide variety of metrics, which you 214 | can set when creating the clusterer object. For example we can do the 215 | following: 216 | 217 | .. code:: python 218 | 219 | clusterer = hdbscan.HDBSCAN(metric='manhattan') 220 | clusterer.fit(blobs) 221 | clusterer.labels_ 222 | 223 | 224 | 225 | 226 | .. parsed-literal:: 227 | 228 | array([1, 1, 1, ..., 1, 1, 0]) 229 | 230 | 231 | 232 | What metrics are supported? Because we simply steal metric computations 233 | from sklearn we get a large number of metrics readily available. 234 | 235 | .. code:: python 236 | 237 | hdbscan.dist_metrics.METRIC_MAPPING 238 | 239 | 240 | 241 | 242 | .. parsed-literal:: 243 | 244 | {'braycurtis': hdbscan.dist_metrics.BrayCurtisDistance, 245 | 'canberra': hdbscan.dist_metrics.CanberraDistance, 246 | 'chebyshev': hdbscan.dist_metrics.ChebyshevDistance, 247 | 'cityblock': hdbscan.dist_metrics.ManhattanDistance, 248 | 'dice': hdbscan.dist_metrics.DiceDistance, 249 | 'euclidean': hdbscan.dist_metrics.EuclideanDistance, 250 | 'hamming': hdbscan.dist_metrics.HammingDistance, 251 | 'haversine': hdbscan.dist_metrics.HaversineDistance, 252 | 'infinity': hdbscan.dist_metrics.ChebyshevDistance, 253 | 'jaccard': hdbscan.dist_metrics.JaccardDistance, 254 | 'kulsinski': hdbscan.dist_metrics.KulsinskiDistance, 255 | 'l1': hdbscan.dist_metrics.ManhattanDistance, 256 | 'l2': hdbscan.dist_metrics.EuclideanDistance, 257 | 'mahalanobis': hdbscan.dist_metrics.MahalanobisDistance, 258 | 'manhattan': hdbscan.dist_metrics.ManhattanDistance, 259 | 'matching': hdbscan.dist_metrics.MatchingDistance, 260 | 'minkowski': hdbscan.dist_metrics.MinkowskiDistance, 261 | 'p': hdbscan.dist_metrics.MinkowskiDistance, 262 | 'pyfunc': hdbscan.dist_metrics.PyFuncDistance, 263 | 'rogerstanimoto': hdbscan.dist_metrics.RogersTanimotoDistance, 264 | 'russellrao': hdbscan.dist_metrics.RussellRaoDistance, 265 | 'seuclidean': hdbscan.dist_metrics.SEuclideanDistance, 266 | 'sokalmichener': hdbscan.dist_metrics.SokalMichenerDistance, 267 | 'sokalsneath': hdbscan.dist_metrics.SokalSneathDistance, 268 | 'wminkowski': hdbscan.dist_metrics.WMinkowskiDistance} 269 | 270 | 271 | 272 | Distance matrices 273 | ----------------- 274 | 275 | What if you don't have a nice set of points in a vector space, but only 276 | have a pairwise distance matrix providing the distance between each pair 277 | of points? This is a common situation. Perhaps you have a complex custom 278 | distance measure; perhaps you have strings and are using Levenshtein 279 | distance, etc. Again, this is all fine as ``hdbscan`` supports a special 280 | metric called ``precomputed``. If you create the clusterer with the 281 | metric set to ``precomputed`` then the clusterer will assume that, 282 | rather than being handed a vector of points in a vector space, it is 283 | receiving an all-pairs distance matrix. Missing distances can be 284 | indicated by ``numpy.inf``, which leads HDBSCAN to ignore these pairwise 285 | relationships as long as there exists a path between two points that 286 | contains defined distances (i.e. if there are too many distances 287 | missing, the clustering is going to fail). 288 | 289 | NOTE: The input vector _must_ contain numerical data. If you have a 290 | distance matrix for non-numerical vectors, you will need to map your 291 | input vectors to numerical vectors. (e.g use map ['A', 'G', 'C', 'T']-> 292 | [ 1, 2, 3, 4] to replace input vector ['A', 'A', 'A', 'C', 'G'] with 293 | [ 1, 1, 1, 3, 2]) 294 | 295 | .. code:: python 296 | 297 | from sklearn.metrics.pairwise import pairwise_distances 298 | 299 | .. code:: python 300 | 301 | distance_matrix = pairwise_distances(blobs) 302 | clusterer = hdbscan.HDBSCAN(metric='precomputed') 303 | clusterer.fit(distance_matrix) 304 | clusterer.labels_ 305 | 306 | 307 | 308 | 309 | .. parsed-literal:: 310 | 311 | array([1, 1, 1, ..., 1, 1, 2]) 312 | 313 | 314 | 315 | Note that this result only appears different due to a different 316 | labelling order for the clusters. 317 | 318 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # hdbscan documentation build configuration file, created by 4 | # sphinx-quickstart on Sat May 28 10:34:44 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | import sphinx_rtd_theme 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | 23 | ### We now install the package in a virtualenv to build docs, so this is not needed 24 | # sys.path.insert(0, os.path.abspath('../')) 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | #needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.doctest', 37 | 'sphinx.ext.todo', 38 | 'sphinx.ext.coverage', 39 | 'sphinx.ext.imgmath', 40 | 'sphinx.ext.viewcode', 41 | # 'sphinx.ext.napoleon', 42 | # 'numpy_ext.numpydoc' 43 | ] 44 | #napoleon_google_docstring = False 45 | #napoleon_numpy_docstring = True 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ['_templates'] 49 | 50 | # The suffix(es) of source filenames. 51 | # You can specify multiple suffix as a list of string: 52 | # source_suffix = ['.rst', '.md'] 53 | source_suffix = '.rst' 54 | 55 | # The encoding of source files. 56 | #source_encoding = 'utf-8-sig' 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # General information about the project. 62 | project = u'hdbscan' 63 | copyright = u'2016, Leland McInnes, John Healy, Steve Astels' 64 | author = u'Leland McInnes, John Healy, Steve Astels' 65 | 66 | # The version info for the project you're documenting, acts as replacement for 67 | # |version| and |release|, also used in various other places throughout the 68 | # built documents. 69 | # 70 | # The short X.Y version. 71 | version = u'0.8.1' 72 | # The full version, including alpha/beta/rc tags. 73 | release = u'0.8.1' 74 | 75 | # The language for content autogenerated by Sphinx. Refer to documentation 76 | # for a list of supported languages. 77 | # 78 | # This is also used if you do content translation via gettext catalogs. 79 | # Usually you set "language" from the command line for these cases. 80 | language = None 81 | 82 | # There are two options for replacing |today|: either, you set today to some 83 | # non-false value, then it is used: 84 | #today = '' 85 | # Else, today_fmt is used as the format for a strftime call. 86 | #today_fmt = '%B %d, %Y' 87 | 88 | # List of patterns, relative to source directory, that match files and 89 | # directories to ignore when looking for source files. 90 | exclude_patterns = ['_build'] 91 | 92 | # The reST default role (used for this markup: `text`) to use for all 93 | # documents. 94 | #default_role = None 95 | 96 | # If true, '()' will be appended to :func: etc. cross-reference text. 97 | #add_function_parentheses = True 98 | 99 | # If true, the current module name will be prepended to all description 100 | # unit titles (such as .. function::). 101 | #add_module_names = True 102 | 103 | # If true, sectionauthor and moduleauthor directives will be shown in the 104 | # output. They are ignored by default. 105 | #show_authors = False 106 | 107 | # The name of the Pygments (syntax highlighting) style to use. 108 | pygments_style = 'sphinx' 109 | 110 | # A list of ignored prefixes for module index sorting. 111 | #modindex_common_prefix = [] 112 | 113 | # If true, keep warnings as "system message" paragraphs in the built documents. 114 | #keep_warnings = False 115 | 116 | # If true, `todo` and `todoList` produce output, else they produce nothing. 117 | todo_include_todos = True 118 | 119 | 120 | # -- Options for HTML output ---------------------------------------------- 121 | 122 | # The theme to use for HTML and HTML Help pages. See the documentation for 123 | # a list of builtin themes. 124 | #html_theme = 'alabaster' 125 | html_theme = 'sphinx_rtd_theme' 126 | 127 | # Theme options are theme-specific and customize the look and feel of a theme 128 | # further. For a list of options available for each theme, see the 129 | # documentation. 130 | #html_theme_options = {} 131 | 132 | # Add any paths that contain custom themes here, relative to this directory. 133 | #html_theme_path = [] 134 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 135 | 136 | # The name for this set of Sphinx documents. If None, it defaults to 137 | # " v documentation". 138 | #html_title = None 139 | 140 | # A shorter title for the navigation bar. Default is the same as html_title. 141 | #html_short_title = None 142 | 143 | # The name of an image file (relative to this directory) to place at the top 144 | # of the sidebar. 145 | #html_logo = None 146 | 147 | # The name of an image file (within the static path) to use as favicon of the 148 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 149 | # pixels large. 150 | #html_favicon = None 151 | 152 | # Add any paths that contain custom static files (such as style sheets) here, 153 | # relative to this directory. They are copied after the builtin static files, 154 | # so a file named "default.css" will overwrite the builtin "default.css". 155 | html_static_path = ['_static'] 156 | 157 | # Add any extra paths that contain custom files (such as robots.txt or 158 | # .htaccess) here, relative to this directory. These files are copied 159 | # directly to the root of the documentation. 160 | #html_extra_path = [] 161 | 162 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 163 | # using the given strftime format. 164 | #html_last_updated_fmt = '%b %d, %Y' 165 | 166 | # If true, SmartyPants will be used to convert quotes and dashes to 167 | # typographically correct entities. 168 | #html_use_smartypants = True 169 | 170 | # Custom sidebar templates, maps document names to template names. 171 | #html_sidebars = {} 172 | 173 | # Additional templates that should be rendered to pages, maps page names to 174 | # template names. 175 | #html_additional_pages = {} 176 | 177 | # If false, no module index is generated. 178 | #html_domain_indices = True 179 | 180 | # If false, no index is generated. 181 | #html_use_index = True 182 | 183 | # If true, the index is split into individual pages for each letter. 184 | #html_split_index = False 185 | 186 | # If true, links to the reST sources are added to the pages. 187 | #html_show_sourcelink = True 188 | 189 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 190 | #html_show_sphinx = True 191 | 192 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 193 | #html_show_copyright = True 194 | 195 | # If true, an OpenSearch description file will be output, and all pages will 196 | # contain a tag referring to it. The value of this option must be the 197 | # base URL from which the finished HTML is served. 198 | #html_use_opensearch = '' 199 | 200 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 201 | #html_file_suffix = None 202 | 203 | # Language to be used for generating the HTML full-text search index. 204 | # Sphinx supports the following languages: 205 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 206 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 207 | #html_search_language = 'en' 208 | 209 | # A dictionary with options for the search language support, empty by default. 210 | # Now only 'ja' uses this config value 211 | #html_search_options = {'type': 'default'} 212 | 213 | # The name of a javascript file (relative to the configuration directory) that 214 | # implements a search results scorer. If empty, the default will be used. 215 | #html_search_scorer = 'scorer.js' 216 | 217 | # Output file base name for HTML help builder. 218 | htmlhelp_basename = 'hdbscandoc' 219 | 220 | # -- Options for LaTeX output --------------------------------------------- 221 | 222 | latex_elements = { 223 | # The paper size ('letterpaper' or 'a4paper'). 224 | #'papersize': 'letterpaper', 225 | 226 | # The font size ('10pt', '11pt' or '12pt'). 227 | #'pointsize': '10pt', 228 | 229 | # Additional stuff for the LaTeX preamble. 230 | #'preamble': '', 231 | 232 | # Latex figure (float) alignment 233 | #'figure_align': 'htbp', 234 | } 235 | 236 | # Grouping the document tree into LaTeX files. List of tuples 237 | # (source start file, target name, title, 238 | # author, documentclass [howto, manual, or own class]). 239 | latex_documents = [ 240 | (master_doc, 'hdbscan.tex', u'hdbscan Documentation', 241 | u'Leland McInnes, John Healy, Steve Astels', 'manual'), 242 | ] 243 | 244 | # The name of an image file (relative to this directory) to place at the top of 245 | # the title page. 246 | #latex_logo = None 247 | 248 | # For "manual" documents, if this is true, then toplevel headings are parts, 249 | # not chapters. 250 | #latex_use_parts = False 251 | 252 | # If true, show page references after internal links. 253 | #latex_show_pagerefs = False 254 | 255 | # If true, show URL addresses after external links. 256 | #latex_show_urls = False 257 | 258 | # Documents to append as an appendix to all manuals. 259 | #latex_appendices = [] 260 | 261 | # If false, no module index is generated. 262 | #latex_domain_indices = True 263 | 264 | 265 | # -- Options for manual page output --------------------------------------- 266 | 267 | # One entry per manual page. List of tuples 268 | # (source start file, name, description, authors, manual section). 269 | man_pages = [ 270 | (master_doc, 'hdbscan', u'hdbscan Documentation', 271 | [author], 1) 272 | ] 273 | 274 | # If true, show URL addresses after external links. 275 | #man_show_urls = False 276 | 277 | 278 | # -- Options for Texinfo output ------------------------------------------- 279 | 280 | # Grouping the document tree into Texinfo files. List of tuples 281 | # (source start file, target name, title, author, 282 | # dir menu entry, description, category) 283 | texinfo_documents = [ 284 | (master_doc, 'hdbscan', u'hdbscan Documentation', 285 | author, 'hdbscan', 'One line description of project.', 286 | 'Miscellaneous'), 287 | ] 288 | 289 | # Documents to append as an appendix to all manuals. 290 | #texinfo_appendices = [] 291 | 292 | # If false, no module index is generated. 293 | #texinfo_domain_indices = True 294 | 295 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 296 | #texinfo_show_urls = 'footnote' 297 | 298 | # If true, do not generate a @detailmenu in the "Top" node's menu. 299 | #texinfo_no_detailmenu = False 300 | 301 | 302 | # -- Options for Epub output ---------------------------------------------- 303 | 304 | # Bibliographic Dublin Core info. 305 | epub_title = project 306 | epub_author = author 307 | epub_publisher = author 308 | epub_copyright = copyright 309 | 310 | # The basename for the epub file. It defaults to the project name. 311 | #epub_basename = project 312 | 313 | # The HTML theme for the epub output. Since the default themes are not 314 | # optimized for small screen space, using the same theme for HTML and epub 315 | # output is usually not wise. This defaults to 'epub', a theme designed to save 316 | # visual space. 317 | #epub_theme = 'epub' 318 | 319 | # The language of the text. It defaults to the language option 320 | # or 'en' if the language is not set. 321 | #epub_language = '' 322 | 323 | # The scheme of the identifier. Typical schemes are ISBN or URL. 324 | #epub_scheme = '' 325 | 326 | # The unique identifier of the text. This can be a ISBN number 327 | # or the project homepage. 328 | #epub_identifier = '' 329 | 330 | # A unique identification for the text. 331 | #epub_uid = '' 332 | 333 | # A tuple containing the cover image and cover page html template filenames. 334 | #epub_cover = () 335 | 336 | # A sequence of (type, uri, title) tuples for the guide element of content.opf. 337 | #epub_guide = () 338 | 339 | # HTML files that should be inserted before the pages created by sphinx. 340 | # The format is a list of tuples containing the path and title. 341 | #epub_pre_files = [] 342 | 343 | # HTML files that should be inserted after the pages created by sphinx. 344 | # The format is a list of tuples containing the path and title. 345 | #epub_post_files = [] 346 | 347 | # A list of files that should not be packed into the epub file. 348 | epub_exclude_files = ['search.html'] 349 | 350 | # The depth of the table of contents in toc.ncx. 351 | #epub_tocdepth = 3 352 | 353 | # Allow duplicate toc entries. 354 | #epub_tocdup = True 355 | 356 | # Choose between 'default' and 'includehidden'. 357 | #epub_tocscope = 'default' 358 | 359 | # Fix unsupported image types using the Pillow. 360 | #epub_fix_images = False 361 | 362 | # Scale large images. 363 | #epub_max_image_width = 0 364 | 365 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 366 | #epub_show_urls = 'inline' 367 | 368 | # If false, no index is generated. 369 | #epub_use_index = True 370 | -------------------------------------------------------------------------------- /docs/dbscan_from_hdbscan.rst: -------------------------------------------------------------------------------- 1 | 2 | Extracting DBSCAN* clustering from HDBSCAN* 3 | =========================================== 4 | 5 | There are a number of reasons that one might prefer `DBSCAN `__'s 6 | clustering over that of HDBSCAN*. The biggest difficulty many folks have with 7 | DBSCAN is that the epsilon distance parameter can be hard to determine and often 8 | requires a great deal of trial and error to tune. If your data lived in a more 9 | interpretable space and you had a good notion of distance in that space this problem 10 | is certainly mitigated and a user might want to set a very specific epsilon distance 11 | for their use case. Another viable use case might be that a user is interested in a 12 | constant density clustering. 13 | HDBSCAN* does variable density clustering by default, looking for the clusters that persist 14 | over a wide range of epsilon distance parameters to find a 'natural' clustering. This might 15 | not be the right result for your application. A DBSCAN clustering at a particular 16 | epsilon value might work better for your particular task. 17 | 18 | HDBSCAN returns a very natural clustering of your data which is often very useful in exploring 19 | a new data set. That doesn't necessarily make it the right clustering algorithm or every 20 | task. 21 | 22 | HDBSCAN* can best be thought of as a DBSCAN* implementation which varies across 23 | all epsilon values and extracts the clusters that persist over the widest range 24 | of these parameter choices. It is therefore able to ignore the parameter and 25 | only needs the minimum cluster size as single input parameter. 26 | The 'eom' (Excess of Mass) cluster selection method then returns clusters with the 27 | best stability over epsilon. 28 | 29 | There are a number of alternative ways of extracting a flat clustering from 30 | the HDBSCAN* hierarchical tree. If one is interested in finer resolution 31 | clusters while still maintaining variable density one could set 32 | ``cluster_selection_method='leaf'`` to extract the leaves of the condensed 33 | tree instead of the most persistent clusters. For more details on these 34 | cluster selection methods see :ref:`leaf_clustering_label`. 35 | 36 | If one wasn't interested in the variable density clustering that is the hallmark of 37 | HDBSCAN* it is relatively easy to extract any DBSCAN* clustering from a 38 | single run of HDBSCAN*. This has the advantage of allowing you to perform 39 | a single computationally efficient HDBSCAN* run and then quickly search over 40 | the DBSCAN* parameter space by extracting clustering results from our 41 | pre-constructed tree. This can save significant computational time when 42 | searching across multiple cluster parameter settings on large amounts of data. 43 | 44 | Alternatively, one could make use of the ``cluster_selection_epsilon`` as a 45 | post processing step with any ``cluster_selection_method`` in order to 46 | return a hybrid clustering of DBSCAN* and HDBSCAN*. For more details on 47 | this see :doc:`how_to_use_epsilon`. 48 | 49 | In order to extract a DBSCAN* clustering from an HDBSCAN run we must first train 50 | and HDBSCAN model on our data. 51 | 52 | .. code:: python 53 | 54 | import hdbscan 55 | h_cluster = hdbscan.HDBSCAN(min_samples=5,match_reference_implementation=True).fit(X) 56 | 57 | The ``min_cluster_size`` parameter is unimportant in this case in that it is 58 | only used in the creation of our condensed tree which we won't be using here. 59 | Now we choose a ``cut_distance`` which is just another name for the epsilon 60 | threshold in DBSCAN and will be passed to our 61 | :py:meth:`~hdbscan.hdbscan_.dbscan_clustering` method. 62 | 63 | .. code:: python 64 | 65 | eps = 0.2 66 | labels = h_cluster.dbscan_clustering(cut_distance=eps, min_cluster_size=5) 67 | sns.scatterplot(x=X[:,0], y=X[:,1], hue=labels.astype(str)); 68 | 69 | .. image:: images/dbscan_from_hdbscan_clustering.png 70 | :align: center 71 | 72 | It should be noted that a DBSCAN* clustering extracted from our HDBSCAN* tree will 73 | not precisely match the clustering results from sklearn's DBSCAN implementation. 74 | Our clustering results should better match DBSCAN* (which can be thought of as 75 | DBSCAN without the border points). As such when comparing the two results one 76 | should expect them to mostly differ in the points that DBSCAN considers boarder 77 | points. We'll deal with 78 | this by only looking at the comparison of our clustering results based on the points identified 79 | by DBSCAN as core points. We can see below that the differences between these two 80 | clusterings mostly occur in the boundaries of the clusters. This matches our 81 | intuition of stability within the core points. 82 | 83 | .. image:: images/dbscan_from_hdbscan_comparision.png 84 | :align: center 85 | 86 | For a slightly more empirical comparison we we make use of the `adjusted rand score `__ 87 | to compare the clustering of the core points between a DBSCAN cluster from sklearn and 88 | a DBSCAN* clustering extracted from our HDBSCAN* object. 89 | 90 | .. image:: images/dbscan_from_hdbscan_percentage_core.png 91 | :align: center 92 | 93 | .. image:: images/dbscan_from_hdbscan_number_of_clusters.png 94 | :align: center 95 | 96 | We see that for very small epsilon values our number of clusters tends to be quite 97 | far apart, largely due to a large number of the points being considered boundary points 98 | instead of core points. As the epsilon value increases, more and more points are 99 | considered core and the number of clusters generated by each algorithm converge. 100 | 101 | Additionally, the adjusted rand score between the core points of both algorithm 102 | stays consistently high (mostly 1.0) for our entire range of epsilon. There may be 103 | be some minor discrepancies between core point results largely due to implementation 104 | details and optimizations with the code base. 105 | 106 | Why might one just extract the DBSCAN* clustering results from a single HDBSCAN* run 107 | instead of making use of sklearns DBSSCAN code? The short answer is efficiency. 108 | If you aren't sure what epsilon parameter to select for DBSCAN then you may have to 109 | run the algorithm many times on your data set. While those runs can be inexpensive for 110 | very small epsilon values they can get quite expensive for large parameter values. 111 | 112 | In this small benchmark case of 50,000 two dimensional data points we have broken even 113 | after having only had to try two epsilon parameters from DBSCAN, or only a single 114 | run with a large parameter selected. This trend is only exacerbated for larger 115 | data sets in higher dimensional spaces. For more detailed scaling experiments see 116 | `Accelearted Hierarchical Density Clustering `__ 117 | by McInnes and Healy. 118 | 119 | .. image:: images/dbscan_from_hdbscan_timing.png 120 | :align: center 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /docs/docs_requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | -------------------------------------------------------------------------------- /docs/faq.rst: -------------------------------------------------------------------------------- 1 | Frequently Asked Questions 2 | ========================== 3 | 4 | Here we attempt to address some common questions, directing the user to some 5 | helpful answers. 6 | 7 | Q: Most of data is classified as noise; why? 8 | -------------------------------------------- 9 | 10 | The amount of data classified as noise is controlled by the ``min_samples`` 11 | parameter. By default, if not otherwise set, this value is set to the same 12 | value as ``min_cluster_size``. You can set it independently if you wish by 13 | specifying it separately. The lower the value, the less noise you'll get, but 14 | there are limits, and it is possible that you simply have noisy data. See 15 | :any:`min_samples ` for more details. 16 | 17 | Q: I mostly just get one large cluster; I want smaller clusters. 18 | ---------------------------------------------------------------- 19 | 20 | If you are getting a single large cluster and a few small outlying clusters 21 | that means your data is essentially a large glob with some small outlying 22 | clusters -- there may be structure to the glob, but compared to how well 23 | separated those other small clusters are, it doesn't really show up. You may, 24 | however, want to get at that more fine grained structure. You can do that, 25 | and what you are looking for is :any:`leaf clustering `. 26 | 27 | Q: HDBSCAN is failing to separate the clusters I think it should. 28 | ----------------------------------------------------------------- 29 | 30 | Density based clustering relies on having enough data to separate dense areas. 31 | In higher dimensional spaces this becomes more difficult, and hence 32 | requires more data. Quite possibly there is not enough data to make your 33 | clusters clearly separable. Consider the following plots: 34 | 35 | .. image:: images/generative_model_scatter.png 36 | .. image:: images/generative_model_kde.png 37 | 38 | Four different generative models, when sampled, produce results that are hard to 39 | easily differentiate. The blue dataset is sampled from a mixture of three 40 | standard Gaussians centered at (-2, 0), (0,0) and (2,0); the green dataset is 41 | sampled from a mixture of two standard Gaussians centered at (-1,0) and (1,0); 42 | the red data is sampled from a multivariate Gaussian with covariance 43 | [2, 0; 0, 1]; the purple data is a single standard Gaussian with uniform 44 | background noise. 45 | 46 | Despite the generate model having clearly different "clusters", without more 47 | data we simply cannot differentiate between these models, and hence no 48 | density based clustering will manage cluster these according to the model. 49 | 50 | Q: I am not getting the claimed performance. Why not? 51 | ----------------------------------------------------- 52 | 53 | The most likely explanation is to do with the dimensionality of your input data. 54 | While HDBSCAN can perform well on low to medium dimensional data the performance 55 | tends to decrease significantly as dimension increases. In general HDBSCAN can do 56 | well on up to around 50 or 100 dimensional data, but performance can see 57 | significant decreases beyond that. Of course a lot is also dataset dependent, so 58 | you can still get good performance even on high dimensional data, but it 59 | is no longer guaranteed. 60 | 61 | Q: I want to predict the cluster of a new unseen point. How do I do this? 62 | ------------------------------------------------------------------------- 63 | 64 | This is possible via the function :func:`~hdbscan.prediction.approximate_predict`. Note that you 65 | either need to set ``prediction_data=True`` on initialization of your 66 | clusterer object, or run the ``generate_prediction_data`` method after 67 | fitting. With that done you can run :func:`~hdbscan.prediction.approximate_predict` with the model 68 | and any new data points you wish to predict. Note that this differs from 69 | re-running HDBSCAN with the new points added since no new clusters will be 70 | considered -- instead the new points will be labelled according to the 71 | clusters already labelled by the model. 72 | 73 | Q: Haversine metric is not clustering my Lat-Lon data correctly. 74 | ---------------------------------------------------------------- 75 | 76 | The Haversine metric as implemented supports coordinates in radians. That 77 | means you'll need to convert your latitude and longitude data into radians 78 | before passing it in to HDBSCAN. 79 | 80 | Q: I want to cite this software in my journal publication. How do I do that? 81 | ---------------------------------------------------------------------------- 82 | 83 | If you have used this codebase in a scientific publication and wish to cite it, please use the `Journal of Open Source Software article `_. 84 | 85 | L. McInnes, J. Healy, S. Astels, *hdbscan: Hierarchical density based clustering* 86 | In: Journal of Open Source Software, The Open Journal, volume 2, number 11. 87 | 2017 88 | 89 | BibTeX:: 90 | 91 | @article{McInnes2017, 92 | doi = {10.21105/joss.00205}, 93 | url = {https://doi.org/10.21105%2Fjoss.00205}, 94 | year = {2017}, 95 | month = {mar}, 96 | publisher = {The Open Journal}, 97 | volume = {2}, 98 | number = {11}, 99 | author = {Leland McInnes and John Healy and Steve Astels}, 100 | title = {hdbscan: Hierarchical density based clustering}, 101 | journal = {The Journal of Open Source Software} 102 | } 103 | 104 | :: 105 | -------------------------------------------------------------------------------- /docs/how_to_use_epsilon.rst: -------------------------------------------------------------------------------- 1 | 2 | Combining HDBSCAN\* with DBSCAN 3 | ============================= 4 | 5 | While DBSCAN needs a minimum cluster size *and* a distance threshold epsilon as user-defined input parameters, 6 | HDBSCAN\* is basically a DBSCAN implementation for varying epsilon values and therefore only needs the minimum cluster size as single input parameter. 7 | The ``'eom'`` (Excess of Mass) cluster selection method then returns clusters with the best stability over epsilon. 8 | 9 | Unlike DBSCAN, this allows to it find clusters of variable densities without having to choose a suitable distance threshold first. 10 | However, there are cases where we could still benefit from the use of an epsilon threshold. 11 | 12 | For illustration, see this map with GPS locations, representing recorded pick-up and drop-off locations for customers of a ride pooling provider. 13 | The largest (visual) data cluster can be found around the train station. Smaller clusters are placed along the streets, depending on the requested location 14 | in the form of a postal address or point of interest. Since we are considering a door-to-door system where customers are not bound to collective pick-up or 15 | drop-off locations, we are interested in both large clusters and small clusters with a minimum size of 4. 16 | 17 | .. image:: images/epsilon_parameter_dataset.png 18 | :align: center 19 | 20 | Clustering the given data set with `DBSCAN `__ and an epsilon threshold of 5 meters gives us good results, 21 | but neglects clusters with points that are more than 5 meters apart from each other. 22 | However, increasing epsilon would result in cluster chains along the streets, especially when working with a larger data set. 23 | 24 | .. image:: images/epsilon_parameter_dbscan.png 25 | :align: center 26 | 27 | Unfortunately, HDBSCAN\* does not produce any better results in this case: while it discovers the clusters that DBSCAN missed, it also returns a very high number of micro-clusters around the train station, 28 | even though we would prefer one or only few clusters representing this location. We could achieve this by increasing ``min_cluster_size`` or 29 | the smoothing parameter ``min_samples``, but with the trade-off of losing small clusters in less dense areas or merging them into other clusters 30 | separated by a relatively large distance. 31 | 32 | .. image:: images/epsilon_parameter_hdbscan_eom.png 33 | :align: center 34 | 35 | This is where the parameter ``cluster_selection_epsilon`` comes into play. The cluster extraction method using this parameter, as described in detail 36 | by `Malzer and Baum `__, acts like a hybrid between DBSCAN 37 | (or, to be precise, DBSCAN\*, i.e. DBSCAN without the border points) by extracting DBSCAN results for data partitions 38 | affected by the given parameter value, and HDBSCAN\* results for all others. 39 | 40 | In our example, we choose to merge nested clusters below 5 meters (0.005 kilometers) and therefore set the parameter ``cluster_selection_epsilon`` accordingly: 41 | 42 | .. code:: python 43 | 44 | X = np.radians(coordinates) #convert the list of lat/lon coordinates to radians 45 | earth_radius_km = 6371 46 | epsilon = 0.005 / earth_radius_km #calculate 5 meter epsilon threshold 47 | 48 | clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='haversine', 49 | cluster_selection_epsilon=epsilon, cluster_selection_method = 'eom') 50 | clusterer.fit(X) 51 | 52 | And indeed, the result looks like a mix between DBSCAN and HDBSCAN(eom). We no longer lose clusters of variable densities beyond the given epsilon, but at the 53 | same time avoid the abundance of micro-clusters in the original HDBSCAN\* clustering, which was an undesired side-effect of having to choose a low ``min_cluster_size`` value. 54 | 55 | .. image:: images/epsilon_parameter_hdbscan_eps.png 56 | :align: center 57 | 58 | Note that for the given parameter setting, running HDBSCAN\* based on ``cluster_selection_method = 'eom'`` or ``cluster_selection_method = 'leaf'`` does not make 59 | any difference: the ``cluster_selection_epsilon`` threshold neutralizes the effect of HDBSCAN(eom)'s stability calculations. 60 | When using a lower threshold, some minor differences can be noticed. For example, an epsilon value of 3 meters with ``'eom'`` produces the same results as 61 | a the 5 meter value on the given data set, but 3 meters in combination with ``'leaf'`` achieves a slightly different result: 62 | 63 | .. image:: images/epsilon_parameter_hdbscan_e3_leaf.png 64 | :align: center 65 | 66 | A ``cluster_selection_epsilon`` value of 0 (the default value) always returns the original HDBSCAN\* results, either according to ``'eom'`` or ``'leaf'``. 67 | 68 | -------------------------------------------------------------------------------- /docs/images/advanced_hdbscan_11_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_11_1.png -------------------------------------------------------------------------------- /docs/images/advanced_hdbscan_26_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_26_1.png -------------------------------------------------------------------------------- /docs/images/advanced_hdbscan_3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_3_1.png -------------------------------------------------------------------------------- /docs/images/advanced_hdbscan_5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_5_1.png -------------------------------------------------------------------------------- /docs/images/advanced_hdbscan_9_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/advanced_hdbscan_9_1.png -------------------------------------------------------------------------------- /docs/images/allow_single_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/allow_single_cluster.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_12_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_12_0.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_15_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_15_0.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_18_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_18_0.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_21_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_21_0.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_24_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_24_0.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_27_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_27_0.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_31_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_31_0.png -------------------------------------------------------------------------------- /docs/images/comparing_clustering_algorithms_6_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/comparing_clustering_algorithms_6_0.png -------------------------------------------------------------------------------- /docs/images/epsilon_parameter_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_dataset.png -------------------------------------------------------------------------------- /docs/images/epsilon_parameter_dbscan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_dbscan.png -------------------------------------------------------------------------------- /docs/images/epsilon_parameter_hdbscan_e3_leaf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_e3_leaf.png -------------------------------------------------------------------------------- /docs/images/epsilon_parameter_hdbscan_eom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_eom.png -------------------------------------------------------------------------------- /docs/images/epsilon_parameter_hdbscan_eps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/epsilon_parameter_hdbscan_eps.png -------------------------------------------------------------------------------- /docs/images/generative_model_kde.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/generative_model_kde.png -------------------------------------------------------------------------------- /docs/images/generative_model_scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/generative_model_scatter.png -------------------------------------------------------------------------------- /docs/images/how_hdbscan_works_10_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_10_1.png -------------------------------------------------------------------------------- /docs/images/how_hdbscan_works_12_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_12_1.png -------------------------------------------------------------------------------- /docs/images/how_hdbscan_works_15_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_15_1.png -------------------------------------------------------------------------------- /docs/images/how_hdbscan_works_18_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_18_1.png -------------------------------------------------------------------------------- /docs/images/how_hdbscan_works_20_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_20_1.png -------------------------------------------------------------------------------- /docs/images/how_hdbscan_works_3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_hdbscan_works_3_1.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_13_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_13_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_15_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_15_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_17_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_17_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_19_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_19_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_21_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_21_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_23_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_23_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_25_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_25_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_3_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_3_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_5_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_5_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_7_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_7_0.png -------------------------------------------------------------------------------- /docs/images/how_to_detect_branches_9_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/how_to_detect_branches_9_0.png -------------------------------------------------------------------------------- /docs/images/outlier_detection_3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_3_1.png -------------------------------------------------------------------------------- /docs/images/outlier_detection_7_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_7_1.png -------------------------------------------------------------------------------- /docs/images/outlier_detection_9_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/outlier_detection_9_1.png -------------------------------------------------------------------------------- /docs/images/parameter_selection_11_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_11_1.png -------------------------------------------------------------------------------- /docs/images/parameter_selection_12_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_12_1.png -------------------------------------------------------------------------------- /docs/images/parameter_selection_15_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_15_1.png -------------------------------------------------------------------------------- /docs/images/parameter_selection_18_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_18_1.png -------------------------------------------------------------------------------- /docs/images/parameter_selection_3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_3_1.png -------------------------------------------------------------------------------- /docs/images/parameter_selection_7_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_7_1.png -------------------------------------------------------------------------------- /docs/images/parameter_selection_9_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/parameter_selection_9_1.png -------------------------------------------------------------------------------- /docs/images/performance_and_scalability_14_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_14_1.png -------------------------------------------------------------------------------- /docs/images/performance_and_scalability_20_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_20_2.png -------------------------------------------------------------------------------- /docs/images/performance_and_scalability_24_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_24_1.png -------------------------------------------------------------------------------- /docs/images/performance_and_scalability_9_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/performance_and_scalability_9_1.png -------------------------------------------------------------------------------- /docs/images/prediction_tutorial_3_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_3_0.png -------------------------------------------------------------------------------- /docs/images/prediction_tutorial_5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_5_1.png -------------------------------------------------------------------------------- /docs/images/prediction_tutorial_9_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/prediction_tutorial_9_1.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_10_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_10_1.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_13_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_13_1.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_15_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_15_0.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_3_1.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_6_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_6_1.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_8_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_8_1.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_explanation_11_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_11_0.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_explanation_15_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_15_0.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_explanation_26_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_26_0.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_explanation_2_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_2_0.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_explanation_31_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_31_0.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_explanation_36_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_36_0.png -------------------------------------------------------------------------------- /docs/images/soft_clustering_explanation_6_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/docs/images/soft_clustering_explanation_6_0.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. hdbscan documentation master file, created by 2 | sphinx-quickstart on Sat May 28 10:34:44 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | The hdbscan Clustering Library 7 | ============================== 8 | 9 | The hdbscan library is a suite of tools to use unsupervised learning to find clusters, or 10 | dense regions, of a dataset. The primary algorithm is HDBSCAN* as proposed by Campello, 11 | Moulavi, and Sander. The library provides a high performance implementation of this algorithm, 12 | along with tools for analysing the resulting clustering. 13 | 14 | 15 | User Guide / Tutorial 16 | --------------------- 17 | 18 | .. toctree:: 19 | :maxdepth: 2 20 | 21 | basic_hdbscan 22 | advanced_hdbscan 23 | parameter_selection 24 | outlier_detection 25 | prediction_tutorial 26 | soft_clustering 27 | how_to_use_epsilon 28 | dbscan_from_hdbscan 29 | how_to_detect_branches 30 | faq 31 | 32 | Background on Clustering with HDBSCAN 33 | ------------------------------------- 34 | 35 | .. toctree:: 36 | :maxdepth: 2 37 | 38 | how_hdbscan_works 39 | comparing_clustering_algorithms 40 | performance_and_scalability 41 | soft_clustering_explanation 42 | 43 | API Reference 44 | ------------- 45 | 46 | .. toctree:: 47 | 48 | api 49 | 50 | Indices and tables 51 | ================== 52 | 53 | * :ref:`genindex` 54 | * :ref:`modindex` 55 | * :ref:`search` 56 | 57 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 1>NUL 2>NUL 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\hdbscan.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\hdbscan.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /docs/outlier_detection.rst: -------------------------------------------------------------------------------- 1 | 2 | Outlier Detection 3 | ================= 4 | 5 | The hdbscan library supports the GLOSH outlier detection algorithm, and 6 | does so within the HDBSCAN clustering class. The GLOSH outlier detection 7 | algorithm is related to older outlier detection methods such as 8 | `LOF `__ and 9 | `LOCI `__. 10 | It is a fast and flexible outlier detection system, and supports a 11 | notion of local outliers. This means that it can detect outliers that 12 | may be noticeably different from points in its local region (for example 13 | points not on a local submanifold) but that are not necessarily outliers 14 | globally. So how do we find outliers? We proceed identically to the 15 | basic use of HDBSCAN\*. We start with some data, and fit it with an 16 | HDBSCAN object. 17 | 18 | .. code:: python 19 | 20 | plt.scatter(*data.T, s=50, linewidth=0, c='b', alpha=0.25) 21 | 22 | 23 | .. image:: images/outlier_detection_3_1.png 24 | 25 | 26 | .. code:: python 27 | 28 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data) 29 | 30 | The ``clusterer`` object now has an attribute (computed when first accessed) 31 | called ``outlier_scores_``. This provides a numpy array with a value for 32 | each sample in the original dataset that was fit with the ``clusterer``. The 33 | higher the score, the more likely the point is to be an outlier. In 34 | practice it is often best to look at the distributions of outlier 35 | scores. 36 | 37 | .. code:: python 38 | 39 | clusterer.outlier_scores_ 40 | 41 | 42 | 43 | 44 | .. parsed-literal:: 45 | 46 | array([ 0.14791852, 0.14116731, 0.09171929, ..., 0.62050534, 47 | 0.56749298, 0.20681685]) 48 | 49 | 50 | 51 | .. code:: python 52 | 53 | sns.distplot(clusterer.outlier_scores_[np.isfinite(clusterer.outlier_scores_)], rug=True) 54 | 55 | .. image:: images/outlier_detection_7_1.png 56 | 57 | 58 | We can pull off upper quantiles to detect outliers, which we can then 59 | plot. 60 | 61 | .. code:: python 62 | 63 | threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9) 64 | outliers = np.where(clusterer.outlier_scores_ > threshold)[0] 65 | plt.scatter(*data.T, s=50, linewidth=0, c='gray', alpha=0.25) 66 | plt.scatter(*data[outliers].T, s=50, linewidth=0, c='red', alpha=0.5) 67 | 68 | .. image:: images/outlier_detection_9_1.png 69 | 70 | 71 | Note that not only are the outlying border points highlighted as 72 | outliers, but points at the edge of the central ball like cluster, and 73 | just below the vertical band cluster, are also designated as outliers. 74 | This is because those two clusters are extremely dense, and the points 75 | at the edge of this cluster are close enough to the cluster that they 76 | should be part of it, but far enough from the being core parts of the 77 | cluster that they are extremely unlikely and hence anomalous. 78 | 79 | -------------------------------------------------------------------------------- /docs/parameter_selection.rst: -------------------------------------------------------------------------------- 1 | 2 | Parameter Selection for HDBSCAN\* 3 | ================================= 4 | 5 | While the HDBSCAN class has a large number of parameters that can be set 6 | on initialization, in practice there are a very small number of 7 | parameters that have significant practical effect on clustering. We will 8 | consider those major parameters, and consider how one may go about 9 | choosing them effectively. 10 | 11 | .. _min_cluster_size_label: 12 | 13 | Selecting ``min_cluster_size`` 14 | ------------------------------ 15 | 16 | The primary parameter to effect the resulting clustering is 17 | ``min_cluster_size``. Ideally this is a relatively intuitive parameter 18 | to select -- set it to the smallest size grouping that you wish to 19 | consider a cluster. It can have slightly non-obvious effects however. 20 | Let's consider the digits dataset from sklearn. We can project the data 21 | into two dimensions to visualize it via t-SNE. 22 | 23 | .. code:: python 24 | 25 | digits = datasets.load_digits() 26 | data = digits.data 27 | projection = TSNE().fit_transform(data) 28 | plt.scatter(*projection.T, **plot_kwds) 29 | 30 | 31 | .. image:: images/parameter_selection_3_1.png 32 | 33 | 34 | If we cluster this data in the full 64 dimensional space with HDBSCAN\* we 35 | can see some effects from varying the ``min_cluster_size``. 36 | 37 | We start with a ``min_cluster_size`` of 15. 38 | 39 | .. code:: python 40 | 41 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15).fit(data) 42 | color_palette = sns.color_palette('Paired', 12) 43 | cluster_colors = [color_palette[x] if x >= 0 44 | else (0.5, 0.5, 0.5) 45 | for x in clusterer.labels_] 46 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 47 | zip(cluster_colors, clusterer.probabilities_)] 48 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 49 | 50 | 51 | .. image:: images/parameter_selection_7_1.png 52 | 53 | 54 | Increasing the ``min_cluster_size`` to 30 reduces the number of 55 | clusters, merging some together. This is a result of HDBSCAN\* 56 | reoptimizing which flat clustering provides greater stability under a 57 | slightly different notion of what constitutes a cluster. 58 | 59 | .. code:: python 60 | 61 | clusterer = hdbscan.HDBSCAN(min_cluster_size=30).fit(data) 62 | color_palette = sns.color_palette('Paired', 12) 63 | cluster_colors = [color_palette[x] if x >= 0 64 | else (0.5, 0.5, 0.5) 65 | for x in clusterer.labels_] 66 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 67 | zip(cluster_colors, clusterer.probabilities_)] 68 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 69 | 70 | .. image:: images/parameter_selection_9_1.png 71 | 72 | 73 | Doubling the ``min_cluster_size`` again to 60 gives us just two clusters 74 | -- the really core clusters. This is somewhat as expected, but surely 75 | some of the other clusters that we had previously had more than 60 76 | members? Why are they being considered noise? The answer is that 77 | HDBSCAN\* has a second parameter ``min_samples``. The implementation 78 | defaults this value (if it is unspecified) to whatever 79 | ``min_cluster_size`` is set to. We can recover some of our original 80 | clusters by explicitly providing ``min_samples`` at the original value 81 | of 15. 82 | 83 | .. code:: python 84 | 85 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60).fit(data) 86 | color_palette = sns.color_palette('Paired', 12) 87 | cluster_colors = [color_palette[x] if x >= 0 88 | else (0.5, 0.5, 0.5) 89 | for x in clusterer.labels_] 90 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 91 | zip(cluster_colors, clusterer.probabilities_)] 92 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 93 | 94 | 95 | .. image:: images/parameter_selection_11_1.png 96 | 97 | 98 | .. code:: python 99 | 100 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=15).fit(data) 101 | color_palette = sns.color_palette('Paired', 12) 102 | cluster_colors = [color_palette[x] if x >= 0 103 | else (0.5, 0.5, 0.5) 104 | for x in clusterer.labels_] 105 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 106 | zip(cluster_colors, clusterer.probabilities_)] 107 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 108 | 109 | .. image:: images/parameter_selection_12_1.png 110 | 111 | 112 | As you can see this results in us recovering something much closer to 113 | our original clustering, only now with some of the smaller clusters 114 | pruned out. Thus ``min_cluster_size`` does behave more closely to our 115 | intuitions, but only if we fix ``min_samples``. 116 | 117 | If you wish to explore different ``min_cluster_size`` settings with 118 | a fixed ``min_samples`` value, especially for larger dataset sizes, 119 | you can cache the hard computation, and recompute only the relatively 120 | cheap flat cluster extraction using the ``memory`` parameter, which 121 | makes use of `joblib `_ 122 | 123 | .. _min_samples_label: 124 | 125 | Selecting ``min_samples`` 126 | ----------------------- 127 | 128 | Since we have seen that ``min_samples`` clearly has a dramatic effect on 129 | clustering, the question becomes: how do we select this parameter? The 130 | simplest intuition for what ``min_samples`` does is provide a measure of 131 | how conservative you want your clustering to be. The larger the value of 132 | ``min_samples`` you provide, the more conservative the clustering -- 133 | more points will be declared as noise, and clusters will be restricted 134 | to progressively more dense areas. We can see this in practice by 135 | leaving the ``min_cluster_size`` at 60, but reducing ``min_samples`` to 136 | 1. 137 | 138 | Note: adjusting ``min_samples`` will result in recomputing the **hard 139 | comptuation** of the single linkage tree. 140 | 141 | .. code:: python 142 | 143 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=1).fit(data) 144 | color_palette = sns.color_palette('Paired', 12) 145 | cluster_colors = [color_palette[x] if x >= 0 146 | else (0.5, 0.5, 0.5) 147 | for x in clusterer.labels_] 148 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 149 | zip(cluster_colors, clusterer.probabilities_)] 150 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 151 | 152 | 153 | 154 | 155 | .. parsed-literal:: 156 | 157 | 158 | 159 | 160 | 161 | 162 | .. image:: images/parameter_selection_15_1.png 163 | 164 | 165 | Now most points are clustered, and there are much fewer noise points. 166 | Steadily increasing ``min_samples`` will, as we saw in the examples 167 | above, make the clustering progressively more conservative, culminating 168 | in the example above where ``min_samples`` was set to 60 and we had only 169 | two clusters with most points declared as noise. 170 | 171 | .. _epsilon_label: 172 | 173 | Selecting ``cluster_selection_epsilon`` 174 | --------------------------------------- 175 | 176 | In some cases, we want to choose a small ``min_cluster_size`` because even groups of few points might be of interest to us. 177 | However, if our data set also contains partitions with high concentrations of objects, this parameter setting can result in 178 | a large number of micro-clusters. Selecting a value for ``cluster_selection_epsilon`` helps us to merge clusters in these regions. 179 | Or in other words, it ensures that clusters below the given threshold are not split up any further. 180 | 181 | The choice of ``cluster_selection_epsilon`` depends on the given distances between your data points. For example, set the value to 0.5 if you don't want to 182 | separate clusters that are less than 0.5 units apart. This will basically extract DBSCAN* clusters for epsilon = 0.5 from the condensed cluster tree, but leave 183 | HDBSCAN* clusters that emerged at distances greater than 0.5 untouched. See :doc:`how_to_use_epsilon` for a more detailed demonstration of the effect this parameter 184 | has on the resulting clustering. 185 | 186 | .. _alpha_label: 187 | 188 | Selecting ``alpha`` 189 | ----------------- 190 | 191 | A further parameter that effects the resulting clustering is ``alpha``. 192 | In practice it is best not to mess with this parameter -- ultimately it 193 | is part of the ``RobustSingleLinkage`` code, but flows naturally into 194 | HDBSCAN\*. If, for some reason, ``min_samples`` or ``cluster_selection_epsilon`` is not providing you 195 | what you need, stop, rethink things, and try again with ``min_samples`` or ``cluster_selection_epsilon``. 196 | If you still need to play with another parameter (and you shouldn't), 197 | then you can try setting ``alpha``. The ``alpha`` parameter provides a 198 | slightly different approach to determining how conservative the 199 | clustering is. By default ``alpha`` is set to 1.0. Increasing ``alpha`` 200 | will make the clustering more conservative, but on a much tighter scale, 201 | as we can see by setting ``alpha`` to 1.3. 202 | 203 | Note: adjusting ``alpha`` will result in recomputing the **hard 204 | comptuation** of the single linkage tree. 205 | 206 | .. code:: python 207 | 208 | clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=15, alpha=1.3).fit(data) 209 | color_palette = sns.color_palette('Paired', 12) 210 | cluster_colors = [color_palette[x] if x >= 0 211 | else (0.5, 0.5, 0.5) 212 | for x in clusterer.labels_] 213 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 214 | zip(cluster_colors, clusterer.probabilities_)] 215 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 216 | 217 | .. image:: images/parameter_selection_18_1.png 218 | 219 | 220 | .. _leaf_clustering_label: 221 | 222 | Leaf clustering 223 | --------------- 224 | 225 | HDBSCAN supports an extra parameter ``cluster_selection_method`` to determine 226 | how it selects flat clusters from the cluster tree hierarchy. The default 227 | method is ``'eom'`` for Excess of Mass, the algorithm described in 228 | :doc:`how_hdbscan_works`. This is not always the most desireable approach to 229 | cluster selection. If you are more interested in having small homogeneous 230 | clusters then you may find Excess of Mass has a tendency to pick one or two 231 | large clusters and then a number of small extra clusters. In this situation 232 | you may be tempted to recluster just the data in the single large cluster. 233 | Instead, a better option is to select ``'leaf'`` as a cluster selection 234 | method. This will select leaf nodes from the tree, producing many small 235 | homogeneous clusters. Note that you can still get variable density clusters 236 | via this method, and it is also still possible to get large clusters, but 237 | there will be a tendency to produce a more fine grained clustering than 238 | Excess of Mass can provide. 239 | 240 | .. _single_cluster_label: 241 | 242 | Allowing a single cluster 243 | ------------------------- 244 | 245 | In contrast, if you are getting lots of small clusters, but believe there 246 | should be some larger scale structure (or the possibility of no structure), 247 | consider the ``allow_single_cluster`` option. By default HDBSCAN\* does not 248 | allow a single cluster to be returned -- this is due to how the Excess of 249 | Mass algorithm works, and a bias towards the root cluster that may occur. You 250 | can override this behaviour and see what clustering would look like if you 251 | allow a single cluster to be returned. This can alleviate issue caused by 252 | there only being a single large cluster, or by data that is essentially just 253 | noise. For example, the image below shows the effects of setting 254 | ``allow_single_cluster=True`` in the bottom row, compared to the top row 255 | which used default settings. 256 | 257 | .. image:: images/allow_single_cluster.png 258 | -------------------------------------------------------------------------------- /docs/prediction_tutorial.rst: -------------------------------------------------------------------------------- 1 | 2 | Predicting clusters for new points 3 | ================================== 4 | 5 | Often it is useful to train a model once on a large amount of data, and 6 | then query the model repeatedly with small amounts of new data. This is 7 | hard for HDBSCAN\* as it is a transductive method -- new data points 8 | can (and should!) be able to alter the underlying clustering. That is, 9 | given new information it might make sense to create a new cluster, split 10 | an existing cluster, or merge two previously separate clusters. If the 11 | actual clusters (and hence their labels) change with each new data point 12 | it becomes impossible to compare the cluster assignments between such 13 | queries. 14 | 15 | We can accommodate this by effectively holding a clustering fixed (after 16 | a potentially expensive training run) and then asking: *if we do not 17 | change the existing clusters* which cluster would HDBSCAN\* assign a new 18 | data point to. In practice this amounts to determining where in the 19 | condensed tree the new data point would fall (see 20 | :any:`how_hdbscan_works`) assuming we do not change the condensed 21 | tree. This allows for a very inexpensive operation to compute a 22 | predicted cluster for the new data point. 23 | 24 | This has been implemented in ``hdbscan`` as the 25 | :py:func:`~hdbscan.predict.approximate_predict` function. We'll look 26 | at how this works below. 27 | 28 | As usual we begin with our test synthetic data set, and cluster it with 29 | HDBSCAN. The primary point to note here, however, is the use of the 30 | ``prediction_data=True`` keyword argument. This ensures that HDBSCAN 31 | does a little extra computation when fitting the model that can 32 | dramatically speed up the prediction queries later. 33 | 34 | You can also get an HDBSCAN object to create this data after the fact 35 | via the :py:meth:`~hdbscan.HDBSCAN.generate_prediction_data` method. 36 | 37 | .. code:: python 38 | 39 | data = np.load('clusterable_data.npy') 40 | clusterer = hdbscan.HDBSCAN(min_cluster_size=15, prediction_data=True).fit(data) 41 | pal = sns.color_palette('deep', 8) 42 | colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, 43 | clusterer.probabilities_)] 44 | plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds); 45 | 46 | 47 | 48 | .. image:: images/prediction_tutorial_3_0.png 49 | 50 | 51 | Now to make things a little more interesting let's generate 50 new data 52 | points scattered across the data. We can plot them in black to see where 53 | they happen to fall. 54 | 55 | .. code:: python 56 | 57 | test_points = np.random.random(size=(50, 2)) - 0.5 58 | 59 | colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, 60 | clusterer.probabilities_)] 61 | plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds); 62 | plt.scatter(*test_points.T, c='k', s=50) 63 | 64 | 65 | 66 | .. image:: images/prediction_tutorial_5_1.png 67 | 68 | 69 | We can use the predict API on this data, calling 70 | :py:func:`~hdbscan.predict.approximate_predict` with the HDBSCAN object, 71 | and the numpy array of new points. Note that 72 | :py:func:`~hdbscan.predict.approximate_predict` takes an *array* of new 73 | points. If you have a single point be sure to wrap it in a list. 74 | 75 | .. code:: python 76 | 77 | test_labels, strengths = hdbscan.approximate_predict(clusterer, test_points) 78 | test_labels 79 | 80 | 81 | 82 | 83 | .. parsed-literal:: 84 | 85 | array([ 2, -1, -1, -1, -1, -1, 1, 5, -1, -1, 5, -1, -1, -1, -1, 4, -1, 86 | -1, -1, -1, -1, 4, -1, -1, -1, -1, 2, -1, -1, 1, -1, -1, -1, 0, 87 | -1, 2, -1, -1, 3, -1, -1, 1, -1, -1, -1, -1, -1, 5, 3, 2]) 88 | 89 | 90 | 91 | The result is a set of labels as you can see. Many of the points as 92 | classified as noise, but several are also assigned to clusters. This is 93 | a very fast operation, even with large datasets, as long the HDBSCAN 94 | object has the prediction data generated beforehand. 95 | 96 | We can also visualize how this worked, coloring the new data points by 97 | the cluster to which they were assigned. I have added black border 98 | around the points so they don't get lost inside the clusters they fall 99 | into. 100 | 101 | .. code:: python 102 | 103 | colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_, 104 | clusterer.probabilities_)] 105 | test_colors = [pal[col] if col >= 0 else (0.1, 0.1, 0.1) for col in test_labels] 106 | plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds); 107 | plt.scatter(*test_points.T, c=test_colors, s=80, linewidths=1, edgecolors='k') 108 | 109 | 110 | 111 | .. image:: images/prediction_tutorial_9_1.png 112 | 113 | 114 | It is as simple as that. So now you can get started using HDBSCAN as a 115 | streaming clustering service -- just be sure to cache your data and 116 | retrain your model periodically to avoid drift! 117 | 118 | -------------------------------------------------------------------------------- /docs/soft_clustering.rst: -------------------------------------------------------------------------------- 1 | 2 | Soft Clustering for HDBSCAN\* 3 | ============================= 4 | 5 | Soft clustering is a new (and still somewhat experimental) feature of 6 | the hdbscan library. It takes advantage of the fact that the condensed 7 | tree is a kind of smoothed density function over data points, and the 8 | notion of exemplars for clusters. If you want to better understand how 9 | soft clustering works please refer to :any:`soft_clustering_explanation`. 10 | 11 | Let's consider the digits dataset from sklearn. We can project the data 12 | into two dimensions to visualize it via t-SNE. 13 | 14 | .. code:: python 15 | 16 | from sklearn import datasets 17 | from sklearn.manifold import TSNE 18 | import matplotlib.pyplot as plt 19 | import seaborn as sns 20 | import numpy as np 21 | 22 | .. code:: python 23 | 24 | digits = datasets.load_digits() 25 | data = digits.data 26 | projection = TSNE().fit_transform(data) 27 | plt.scatter(*projection.T, **plot_kwds) 28 | 29 | 30 | .. image:: images/soft_clustering_3_1.png 31 | 32 | 33 | Now we import hdbscan and then cluster in the full 64 dimensional space. 34 | It is important to note that, if we wish to use the soft clustering we 35 | should use the ``prediction_data=True`` option for HDBSCAN. This will 36 | ensure we generate the extra data required that will allow soft 37 | clustering to work. 38 | 39 | .. code:: python 40 | 41 | import hdbscan 42 | 43 | .. code:: python 44 | 45 | clusterer = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True).fit(data) 46 | color_palette = sns.color_palette('Paired', 12) 47 | cluster_colors = [color_palette[x] if x >= 0 48 | else (0.5, 0.5, 0.5) 49 | for x in clusterer.labels_] 50 | cluster_member_colors = [sns.desaturate(x, p) for x, p in 51 | zip(cluster_colors, clusterer.probabilities_)] 52 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25) 53 | 54 | 55 | 56 | .. image:: images/soft_clustering_6_1.png 57 | 58 | 59 | Certainly a number of clusters were found, but the data is fairly noisy 60 | in 64 dimensions, so there are a number of points that have been 61 | classified as noise. We can generate a soft clustering to get more 62 | information about some of these noise points. 63 | 64 | To generate a soft clustering for all the points in the original dataset 65 | we use the 66 | :py:func:`~hdbscan.prediction.all_points_membership_vectors` function 67 | which takes a clusterer object. If we wanted to get soft cluster 68 | membership values for a set of new unseen points we could use 69 | :py:func:`~hdbscan.prediction.membership_vector` instead. 70 | 71 | The return value is a two-dimensional numpy array. Each point of the 72 | input data is assigned a vector of probabilities of being in a cluster. 73 | For a first pass we can visualize the data looking at what the *most 74 | likely* cluster was, by coloring according to the ``argmax`` of the 75 | probability vector (i.e. the cluster for which a given point has the 76 | highest probability of being in). 77 | 78 | .. code:: python 79 | 80 | soft_clusters = hdbscan.all_points_membership_vectors(clusterer) 81 | color_palette = sns.color_palette('Paired', 12) 82 | cluster_colors = [color_palette[np.argmax(x)] 83 | for x in soft_clusters] 84 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25) 85 | 86 | 87 | 88 | .. image:: images/soft_clustering_8_1.png 89 | 90 | 91 | This fills out the clusters nicely -- we see that there were many noise 92 | points that are most likely to belong to the clusters we would expect; 93 | we can also see where things have gotten confused in the middle, and 94 | there is a mix of cluster assignments. 95 | 96 | We are still only using part of the information however; we can 97 | desaturate according to the actual probability value for the most likely 98 | cluster. 99 | 100 | .. code:: python 101 | 102 | color_palette = sns.color_palette('Paired', 12) 103 | cluster_colors = [sns.desaturate(color_palette[np.argmax(x)], np.max(x)) 104 | for x in soft_clusters] 105 | plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25) 106 | 107 | 108 | 109 | .. image:: images/soft_clustering_10_1.png 110 | 111 | 112 | We see that many points actually have a low probability of being in the 113 | cluster -- indeed the soft clustering applies *within* a cluster, so 114 | only the very cores of each cluster have high probabilities. In practice 115 | desaturating is a fairly string treatment; visually a lot will look 116 | gray. We could apply a function and put a lower limit on the 117 | desaturation that meets better with human visual perception, but that is 118 | left as an exercise for the reader. 119 | 120 | Instead we'll explore what else we can learn about the data from these 121 | cluster membership probabilities. An interesting question is which 122 | points have high likelihoods for *two* clusters (and low likelihoods for 123 | the other clusters). 124 | 125 | .. code:: python 126 | 127 | def top_two_probs_diff(probs): 128 | sorted_probs = np.sort(probs) 129 | return sorted_probs[-1] - sorted_probs[-2] 130 | 131 | # Compute the differences between the top two probabilities 132 | diffs = np.array([top_two_probs_diff(x) for x in soft_clusters]) 133 | # Select out the indices that have a small difference, and a larger total probability 134 | mixed_points = np.where((diffs < 0.001) & (np.sum(soft_clusters, axis=1) > 0.5))[0] 135 | 136 | .. code:: python 137 | 138 | colors = [(0.75, 0.1, 0.1) if x in mixed_points 139 | else (0.5, 0.5, 0.5) for x in range(data.shape[0])] 140 | plt.scatter(*projection.T, s=50, linewidth=0, c=colors, alpha=0.5) 141 | 142 | 143 | 144 | 145 | .. image:: images/soft_clustering_13_1.png 146 | 147 | 148 | We can look at a few of these and see that many are, indeed, hard to 149 | classify (even for humans). It also seems that 8 was not assigned a 150 | cluster and is seen as a mixture of other clusters. 151 | 152 | .. code:: python 153 | 154 | fig = plt.figure() 155 | for i, image in enumerate(digits.images[mixed_points][:16]): 156 | ax = fig.add_subplot(4,4,i+1) 157 | ax.imshow(image) 158 | plt.tight_layout() 159 | 160 | 161 | 162 | .. image:: images/soft_clustering_15_0.png 163 | 164 | 165 | There is, of course, a lot more analysis that can be done from here, but 166 | hopefully this provides sufficient introduction to what can be achieved 167 | with soft clustering. 168 | 169 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: hdbscan 2 | dependencies: 3 | - python>=3.5 4 | - scikit-learn>=0.19 5 | - matplotlib>=2.0 6 | - seaborn>=0.8 7 | - hdbscan>=0.8.11 8 | -------------------------------------------------------------------------------- /examples/plot_cluster_comparison.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================================================= 3 | Comparing different clustering algorithms on toy datasets 4 | ========================================================= 5 | 6 | This example aims at showing characteristics of different 7 | clustering algorithms on datasets that are "interesting" 8 | but still in 2D. The last dataset is an example of a 'null' 9 | situation for clustering: the data is homogeneous, and 10 | there is no good clustering. 11 | 12 | While these examples give some intuition about the algorithms, 13 | this intuition might not apply to very high dimensional data. 14 | 15 | The results could be improved by tweaking the parameters for 16 | each clustering strategy, for instance setting the number of 17 | clusters for the methods that needs this parameter 18 | specified. Note that affinity propagation has a tendency to 19 | create many clusters. Thus in this example its two parameters 20 | (damping and per-point preference) were set to to mitigate this 21 | behavior. 22 | """ 23 | print(__doc__) 24 | 25 | import time 26 | 27 | import numpy as np 28 | import matplotlib.pyplot as plt 29 | 30 | from sklearn import cluster, datasets 31 | from sklearn.neighbors import kneighbors_graph 32 | from sklearn.preprocessing import StandardScaler 33 | 34 | import hdbscan 35 | 36 | np.random.seed(0) 37 | plt.style.use('fivethirtyeight') 38 | 39 | def make_var_density_blobs(n_samples=750, centers=[[0,0]], cluster_std=[0.5], random_state=0): 40 | samples_per_blob = n_samples // len(centers) 41 | blobs = [datasets.make_blobs(n_samples=samples_per_blob, centers=[c], cluster_std=cluster_std[i])[0] 42 | for i, c in enumerate(centers)] 43 | labels = [i * np.ones(samples_per_blob) for i in range(len(centers))] 44 | return np.vstack(blobs), np.hstack(labels) 45 | 46 | # Generate datasets. We choose the size big enough to see the scalability 47 | # of the algorithms, but not too big to avoid too long running times 48 | n_samples = 1500 49 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, 50 | noise=.08) 51 | noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.10) 52 | blobs = datasets.make_blobs(n_samples=n_samples-200, random_state=8) 53 | noisy_blobs = np.vstack((blobs[0], 25.0*np.random.rand(200, 2)-[10.0,10.0])), np.hstack((blobs[1], -1*np.ones(200))) 54 | varying_blobs = make_var_density_blobs(n_samples, 55 | centers=[[1, 1], 56 | [-1, -1], 57 | [1, -1]], 58 | cluster_std=[0.2, 0.35, 0.5]) 59 | no_structure = np.random.rand(n_samples, 2), None 60 | 61 | colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) 62 | colors = np.hstack([colors] * 20) 63 | 64 | clustering_names = [ 65 | 'MiniBatchKMeans', 'AffinityPropagation', 66 | 'SpectralClustering', 'AgglomerativeClustering', 67 | 'DBSCAN', 'HDBSCAN'] 68 | 69 | plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5)) 70 | plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, 71 | hspace=.01) 72 | 73 | plot_num = 1 74 | 75 | datasets = [noisy_circles, noisy_moons, noisy_blobs, varying_blobs, no_structure] 76 | for i_dataset, dataset in enumerate(datasets): 77 | X, y = dataset 78 | # normalize dataset for easier parameter selection 79 | X = StandardScaler().fit_transform(X) 80 | 81 | # estimate bandwidth for mean shift 82 | bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) 83 | 84 | # connectivity matrix for structured Ward 85 | connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) 86 | # make connectivity symmetric 87 | connectivity = 0.5 * (connectivity + connectivity.T) 88 | 89 | # create clustering estimators 90 | two_means = cluster.MiniBatchKMeans(n_clusters=2) 91 | spectral = cluster.SpectralClustering(n_clusters=2, 92 | eigen_solver='arpack', 93 | affinity="nearest_neighbors") 94 | dbscan = cluster.DBSCAN(eps=.2) 95 | affinity_propagation = cluster.AffinityPropagation(damping=.9, 96 | preference=-200) 97 | 98 | average_linkage = cluster.AgglomerativeClustering( 99 | linkage="average", affinity="cityblock", n_clusters=2, 100 | connectivity=connectivity) 101 | 102 | hdbscanner = hdbscan.HDBSCAN() 103 | clustering_algorithms = [ 104 | two_means, affinity_propagation, spectral, average_linkage, 105 | dbscan, hdbscanner] 106 | 107 | for name, algorithm in zip(clustering_names, clustering_algorithms): 108 | # predict cluster memberships 109 | t0 = time.time() 110 | algorithm.fit(X) 111 | t1 = time.time() 112 | if hasattr(algorithm, 'labels_'): 113 | y_pred = algorithm.labels_.astype(np.int) 114 | else: 115 | y_pred = algorithm.predict(X) 116 | 117 | # plot 118 | plt.subplot(5, len(clustering_algorithms), plot_num) 119 | if i_dataset == 0: 120 | plt.title(name, size=18) 121 | plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10) 122 | 123 | if hasattr(algorithm, 'cluster_centers_'): 124 | centers = algorithm.cluster_centers_ 125 | center_colors = colors[:len(centers)] 126 | plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors) 127 | plt.xlim(-2, 2) 128 | plt.ylim(-2, 2) 129 | plt.xticks(()) 130 | plt.yticks(()) 131 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), 132 | transform=plt.gca().transAxes, size=15, 133 | horizontalalignment='right') 134 | plot_num += 1 135 | 136 | plt.show() 137 | -------------------------------------------------------------------------------- /examples/plot_hdbscan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | =================================== 4 | Demo of HDBSCAN clustering algorithm 5 | =================================== 6 | 7 | Finds a clustering that has the greatest stability over a range 8 | of epsilon values for standard DBSCAN. This allows clusterings 9 | of different densities unlike DBSCAN. 10 | 11 | """ 12 | print(__doc__) 13 | 14 | import numpy as np 15 | 16 | from hdbscan import HDBSCAN 17 | from sklearn.cluster import DBSCAN 18 | from sklearn import metrics 19 | from sklearn.datasets.samples_generator import make_blobs 20 | from sklearn.preprocessing import StandardScaler 21 | 22 | import time 23 | 24 | def make_var_density_blobs(n_samples=750, centers=[[0,0]], cluster_std=[0.5], random_state=0): 25 | samples_per_blob = n_samples // len(centers) 26 | blobs = [make_blobs(n_samples=samples_per_blob, centers=[c], cluster_std=cluster_std[i])[0] 27 | for i, c in enumerate(centers)] 28 | labels = [i * np.ones(samples_per_blob) for i in range(len(centers))] 29 | return np.vstack(blobs), np.hstack(labels) 30 | 31 | 32 | ############################################################################## 33 | # Generate sample data 34 | centers = [[1, 1], [-1, -1], [1, -1]] 35 | densities = [0.2, 0.35, 0.5] 36 | X, labels_true = make_var_density_blobs(n_samples=750, centers=centers, cluster_std=densities, 37 | random_state=0) 38 | 39 | X = StandardScaler().fit_transform(X) 40 | 41 | ############################################################################## 42 | # Compute DBSCAN 43 | hdb_t1 = time.time() 44 | hdb = HDBSCAN(min_cluster_size=10).fit(X) 45 | hdb_labels = hdb.labels_ 46 | hdb_elapsed_time = time.time() - hdb_t1 47 | 48 | db_t1 = time.time() 49 | db = DBSCAN(eps=0.1).fit(X) 50 | db_labels = db.labels_ 51 | db_elapsed_time = time.time() - db_t1 52 | 53 | # Number of clusters in labels, ignoring noise if present. 54 | n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0) 55 | 56 | print('\n\n++ HDBSCAN Results') 57 | print('Estimated number of clusters: %d' % n_clusters_hdb_) 58 | print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time) 59 | print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, hdb_labels)) 60 | print('Completeness: %0.3f' % metrics.completeness_score(labels_true, hdb_labels)) 61 | print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, hdb_labels)) 62 | print('Adjusted Rand Index: %0.3f' 63 | % metrics.adjusted_rand_score(labels_true, hdb_labels)) 64 | print('Adjusted Mutual Information: %0.3f' 65 | % metrics.adjusted_mutual_info_score(labels_true, hdb_labels)) 66 | print('Silhouette Coefficient: %0.3f' 67 | % metrics.silhouette_score(X, hdb_labels)) 68 | 69 | n_clusters_db_ = len(set(db_labels)) - (1 if -1 in db_labels else 0) 70 | 71 | print('\n\n++ DBSCAN Results') 72 | print('Estimated number of clusters: %d' % n_clusters_db_) 73 | print('Elapsed time to cluster: %.4f s' % db_elapsed_time) 74 | print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, db_labels)) 75 | print('Completeness: %0.3f' % metrics.completeness_score(labels_true, db_labels)) 76 | print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, db_labels)) 77 | print('Adjusted Rand Index: %0.3f' 78 | % metrics.adjusted_rand_score(labels_true, db_labels)) 79 | print('Adjusted Mutual Information: %0.3f' 80 | % metrics.adjusted_mutual_info_score(labels_true, db_labels)) 81 | if n_clusters_db_ > 1: 82 | print('Silhouette Coefficient: %0.3f' 83 | % metrics.silhouette_score(X, db_labels)) 84 | else: 85 | print('Silhouette Coefficient: NaN (too few clusters)') 86 | 87 | ############################################################################## 88 | # Plot result 89 | import matplotlib.pyplot as plt 90 | 91 | # Black removed and is used for noise instead. 92 | hdb_unique_labels = set(hdb_labels) 93 | db_unique_labels = set(db_labels) 94 | hdb_colors = plt.cm.Spectral(np.linspace(0, 1, len(hdb_unique_labels))) 95 | db_colors = plt.cm.Spectral(np.linspace(0, 1, len(db_unique_labels))) 96 | fig = plt.figure(figsize=plt.figaspect(0.5)) 97 | hdb_axis = fig.add_subplot('121') 98 | db_axis = fig.add_subplot('122') 99 | for k, col in zip(hdb_unique_labels, hdb_colors): 100 | if k == -1: 101 | # Black used for noise. 102 | col = 'k' 103 | 104 | hdb_axis.plot(X[hdb_labels == k, 0], X[hdb_labels == k, 1], 'o', markerfacecolor=col, 105 | markeredgecolor='k', markersize=6) 106 | for k, col in zip(db_unique_labels, db_colors): 107 | if k == -1: 108 | # Black used for noise. 109 | col = 'k' 110 | 111 | db_axis.plot(X[db_labels == k, 0], X[db_labels == k, 1], 'o', markerfacecolor=col, 112 | markeredgecolor='k', markersize=6) 113 | 114 | hdb_axis.set_title('HDBSCAN\nEstimated number of clusters: %d' % n_clusters_hdb_) 115 | db_axis.set_title('DBSCAN\nEstimated number of clusters: %d' % n_clusters_db_) 116 | plt.show() 117 | -------------------------------------------------------------------------------- /hdbscan/__init__.py: -------------------------------------------------------------------------------- 1 | from .hdbscan_ import HDBSCAN, hdbscan 2 | from .robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage 3 | from .validity import validity_index 4 | from .prediction import (approximate_predict, 5 | membership_vector, 6 | all_points_membership_vectors, 7 | approximate_predict_scores) 8 | from .branches import (BranchDetector, 9 | detect_branches_in_clusters, 10 | approximate_predict_branch) 11 | 12 | 13 | -------------------------------------------------------------------------------- /hdbscan/_hdbscan_linkage.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: nonecheck=False 3 | # Minimum spanning tree single linkage implementation for hdbscan 4 | # Authors: Leland McInnes, Steve Astels 5 | # License: 3-clause BSD 6 | 7 | import numpy as np 8 | cimport numpy as np 9 | 10 | from libc.float cimport DBL_MAX 11 | 12 | from hdbscan.dist_metrics cimport DistanceMetric 13 | 14 | 15 | cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( 16 | np.ndarray[np.double_t, 17 | ndim=2] distance_matrix): 18 | 19 | cdef np.ndarray[np.intp_t, ndim=1] node_labels 20 | cdef np.ndarray[np.intp_t, ndim=1] current_labels 21 | cdef np.ndarray[np.double_t, ndim=1] current_distances 22 | cdef np.ndarray[np.double_t, ndim=1] left 23 | cdef np.ndarray[np.double_t, ndim=1] right 24 | cdef np.ndarray[np.double_t, ndim=2] result 25 | 26 | cdef np.ndarray label_filter 27 | 28 | cdef np.intp_t current_node 29 | cdef np.intp_t new_node_index 30 | cdef np.intp_t new_node 31 | cdef np.intp_t i 32 | 33 | result = np.zeros((distance_matrix.shape[0] - 1, 3)) 34 | node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) 35 | current_node = 0 36 | current_distances = np.inf * np.ones(distance_matrix.shape[0]) 37 | current_labels = node_labels 38 | for i in range(1, node_labels.shape[0]): 39 | label_filter = current_labels != current_node 40 | current_labels = current_labels[label_filter] 41 | left = current_distances[label_filter] 42 | right = distance_matrix[current_node][current_labels] 43 | current_distances = np.where(left < right, left, right) 44 | 45 | new_node_index = np.argmin(current_distances) 46 | new_node = current_labels[new_node_index] 47 | result[i - 1, 0] = current_node 48 | result[i - 1, 1] = new_node 49 | result[i - 1, 2] = current_distances[new_node_index] 50 | current_node = new_node 51 | 52 | return result 53 | 54 | 55 | cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( 56 | np.ndarray[np.double_t, ndim=2, mode='c'] raw_data, 57 | np.ndarray[np.double_t, ndim=1, mode='c'] core_distances, 58 | DistanceMetric dist_metric, 59 | np.double_t alpha=1.0): 60 | 61 | # Add a comment 62 | cdef np.ndarray[np.double_t, ndim=1] current_distances_arr 63 | cdef np.ndarray[np.double_t, ndim=1] current_sources_arr 64 | cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr 65 | cdef np.ndarray[np.double_t, ndim=2] result_arr 66 | 67 | cdef np.double_t * current_distances 68 | cdef np.double_t * current_sources 69 | cdef np.double_t * current_core_distances 70 | cdef np.double_t * raw_data_ptr 71 | cdef np.int8_t * in_tree 72 | cdef np.double_t[:, ::1] raw_data_view 73 | cdef np.double_t[:, ::1] result 74 | 75 | cdef np.ndarray label_filter 76 | 77 | cdef np.intp_t current_node 78 | cdef np.intp_t source_node 79 | cdef np.intp_t right_node, right_source 80 | cdef np.intp_t left_node, left_source 81 | cdef np.intp_t new_node 82 | cdef np.intp_t i 83 | cdef np.intp_t j 84 | cdef np.intp_t dim 85 | cdef np.intp_t num_features 86 | 87 | cdef double current_node_core_distance 88 | cdef double right_value 89 | cdef double left_value 90 | cdef double core_value 91 | cdef double new_distance 92 | 93 | dim = raw_data.shape[0] 94 | num_features = raw_data.shape[1] 95 | 96 | raw_data_view = ( ( 97 | raw_data.data)) 98 | raw_data_ptr = ( &raw_data_view[0, 0]) 99 | 100 | result_arr = np.zeros((dim - 1, 3)) 101 | in_tree_arr = np.zeros(dim, dtype=np.int8) 102 | current_node = 0 103 | current_distances_arr = np.inf * np.ones(dim) 104 | current_sources_arr = np.ones(dim) 105 | 106 | result = ( ( result_arr.data)) 107 | in_tree = ( in_tree_arr.data) 108 | current_distances = ( current_distances_arr.data) 109 | current_sources = ( current_sources_arr.data) 110 | current_core_distances = ( core_distances.data) 111 | 112 | for i in range(1, dim): 113 | 114 | in_tree[current_node] = 1 115 | 116 | current_node_core_distance = current_core_distances[current_node] 117 | 118 | new_distance = DBL_MAX 119 | source_node = 0 120 | new_node = 0 121 | 122 | for j in range(dim): 123 | if in_tree[j]: 124 | continue 125 | 126 | right_value = current_distances[j] 127 | right_source = current_sources[j] 128 | 129 | left_value = dist_metric.dist(&raw_data_ptr[num_features * 130 | current_node], 131 | &raw_data_ptr[num_features * j], 132 | num_features) 133 | left_source = current_node 134 | 135 | if alpha != 1.0: 136 | left_value /= alpha 137 | 138 | core_value = core_distances[j] 139 | if (current_node_core_distance > right_value or 140 | core_value > right_value or 141 | left_value > right_value): 142 | if right_value < new_distance: 143 | new_distance = right_value 144 | source_node = right_source 145 | new_node = j 146 | continue 147 | 148 | if core_value > current_node_core_distance: 149 | if core_value > left_value: 150 | left_value = core_value 151 | else: 152 | if current_node_core_distance > left_value: 153 | left_value = current_node_core_distance 154 | 155 | if left_value < right_value: 156 | current_distances[j] = left_value 157 | current_sources[j] = left_source 158 | if left_value < new_distance: 159 | new_distance = left_value 160 | source_node = left_source 161 | new_node = j 162 | else: 163 | if right_value < new_distance: 164 | new_distance = right_value 165 | source_node = right_source 166 | new_node = j 167 | 168 | result[i - 1, 0] = source_node 169 | result[i - 1, 1] = new_node 170 | result[i - 1, 2] = new_distance 171 | current_node = new_node 172 | 173 | return result_arr 174 | 175 | 176 | cdef class UnionFind (object): 177 | 178 | cdef np.ndarray parent_arr 179 | cdef np.ndarray size_arr 180 | cdef np.intp_t next_label 181 | cdef np.intp_t *parent 182 | cdef np.intp_t *size 183 | 184 | def __init__(self, N): 185 | self.parent_arr = -1 * np.ones(2 * N - 1, dtype=np.intp, order='C') 186 | self.next_label = N 187 | self.size_arr = np.hstack((np.ones(N, dtype=np.intp), 188 | np.zeros(N-1, dtype=np.intp))) 189 | self.parent = ( self.parent_arr.data) 190 | self.size = ( self.size_arr.data) 191 | 192 | cdef void union(self, np.intp_t m, np.intp_t n): 193 | self.size[self.next_label] = self.size[m] + self.size[n] 194 | self.parent[m] = self.next_label 195 | self.parent[n] = self.next_label 196 | self.size[self.next_label] = self.size[m] + self.size[n] 197 | self.next_label += 1 198 | 199 | return 200 | 201 | cdef np.intp_t fast_find(self, np.intp_t n): 202 | cdef np.intp_t p 203 | p = n 204 | while self.parent_arr[n] != -1: 205 | n = self.parent_arr[n] 206 | # label up to the root 207 | while self.parent_arr[p] != n: 208 | p, self.parent_arr[p] = self.parent_arr[p], n 209 | return n 210 | 211 | 212 | cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L): 213 | 214 | cdef np.ndarray[np.double_t, ndim=2] result_arr 215 | cdef np.double_t[:, ::1] result 216 | 217 | cdef np.intp_t N, a, aa, b, bb, index 218 | cdef np.double_t delta 219 | 220 | result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) 221 | result = ( ( 222 | result_arr.data)) 223 | N = L.shape[0] + 1 224 | U = UnionFind(N) 225 | 226 | for index in range(L.shape[0]): 227 | 228 | a = L[index, 0] 229 | b = L[index, 1] 230 | delta = L[index, 2] 231 | 232 | aa, bb = U.fast_find(a), U.fast_find(b) 233 | 234 | result[index][0] = aa 235 | result[index][1] = bb 236 | result[index][2] = delta 237 | result[index][3] = U.size[aa] + U.size[bb] 238 | 239 | U.union(aa, bb) 240 | 241 | return result_arr 242 | 243 | 244 | cpdef np.ndarray[np.double_t, ndim=2] single_linkage(distance_matrix): 245 | 246 | cdef np.ndarray[np.double_t, ndim=2] hierarchy 247 | cdef np.ndarray[np.double_t, ndim=2] for_labelling 248 | 249 | hierarchy = mst_linkage_core(distance_matrix) 250 | for_labelling = hierarchy[np.argsort(hierarchy.T[2]), :] 251 | 252 | return label(for_labelling) 253 | -------------------------------------------------------------------------------- /hdbscan/_hdbscan_reachability.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: nonecheck=False 3 | # cython: initializedcheck=False 4 | # mutual reachability distance compiutations 5 | # Authors: Leland McInnes 6 | # License: 3-clause BSD 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | from scipy.spatial.distance import pdist, squareform 12 | from scipy.sparse import lil_matrix as sparse_matrix 13 | from sklearn.neighbors import KDTree, BallTree 14 | import gc 15 | 16 | 17 | def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): 18 | """Compute the weighted adjacency matrix of the mutual reachability 19 | graph of a distance matrix. 20 | 21 | Parameters 22 | ---------- 23 | distance_matrix : ndarray, shape (n_samples, n_samples) 24 | Array of distances between samples. 25 | 26 | min_points : int, optional (default=5) 27 | The number of points in a neighbourhood for a point to be considered 28 | a core point. 29 | 30 | Returns 31 | ------- 32 | mututal_reachability: ndarray, shape (n_samples, n_samples) 33 | Weighted adjacency matrix of the mutual reachability graph. 34 | 35 | References 36 | ---------- 37 | .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). 38 | Density-based clustering based on hierarchical density estimates. 39 | In Pacific-Asia Conference on Knowledge Discovery and Data Mining 40 | (pp. 160-172). Springer Berlin Heidelberg. 41 | """ 42 | size = distance_matrix.shape[0] 43 | min_points = min(size - 1, min_points) 44 | try: 45 | core_distances = np.partition(distance_matrix, 46 | min_points, 47 | axis=0)[min_points] 48 | except AttributeError: 49 | core_distances = np.sort(distance_matrix, 50 | axis=0)[min_points] 51 | 52 | if alpha != 1.0: 53 | distance_matrix = distance_matrix / alpha 54 | 55 | stage1 = np.where(core_distances > distance_matrix, 56 | core_distances, distance_matrix) 57 | result = np.where(core_distances > stage1.T, 58 | core_distances.T, stage1.T).T 59 | return result 60 | 61 | 62 | cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, 63 | float alpha=1.0, float max_dist=0.): 64 | 65 | cdef np.intp_t i 66 | cdef np.intp_t j 67 | cdef np.intp_t n 68 | cdef np.double_t mr_dist 69 | cdef list sorted_row_data 70 | cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance 71 | cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data 72 | cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data 73 | 74 | result = sparse_matrix(lil_matrix.shape) 75 | core_distance = np.empty(lil_matrix.shape[0], dtype=np.double) 76 | 77 | for i in range(lil_matrix.shape[0]): 78 | sorted_row_data = sorted(lil_matrix.data[i]) 79 | if min_points - 1 < len(sorted_row_data): 80 | core_distance[i] = sorted_row_data[min_points - 1] 81 | else: 82 | core_distance[i] = np.inf 83 | 84 | if alpha != 1.0: 85 | lil_matrix = lil_matrix / alpha 86 | 87 | nz_row_data, nz_col_data = lil_matrix.nonzero() 88 | 89 | for n in range(nz_row_data.shape[0]): 90 | i = nz_row_data[n] 91 | j = nz_col_data[n] 92 | 93 | mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j]) 94 | if np.isfinite(mr_dist): 95 | result[i, j] = mr_dist 96 | elif max_dist > 0: 97 | result[i, j] = max_dist 98 | 99 | return result.tocsr() 100 | 101 | 102 | def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, 103 | alpha=1.0, **kwargs): 104 | dim = distance_matrix.shape[0] 105 | min_points = min(dim - 1, min_points) 106 | 107 | if metric == 'minkowski': 108 | tree = KDTree(X, metric=metric, p=p) 109 | else: 110 | tree = KDTree(X, metric=metric, **kwargs) 111 | 112 | core_distances = tree.query(X, k=min_points)[0][:, -1] 113 | 114 | if alpha != 1.0: 115 | distance_matrix = distance_matrix / alpha 116 | 117 | stage1 = np.where(core_distances > distance_matrix, 118 | core_distances, distance_matrix) 119 | result = np.where(core_distances > stage1.T, 120 | core_distances.T, stage1.T).T 121 | return result 122 | 123 | 124 | def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, 125 | alpha=1.0, **kwargs): 126 | dim = distance_matrix.shape[0] 127 | min_points = min(dim - 1, min_points) 128 | 129 | tree = BallTree(X, metric=metric, **kwargs) 130 | 131 | core_distances = tree.query(X, k=min_points)[0][:, -1] 132 | 133 | if alpha != 1.0: 134 | distance_matrix = distance_matrix / alpha 135 | 136 | stage1 = np.where(core_distances > distance_matrix, 137 | core_distances, distance_matrix) 138 | result = np.where(core_distances > stage1.T, 139 | core_distances.T, stage1.T).T 140 | return result 141 | 142 | 143 | cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist( 144 | np.ndarray[np.double_t, ndim=1] core_distances, 145 | np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim): 146 | 147 | cdef np.intp_t i 148 | cdef np.intp_t j 149 | cdef np.intp_t result_pos 150 | 151 | result_pos = 0 152 | for i in range(dim): 153 | for j in range(i + 1, dim): 154 | if core_distances[i] > core_distances[j]: 155 | if core_distances[i] > dists[result_pos]: 156 | dists[result_pos] = core_distances[i] 157 | 158 | else: 159 | if core_distances[j] > dists[result_pos]: 160 | dists[result_pos] = core_distances[j] 161 | 162 | result_pos += 1 163 | 164 | return dists 165 | 166 | 167 | def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, 168 | **kwargs): 169 | 170 | dim = X.shape[0] 171 | min_points = min(dim - 1, min_points) 172 | 173 | if metric == 'minkowski': 174 | tree = KDTree(X, metric=metric, p=p) 175 | else: 176 | tree = KDTree(X, metric=metric, **kwargs) 177 | 178 | core_distances = tree.query(X, k=min_points)[0][:, -1] 179 | 180 | del tree 181 | gc.collect() 182 | 183 | dists = pdist(X, metric=metric, p=p, **kwargs) 184 | 185 | if alpha != 1.0: 186 | dists /= alpha 187 | 188 | dists = mutual_reachability_from_pdist(core_distances, dists, dim) 189 | 190 | return dists 191 | 192 | 193 | def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, 194 | **kwargs): 195 | 196 | dim = X.shape[0] 197 | min_points = min(dim - 1, min_points) 198 | 199 | tree = BallTree(X, metric=metric, **kwargs) 200 | 201 | core_distances = tree.query(X, k=min_points)[0][:, -1] 202 | 203 | del tree 204 | gc.collect() 205 | 206 | dists = pdist(X, metric=metric, p=p, **kwargs) 207 | 208 | if alpha != 1.0: 209 | dists /= alpha 210 | 211 | dists = mutual_reachability_from_pdist(core_distances, dists, dim) 212 | 213 | return dists 214 | -------------------------------------------------------------------------------- /hdbscan/branch_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.neighbors import KDTree, BallTree 3 | from .dist_metrics import DistanceMetric 4 | 5 | 6 | class BranchDetectionData(object): 7 | """Input data for branch detection functionality. 8 | 9 | Recreates and caches internal data structures from the clustering stage. 10 | 11 | Parameters 12 | ---------- 13 | 14 | data : array (n_samples, n_features) 15 | The original data set that was clustered. 16 | 17 | labels : array (n_samples) 18 | The cluster labels for every point in the data set. 19 | 20 | condensed_tree : array (n_points + n_merges, 4) 21 | The condensed tree produced during clustering, used to extract outliers. 22 | 23 | min_samples : int 24 | The min_samples value used in clustering. 25 | 26 | tree_type : string, optional 27 | Which type of space tree to use for core distance computation. 28 | One of: 29 | * ``kdtree`` 30 | * ``balltree`` 31 | 32 | metric : string, optional 33 | The metric used to determine distance for the clustering. 34 | This is the metric that will be used for the space tree to determine 35 | core distances etc. 36 | 37 | **kwargs : 38 | Any further arguments to the metric. 39 | 40 | Attributes 41 | ---------- 42 | 43 | all_finite : bool 44 | Whether the data set contains any infinite or NaN values. 45 | 46 | finite_index : array (n_samples) 47 | The indices of the finite data points in the original data set. 48 | 49 | internal_to_raw : dict 50 | A mapping from the finite data set indices to the original data set. 51 | 52 | tree : KDTree or BallTree 53 | A space partitioning tree that can be queried for nearest neighbors if 54 | the metric is supported by a KDTree or BallTree. 55 | 56 | neighbors : array (n_samples, min_samples) 57 | The nearest neighbor for every non-noise point in the original data set. 58 | 59 | core_distances : array (n_samples) 60 | The core distance for every non-noise point in the original data set. 61 | 62 | dist_metric : callable 63 | Accelerated distance metric function. 64 | """ 65 | 66 | _tree_type_map = {"kdtree": KDTree, "balltree": BallTree} 67 | 68 | def __init__( 69 | self, 70 | data, 71 | labels, 72 | condensed_tree, 73 | min_samples, 74 | tree_type="kdtree", 75 | metric="euclidean", 76 | **kwargs, 77 | ): 78 | clean_data = data.astype(np.float64) 79 | last_outlier = np.searchsorted(condensed_tree["lambda_val"], 0.0, side="right") 80 | if last_outlier == 0: 81 | self.all_finite = True 82 | self.internal_to_raw = None 83 | self.finite_index = None 84 | else: 85 | self.all_finite = False 86 | self.finite_index = np.setdiff1d( 87 | np.arange(data.shape[0]), 88 | condensed_tree["child"][:last_outlier] 89 | ) 90 | labels = labels[self.finite_index] 91 | clean_data = clean_data[self.finite_index] 92 | self.internal_to_raw = { 93 | x: y for x, y in enumerate(self.finite_index) 94 | } 95 | 96 | # Construct tree 97 | self.tree = self._tree_type_map[tree_type](clean_data, metric=metric, **kwargs) 98 | self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) 99 | 100 | # Allocate to maintain data point indices 101 | self.core_distances = np.full(clean_data.shape[0], np.nan) 102 | self.neighbors = np.full((clean_data.shape[0], min_samples), -1, dtype=np.int64) 103 | 104 | # Find neighbors for non-noise points 105 | noise_mask = labels != -1 106 | if noise_mask.any(): 107 | distances, self.neighbors[noise_mask, :] = self.tree.query( 108 | clean_data[noise_mask], k=min_samples 109 | ) 110 | self.core_distances[noise_mask] = distances[:, -1] 111 | 112 | -------------------------------------------------------------------------------- /hdbscan/dist_metrics.pxd: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: boundscheck=False 3 | #cython: wraparound=False 4 | #cython: cdivision=True 5 | 6 | import cython 7 | cimport cython 8 | 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | from libc.math cimport fabs, sqrt, exp, cos, pow 13 | 14 | ctypedef np.double_t DTYPE_t 15 | ctypedef np.intp_t ITYPE_t 16 | 17 | cdef enum: 18 | DTYPECODE = np.NPY_FLOAT64 19 | ITYPECODE = np.NPY_INTP 20 | 21 | # Fused type for certain operations 22 | ctypedef fused DITYPE_t: 23 | ITYPE_t 24 | DTYPE_t 25 | 26 | ITYPE = np.intp 27 | 28 | DTYPE = np.double 29 | 30 | ###################################################################### 31 | # Inline distance functions 32 | # 33 | # We use these for the default (euclidean) case so that they can be 34 | # inlined. This leads to faster computation for the most common case 35 | cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2, 36 | ITYPE_t size) nogil except -1: 37 | cdef DTYPE_t tmp, d=0 38 | cdef np.intp_t j 39 | for j in range(size): 40 | tmp = x1[j] - x2[j] 41 | d += tmp * tmp 42 | return sqrt(d) 43 | 44 | 45 | cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2, 46 | ITYPE_t size) nogil except -1: 47 | cdef DTYPE_t tmp, d=0 48 | cdef np.intp_t j 49 | for j in range(size): 50 | tmp = x1[j] - x2[j] 51 | d += tmp * tmp 52 | return d 53 | 54 | 55 | cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1: 56 | return dist * dist 57 | 58 | 59 | cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1: 60 | return sqrt(dist) 61 | 62 | 63 | ###################################################################### 64 | # DistanceMetric base class 65 | cdef class DistanceMetric: 66 | # The following attributes are required for a few of the subclasses. 67 | # we must define them here so that cython's limited polymorphism will work. 68 | # Because we don't expect to instantiate a lot of these objects, the 69 | # extra memory overhead of this setup should not be an issue. 70 | cdef DTYPE_t p 71 | #cdef DTYPE_t[::1] vec 72 | #cdef DTYPE_t[:, ::1] mat 73 | cdef np.ndarray vec 74 | cdef np.ndarray mat 75 | cdef DTYPE_t* vec_ptr 76 | cdef DTYPE_t* mat_ptr 77 | cdef ITYPE_t size 78 | cdef object func 79 | cdef object kwargs 80 | 81 | cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 82 | ITYPE_t size) nogil except -1 83 | 84 | cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 85 | ITYPE_t size) nogil except -1 86 | 87 | cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1 88 | 89 | cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, 90 | DTYPE_t[:, ::1] D) except -1 91 | 92 | cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1 93 | 94 | cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1 95 | -------------------------------------------------------------------------------- /hdbscan/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/hdbscan/tests/__init__.py -------------------------------------------------------------------------------- /hdbscan/tests/test_prediction_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from hdbscan._prediction_utils import safe_always_positive_division 4 | 5 | 6 | @pytest.mark.parametrize('denominator', [-1, 0, 1]) 7 | def test_safe_always_positive_division(denominator): 8 | numerator = 1 9 | # Given negative, zero and positive denominator and positive numerator 10 | value = safe_always_positive_division(numerator, 0) 11 | # Make sure safe division is always positive and doesn't raise ZeroDivision error 12 | assert value >= 0 13 | -------------------------------------------------------------------------------- /hdbscan/tests/test_rsl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for Robust Single Linkage clustering algorithm 3 | """ 4 | # import pickle 5 | import numpy as np 6 | from scipy.spatial import distance 7 | from scipy import sparse 8 | from sklearn.utils.estimator_checks import check_estimator 9 | from hdbscan import RobustSingleLinkage, robust_single_linkage 10 | 11 | # from sklearn.cluster.tests.common import generate_clustered_data 12 | 13 | from sklearn import datasets 14 | import warnings 15 | 16 | from sklearn.datasets import make_blobs 17 | from sklearn.utils import shuffle 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | import pytest 21 | 22 | n_clusters = 3 23 | X, y = make_blobs(n_samples=50, random_state=1) 24 | X, y = shuffle(X, y, random_state=7) 25 | X = StandardScaler().fit_transform(X) 26 | # X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) 27 | 28 | def test_rsl_distance_matrix(): 29 | D = distance.squareform(distance.pdist(X)) 30 | D /= np.max(D) 31 | 32 | labels, tree = robust_single_linkage(D, 0.4, metric='precomputed') 33 | # number of clusters, ignoring noise if present 34 | n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise 35 | assert(n_clusters_1 == 2) 36 | 37 | labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_ 38 | n_clusters_2 = len(set(labels)) - int(-1 in labels) 39 | assert(n_clusters_2 == 2) 40 | 41 | 42 | def test_rsl_feature_vector(): 43 | labels, tree = robust_single_linkage(X, 0.4) 44 | n_clusters_1 = len(set(labels)) - int(-1 in labels) 45 | assert(n_clusters_1 == n_clusters) 46 | 47 | labels = RobustSingleLinkage().fit(X).labels_ 48 | n_clusters_2 = len(set(labels)) - int(-1 in labels) 49 | assert(n_clusters_2 == n_clusters) 50 | 51 | 52 | def test_rsl_callable_metric(): 53 | # metric is the function reference, not the string key. 54 | metric = distance.euclidean 55 | 56 | labels, tree = robust_single_linkage(X, 0.4, metric=metric) 57 | n_clusters_1 = len(set(labels)) - int(-1 in labels) 58 | assert(n_clusters_1 == n_clusters) 59 | 60 | labels = RobustSingleLinkage(metric=metric).fit(X).labels_ 61 | n_clusters_2 = len(set(labels)) - int(-1 in labels) 62 | assert(n_clusters_2 == n_clusters) 63 | 64 | 65 | def test_rsl_input_lists(): 66 | X = [[1., 2.], [3., 4.]] 67 | RobustSingleLinkage().fit(X) # must not raise exception 68 | 69 | 70 | def test_rsl_boruvka_balltree(): 71 | labels, tree = robust_single_linkage(X, 0.45, algorithm='boruvka_balltree') 72 | n_clusters_1 = len(set(labels)) - int(-1 in labels) 73 | assert(n_clusters_1 == n_clusters) 74 | 75 | labels = RobustSingleLinkage(cut=0.45, 76 | algorithm='boruvka_balltree').fit(X).labels_ 77 | n_clusters_2 = len(set(labels)) - int(-1 in labels) 78 | assert(n_clusters_2 == n_clusters) 79 | 80 | 81 | def test_rsl_prims_balltree(): 82 | labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_balltree') 83 | n_clusters_1 = len(set(labels)) - int(-1 in labels) 84 | assert(n_clusters_1 == n_clusters) 85 | 86 | labels = RobustSingleLinkage(algorithm='prims_balltree').fit(X).labels_ 87 | n_clusters_2 = len(set(labels)) - int(-1 in labels) 88 | assert(n_clusters_2 == n_clusters) 89 | 90 | 91 | def test_rsl_prims_kdtree(): 92 | labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_kdtree') 93 | n_clusters_1 = len(set(labels)) - int(-1 in labels) 94 | assert(n_clusters_1 == n_clusters) 95 | 96 | labels = RobustSingleLinkage(algorithm='prims_kdtree').fit(X).labels_ 97 | n_clusters_2 = len(set(labels)) - int(-1 in labels) 98 | assert(n_clusters_2 == n_clusters) 99 | 100 | 101 | # def test_rsl_unavailable_hierarchy(): 102 | # clusterer = RobustSingleLinkage() 103 | # with warnings.catch_warnings(record=True) as w: 104 | # tree = clusterer.cluster_hierarchy_ 105 | # assert len(w) > 0 106 | # assert tree is None 107 | 108 | 109 | def test_rsl_hierarchy(): 110 | clusterer = RobustSingleLinkage().fit(X) 111 | assert clusterer.cluster_hierarchy_ is not None 112 | 113 | 114 | def test_rsl_high_dimensional(): 115 | H, y = make_blobs(n_samples=50, random_state=0, n_features=64) 116 | # H, y = shuffle(X, y, random_state=7) 117 | H = StandardScaler().fit_transform(H) 118 | labels, tree = robust_single_linkage(H, 5.5) 119 | n_clusters_1 = len(set(labels)) - int(-1 in labels) 120 | assert(n_clusters_1 == n_clusters) 121 | 122 | labels = RobustSingleLinkage(cut=5.5, algorithm='best', 123 | metric='seuclidean', 124 | metric_params={'V': np.ones(H.shape[1])}).fit(H).labels_ 125 | n_clusters_2 = len(set(labels)) - int(-1 in labels) 126 | assert(n_clusters_2 == n_clusters) 127 | 128 | 129 | def test_rsl_badargs(): 130 | with pytest.raises(ValueError): 131 | robust_single_linkage('fail', 0.4) 132 | with pytest.raises(ValueError): 133 | robust_single_linkage(None, 0.4) 134 | with pytest.raises(ValueError): 135 | robust_single_linkage(X, 0.4, k='fail') 136 | with pytest.raises(ValueError): 137 | robust_single_linkage(X, 0.4, k=-1) 138 | with pytest.raises(ValueError): 139 | robust_single_linkage(X, 0.4, metric='imperial') 140 | with pytest.raises(ValueError): 141 | robust_single_linkage(X, 0.4, metric=None) 142 | with pytest.raises(ValueError): 143 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1) 144 | with pytest.raises(ValueError): 145 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='prims_kdtree') 146 | with pytest.raises(ValueError): 147 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='prims_balltree') 148 | with pytest.raises(ValueError): 149 | robust_single_linkage(X, 0.4, metric='minkowski', p=-1, algorithm='boruvka_balltree') 150 | with pytest.raises(ValueError): 151 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='boruvka_kdtree') 152 | with pytest.raises(ValueError): 153 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='prims_kdtree') 154 | with pytest.raises(ValueError): 155 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='prims_balltree') 156 | with pytest.raises(ValueError): 157 | robust_single_linkage(X, 0.4, metric='precomputed', algorithm='boruvka_balltree') 158 | with pytest.raises(ValueError): 159 | robust_single_linkage(X, 0.4, alpha=-1) 160 | with pytest.raises(ValueError): 161 | robust_single_linkage(X, 0.4, alpha='fail') 162 | with pytest.raises(Exception): 163 | robust_single_linkage(X, 0.4, algorithm='something_else') 164 | with pytest.raises(TypeError): 165 | robust_single_linkage(X, 0.4, metric='minkowski', p=None) 166 | with pytest.raises(ValueError): 167 | robust_single_linkage(X, 0.4, leaf_size=0) 168 | with pytest.raises(ValueError): 169 | robust_single_linkage(X, 0.4, gamma=0) 170 | 171 | 172 | # Disable for now -- need to refactor to meet newer standards 173 | @pytest.mark.skip(reason="need to refactor to meet newer standards") 174 | def test_rsl_is_sklearn_estimator(): 175 | check_estimator(RobustSingleLinkage) 176 | -------------------------------------------------------------------------------- /notebooks/clusterable_data.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/notebooks/clusterable_data.npy -------------------------------------------------------------------------------- /notebooks/hdbscan01_timings.csv: -------------------------------------------------------------------------------- 1 | 2,2000,0.203334093094 2 | 2,4000,0.259212017059 3 | 2,6000,0.530183076859 4 | 2,8000,0.928155183792 5 | 2,10000,1.33956003189 6 | 2,12000,2.02227687836 7 | 2,14000,2.74701404572 8 | 2,16000,3.63934803009 9 | 2,18000,4.60029006004 10 | 2,20000,6.09813690186 11 | 2,22000,10.7129349709 12 | 2,24000,9.08472108841 13 | 2,26000,15.8526310921 14 | 2,28000,19.4355289936 15 | 2,30000,24.5378270149 16 | 2,32000,30.3819289207 17 | 5,2000,0.21369099617 18 | 5,4000,0.255190849304 19 | 5,6000,0.527250051498 20 | 5,8000,0.93247294426 21 | 5,10000,1.47298002243 22 | 5,12000,2.07997608185 23 | 5,14000,2.84801912308 24 | 5,16000,3.78576898575 25 | 5,18000,4.60007095337 26 | 5,20000,5.82796311378 27 | 5,22000,7.35501813889 28 | 5,24000,8.69181203842 29 | 5,26000,10.3049359322 30 | 5,28000,12.5369310379 31 | 5,30000,28.7729370594 32 | 5,32000,29.6381349564 33 | 10,2000,0.174388170242 34 | 10,4000,0.296141147614 35 | 10,6000,0.662806987762 36 | 10,8000,1.17675209045 37 | 10,10000,1.79025316238 38 | 10,12000,2.48112487793 39 | 10,14000,3.44052696228 40 | 10,16000,4.44019889832 41 | 10,18000,5.61963176727 42 | 10,20000,7.39718699455 43 | 10,22000,8.64890098572 44 | 10,24000,10.4458150864 45 | 10,26000,12.8114190102 46 | 10,28000,20.3707690239 47 | 10,30000,29.7545838356 48 | 10,32000,34.2230820656 49 | 25,2000,0.198121070862 50 | 25,4000,0.452563047409 51 | 25,6000,0.94957280159 52 | 25,8000,1.62946105003 53 | 25,10000,2.49307203293 54 | 25,12000,3.63441205025 55 | 25,14000,4.78342199326 56 | 25,16000,6.30564498901 57 | 25,18000,8.03539299965 58 | 25,20000,10.3152740002 59 | 25,22000,12.7070331573 60 | 25,24000,15.693295002 61 | 25,26000,18.6774010658 62 | 25,28000,28.0319800377 63 | 25,30000,35.5377750397 64 | 25,32000,43.5508480072 65 | 50,2000,0.241183042526 66 | 50,4000,0.691927909851 67 | 50,6000,1.46878409386 68 | 50,8000,2.71946191788 69 | 50,10000,3.89164805412 70 | 50,12000,5.76127791405 71 | 50,14000,8.03004384041 72 | 50,16000,10.2894189358 73 | 50,18000,13.2365300655 74 | 50,20000,16.5973930359 75 | 50,22000,19.8884520531 76 | 50,24000,23.8139870167 77 | 50,26000,28.6661889553 78 | 50,28000,38.4153680801 79 | 50,30000,49.254393816 80 | 50,32000,58.0542850494 81 | -------------------------------------------------------------------------------- /notebooks/hdbscan02_timings.csv: -------------------------------------------------------------------------------- 1 | 2,2000,0.190771818161 2 | 2,4000,0.33536696434 3 | 2,6000,0.475166797638 4 | 2,8000,0.830126047134 5 | 2,10000,1.21801495552 6 | 2,12000,1.66791892052 7 | 2,14000,2.25732898712 8 | 2,16000,2.97524309158 9 | 2,18000,3.75251483917 10 | 2,20000,4.78878498077 11 | 2,22000,5.71841812134 12 | 2,24000,6.86345005035 13 | 2,26000,8.4248509407 14 | 2,28000,10.5936911106 15 | 2,30000,12.250483036 16 | 2,32000,14.2500619888 17 | 5,2000,0.165930986404 18 | 5,4000,0.25049495697 19 | 5,6000,0.505705833435 20 | 5,8000,0.85303401947 21 | 5,10000,1.30479001999 22 | 5,12000,1.78360509872 23 | 5,14000,2.37719798088 24 | 5,16000,3.19220519066 25 | 5,18000,4.0063521862 26 | 5,20000,5.10847592354 27 | 5,22000,6.15350604057 28 | 5,24000,7.8016500473 29 | 5,26000,9.36254882812 30 | 5,28000,10.940628767 31 | 5,30000,13.0416350365 32 | 5,32000,15.0905759335 33 | 10,2000,0.171450138092 34 | 10,4000,0.306551933289 35 | 10,6000,0.609230041504 36 | 10,8000,1.01101207733 37 | 10,10000,1.56092309952 38 | 10,12000,2.25636100769 39 | 10,14000,3.02007102966 40 | 10,16000,3.85052204132 41 | 10,18000,4.90771794319 42 | 10,20000,6.28313612938 43 | 10,22000,7.84088993073 44 | 10,24000,9.35490894318 45 | 10,26000,11.2061488628 46 | 10,28000,13.258589983 47 | 10,30000,15.8290801048 48 | 10,32000,18.140255928 49 | 25,2000,0.187772035599 50 | 25,4000,0.422642946243 51 | 25,6000,0.917279958725 52 | 25,8000,1.49317598343 53 | 25,10000,2.3160700798 54 | 25,12000,3.33820199966 55 | 25,14000,4.4094080925 56 | 25,16000,5.88487386703 57 | 25,18000,7.52313017845 58 | 25,20000,9.37871217728 59 | 25,22000,11.7811200619 60 | 25,24000,14.447204113 61 | 25,26000,17.3661310673 62 | 25,28000,20.1399390697 63 | 25,30000,24.2563328743 64 | 25,32000,28.605463028 65 | 50,2000,0.230389118195 66 | 50,4000,0.681818008423 67 | 50,6000,1.39964485168 68 | 50,8000,2.48313784599 69 | 50,10000,3.77135896683 70 | 50,12000,5.48401618004 71 | 50,14000,7.19847917557 72 | 50,16000,9.64172506332 73 | 50,18000,12.4206252098 74 | 50,20000,15.4045789242 75 | 50,22000,18.8578879833 76 | 50,24000,22.6411821842 77 | 50,26000,26.6900000572 78 | 50,28000,31.2701971531 79 | 50,30000,36.5198609829 80 | 50,32000,41.7656099796 81 | -------------------------------------------------------------------------------- /notebooks/hdbscan03_timings.csv: -------------------------------------------------------------------------------- 1 | 2,4000,0.254909992218 2 | 2,8000,0.781009912491 3 | 2,12000,1.65578794479 4 | 2,16000,2.86548995972 5 | 2,20000,4.5723490715 6 | 2,24000,7.35976219177 7 | 2,28000,10.392701149 8 | 2,32000,9.43943691254 9 | 2,36000,11.3052511215 10 | 2,40000,13.9955811501 11 | 2,44000,18.7241039276 12 | 2,48000,20.6580238342 13 | 2,52000,24.4679880142 14 | 2,56000,29.1394848824 15 | 2,60000,34.244658947 16 | 2,64000,39.4027280807 17 | 5,4000,0.25834608078 18 | 5,8000,0.854709863663 19 | 5,12000,1.76500201225 20 | 5,16000,3.11302685738 21 | 5,20000,5.05285406113 22 | 5,24000,7.59221887589 23 | 5,28000,11.0022101402 24 | 5,32000,11.0250749588 25 | 5,36000,14.1674640179 26 | 5,40000,17.6738820076 27 | 5,44000,22.3881859779 28 | 5,48000,26.0163779259 29 | 5,52000,30.8282210827 30 | 5,56000,35.8936729431 31 | 5,60000,41.7060689926 32 | 5,64000,48.1323189735 33 | 10,4000,0.300674915314 34 | 10,8000,1.02144503593 35 | 10,12000,2.25444197655 36 | 10,16000,3.87991809845 37 | 10,20000,6.13427686691 38 | 10,24000,9.54126405716 39 | 10,28000,13.4590039253 40 | 10,32000,17.133865118 41 | 10,36000,21.9930670261 42 | 10,40000,27.4153258801 43 | 10,44000,33.9543378353 44 | 10,48000,40.5958509445 45 | 10,52000,47.9032700062 46 | 10,56000,57.3020319939 47 | 10,60000,65.7409169674 48 | 10,64000,74.7461779118 49 | 25,4000,0.429993152618 50 | 25,8000,1.53049278259 51 | 25,12000,3.27671718597 52 | 25,16000,5.81940603256 53 | 25,20000,9.31306195259 54 | 25,24000,14.3008999825 55 | 25,28000,20.7219820023 56 | 25,32000,35.4473462105 57 | 25,36000,44.8741598129 58 | 25,40000,55.1005539894 59 | 25,44000,66.9944300652 60 | 25,48000,78.9403419495 61 | 25,52000,92.4163110256 62 | 25,56000,107.29060483 63 | 25,60000,124.042211056 64 | 25,64000,139.81782198 65 | 50,4000,0.689707040787 66 | 50,8000,2.43957304955 67 | 50,12000,5.3949701786 68 | 50,16000,9.77388811111 69 | 50,20000,15.3528060913 70 | 50,24000,22.688354969 71 | 50,28000,31.6130321026 72 | 50,32000,60.4746580124 73 | 50,36000,76.1894528866 74 | 50,40000,93.2929999828 75 | 50,44000,111.741698027 76 | 50,48000,132.439800024 77 | 50,52000,153.971266031 78 | 50,56000,177.992291927 79 | 50,60000,204.601658106 80 | 50,64000,231.908761978 81 | -------------------------------------------------------------------------------- /notebooks/hdbscan04_timings.csv: -------------------------------------------------------------------------------- 1 | 2,8000,0.227055072784 2 | 2,16000,0.532173156738 3 | 2,24000,0.879513025284 4 | 2,32000,1.24024891853 5 | 2,40000,1.81793093681 6 | 2,48000,2.22707700729 7 | 2,56000,2.89961886406 8 | 2,64000,3.2689011097 9 | 2,72000,3.87070393562 10 | 2,80000,6.16474890709 11 | 2,88000,6.37934803963 12 | 2,96000,8.87552189827 13 | 2,104000,8.83126091957 14 | 2,112000,10.2158279419 15 | 2,120000,12.5876441002 16 | 2,128000,13.6096761227 17 | 5,8000,0.405529975891 18 | 5,16000,1.33872485161 19 | 5,24000,2.52023291588 20 | 5,32000,3.81210708618 21 | 5,40000,4.77973794937 22 | 5,48000,7.4870300293 23 | 5,56000,7.76650905609 24 | 5,64000,8.53143310547 25 | 5,72000,11.8250510693 26 | 5,80000,14.0402071476 27 | 5,88000,16.0629730225 28 | 5,96000,19.1256659031 29 | 5,104000,19.8361799717 30 | 5,112000,20.415594101 31 | 5,120000,21.5572421551 32 | 5,128000,24.9693388939 33 | 10,8000,0.523543119431 34 | 10,16000,1.62090706825 35 | 10,24000,3.66929006577 36 | 10,32000,5.36760091782 37 | 10,40000,7.74307012558 38 | 10,48000,13.7823400497 39 | 10,56000,15.9222350121 40 | 10,64000,19.0056459904 41 | 10,72000,22.3747861385 42 | 10,80000,31.0509710312 43 | 10,88000,49.9119548798 44 | 10,96000,47.1509799957 45 | 10,104000,58.6490371227 46 | 10,112000,72.9800539017 47 | 10,120000,68.7178759575 48 | 10,128000,60.2585930824 49 | 25,8000,0.886401891708 50 | 25,16000,2.55635499954 51 | 25,24000,10.2341220379 52 | 25,32000,10.0402569771 53 | 25,40000,16.4257571697 54 | 25,48000,23.4617791176 55 | 25,56000,32.1058709621 56 | 25,64000,35.5998060703 57 | 25,72000,51.0438849926 58 | 25,80000,53.5488469601 59 | 25,88000,74.6229739189 60 | 25,96000,87.4415640831 61 | 25,104000,103.67979002 62 | 25,112000,100.422867775 63 | 25,120000,117.445795059 64 | 25,128000,127.074856043 65 | 50,8000,2.15198493004 66 | 50,16000,6.01606011391 67 | 50,24000,15.0741400719 68 | 50,32000,24.8565030098 69 | 50,40000,32.738462925 70 | 50,48000,54.6907629967 71 | 50,56000,65.1226139069 72 | 50,64000,80.4430060387 73 | 50,72000,103.5877738 74 | 50,80000,120.219110966 75 | 50,88000,171.107203007 76 | 50,96000,201.432529926 77 | 50,104000,238.729315996 78 | 50,112000,258.13277483 79 | 50,120000,285.661708117 80 | 50,128000,316.628612041 81 | -------------------------------------------------------------------------------- /notebooks/hdbscan05_timings.csv: -------------------------------------------------------------------------------- 1 | 2,8000,0.201974868774 2 | 2,16000,0.382796049118 3 | 2,24000,0.677625179291 4 | 2,32000,0.857353925705 5 | 2,40000,1.19192004204 6 | 2,48000,1.65057206154 7 | 2,56000,1.76224017143 8 | 2,64000,2.09517502785 9 | 2,72000,2.37437987328 10 | 2,80000,2.61393880844 11 | 2,88000,3.86622595787 12 | 2,96000,4.16805887222 13 | 2,104000,4.60610985756 14 | 2,112000,4.65505003929 15 | 2,120000,4.94053196907 16 | 2,128000,5.48205113411 17 | 5,8000,0.390153884888 18 | 5,16000,1.1207010746 19 | 5,24000,2.12859201431 20 | 5,32000,3.20195794106 21 | 5,40000,4.50784707069 22 | 5,48000,5.86051797867 23 | 5,56000,6.96505713463 24 | 5,64000,8.35725998878 25 | 5,72000,10.0785040855 26 | 5,80000,11.8928399086 27 | 5,88000,14.2854990959 28 | 5,96000,16.3619041443 29 | 5,104000,18.1008689404 30 | 5,112000,18.765378952 31 | 5,120000,20.262346983 32 | 5,128000,22.245456934 33 | 10,8000,0.362307071686 34 | 10,16000,1.10565090179 35 | 10,24000,2.1113088131 36 | 10,32000,3.8094599247 37 | 10,40000,5.60643601418 38 | 10,48000,8.05391407013 39 | 10,56000,12.0181820393 40 | 10,64000,14.4568071365 41 | 10,72000,17.575797081 42 | 10,80000,20.9547560215 43 | 10,88000,28.589566946 44 | 10,96000,31.5660579205 45 | 10,104000,35.0399270058 46 | 10,112000,46.7496728897 47 | 10,120000,51.5727710724 48 | 10,128000,56.6605160236 49 | 25,8000,0.503958940506 50 | 25,16000,1.15347003937 51 | 25,24000,2.52892589569 52 | 25,32000,3.7748811245 53 | 25,40000,5.54964900017 54 | 25,48000,7.7039680481 55 | 25,56000,10.2646648884 56 | 25,64000,12.3325390816 57 | 25,72000,14.4936189651 58 | 25,80000,17.8296489716 59 | 25,88000,24.9521570206 60 | 25,96000,27.6805050373 61 | 25,104000,31.0702199936 62 | 25,112000,38.4048509598 63 | 25,120000,41.4252431393 64 | 25,128000,45.7964301109 65 | 50,8000,1.46589207649 66 | 50,16000,2.91623210907 67 | 50,24000,4.17734980583 68 | 50,32000,6.72125601768 69 | 50,40000,9.49217200279 70 | 50,48000,11.0911870003 71 | 50,56000,13.4033820629 72 | 50,64000,16.9308049679 73 | 50,72000,20.2958710194 74 | 50,80000,27.0205729008 75 | 50,88000,31.7669379711 76 | 50,96000,37.2198050022 77 | 50,104000,39.0934021473 78 | 50,112000,45.5359759331 79 | 50,120000,49.7200181484 80 | 50,128000,54.0523099899 81 | -------------------------------------------------------------------------------- /notebooks/hdbscan06_timings.csv: -------------------------------------------------------------------------------- 1 | 2,8000,0.175021886826 2 | 2,16000,0.387292146683 3 | 2,24000,0.677018880844 4 | 2,32000,0.934924125671 5 | 2,40000,1.17343378067 6 | 2,48000,1.38080406189 7 | 2,56000,1.60144400597 8 | 2,64000,1.79244303703 9 | 2,72000,2.1175339222 10 | 2,80000,2.43222498894 11 | 2,88000,2.75695896149 12 | 2,96000,3.10400700569 13 | 2,104000,3.41808009148 14 | 2,112000,3.49205112457 15 | 2,120000,3.87581586838 16 | 2,128000,4.19616699219 17 | 5,8000,0.372463941574 18 | 5,16000,1.05067205429 19 | 5,24000,1.93789100647 20 | 5,32000,2.74101495743 21 | 5,40000,3.80962181091 22 | 5,48000,4.98932695389 23 | 5,56000,5.92916297913 24 | 5,64000,7.09130311012 25 | 5,72000,8.22766804695 26 | 5,80000,9.74051809311 27 | 5,88000,11.0401978493 28 | 5,96000,12.6047639847 29 | 5,104000,14.0353701115 30 | 5,112000,14.6283960342 31 | 5,120000,16.2875649929 32 | 5,128000,17.4939930439 33 | 10,8000,0.349482059479 34 | 10,16000,1.09388589859 35 | 10,24000,1.87578415871 36 | 10,32000,3.21113491058 37 | 10,40000,4.35681700706 38 | 10,48000,6.19830203056 39 | 10,56000,9.55884099007 40 | 10,64000,11.4342520237 41 | 10,72000,13.2101860046 42 | 10,80000,16.1834290028 43 | 10,88000,20.0170080662 44 | 10,96000,22.5502281189 45 | 10,104000,24.9669640064 46 | 10,112000,35.226790905 47 | 10,120000,39.5434041023 48 | 10,128000,42.897605896 49 | 25,8000,0.444399118423 50 | 25,16000,1.209430933 51 | 25,24000,1.97230005264 52 | 25,32000,3.10147595406 53 | 25,40000,4.67809796333 54 | 25,48000,5.50237488747 55 | 25,56000,7.86162614822 56 | 25,64000,9.46203804016 57 | 25,72000,11.5571279526 58 | 25,80000,13.881565094 59 | 25,88000,16.1510570049 60 | 25,96000,18.3807759285 61 | 25,104000,20.2770631313 62 | 25,112000,25.9744091034 63 | 25,120000,28.6864550114 64 | 25,128000,31.9634900093 65 | 50,8000,1.42019295692 66 | 50,16000,2.98401212692 67 | 50,24000,3.57059788704 68 | 50,32000,5.97410511971 69 | 50,40000,7.985861063 70 | 50,48000,9.6884970665 71 | 50,56000,11.9059169292 72 | 50,64000,13.7416830063 73 | 50,72000,17.8067760468 74 | 50,80000,20.3124599457 75 | 50,88000,20.6006500721 76 | 50,96000,22.6325879097 77 | 50,104000,27.3392460346 78 | 50,112000,31.2804059982 79 | 50,120000,34.6195569038 80 | 50,128000,39.2653598785 81 | -------------------------------------------------------------------------------- /notebooks/reference_impl_external_timings.csv: -------------------------------------------------------------------------------- 1 | 2,8000,3.59666895866 2 | 2,16000,15.2572879791 3 | 2,24000,31.3827497959 4 | 2,32000,60.9953649044 5 | 2,40000,111.264041901 6 | 2,48000,80.2624919415 7 | 2,56000,111.845596075 8 | 2,64000,157.572174072 9 | 2,72000,213.970286131 10 | 2,80000,291.316827059 11 | 2,88000,364.542631865 12 | 2,96000,330.40318799 13 | 2,104000,376.085955858 14 | 2,112000,437.023652077 15 | 2,120000,512.283486128 16 | 2,128000,639.647830963 17 | 5,8000,2.96017384529 18 | 5,16000,12.4860448837 19 | 5,24000,24.3062229156 20 | 5,32000,27.3480169773 21 | 5,40000,57.2987709045 22 | 5,48000,100.169524908 23 | 5,56000,79.1349971294 24 | 5,64000,124.066302061 25 | 5,72000,185.705877066 26 | 5,80000,266.771252155 27 | 5,88000,344.634408951 28 | 5,96000,437.551882982 29 | 5,104000,446.130121946 30 | 5,112000,365.777822018 31 | 5,120000,447.037277937 32 | 5,128000,591.354615211 33 | 10,8000,3.74887800217 34 | 10,16000,9.18430614471 35 | 10,24000,30.3249309063 36 | 10,32000,33.4931271076 37 | 10,40000,78.0882520676 38 | 10,48000,91.3173689842 39 | 10,56000,200.770553112 40 | 10,64000,158.011397839 41 | 10,72000,241.757611036 42 | 10,80000,323.283601046 43 | 10,88000,342.906905174 44 | 10,96000,354.992150068 45 | 10,104000,435.243753195 46 | 10,112000,547.999858856 47 | 10,120000,687.23850894 48 | 10,128000,572.590743065 49 | 25,8000,3.80018186569 50 | 25,16000,18.4901921749 51 | 25,24000,33.0604710579 52 | 25,32000,90.8991298676 53 | 25,40000,110.421215057 54 | 25,48000,153.691064119 55 | 25,56000,236.893220901 56 | 25,64000,371.323115826 57 | 25,72000,413.138042927 58 | 25,80000,580.538727999 59 | 25,88000,492.039662123 60 | 25,96000,665.976908922 61 | 25,104000,879.488523006 62 | 25,112000,946.649399996 63 | 25,120000,1354.74109793 64 | 25,128000,1628.48575211 65 | 50,8000,7.23535704613 66 | 50,16000,35.2021028996 67 | 50,24000,69.9486300945 68 | 50,32000,146.289216995 69 | 50,40000,234.030052185 70 | 50,48000,305.608191013 71 | 50,56000,423.300146103 72 | 50,64000,642.593301058 73 | 50,72000,703.198181152 74 | 50,80000,885.244357109 75 | 50,88000,1099.00257683 76 | 50,96000,1249.79146123 77 | 50,104000,1456.11673903 78 | 50,112000,1785.89922595 79 | 50,120000,2121.75022507 80 | 50,128000,2446.19570708 81 | -------------------------------------------------------------------------------- /notebooks/reference_impl_internal_timings.csv: -------------------------------------------------------------------------------- 1 | ,,calculate MST,compute core distances,compute hierarchy and cluster tree,compute outlier scores,find flat result,runtime 2 | 2,8000,624,622,1492,22,243,3060 3 | 2,16000,3422,4744,5711,40,542,14514 4 | 2,24000,8290,13080,9732,56,12,31246 5 | 2,32000,18030,24890,16561,90,1192,60865 6 | 2,40000,35571,50032,25340,71,20,111135 7 | 2,48000,20298,15773,38696,118,5086,80123 8 | 2,56000,30316,24342,50220,99,6582,111702 9 | 2,64000,42993,41946,63860,102,8401,157433 10 | 2,72000,61349,62803,78890,136,10489,213827 11 | 2,80000,86100,87121,104851,154,12777,291163 12 | 2,88000,107276,119212,121918,182,15636,364407 13 | 2,96000,80565,88392,142386,122,18594,330248 14 | 2,104000,99502,85378,168434,171,22258,375928 15 | 2,112000,118551,90896,199844,226,27171,436855 16 | 2,120000,147139,121902,210340,158,32281,512020 17 | 2,128000,182831,158954,260799,228,36374,639384 18 | 5,8000,803,907,640,24,17,2468 19 | 5,16000,4025,6237,1609,37,19,12025 20 | 5,24000,5345,15126,3249,60,26,23952 21 | 5,32000,10404,10845,5548,77,29,27043 22 | 5,40000,22235,27360,7303,84,33,57168 23 | 5,48000,39939,49988,9814,93,39,100041 24 | 5,56000,33073,30384,15212,107,38,78986 25 | 5,64000,52653,53860,17047,127,46,123917 26 | 5,72000,77242,84245,23702,118,36,185555 27 | 5,80000,108770,125721,31750,140,34,266622 28 | 5,88000,140356,170121,33576,149,53,344486 29 | 5,96000,186953,203925,46083,185,40,437405 30 | 5,104000,146302,248246,50781,353,73,445989 31 | 5,112000,159718,150778,54602,175,53,365611 32 | 5,120000,194952,180324,71056,226,69,446893 33 | 5,128000,250133,255315,85176,241,46,591201 34 | 10,8000,1074,1763,373,28,14,3347 35 | 10,16000,3615,3858,1245,50,24,8914 36 | 10,24000,10598,17317,1938,68,30,30144 37 | 10,32000,13761,15707,3413,82,39,33204 38 | 10,40000,29709,39094,8769,101,57,77928 39 | 10,48000,34325,37811,18517,133,49,91048 40 | 10,56000,81323,108916,9915,127,50,200561 41 | 10,64000,64556,74037,18723,139,61,157770 42 | 10,72000,111252,114018,15777,151,72,241532 43 | 10,80000,136247,165805,20230,197,80,322847 44 | 10,88000,122443,204552,15006,203,80,342573 45 | 10,96000,160015,173657,20586,179,73,354798 46 | 10,104000,199701,215870,18923,202,74,435081 47 | 10,112000,244868,279603,22277,265,104,547449 48 | 10,120000,306516,355204,24496,265,115,686922 49 | 10,128000,271135,269269,31109,206,136,572215 50 | 25,8000,1195,1829,350,31,43,3585 51 | 25,16000,6410,10653,924,46,36,18263 52 | 25,24000,13252,17456,1826,63,63,32881 53 | 25,32000,34042,50735,5504,78,76,90680 54 | 25,40000,46438,58907,4457,109,82,110275 55 | 25,48000,64879,74894,13223,124,107,153555 56 | 25,56000,98906,128676,8398,123,138,236582 57 | 25,64000,147256,207237,16032,137,117,371163 58 | 25,72000,175076,226689,10241,156,157,412771 59 | 25,80000,232469,327623,19225,174,227,580180 60 | 25,88000,218678,253468,18846,176,224,491880 61 | 25,96000,287941,362339,14325,197,189,665513 62 | 25,104000,368983,488749,20733,163,135,879320 63 | 25,112000,407948,512364,25227,175,200,946495 64 | 25,120000,550593,778372,24603,205,176,1354579 65 | 25,128000,647824,945765,33639,224,174,1628338 66 | 50,8000,2313,4115,441,28,43,7119 67 | 50,16000,12828,20685,1163,50,88,35048 68 | 50,24000,27541,39661,1988,65,113,69679 69 | 50,32000,56303,85807,3371,73,159,146071 70 | 50,40000,90720,137414,4801,102,197,233684 71 | 50,48000,122137,173262,9205,104,178,305392 72 | 50,56000,168479,242965,10462,117,166,422749 73 | 50,64000,246315,379799,14992,132,229,642115 74 | 50,72000,282092,405504,14290,165,244,703033 75 | 50,80000,352981,512679,17941,177,330,884889 76 | 50,88000,438449,643205,15735,179,353,1098758 77 | 50,96000,504656,724577,19070,207,219,1249631 78 | 50,104000,592942,836562,24831,189,317,1455788 79 | 50,112000,714665,1047422,21969,239,428,1785726 80 | 50,120000,847335,1254820,17559,209,548,2121531 81 | 50,128000,969114,1450654,24317,259,424,2445925 82 | -------------------------------------------------------------------------------- /paper/hdbscan_clustering_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/paper/hdbscan_clustering_result.png -------------------------------------------------------------------------------- /paper/hdbscan_condensed_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/hdbscan/8ecf239379c75343b8e1350d62ca8e0a73259f9b/paper/hdbscan_condensed_tree.png -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{campello2013density, 2 | title={Density-based clustering based on hierarchical density estimates}, 3 | author={Campello, Ricardo JGB and Moulavi, Davoud and Sander, Joerg}, 4 | booktitle={Pacific-Asia Conference on Knowledge Discovery and Data Mining}, 5 | pages={160--172}, 6 | year={2013}, 7 | organization={Springer}, 8 | doi={10.1007/978-3-642-37456-2_14}, 9 | url={http://dx.doi.org/10.1007/978-3-642-37456-2_14} 10 | } 11 | 12 | @article{campello2015hierarchical, 13 | title={Hierarchical density estimates for data clustering, visualization, and outlier detection}, 14 | author={Campello, Ricardo JGB and Moulavi, Davoud and Zimek, Arthur and Sander, J{\"o}rg}, 15 | journal={ACM Transactions on Knowledge Discovery from Data (TKDD)}, 16 | volume={10}, 17 | number={1}, 18 | pages={5}, 19 | year={2015}, 20 | publisher={ACM}, 21 | url = {http://doi.acm.org/10.1145/2733381}, 22 | doi = {10.1145/2733381} 23 | } 24 | 25 | @article{chaudhuri2014consistent, 26 | title={Consistent procedures for cluster tree estimation and pruning}, 27 | author={Chaudhuri, Kamalika and Dasgupta, Sanjoy and Kpotufe, Samory and von Luxburg, Ulrike}, 28 | journal={IEEE Transactions on Information Theory}, 29 | volume={60}, 30 | number={12}, 31 | pages={7900--7912}, 32 | year={2014}, 33 | publisher={IEEE}, 34 | doi={10.1109/TIT.2014.2361055} 35 | } 36 | 37 | @inproceedings{chaudhuri2010rates, 38 | author = {Chaudhuri, Kamalika and Dasgupta, Sanjoy}, 39 | title = {Rates of Convergence for the Cluster Tree}, 40 | booktitle = {Proceedings of the 23rd International Conference on Neural Information Processing Systems}, 41 | series = {NIPS'10}, 42 | year = {2010}, 43 | location = {Vancouver, British Columbia, Canada}, 44 | pages = {343--351}, 45 | numpages = {9}, 46 | url = {https://papers.nips.cc/paper/4068-rates-of-convergence-for-the-cluster-tree}, 47 | acmid = {2997228}, 48 | publisher = {Curran Associates Inc.}, 49 | address = {USA}, 50 | } 51 | -------------------------------------------------------------------------------- /paper/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'hdbscan: Hierarchical density based clustering' 3 | tags: 4 | - clustering 5 | - unsupervised learning 6 | - machine learning 7 | authors: 8 | - name: Leland McInnes 9 | orcid: 0000-0003-2143-6834 10 | affiliation: 1 11 | - name: John Healy 12 | affiliation: 1 13 | - name: Steve Astels 14 | affiliation: 2 15 | affiliations: 16 | - name: Tutte Institute for Mathematics and Computing 17 | index: 1 18 | - name: Shopify 19 | index: 2 20 | date: 26 February 2017 21 | bibliography: paper.bib 22 | --- 23 | 24 | # Summary 25 | 26 | HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise 27 | [@campello2013density], [@campello2015hierarchical]. 28 | Performs DBSCAN over varying epsilon values and integrates the result to find a 29 | clustering that gives the best stability over epsilon. This allows HDBSCAN to 30 | find clusters of varying densities (unlike DBSCAN), and be more robust to parameter 31 | selection. The library also includes support for Robust Single Linkage clustering 32 | [@chaudhuri2014consistent], [@chaudhuri2010rates], 33 | GLOSH outlier detection [@campello2015hierarchical], and tools for visualizing 34 | and exploring cluster structures. 35 | Finally support for prediction and soft clustering is also available. 36 | 37 | -![Example clustering results.](hdbscan_clustering_result.png) 38 | -![Hierarchical tree structure.](hdbscan_condensed_tree.png) 39 | 40 | # References 41 | 42 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "wheel", 5 | "cython<4", 6 | "numpy<3" 7 | ] 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.20,<3 2 | scipy>= 1.0 3 | scikit-learn>=0.20 4 | joblib>=1.0 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | try: 4 | # from Cython.Distutils import build_ext 5 | from Cython.Build import cythonize 6 | from setuptools import setup, Extension 7 | from setuptools.command.build_ext import build_ext 8 | HAVE_CYTHON = True 9 | except ImportError as e: 10 | warnings.warn(e.args[0]) 11 | cythonize = lambda ext: ext 12 | from setuptools import setup, Extension 13 | from setuptools.command.build_ext import build_ext 14 | HAVE_CYTHON = False 15 | 16 | 17 | class CustomBuildExtCommand(build_ext): 18 | """build_ext command for use when numpy headers are needed.""" 19 | 20 | def run(self): 21 | 22 | # Import numpy here, only when headers are needed 23 | import numpy 24 | 25 | # Add numpy headers to include_dirs 26 | self.include_dirs.append(numpy.get_include()) 27 | 28 | # Call original build_ext command 29 | build_ext.run(self) 30 | 31 | 32 | _hdbscan_tree = Extension('hdbscan._hdbscan_tree', 33 | sources=['hdbscan/_hdbscan_tree.pyx']) 34 | _hdbscan_linkage = Extension('hdbscan._hdbscan_linkage', 35 | sources=['hdbscan/_hdbscan_linkage.pyx']) 36 | _hdbscan_boruvka = Extension('hdbscan._hdbscan_boruvka', 37 | sources=['hdbscan/_hdbscan_boruvka.pyx']) 38 | _hdbscan_reachability = Extension('hdbscan._hdbscan_reachability', 39 | sources=['hdbscan/_hdbscan_reachability.pyx']) 40 | _prediction_utils = Extension('hdbscan._prediction_utils', 41 | sources=['hdbscan/_prediction_utils.pyx']) 42 | dist_metrics = Extension('hdbscan.dist_metrics', 43 | sources=['hdbscan/dist_metrics.pyx']) 44 | 45 | 46 | 47 | def readme(): 48 | with open('README.rst') as readme_file: 49 | return readme_file.read() 50 | 51 | def requirements(): 52 | # The dependencies are the same as the contents of requirements.txt 53 | with open('requirements.txt') as f: 54 | return [line.strip() for line in f if line.strip()] 55 | 56 | configuration = { 57 | 'name': 'hdbscan', 58 | 'version': '0.8.40', 59 | 'description': 'Clustering based on density with variable density clusters', 60 | 'long_description': readme(), 61 | 'classifiers': [ 62 | 'Development Status :: 4 - Beta', 63 | 'Intended Audience :: Science/Research', 64 | 'Intended Audience :: Developers', 65 | 'License :: OSI Approved', 66 | 'Programming Language :: C', 67 | 'Programming Language :: Python', 68 | 'Topic :: Software Development', 69 | 'Topic :: Scientific/Engineering', 70 | 'Operating System :: Microsoft :: Windows', 71 | 'Operating System :: POSIX', 72 | 'Operating System :: Unix', 73 | 'Operating System :: MacOS', 74 | 'Programming Language :: Python :: 3.9', 75 | 'Programming Language :: Python :: 3.10', 76 | 'Programming Language :: Python :: 3.11', 77 | 'Programming Language :: Python :: 3.12', 78 | ], 79 | 'keywords': 'cluster clustering density hierarchical', 80 | 'url': 'http://github.com/scikit-learn-contrib/hdbscan', 81 | 'maintainer': 'Leland McInnes', 82 | 'maintainer_email': 'leland.mcinnes@gmail.com', 83 | 'license': 'BSD', 84 | 'packages': ['hdbscan', 'hdbscan.tests'], 85 | 'install_requires': requirements(), 86 | 'ext_modules': cythonize([ 87 | _hdbscan_tree, 88 | _hdbscan_linkage, 89 | _hdbscan_boruvka, 90 | _hdbscan_reachability, 91 | _prediction_utils, 92 | dist_metrics]), 93 | 'cmdclass': {'build_ext': CustomBuildExtCommand}, 94 | 'test_suite': 'nose.collector', 95 | 'tests_require': ['nose'], 96 | 'data_files': ('hdbscan/dist_metrics.pxd',) 97 | } 98 | 99 | if not HAVE_CYTHON: 100 | warnings.warn('Due to incompatibilities with Python 3.7 hdbscan now' 101 | 'requires Cython to be installed in order to build it') 102 | raise ImportError('Cython not found! Please install cython and try again') 103 | 104 | setup(**configuration) 105 | --------------------------------------------------------------------------------