├── src └── radius_clustering │ ├── utils │ ├── __init__.py │ ├── random_manager.h │ ├── random_manager.cpp │ ├── emos.pyx │ ├── util_heap.h │ ├── mds.pyx │ ├── mds_core.cpp │ └── mds3-util.h │ ├── __init__.py │ ├── algorithms.py │ └── radius_clustering.py ├── tests ├── conftest.py ├── test_structural.py ├── test_regression.py ├── test_unit.py └── test_integration.py ├── .coverage ├── docs ├── source │ ├── images │ │ ├── exec_time.png │ │ ├── logo-lias.jpg │ │ └── exec_time_optimized.png │ ├── api.rst │ ├── _static │ │ └── styles │ │ │ └── custom.css │ ├── scss │ │ └── custom.scss │ ├── conf.py │ ├── index.rst │ ├── details.rst │ ├── usage.rst │ └── installation.rst ├── requirements.txt ├── Makefile └── make.bat ├── examples ├── GALLERY_HEADER.rst ├── plot_benchmark_custom.py └── plot_iris_example.py ├── environment.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .github ├── ISSUE_TEMPLATE │ ├── doc_improvement.yml │ ├── feature_request.yml │ └── bug_report.yml ├── workflows │ ├── lint.yml │ ├── tests.yml │ ├── sphinx.yml │ ├── pr_build.yml │ └── build_wheels.yml └── PULL_REQUEST_TEMPLATE.md ├── scripts └── build_wheel.sh ├── CITATION.cff ├── setup.py ├── CONTRIBUTING.md ├── PRESENTATION.md ├── CHANGELOG.md ├── pyproject.toml ├── CODE_OF_CONDUCT.md ├── EXPERIMENTS.md ├── README.md └── notebooks └── comparison_example.ipynb /src/radius_clustering/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | -------------------------------------------------------------------------------- /.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/radius_clustering/main/.coverage -------------------------------------------------------------------------------- /docs/source/images/exec_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/radius_clustering/main/docs/source/images/exec_time.png -------------------------------------------------------------------------------- /docs/source/images/logo-lias.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/radius_clustering/main/docs/source/images/logo-lias.jpg -------------------------------------------------------------------------------- /docs/source/images/exec_time_optimized.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/radius_clustering/main/docs/source/images/exec_time_optimized.png -------------------------------------------------------------------------------- /examples/GALLERY_HEADER.rst: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | Examples 4 | ======== 5 | 6 | This is the gallery of examples using the Radius Clustering package. -------------------------------------------------------------------------------- /src/radius_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | # Import the main clustering class 2 | from .radius_clustering import RadiusClustering 3 | 4 | __all__ = ["RadiusClustering"] 5 | __version__ = "1.4.2" 6 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=8.1.3 2 | sphinx_gallery>=0.18.0 3 | sphinx-copybutton>=0.5.2 4 | pydata-sphinx-theme>=0.15.3 5 | sphinxcontrib-email>=0.3.6 6 | sphinx-remove-toctrees>=1.0.0 7 | sphinx-prompt>=1.9.0 8 | sphinx_design>=0.6.1 -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: radius_clustering 2 | dependencies: 3 | - matplotlib>=3.6.2 4 | - numpy>=2.0 5 | - scikit-learn>=1.2.2 6 | - scipy>=1.12.0 7 | - pandas>=2.0.3 8 | - pip 9 | - pip: 10 | - radius-clustering>=1.4.0 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # docs 2 | docs/build/ 3 | 4 | # env and caches 5 | 6 | mds-env/ 7 | **/__pycache__/ 8 | .pytest_cache/ 9 | .ruff_cache/ 10 | 11 | # IDE files 12 | .idea 13 | 14 | # compiled files 15 | **.so 16 | build/ 17 | **/auto_examples/ 18 | wheelhouse/ 19 | dist/ 20 | **.egg-info 21 | docs/source/modules/generated/ 22 | /docs/source/sg_execution_times.rst 23 | 24 | # cython files 25 | **/emos.c 26 | **/mds.cpp 27 | 28 | # MAC OS files 29 | .DS_Store 30 | 31 | # Reportings 32 | reporting/ -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | This page documents the implementation details of the `radius_clustering` package. 5 | 6 | RadiusClustering Class 7 | ---------------------- 8 | 9 | .. autoclass:: radius_clustering.RadiusClustering 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | Algorithms Module 15 | ----------------- 16 | .. automodule:: radius_clustering.algorithms 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: -------------------------------------------------------------------------------- /tests/test_structural.py: -------------------------------------------------------------------------------- 1 | from sklearn.utils.estimator_checks import parametrize_with_checks 2 | def test_import(): 3 | import radius_clustering as rad 4 | 5 | 6 | def test_from_import(): 7 | from radius_clustering import RadiusClustering 8 | 9 | 10 | from radius_clustering import RadiusClustering 11 | 12 | @parametrize_with_checks([RadiusClustering()]) 13 | def test_check_estimator_api_consistency(estimator, check, request): 14 | 15 | """Check the API consistency of the RadiusClustering estimator 16 | """ 17 | check(estimator) 18 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.6.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | 10 | - repo: https://github.com/psf/black-pre-commit-mirror 11 | rev: 24.8.0 12 | hooks: 13 | - id: black 14 | 15 | - repo: https://github.com/astral-sh/ruff-pre-commit 16 | rev: v0.5.5 17 | hooks: 18 | - id: ruff 19 | args: ["--fix", "--show-files"] 20 | -------------------------------------------------------------------------------- /src/radius_clustering/utils/random_manager.h: -------------------------------------------------------------------------------- 1 | #ifndef RANDOM_MANAGER_H 2 | #define RANDOM_MANAGER_H 3 | 4 | #include 5 | #include 6 | 7 | class RandomManager { 8 | private: 9 | static std::mt19937 rng; 10 | static std::vector parallelRng; 11 | 12 | public: 13 | static void setSeed(long seed); 14 | static std::mt19937& getRandom(); 15 | static void initParallel(int nRandoms, long initSeed); 16 | static std::mt19937& getRandom(int i); 17 | static int nextInt(int max); // Add this line 18 | }; 19 | 20 | #endif // RANDOM_MANAGER_H -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/doc_improvement.yml: -------------------------------------------------------------------------------- 1 | name: Documentation improvement 2 | description: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. 3 | labels: ["documentation", "triage"] 4 | 5 | body: 6 | - type: textarea 7 | attributes: 8 | label: Describe the issue linked to the documentation 9 | description: > 10 | Tell us about the confusion introduced in the documentation. 11 | validations: 12 | required: true 13 | - type: textarea 14 | attributes: 15 | label: Suggest a potential alternative/fix 16 | description: > 17 | Tell us how we could improve the documentation in this regard. 18 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /scripts/build_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -u -x 3 | 4 | function repair_wheel { 5 | wheel="$1" 6 | if ! auditwheel show "$wheel"; then 7 | echo "Skipping non-platform wheel $wheel" 8 | else 9 | auditwheel repair "$wheel" --plat "$PLAT" -w /io/wheelhouse/ 10 | fi 11 | } 12 | 13 | # Compile wheels 14 | for PYBIN in /opt/python/*/bin; do 15 | if [[ $PYBIN != *"cp313t"* ]] && \ 16 | [[ $PYBIN != *"pp"* ]] && \ 17 | [[ $PYBIN != *"cp36"* ]] &&\ 18 | [[ $PYBIN != *"cp37"* ]] &&\ 19 | [[ $PYBIN != *"cp38"* ]]; then 20 | "${PYBIN}/pip" wheel /io/ --no-deps -w wheelhouse/ 21 | fi 22 | done 23 | 24 | # Bundle external shared libraries into the wheels 25 | for whl in wheelhouse/*.whl; do 26 | repair_wheel "$whl" 27 | done -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint and Format 2 | 3 | on: 4 | workflow_call: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | lint-and-format: 9 | name: Run Linters and Formatters 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.11" 19 | 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | python -m pip install ".[dev]" 24 | 25 | - name: Run ruff linter 26 | run: | 27 | ruff check src/radius_clustering tests --fix 28 | 29 | - name: Run black formatter 30 | run: | 31 | black src/radius_clustering tests --check 32 | 33 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | workflow_call: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | pytest: 9 | name: Run pytest 10 | runs-on: ubuntu-latest 11 | strategy: 12 | fail-fast: true 13 | matrix: 14 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install -e ".[dev]" 27 | 28 | - name: Run tests with pytest 29 | run: | 30 | pytest -v 31 | 32 | - name: Upload coverage reports to Codecov 33 | uses: codecov/codecov-action@v5.4.3 34 | with: 35 | token: ${{ secrets.CODECOV_TOKEN }} 36 | -------------------------------------------------------------------------------- /src/radius_clustering/utils/random_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "random_manager.h" 2 | #include 3 | #include 4 | 5 | std::mt19937 RandomManager::rng(std::chrono::system_clock::now().time_since_epoch().count()); 6 | std::vector RandomManager::parallelRng; 7 | 8 | void RandomManager::setSeed(long seed) { 9 | rng.seed(seed); 10 | } 11 | 12 | std::mt19937& RandomManager::getRandom() { 13 | return rng; 14 | } 15 | 16 | void RandomManager::initParallel(int nRandoms, long initSeed) { 17 | parallelRng.resize(nRandoms); 18 | std::mt19937 rndStart(initSeed); 19 | for (int i = 0; i < nRandoms; ++i) { 20 | int seed = std::uniform_int_distribution<>(0, std::numeric_limits::max())(rndStart); 21 | parallelRng[i].seed(seed); 22 | } 23 | } 24 | 25 | std::mt19937& RandomManager::getRandom(int i) { 26 | return parallelRng[i]; 27 | } 28 | 29 | int RandomManager::nextInt(int max) { 30 | return std::uniform_int_distribution<>(0, max - 1)(rng); 31 | } 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Suggest an idea for this project 3 | title: "[Feature]: " 4 | labels: ["enhancement"] 5 | body: 6 | - type: textarea 7 | attributes: 8 | label: Is your feature request related to a problem? Please describe. 9 | description: A clear and concise description of what the problem is. Ex. "I'm always frustrated when..." 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Describe the solution you'd like 15 | description: A clear and concise description of what you want to happen. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Describe alternatives you've considered 21 | description: A clear and concise description of any alternative solutions or features you've considered. 22 | - type: textarea 23 | attributes: 24 | label: Additional context 25 | description: Add any other context or screenshots about the feature request here. 26 | -------------------------------------------------------------------------------- /.github/workflows/sphinx.yml: -------------------------------------------------------------------------------- 1 | name: "Sphinx: Render docs" 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - docs 8 | paths: 9 | - "docs/**" 10 | - "src/radius_clustering/**" 11 | workflow_dispatch: 12 | 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | permissions: 17 | contents: write 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Build HTML 21 | shell: bash 22 | run: | 23 | sudo apt-get update 24 | sudo apt-get install build-essential 25 | pip install --upgrade pip 26 | pip install ".[doc]" 27 | pushd docs 28 | make html 29 | popd 30 | - name: Upload artifacts 31 | uses: actions/upload-artifact@v4 32 | with: 33 | name: html-docs 34 | path: docs/build/html/ 35 | - name: Deploy 36 | uses: peaceiris/actions-gh-pages@v3 37 | if: github.ref == 'refs/heads/main' 38 | with: 39 | github_token: ${{ secrets.GITHUB_TOKEN }} 40 | publish_dir: docs/build/html 41 | -------------------------------------------------------------------------------- /docs/source/_static/styles/custom.css: -------------------------------------------------------------------------------- 1 | /* Tabs (sphinx-design) */ 2 | .sd-tab-set { 3 | --tab-caption-width: 0%; 4 | margin-top: 1.5rem; } 5 | .sd-tab-set::before { 6 | width: var(--tab-caption-width); 7 | display: flex; 8 | align-items: center; 9 | font-weight: bold; } 10 | .sd-tab-set .sd-tab-content { 11 | padding: 0.5rem 0 0 0 !important; 12 | background-color: transparent !important; 13 | border: none !important; } 14 | .sd-tab-set .sd-tab-content blockquote { 15 | background-color: transparent !important; 16 | border: none !important; } 17 | .sd-tab-set .sd-tab-content > p:first-child { 18 | margin-top: 1rem !important; } 19 | .sd-tab-set > label.sd-tab-label { 20 | margin: 0 3px; 21 | display: flex; 22 | align-items: center; 23 | justify-content: center; 24 | border-radius: 5px !important; } 25 | .sd-tab-set > label.sd-tab-label.tab-4 { 26 | width: calc((100% - var(--tab-caption-width)) / 3 - 6px) !important; } 27 | .sd-tab-set > input:checked + label.sd-tab-label { 28 | transform: unset; 29 | border: 2px solid var(--pst-color-primary); } 30 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: Radius Clustering 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Quentin 12 | family-names: Haenn 13 | email: quentin.haenn@ensma.fr 14 | affiliation: LIAS Lab 15 | orcid: 'https://orcid.org/0009-0009-1663-0107' 16 | - given-names: Brice 17 | family-names: Chardin 18 | email: brice.chardin@ensma.fr 19 | affiliation: LIAS Lab 20 | orcid: 'https://orcid.org/0000-0002-9298-9447' 21 | - given-names: Mickael 22 | family-names: Baron 23 | email: mickael.baron@ensma.fr 24 | affiliation: LIAS Lab 25 | orcid: 'https://orcid.org/0000-0002-3356-0835' 26 | - name: LIAS Laboratory 27 | address: 1 Avenue Clément Ader 28 | city: Chasseneuil du Poitou 29 | post-code: '86360' 30 | website: 'https://www.lias-lab.fr' 31 | identifiers: 32 | - type: swh 33 | value: 'swh:1:rev:66f8d295cc5fbc80f356d11be46571bfbb190609' 34 | license: GPL-3.0 35 | -------------------------------------------------------------------------------- /docs/source/scss/custom.scss: -------------------------------------------------------------------------------- 1 | /* Tabs (sphinx-design) */ 2 | 3 | .sd-tab-set { 4 | --tab-caption-width: 0%; // No tab caption by default 5 | margin-top: 1.5rem; 6 | 7 | &::before { 8 | // Set `content` for tab caption 9 | width: var(--tab-caption-width); 10 | display: flex; 11 | align-items: center; 12 | font-weight: bold; 13 | } 14 | 15 | .sd-tab-content { 16 | padding: 0.5rem 0 0 0 !important; 17 | background-color: transparent !important; 18 | border: none !important; 19 | 20 | blockquote { 21 | background-color: transparent !important; 22 | border: none !important; 23 | } 24 | 25 | > p:first-child { 26 | margin-top: 1rem !important; 27 | } 28 | } 29 | 30 | > label.sd-tab-label { 31 | margin: 0 3px; 32 | display: flex; 33 | align-items: center; 34 | justify-content: center; 35 | border-radius: 5px !important; 36 | 37 | 38 | &.tab-4 { 39 | width: calc((100% - var(--tab-caption-width)) / 3 - 6px) !important; 40 | } 41 | } 42 | 43 | > input:checked + label.sd-tab-label { 44 | transform: unset; 45 | border: 2px solid var(--pst-color-primary); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /.github/workflows/pr_build.yml: -------------------------------------------------------------------------------- 1 | name: PR Build and Test 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | paths: 8 | - "src/radius_clustering/**" 9 | - "tests/**" 10 | - "pyproject.toml" 11 | workflow_dispatch: 12 | 13 | jobs: 14 | run_pytest: 15 | name: Run pytest 16 | uses: ./.github/workflows/tests.yml 17 | 18 | build_test_sdist: 19 | name: Test source distribution 20 | runs-on: ubuntu-latest 21 | needs: run_pytest 22 | strategy: 23 | fail-fast: true 24 | matrix: 25 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - name: Set up Python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | 34 | - name: Build sdist 35 | run: | 36 | pip install --upgrade pip 37 | pip install pipx 38 | pipx run build --sdist 39 | 40 | - name: Test sdist 41 | run: | 42 | pip install ./dist/*.tar.gz 43 | python -c "import radius_clustering; print(f'Successfully imported version {radius_clustering.__version__}')" -------------------------------------------------------------------------------- /src/radius_clustering/utils/emos.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | EMOS (Exact Minimum Dominating Set) Solver Module 3 | 4 | This Cython module provides a Python interface to the C implementation 5 | of the Exact Minimum Dominating Set (EMOS) algorithm. It allows for 6 | efficient solving of MDS problems using the exact method. 7 | 8 | The module includes: 9 | - Wrapper functions for the C EMOS solver 10 | - Data conversion between Python and C data structures 11 | - Result processing and conversion back to Python objects 12 | """ 13 | 14 | cdef extern from "mds3-util.h": 15 | struct Result: 16 | int* dominating_set 17 | int set_size 18 | double exec_time 19 | 20 | Result* emos_main(unsigned int* edges, int nb_edge, int n) 21 | 22 | void cleanup() 23 | 24 | void free_results(Result* result) 25 | 26 | import numpy as np 27 | cimport numpy as np 28 | 29 | def py_emos_main(np.ndarray[unsigned int, ndim=1] edges, int n, int nb_edge): 30 | cdef Result* result = emos_main(&edges[0], n, nb_edge) 31 | 32 | dominating_set = [result.dominating_set[i] - 1 for i in range(result.set_size)] 33 | exec_time = result.exec_time 34 | 35 | free_results(result) 36 | cleanup() 37 | 38 | return dominating_set, exec_time 39 | 40 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | Please include a summary of the change and which issue is fixed. Also include relevant motivation and context. List any dependencies that are required for this change. 4 | 5 | Fixes # (issue) 6 | 7 | ## Type of change 8 | 9 | Please delete options that are not relevant. 10 | 11 | - [ ] Bug fix (non-breaking change which fixes an issue) 12 | - [ ] New feature (non-breaking change which adds functionality) 13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 14 | - [ ] This change requires a documentation update 15 | 16 | ## How Has This Been Tested? 17 | 18 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Also list any relevant details for your test configuration. 19 | 20 | - [ ] Test A 21 | - [ ] Test B 22 | 23 | ## Checklist: 24 | 25 | - [ ] My code follows the style guidelines of this project 26 | - [ ] I have performed a self-review of my own code 27 | - [ ] I have commented my code, particularly in hard-to-understand areas 28 | - [ ] I have made corresponding changes to the documentation 29 | - [ ] My changes generate no new warnings 30 | - [ ] I have added tests that prove my fix is effective or that my feature works 31 | - [ ] New and existing unit tests pass locally with my changes 32 | - [ ] Any dependent changes have been merged and published in downstream modules 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | import numpy as np 4 | from Cython.Build import cythonize 5 | from setuptools import Extension, setup 6 | 7 | SYSTEM = platform.system() 8 | CPU = platform.processor() 9 | 10 | C_COMPILE_ARGS = ["-std=c99", "-O3", "-ffast-math", "-DREP"] 11 | CXX_COMPILE_ARGS = ["-std=c++11", "-O3", "-ffast-math"] 12 | CXX_LINK_ARGS = ["-std=c++11"] 13 | 14 | if not CPU: 15 | CPU = platform.machine() 16 | 17 | if (SYSTEM != "Darwin") and (CPU not in "arm64"): 18 | C_COMPILE_ARGS.append("-march=native") 19 | CXX_COMPILE_ARGS.append("-march=native") 20 | CXX_LINK_ARGS.append("-fopenmp") 21 | 22 | extensions = [ 23 | Extension( 24 | "radius_clustering.utils._emos", 25 | [ 26 | "src/radius_clustering/utils/emos.pyx", 27 | "src/radius_clustering/utils/main-emos.c" 28 | ], 29 | include_dirs=[np.get_include(), "src/radius_clustering/utils"], 30 | extra_compile_args=C_COMPILE_ARGS, 31 | ), 32 | Extension( 33 | "radius_clustering.utils._mds_approx", 34 | [ 35 | "src/radius_clustering/utils/mds.pyx", 36 | "src/radius_clustering/utils/mds_core.cpp", 37 | "src/radius_clustering/utils/random_manager.cpp", 38 | ], 39 | include_dirs=[np.get_include(), "src/radius_clustering/utils"], 40 | language="c++", 41 | extra_compile_args=CXX_COMPILE_ARGS, 42 | extra_link_args=CXX_LINK_ARGS, 43 | ), 44 | ] 45 | 46 | setup( 47 | ext_modules=cythonize(extensions, language_level=3), 48 | include_dirs=[np.get_include()], 49 | package_data={"radius_clustering": ["utils/*.pyx", "utils/*.h"]}, 50 | ) 51 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Radius Clustering 2 | 3 | First off, thank you for considering contributing to Radius Clustering! It's people like you that make open source such a great community. 4 | 5 | ## Where do I go from here? 6 | 7 | If you've noticed a bug or have a feature request, [make one](https://github.com/scikit-learn-contrib/radius_clustering/issues/new)! It's generally best if you get confirmation of your bug or approval for your feature request this way before starting to code. 8 | 9 | ### Fork & create a branch 10 | 11 | If you've decided to contribute, you'll need to fork the repository and create a new branch. 12 | 13 | ```bash 14 | git checkout -b my-new-feature 15 | ``` 16 | 17 | ## Getting started 18 | 19 | To get started with the development, you need to install the package in an editable mode with all the development dependencies. It is highly recommended to do this in a virtual environment. 20 | 21 | ```bash 22 | pip install -e ".[dev]" 23 | ``` 24 | 25 | This will install the package and all the tools needed for testing and linting. 26 | 27 | ## Running Tests 28 | 29 | To ensure that your changes don't break anything, please run the test suite. 30 | 31 | ```bash 32 | pytest 33 | ``` 34 | 35 | ## Code Style 36 | 37 | This project uses `ruff` for linting and `black` for formatting. We use `pre-commit` to automatically run these tools before each commit. 38 | 39 | To set up `pre-commit`, run: 40 | 41 | ```bash 42 | pre-commit install 43 | ``` 44 | 45 | This will ensure your contributions match the project's code style. 46 | 47 | ## Submitting a Pull Request 48 | 49 | When you're ready to submit your changes, please write a clear and concise pull request message. Make sure to link any relevant issues. 50 | 51 | Thank you for your contribution! 52 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Create a report to help us improve 3 | title: "[Bug]: " 4 | labels: ["bug", "triage"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thanks for taking the time to fill out this bug report! 10 | 11 | - type: textarea 12 | id: what-happened 13 | attributes: 14 | label: Describe the bug 15 | description: A clear and concise description of what the bug is. 16 | placeholder: Tell us what you see! 17 | validations: 18 | required: true 19 | 20 | - type: textarea 21 | id: reproduce 22 | attributes: 23 | label: To Reproduce 24 | description: "Steps to reproduce the behavior. Please provide a minimal, self-contained code sample." 25 | placeholder: | 26 | ```python 27 | import numpy as np 28 | from radius_clustering import RadiusClustering 29 | 30 | # Your code here that triggers the bug 31 | ``` 32 | validations: 33 | required: true 34 | 35 | - type: textarea 36 | id: expected 37 | attributes: 38 | label: Expected behavior 39 | description: A clear and concise description of what you expected to happen. 40 | validations: 41 | required: true 42 | 43 | - type: dropdown 44 | id: os 45 | attributes: 46 | label: Operating System 47 | description: What operating system are you using? 48 | options: 49 | - Windows 50 | - macOS 51 | - Linux 52 | validations: 53 | required: true 54 | 55 | - type: input 56 | id: python-version 57 | attributes: 58 | label: Python Version 59 | placeholder: "e.g. 3.11.4" 60 | validations: 61 | required: true 62 | 63 | - type: input 64 | id: package-version 65 | attributes: 66 | label: Package Version 67 | placeholder: "e.g. 1.4.0" 68 | validations: 69 | required: true 70 | 71 | - type: textarea 72 | id: additional-context 73 | attributes: 74 | label: Additional context 75 | description: Add any other context about the problem here. 76 | -------------------------------------------------------------------------------- /PRESENTATION.md: -------------------------------------------------------------------------------- 1 | ## How it works 2 | 3 | ### Clustering under radius constraint 4 | Clustering tasks are globally concerned about grouping data points into clusters based on some similarity measure. Clustering under radius constraints is a specific clustering task where the goal is to group data points such that the minimal maximum distance between any two points in the same cluster is less than or equal to a given radius. Mathematically, given a set of data points $X = \{x_1, x_2, \ldots, x_n\}$ and a radius $r$, the goal is to find a partition $ \mathcal{P}$ of $X$ into clusters $C_1, C_2, \ldots, C_k$ such that : 5 | ```math 6 | \forall C \in \mathcal{P}, \min_{x_i \in C}\max_{x_j \in C} d(x_i, x_j) \leq r 7 | ``` 8 | where $d(x_i, x_j)$ is the dissimilarity between $x_i$ and $x_j$. 9 | 10 | ### Minimum Dominating Set (MDS) problem 11 | 12 | The Radius Clustering package implements a clustering algorithm based on the Minimum Dominating Set (MDS) problem. The MDS problem is a well-known NP-Hard problem in graph theory, and it has been proven to be linked to the clustering under radius constraint problem. The MDS problem is defined as follows: 13 | 14 | Given an undirected weighted graph $G = (V,E)$ where $V$ is a set of vertices and $E$ is a set of edges, a dominating set $D$ is a subset of $V$ such that every vertex in $V$ is either in $D$ or adjacent to a vertex in $D$. The goal is to find a dominating set $D$ such that the number of vertices in $D$ is minimized. This problem is known to be NP-Hard. 15 | 16 | However, solving this problem in the context of clustering task can be useful, but we need some adaptations. 17 | 18 | ### Radius Clustering algorithm 19 | 20 | To adapt the MDS problem to the clustering under radius constraint problem, we need to define a graph based on the data points. The vertices of the graph are the data points, and the edges are defined based on the distance between the data points. The weight of the edges is the dissimilarity between the data points. Then, the algorithm operates as follows: 21 | 22 | 1. Construct a graph $G = (V,E)$ based on the data points $X$. 23 | 2. Prune the graph by removing the edges $e_{ij}$ such that $d(x_i,x_j) > r$. 24 | 3. Solve the MDS problem on the pruned graph. 25 | 4. Assign each vertex to the closest vertex in the dominating set. In case of a tie, assign the vertex to the vertex with the smallest index. 26 | 5. Return the cluster labels. -------------------------------------------------------------------------------- /src/radius_clustering/utils/util_heap.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #ifndef V6R_UTIL_HEAP_H 5 | #define V6R_UTIL_HEAP_H 6 | 7 | typedef struct MaxNodeHeap MaxHeap; 8 | typedef struct node Node; 9 | 10 | static const int INF=0x3f3f3f3f; 11 | struct node { 12 | int ID,V; 13 | }; 14 | 15 | struct MaxNodeHeap{ 16 | unsigned size; 17 | unsigned capacity; 18 | Node *array; 19 | int (*comparator)(const Node * ,const Node *); 20 | }; 21 | 22 | 23 | #define size_of_heap(H) (H->size) 24 | #define empty_heap(H) (H->size==0) 25 | 26 | static inline unsigned int left (int i) { return i*2+1; } 27 | static inline unsigned int right (int i) { return (i+1)*2; } 28 | static inline unsigned int parent(int i) { return (i-1) >> 1; } 29 | 30 | static inline void shiftUp(MaxHeap *heap,int i) 31 | { 32 | Node x = heap->array[i]; 33 | int p = parent(i); 34 | while (i != 0 && heap->comparator(&x, &(heap->array[p]))){ 35 | heap->array[i]= heap->array[p]; 36 | i = p; 37 | p = parent(p); 38 | } 39 | heap->array[i] = x; 40 | } 41 | 42 | static inline void shiftDown(MaxHeap *heap,int i) 43 | { 44 | assert(heap->comparator); 45 | 46 | Node x = heap->array[i]; 47 | while (left(i) < heap->size){ 48 | int child = (right(i) < heap->size && heap->comparator(&(heap->array[right(i)]),&(heap->array[left(i)]))) ? right(i) : left(i); 49 | if (!heap->comparator(&(heap->array[child]), &x)) break; 50 | heap->array[i] = heap->array[child]; 51 | i = child; 52 | } 53 | heap->array[i] = x; 54 | } 55 | 56 | static inline void insertHeap(MaxHeap *heap,Node x){ 57 | assert(heap->size<=heap->capacity); 58 | if(heap->size==heap->capacity) { 59 | int NewSize=2*(heap->capacity); 60 | heap->array=(Node *)(realloc(heap->array,(NewSize+1)*sizeof (Node))); 61 | assert((heap->array)!=NULL); 62 | heap->capacity=NewSize; 63 | } 64 | heap->array[heap->size]=x; 65 | heap->size++; 66 | if(heap->size>1)shiftUp(heap,heap->size-1); 67 | } 68 | 69 | static inline int node_cmp_for_MaxHeap(const Node *A,const Node *B) { 70 | if(A->V==B->V)return A->IDID; 71 | return A->V > B->V; 72 | } 73 | 74 | static inline void initHeap(MaxHeap *heap,int capacity,int (*cmp)(const Node *, const Node*)){ 75 | heap->array=(struct node *)calloc(capacity+1,sizeof(Node)); 76 | heap->capacity=capacity; 77 | heap->size=0; 78 | heap->comparator=cmp; 79 | } 80 | 81 | static inline Node removeTop(MaxHeap *heap) 82 | { 83 | 84 | Node x = heap->array[0]; 85 | heap->array[0] = heap->array[heap->size-1]; 86 | heap->size--; 87 | if (heap->size > 1) shiftDown(heap, 0); 88 | return x; 89 | } 90 | 91 | static inline void clearHeap(MaxHeap *heap){ 92 | heap->size=0; 93 | } 94 | #endif //V6R_UTIL_HEAP_H 95 | -------------------------------------------------------------------------------- /src/radius_clustering/utils/mds.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | MDS Solver Module 3 | 4 | This Cython module provides the core functionality for solving Minimum Dominating Set (MDS) problems. 5 | It serves as a bridge between Python and the C++ implementation of the MDS algorithms. 6 | 7 | The module includes: 8 | - Wrapper functions for C++ MDS solvers 9 | - Data structure conversions between Python/NumPy and C++ 10 | - Result processing and conversion back to Python objects 11 | """ 12 | 13 | # distutils: language = c++ 14 | # distutils: sources = mds_clustering/utils/mds_core.cpp mds_clustering/utils/random_manager.cpp 15 | 16 | from libcpp.vector cimport vector 17 | from libcpp.unordered_set cimport unordered_set as cpp_unordered_set 18 | from libcpp.string cimport string 19 | from cython.operator cimport dereference as deref 20 | 21 | import numpy as np 22 | cimport numpy as np 23 | 24 | cdef extern from "random_manager.h": 25 | cdef cppclass RandomManager: 26 | @staticmethod 27 | void setSeed(long seed) 28 | 29 | cdef extern from "mds_core.cpp": 30 | cdef cppclass Result: 31 | Result() 32 | Result(string instanceName) 33 | void add(string key, float value) 34 | float get(int pos) 35 | vector[string] getKeys() 36 | string getInstanceName() 37 | cpp_unordered_set[int] getSolutionSet() 38 | void setSolutionSet(cpp_unordered_set[int] solutionSet) 39 | 40 | cdef Result iterated_greedy_wrapper(int numNodes, const vector[int]& edges_list, int nb_edges, long seed) nogil 41 | 42 | def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, int nb_edges, int seed): 43 | """ 44 | Solve the Minimum Dominating Set problem for a given graph. 45 | 46 | Parameters: 47 | ----------- 48 | num_nodes : int 49 | The number of nodes in the graph. 50 | edges : np.ndarray 51 | A 1D NumPy array representing the edges of the graph. 52 | nb_edges : int 53 | The number of edges in the graph. 54 | name : str 55 | A name identifier for the problem instance. 56 | 57 | Returns: 58 | -------- 59 | dict 60 | A dictionary containing the solution set and other relevant information. 61 | """ 62 | cdef vector[int] cpp_edge_list 63 | 64 | # Cast the NumPy array to a C++ vector 65 | cpp_edge_list.assign(&edges[0], &edges[0] + edges.shape[0]) 66 | 67 | cdef Result result 68 | with nogil: 69 | result = iterated_greedy_wrapper(num_nodes, cpp_edge_list, nb_edges, seed) 70 | 71 | # Convert the C++ Result to a Python dictionary 72 | py_result = { 73 | "solution_set": set(result.getSolutionSet()), 74 | } 75 | 76 | # Add other key-value pairs 77 | keys = result.getKeys() 78 | for i in range(len(keys)): 79 | py_result[keys[i].decode('utf-8')] = result.get(i) 80 | 81 | return py_result -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from radius_clustering import RadiusClustering 4 | from sklearn.datasets import load_iris 5 | 6 | @pytest.fixture 7 | def iris_data(): 8 | """Fixture to load the Iris dataset.""" 9 | data = load_iris() 10 | return data.data 11 | 12 | @pytest.fixture 13 | def approx_results(): 14 | """Fixture to store results for approximate clustering.""" 15 | results = { 16 | 'labels': [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 17 | 0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 18 | 1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,2,2,2,2,1,2,2,2,2, 19 | 2,2,1,1,2,2,2,2,1,2,1,2,1,2,2,1,1,2,2,2,2,2,1,2,2,2,2,1,2,2,2,1,2,2,2,1,2, 20 | 2,1], 21 | "centers": [0,96,125], 22 | "time" : 0.0280, 23 | "effective_radius": 1.4282856857085722 24 | } 25 | return results 26 | 27 | @pytest.fixture 28 | def exact_results(): 29 | """Fixture to store results for exact clustering.""" 30 | results = { 31 | 'labels':[ 32 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 33 | 0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 34 | 1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,2,2,2,2,1,2,2,2,2, 35 | 2,2,1,1,2,2,2,2,1,2,1,2,1,2,2,1,1,2,2,2,2,2,1,2,2,2,2,1,2,2,2,1,2,2,2,1,2, 36 | 2,1 37 | ], 38 | "centers": [0, 96, 102], 39 | "time": 0.0004, 40 | "effective_radius": 1.4282856857085722 41 | } 42 | return results 43 | 44 | def assert_results_exact(results, expected): 45 | """Helper function to assert clustering results.""" 46 | assert_results(results, expected) 47 | assert set(results.labels_) == set(expected['labels']), "Labels do not match expected" 48 | assert results.centers_ == expected['centers'], "Centers do not match expected" 49 | assert np.sum(results.labels_ - expected['labels']) == 0, "Labels do not match expected" 50 | 51 | def assert_results(results, expected): 52 | assert len(results.labels_) == len(expected['labels']), "Labels length mismatch" 53 | assert abs(results.mds_exec_time_ - expected['time']) < 0.1, "Execution time mismatch by more than 10%" 54 | assert abs(results.effective_radius_ - expected['effective_radius'])/results.effective_radius_ < 0.1, "Effective radius mismatch" 55 | 56 | def test_exact(iris_data, exact_results): 57 | """Test the RadiusClustering with exact""" 58 | clustering = RadiusClustering(radius=1.43, manner='exact').fit(iris_data) 59 | assert_results_exact(clustering, exact_results) 60 | 61 | def test_approx(iris_data, approx_results): 62 | """Test the RadiusClustering with approx.""" 63 | clustering = RadiusClustering(radius=1.43, manner='approx').fit(iris_data) 64 | assert_results(clustering, approx_results) 65 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | ## [1.4.2] - 2025-07-07 6 | 7 | ### Contributors 8 | 9 | - [@quentinhaenn](Quentin Haenn) - Main developer and maintainer 10 | 11 | ### Changed 12 | 13 | - Project governance changed : now the project is part of scikit-learn-contrib organization. 14 | - Updated the README to reflect the new governance and organization. 15 | - Updated the documentation to reflect the new governance and organization. 16 | - Updated the test cases to ensure compatibility with the new governance and organization. 17 | - Enhanced repository standards with codecov, binder examples and zenodo integration. 18 | - New issue and PR templates 19 | - Code of conduct and contributing guidelines added 20 | 21 | ## [1.4.0] - 2025-06-19 22 | 23 | ### Contributors 24 | 25 | - [@quentinhaenn](Quentin Haenn) - Main developer and maintainer 26 | 27 | ### Added 28 | 29 | - Added support for custom MDS solvers in the `RadiusClustering` class. 30 | - Updated the documentation to include examples of using custom MDS solvers. 31 | - Added more examples and tutorials to the documentation. 32 | 33 | ### Changed 34 | 35 | - Improved documentation and examples for the `RadiusClustering` class. 36 | - Updated the README to reflect the new features and improvements in version 1.4.0 37 | - Updated the test cases to ensure compatibility with the new features. 38 | - Refactored the main codebase to improve readability and maintainability. 39 | - Prepared the codebase for future adds of MDS solvers and/or clustering algorithms. 40 | 41 | ## [1.3.0] - 2025-06-18 42 | 43 | ### Contributors 44 | 45 | - [@quentinhaenn](Quentin Haenn) - Main developer and maintainer 46 | 47 | ### Added 48 | 49 | - Full test coverage for the entire codebase. 50 | - Badge for test coverage in the README. 51 | - Added `radius` parameter to the `RadiusClustering` class, allowing users to specify the radius for clustering. 52 | 53 | ### Deprecated 54 | 55 | - Deprecated the `threshold` parameter in the `RadiusClustering` class. Use `radius` instead. 56 | 57 | ### Changed 58 | 59 | - Updated all the attributes in the `RadiusClustering` class to fit `scikit-learn` standards and conventions. 60 | - Updated the tests cases to reflect the changes in the `RadiusClustering` class. 61 | - Updated README and documentation to reflect the new `radius` parameter and the deprecation of `threshold`. 62 | 63 | ## [1.2.0] - 2024-10 64 | 65 | ### Contributors 66 | 67 | - [@quentinhaenn](Quentin Haenn) - Main developer and maintainer 68 | - [@mickaelbaron](Mickaël Baron) - Contributor and maintainer 69 | 70 | ### Added 71 | 72 | - Added CI/CD pipelines with GitHub Actions for automated testing and deployment. 73 | - Added package metadata for better integration with PyPI. 74 | - Added a badge for the GitHub Actions workflow status in the README. 75 | - Added a badge for the Python version supported in the README. 76 | - Added a badge for the code style (Ruff) in the README. 77 | - Added a badge for the license in the README. 78 | - Added CI/CD pipelines for PyPI deployment (including test coverage, compiling extensions and wheels, and uploading to PyPI). 79 | - Resolving issues with compiling Cython extensions on Windows and macOS. 80 | -------------------------------------------------------------------------------- /.github/workflows/build_wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | tags: 7 | - "v*" 8 | 9 | jobs: 10 | run_pytest: 11 | name: Run tests on min and max Python versions 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: true 15 | matrix: 16 | python-version: ["3.9", "3.13"] 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install -e ".[dev]" 29 | 30 | - name: Run tests with pytest 31 | run: | 32 | pytest -v 33 | 34 | build_wheels: 35 | name: Build wheels on ${{ matrix.os }} 36 | runs-on: ${{ matrix.os }} 37 | needs: run_pytest 38 | strategy: 39 | fail-fast: false 40 | matrix: 41 | # macos-13 is an intel runner, macos-14 is apple silicon 42 | os: [ubuntu-latest, windows-latest, macos-13, macos-14] 43 | 44 | steps: 45 | - uses: actions/checkout@v4 46 | 47 | - name: Build wheels 48 | uses: pypa/cibuildwheel@v2.22.0 49 | 50 | - uses: actions/upload-artifact@v4 51 | with: 52 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} 53 | path: ./wheelhouse/*.whl 54 | 55 | build_sdist: 56 | name: Build source distribution 57 | runs-on: ubuntu-latest 58 | needs: run_pytest 59 | steps: 60 | - uses: actions/checkout@v4 61 | 62 | - name: Set up Python 63 | uses: actions/setup-python@v5 64 | with: 65 | python-version: "3.12" 66 | 67 | - name: Build sdist 68 | run: | 69 | pip install --upgrade pip 70 | pip install --upgrade pipx 71 | pipx run build --sdist 72 | 73 | - uses: actions/upload-artifact@v4 74 | with: 75 | name: cibw-sdist 76 | path: dist/*.tar.gz 77 | 78 | upload_pypi: 79 | needs: [build_wheels, build_sdist] 80 | runs-on: ubuntu-latest # cannot self host because the next action uses GITHUB_WORKSPACE env variable automatically within the action 81 | environment: 82 | name: pypi 83 | url: https://pypi.org/p/radius-clustering 84 | permissions: 85 | id-token: write 86 | attestations: write 87 | #if: github.event_name == 'release' && github.event.action == 'published' 88 | # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) 89 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 90 | steps: 91 | - name: Download all dists 92 | uses: actions/download-artifact@v4 93 | with: 94 | # unpacks all CIBW artifacts into dist/ 95 | pattern: cibw-* 96 | path: dist/ 97 | merge-multiple: true 98 | 99 | - name: Generate artifact attestations 100 | uses: actions/attest-build-provenance@v2 101 | with: 102 | subject-path: dist/* 103 | 104 | - name: Publish Distribution to PyPI 105 | uses: pypa/gh-action-pypi-publish@release/v1 106 | #with: 107 | # To test: repository-url: https://test.pypi.org/legacy/ 108 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | import os 10 | import sys 11 | from pathlib import Path 12 | 13 | sys.path.insert(0, os.path.abspath(".")) 14 | sys.path.insert(0, os.path.abspath("../..")) 15 | 16 | project = "Radius Clustering" 17 | copyright = "2024, Haenn Quentin, Chardin Brice, Baron Mickaël" 18 | author = "Haenn Quentin, Chardin Brice, Baron Mickaël" 19 | release = "1.3.0" 20 | 21 | # -- General configuration --------------------------------------------------- 22 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 23 | 24 | extensions = [ 25 | "sphinx.ext.autodoc", 26 | "sphinx.ext.autosummary", 27 | "sphinx.ext.viewcode", 28 | "sphinx_prompt", 29 | "sphinx.ext.napoleon", 30 | "sphinxcontrib.sass", 31 | "sphinx_remove_toctrees", 32 | "sphinxcontrib.email", 33 | "sphinx_gallery.gen_gallery", 34 | "sphinx_copybutton", 35 | "sphinx.ext.intersphinx", 36 | "sphinx_design", 37 | ] 38 | 39 | master_doc = "index" 40 | 41 | # Specify how to identify the prompt when copying code snippets 42 | copybutton_prompt_text = r">>> |\.\.\. " 43 | copybutton_prompt_is_regexp = True 44 | copybutton_exclude = "style" 45 | 46 | # Conf of numpydoc 47 | numpydoc_class_members_toctree = False 48 | 49 | templates_path = ["_templates"] 50 | exclude_patterns = [] 51 | 52 | # -- Options for HTML output ------------------------------------------------- 53 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 54 | 55 | html_theme = "pydata_sphinx_theme" 56 | # html_static_path = ['_static'] 57 | 58 | html_logo = "./images/logo-lias.jpg" 59 | 60 | html_short_title = "Radius Clustering" 61 | 62 | html_sidebars = {"**": []} 63 | 64 | html_theme_options = { 65 | "icon_links_label": "Icon Links", 66 | "icon_links": [ 67 | { 68 | "name": "GitHub", 69 | "url": "https://github.com/quentinhaenn", 70 | "icon": "fa-brands fa-square-github", 71 | "type": "fontawesome", 72 | }, 73 | ], 74 | "show_prev_next": False, 75 | "search_bar_text": "Search the docs ...", 76 | "navigation_with_keys": False, 77 | "navbar_align": "left", 78 | "navbar_start": ["navbar-logo"], 79 | "navbar_center": ["navbar-nav"], 80 | "navbar_end": ["theme-switcher", "navbar-icon-links", "version-switcher"], 81 | "navbar_persistent": ["search-button"], 82 | "article_footer_items": ["prev-next"], 83 | "footer_start": ["copyright"], 84 | "footer_center": [], 85 | "footer_end": [], 86 | } 87 | 88 | # Compile scss files into css files using sphinxcontrib-sass 89 | sass_src_dir, sass_out_dir = "scss", "_static/styles" 90 | sass_targets = { 91 | f"{file.stem}.scss": f"{file.stem}.css" 92 | for file in Path(sass_src_dir).glob("*.scss") 93 | } 94 | 95 | html_static_path = ["_static"] 96 | # Additional CSS files, should be subset of the values of `sass_targets` 97 | html_css_files = ["styles/custom.css"] 98 | 99 | sg_examples_dir = "../../examples" 100 | sg_gallery_dir = "auto_examples" 101 | sphinx_gallery_conf = { 102 | "doc_module": "radius_clustering", 103 | "backreferences_dir": os.path.join("modules", "generated"), 104 | "show_memory": False, 105 | "examples_dirs": [sg_examples_dir], 106 | "gallery_dirs": [sg_gallery_dir], 107 | # avoid generating too many cross links 108 | "inspect_global_variables": False, 109 | "remove_config_comments": True, 110 | "plot_gallery": "True", 111 | "recommender": {"enable": True, "n_examples": 4, "min_df": 12}, 112 | "reset_modules": ("matplotlib", "seaborn"), 113 | } 114 | -------------------------------------------------------------------------------- /src/radius_clustering/algorithms.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the implementation of the clustering algorithms. 3 | It provides two main functions: `clustering_approx` and `clustering_exact`. 4 | 5 | These functions can be replaced in the `RadiusClustering` class 6 | to perform clustering using another algorithm. 7 | 8 | .. versionadded:: 1.4.0 9 | Refactoring the structure of the code to separate the clustering algorithms 10 | This allows for easier maintenance and extensibility of the codebase. 11 | 12 | """ 13 | from __future__ import annotations 14 | 15 | import numpy as np 16 | 17 | from .utils._mds_approx import solve_mds 18 | from .utils._emos import py_emos_main 19 | 20 | def clustering_approx( 21 | n: int, edges: np.ndarray, nb_edges: int, 22 | random_state: int | None = None) -> None: 23 | """ 24 | Perform approximate MDS clustering. 25 | This method uses a pretty trick to set the seed for 26 | the random state of the C++ code of the MDS solver. 27 | 28 | .. tip:: 29 | The random state is used to ensure reproducibility of the results 30 | when using the approximate method. 31 | If `random_state` is None, a default value of 42 is used. 32 | 33 | .. important:: 34 | The trick to set the random state is : 35 | 36 | 1. Use the `check_random_state` function to get a `RandomState`singleton 37 | instance, set up with the provided `random_state`. 38 | 39 | 2. Use the `randint` method of the `RandomState` instance to generate a 40 | random integer. 41 | 42 | 3. Use this random integer as the seed for the C++ code of the MDS solver. 43 | 44 | 45 | This ensures that the seed passed to the C++ code is always an integer, 46 | which is required by the MDS solver, and allows for 47 | reproducibility of the results. 48 | 49 | .. note:: 50 | This function uses the approximation method to solve the MDS problem. 51 | See [casado]_ for more details. 52 | 53 | Parameters: 54 | ----------- 55 | n : int 56 | The number of points in the dataset. 57 | edges : np.ndarray 58 | The edges of the graph, flattened into a 1D array. 59 | nb_edges : int 60 | The number of edges in the graph. 61 | random_state : int | None 62 | The random state to use for reproducibility. 63 | If None, a default value of 42 is used. 64 | Returns: 65 | -------- 66 | centers : list 67 | A sorted list of the centers of the clusters. 68 | mds_exec_time : float 69 | The execution time of the MDS algorithm in seconds. 70 | """ 71 | result = solve_mds( 72 | n, edges.flatten().astype(np.int32), nb_edges, random_state 73 | ) 74 | centers = sorted([x for x in result["solution_set"]]) 75 | mds_exec_time = result["Time"] 76 | return centers, mds_exec_time 77 | 78 | def clustering_exact(n: int, edges: np.ndarray, nb_edges: int, seed: None = None) -> None: 79 | """ 80 | Perform exact MDS clustering. 81 | 82 | This function uses the EMOs algorithm to solve the MDS problem. 83 | 84 | .. important:: 85 | The EMOS algorithm is an exact algorithm for solving the MDS problem. 86 | It is a branch and bound algorithm that uses graph theory tricks 87 | to efficiently cut the search space. See [jiang]_ for more details. 88 | 89 | Parameters: 90 | ----------- 91 | n : int 92 | The number of points in the dataset. 93 | edges : np.ndarray 94 | The edges of the graph, flattened into a 1D array. 95 | nb_edges : int 96 | The number of edges in the graph. 97 | seed : None 98 | This parameter is not used in the exact method, but it is kept for 99 | compatibility with the approximate method. 100 | 101 | Returns: 102 | -------- 103 | centers : list 104 | A sorted list of the centers of the clusters. 105 | mds_exec_time : float 106 | The execution time of the MDS algorithm in seconds. 107 | """ 108 | centers, mds_exec_time = py_emos_main( 109 | edges.flatten(), n, nb_edges 110 | ) 111 | centers.sort() 112 | return centers, mds_exec_time -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. _index: 2 | 3 | .. toctree:: 4 | :maxdepth: 2 5 | :hidden: 6 | 7 | installation 8 | details 9 | usage 10 | api 11 | auto_examples/index 12 | 13 | Welcome to Radius Clustering's documentation! 14 | ============================================= 15 | 16 | 17 | The Radius Clustering algorithm is a clustering under radius constraint algorithm. It is based on the minimum dominating set problem (MDS) in graph theory. 18 | 19 | The algorithm is designed such that it can be used to cluster data points based on a radius constraint. The goal is to group data points such that the minimal maximum distance between any two points in the same cluster is less than or equal to a given radius. 20 | 21 | The algorithm is based on the equivalence between the minimum dominating set problem and the clustering under radius constraint problem. The latter problem is characterized by a radius parameter :math:`r` and a set of points :math:`X`. The goal is to find a partition of the points into subsets such that each subset is contained in a ball of radius :math:`r`. Plus, the goal is to minimize the number of subsets. 22 | 23 | This problem is proven to be NP-Hard, and the MDS problem is known to be NP-Hard as well. 24 | 25 | We propose an implementation to tackle this specific problem, based upon the MDS problem. The idea is to use the MDS algorithm to find the representative points of each cluster, and then to assign each point to the nearest representative point. 26 | 27 | 28 | .. warning:: Considering the NP-Hardness (or NP-Completeness) of the MDS problem, we alert that the overall complexity of any algorithm tackling this problem cannot be polynomial, unless P=NP. That is why we alert the user that the algorithm may take a long time to run on large datasets, especially when using the exact algorithm. 29 | From the experiments conducted, the exact algorithm is not recommended for datasets with more than 1000 points, but the overall complexity of the datasets and or the internal structure of the data may affect this threshold, in either way. For a more complete insight, we recommand the user to refer to the paper `Clustering under radius constraint using minimum dominating sets `_ or reading the :ref:`details` page of the documentation. 30 | 31 | 32 | 33 | Acknowledgments 34 | =============== 35 | 36 | The authors would like to thank the following people for their work that contributed either directly or indirectly to the development of this algorithm: 37 | 38 | Authors & Contributors 39 | ---------------------- 40 | 41 | **Quentin Haenn**, ISAE-ENSMA, LIAS, France. PhD Student, first author of this work. 42 | 43 | .. note:: 44 | - `GitHub `_ 45 | - `Lab page `_ 46 | 47 | **Brice Chardin**, ISAE-ENSMA, LIAS, France. Associate Professor, co-author of this work. 48 | 49 | .. note:: 50 | - `Lab page `_ 51 | 52 | **Mickaël Baron**, ISAE-ENSMA, LIAS, France. Research Engineer, co-author of this work. 53 | 54 | .. note:: 55 | - `GitHub `_ 56 | - `Lab page `_ 57 | 58 | Principal References 59 | -------------------- 60 | 61 | .. [casado] A. Casado, S. Bermudo, A.D. López-Sánchez, J. Sánchez-Oro, 62 | An iterated greedy algorithm for finding the minimum dominating set in graphs, 63 | Mathematics and Computers in Simulation, 64 | Volume 207, 65 | 2023 66 | Code available at https://github.com/AlejandraCasado 67 | 68 | *We rewrote the code in C++ to adapt to the need of python interfacing.* 69 | 70 | .. [jiang] Jiang, Hua and Zheng, Zhifei, "An Exact Algorithm for the Minimum Dominating Set Problem", Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, 71 | pages 5604--5612 -in- proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI-23, 2023, doi: 10.24963/ijcai.2023/622. 72 | Code available at https://github.com/huajiang-ynu. 73 | 74 | *We adapted the code to the need of python interfacing.* 75 | 76 | 77 | .. [andersen] Jennie Andersen, Brice Chardin, Mohamed Tribak. "Clustering to the Fewest Clusters Under Intra-Cluster Dissimilarity Constraints". Proceedings of the 33rd IEEE International Conference on Tools with Artificial Intelligence, Nov 2021, Athens, Greece. pp.209-216, https://dx.doi.org/10.1109/ICTAI52525.2021.00036 78 | 79 | .. [bien] Bien, J., & Tibshirani, R. (2011). Hierarchical Clustering with Prototypes via Minimax Linkage. http://faculty.marshall.usc.edu/Jacob-Bien/papers/jasa2011minimax.pdf 80 | 81 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["numpy>=2.0","cython >=3.0","setuptools >= 61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "radius-clustering" 7 | dynamic = ["version"] 8 | description = "A Clustering under radius constraints algorithm using minimum dominating sets" 9 | readme = "README.md" 10 | authors = [ 11 | {name = "Quentin Haenn"}, 12 | {name = "Lias Laboratory"} 13 | ] 14 | maintainers = [ 15 | {name = "Quentin Haenn", email = "quentin.haenn.pro@gmail.com"} 16 | ] 17 | 18 | dependencies = [ 19 | "matplotlib>=3.6.2", 20 | "numpy>=2.0", 21 | "scikit-learn>=1.2.2", 22 | "scipy>=1.12.0", 23 | ] 24 | 25 | requires-python = ">=3.9" 26 | license = {file = "LICENSE"} 27 | classifiers=[ 28 | "Development Status :: 5 - Production/Stable", 29 | "Intended Audience :: Science/Research", 30 | "Intended Audience :: Developers", 31 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 32 | "Programming Language :: C", 33 | "Programming Language :: C++", 34 | "Programming Language :: Python", 35 | "Topic :: Software Development", 36 | "Topic :: Scientific/Engineering", 37 | "Operating System :: Microsoft :: Windows", 38 | "Operating System :: POSIX", 39 | "Operating System :: Unix", 40 | "Operating System :: MacOS", 41 | "Programming Language :: Python :: 3", 42 | "Programming Language :: Python :: 3.9", 43 | "Programming Language :: Python :: 3.10", 44 | "Programming Language :: Python :: 3.11", 45 | "Programming Language :: Python :: 3.12", 46 | "Programming Language :: Python :: 3.13", 47 | "Programming Language :: Python :: Implementation :: CPython", 48 | ] 49 | keywords = ["Unsupervised learning", "clustering", "minimum dominating sets","clustering under radius constraint"] 50 | 51 | [project.urls] 52 | source = "https://github.com/scikit-learn-contrib/radius_clustering" 53 | tracker = "https://github.com/scikit-learn-contrib/radius_clustering/issues" 54 | documentation = "https://contrib.scikit-learn.org/radius_clustering/" 55 | 56 | [project.optional-dependencies] 57 | dev = [ 58 | "pre-commit>=3.8.0", 59 | "pytest>=8.3.3", 60 | "pytest-cov>=5.0.0", 61 | "pandas", 62 | "cython>=3.0", 63 | "setuptools>= 61.0", 64 | "black>=24.3.0", 65 | "ruff>=0.4.8", 66 | ] 67 | 68 | doc = [ 69 | "pandas", 70 | "networkx>=3.3", 71 | "sphinx>=8.1.3", 72 | "sphinx_gallery>=0.18.0", 73 | "sphinx-copybutton>=0.5.2", 74 | "pydata-sphinx-theme>=0.15.3", 75 | "sphinxcontrib-email>=0.3.6", 76 | "sphinx-remove-toctrees>=1.0.0", 77 | "sphinx-prompt>=1.9.0", 78 | "sphinx_design>=0.6.1", 79 | "sphinxcontrib.sass >= 0.3.4", 80 | ] 81 | 82 | [tool.setuptools] 83 | packages.find = {where = ["src"], include = ["radius_clustering", "radius_clustering.*"]} 84 | dynamic.version = {attr = "radius_clustering.__version__"} 85 | 86 | [tool.pytest.ini_options] 87 | pythonpath = "src" 88 | testpaths = ["tests"] 89 | addopts = [ 90 | "--import-mode=importlib", 91 | "--cov=src/radius_clustering", 92 | "--cov-report=term-missing", 93 | "--cov-report=html:coverage_html_report", 94 | ] 95 | 96 | [tool.coverage.run] 97 | source = ["src/radius_clustering"] 98 | branch = true 99 | 100 | [tool.coverage.report] 101 | show_missing = true 102 | 103 | [tool.coverage.html] 104 | directory = "coverage_html_report" 105 | title = "Coverage Report" 106 | 107 | [tool.ruff] 108 | # Exclude a variety of commonly ignored directories. 109 | exclude = [ 110 | ".git", 111 | ".git-rewrite", 112 | ".pytest_cache", 113 | ".ruff_cache", 114 | ".venv", 115 | ".vscode", 116 | "__pypackages__", 117 | "_build", 118 | "buck-out", 119 | "build", 120 | "dist", 121 | "site-packages", 122 | "venv", 123 | "**.egg-info", 124 | "tests", 125 | "examples", 126 | ] 127 | 128 | # Same as Black. 129 | line-length = 88 130 | target-version = "py310" 131 | 132 | [tool.ruff.lint] 133 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 134 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 135 | # McCabe complexity (`C901`) by default. 136 | select = ["E", "F", "W", "I"] 137 | ignore = ["E203", "E731", "E741"] 138 | 139 | # Allow fix for all enabled rules (when `--fix`) is provided. 140 | fixable = ["ALL"] 141 | unfixable = [] 142 | 143 | # Allow unused variables when underscore-prefixed. 144 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 145 | 146 | [tool.ruff.format] 147 | # Like Black, use double quotes for strings. 148 | quote-style = "double" 149 | 150 | # Like Black, indent with spaces, rather than tabs. 151 | indent-style = "space" 152 | 153 | # Like Black, respect magic trailing commas. 154 | skip-magic-trailing-comma = false 155 | 156 | # Like Black, automatically detect the appropriate line ending. 157 | line-ending = "auto" 158 | 159 | # Enable auto-formatting of code examples in docstrings. 160 | docstring-code-format = true 161 | 162 | # Set the line length limit used when formatting code snippets in 163 | # docstrings. 164 | docstring-code-line-length = "dynamic" 165 | 166 | 167 | [tool.cibuildwheel] 168 | # Skip building for PyPy, python 3.6/7/8 and 13t, and 32-bit platforms. 169 | skip = ["pp*", "cp36-*", "cp37-*", "cp38-*", "*-win32", "*linux_i686", "*musllinux*"] 170 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, caste, color, religion, or sexual 11 | identity and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | - Demonstrating empathy and kindness toward other people 22 | - Being respectful of differing opinions, viewpoints, and experiences 23 | - Giving and gracefully accepting constructive feedback 24 | - Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | - Focusing on what is best not just for us as individuals, but for the overall 27 | community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | - The use of sexualized language or imagery, and sexual attention or advances of 32 | any kind 33 | - Trolling, insulting or derogatory comments, and personal or political attacks 34 | - Public or private harassment 35 | - Publishing others' private information, such as a physical or email address, 36 | without their explicit permission 37 | - Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official email address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement : 64 | [Send Report](mailto:quentin.haenn.pro@gmail.com). 65 | All complaints will be reviewed and investigated promptly and fairly. 66 | 67 | All community leaders are obligated to respect the privacy and security of the 68 | reporter of any incident. 69 | 70 | ## Enforcement Guidelines 71 | 72 | Community leaders will follow these Community Impact Guidelines in determining 73 | the consequences for any action they deem in violation of this Code of Conduct: 74 | 75 | ### 1. Correction 76 | 77 | **Community Impact**: Use of inappropriate language or other behavior deemed 78 | unprofessional or unwelcome in the community. 79 | 80 | **Consequence**: A private, written warning from community leaders, providing 81 | clarity around the nature of the violation and an explanation of why the 82 | behavior was inappropriate. A public apology may be requested. 83 | 84 | ### 2. Warning 85 | 86 | **Community Impact**: A violation through a single incident or series of 87 | actions. 88 | 89 | **Consequence**: A warning with consequences for continued behavior. No 90 | interaction with the people involved, including unsolicited interaction with 91 | those enforcing the Code of Conduct, for a specified period of time. This 92 | includes avoiding interactions in community spaces as well as external channels 93 | like social media. Violating these terms may lead to a temporary or permanent 94 | ban. 95 | 96 | ### 3. Temporary Ban 97 | 98 | **Community Impact**: A serious violation of community standards, including 99 | sustained inappropriate behavior. 100 | 101 | **Consequence**: A temporary ban from any sort of interaction or public 102 | communication with the community for a specified period of time. No public or 103 | private interaction with the people involved, including unsolicited interaction 104 | with those enforcing the Code of Conduct, is allowed during this period. 105 | Violating these terms may lead to a permanent ban. 106 | 107 | ### 4. Permanent Ban 108 | 109 | **Community Impact**: Demonstrating a pattern of violation of community 110 | standards, including sustained inappropriate behavior, harassment of an 111 | individual, or aggression toward or disparagement of classes of individuals. 112 | 113 | **Consequence**: A permanent ban from any sort of public interaction within the 114 | community. 115 | 116 | ## Attribution 117 | 118 | This Code of Conduct is adapted from the 119 | [Contributor Covenant](https://www.contributor-covenant.org/), version 2.1, 120 | available at 121 | . 122 | 123 | Community Impact Guidelines were inspired by 124 | [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | . Translations are available at 128 | . 129 | -------------------------------------------------------------------------------- /EXPERIMENTS.md: -------------------------------------------------------------------------------- 1 | ### Experimental results 2 | 3 | The Radius Clustering package provides two algorithms to solve the MDS problem: an exact algorithm and an approximate algorithm. The approximate algorithm is based on a heuristic that iteratively selects the vertex that dominates the most vertices in the graph. The exact algorithm is based on a branch-and-bound algorithm that finds the minimum dominating set in the graph. Experimentation has been conducted on real-world datasets to compare the performances of these two algorithms, and compare them to state-of-the-art clustering algorithms. The complete results are available in the paper [Clustering under radius constraint using minimum dominating sets](https://hal.science/hal-04533921/). 4 | 5 | The algorithms selected for comparison are: 6 | 7 | 1. Equiwide clustering (EQW-LP), a state-of-the-art exact algorithm using LP formulation of the problem [[3]](https://hal.science/hal-03356000) 8 | 2. ProtoClust [[4](http://faculty.marshall.usc.edu/Jacob-Bien/papers/jasa2011minimax.pdf)] 9 | 10 | Here are some key results from the experiments: 11 | 12 | Table 1: Average running time (in seconds) of the algorithms on real-world datasets. 13 | 14 | | **Dataset** | **MDS-APPROX** | **MDS-EXACT** | **EQW-LP** | **PROTOCLUST** | 15 | |--------------------------|----------------|---------------|--------------|----------------| 16 | | **Iris** | 0.062 ± 0.01 | 0.009 ± 0.00 | 0.018 ± 0.01 | 0.026 ± 0.00 | 17 | | **Wine** | 0.029 ± 0.00 | 0.010 ± 0.00 | 0.014 ± 0.00 | 0.034 ± 0.00 | 18 | | **Glass Identification** | 0.015 ± 0.00 | 0.020 ± 0.00 | 0.026 ± 0.00 | 0.046 ± 0.00 | 19 | | **Ionosphere** | 0.078 ± 0.01 | 2.640 ± 0.05 | 0.104 ± 0.00 | 0.120 ± 0.00 | 20 | | **WDBC** | 0.315 ± 0.01 | 0.138 ± 0.00 | 0.197 ± 0.01 | 0.402 ± 0.00 | 21 | | **Synthetic Control** | 0.350 ± 0.03 | 0.036 ± 0.00 | 0.143 ± 0.01 | 0.489 ± 0.00 | 22 | | **Vehicle** | 0.955 ± 0.04 | 0.185 ± 0.00 | 0.526 ± 0.01 | 0.830 ± 0.01 | 23 | | **Yeast** | 2.361 ± 0.03 | 738.8 ± 0.30 | 6.718 ± 0.02 | 2.374 ± 0.08 | 24 | | **Ozone** | 49.82 ± 1.18 | 1447 ± 0.54 | 26.86 ± 0.63 | 15.32 ± 0.15 | 25 | | **Waveform** | 48.01 ± 0.39 | 8813 ± 57.80 | 233.9 ± 1.45 | 61.27 ± 0.08 | 26 | 27 | Table 2: Number of clusters obtained on real-world datasets. 28 | 29 | | **Dataset** | **MDS-APPROX** | **MDS-EXACT** | **EQW-LP** | **PROTOCLUST** | 30 | |--------------------------|----------------|---------------|------------|----------------| 31 | | **Iris** | 3 | 3 | 3 | 4 | 32 | | **Wine** | 4 | 3 | 3 | 4 | 33 | | **Glass Identification** | 7 | 6 | 6 | 7 | 34 | | **Ionosphere** | 2 | 2 | 2 | 5 | 35 | | **WDBC** | 2 | 2 | 2 | 3 | 36 | | **Synthetic Control** | 8 | 6 | 6 | 8 | 37 | | **Vehicle** | 5 | 4 | 4 | 6 | 38 | | **Yeast** | 10 | 10 | 10 | 13 | 39 | | **Ozone** | 3 | 2 | 2 | 3 | 40 | | **Waveform** | 3 | 3 | 3 | 6 | 41 | 42 | 43 | Table 3: Compactness of the clusters (maximal radius obtained after clustering) obtained on real-world datasets. 44 | 45 | | **Dataset** | **MDS-APPROX** | **MDS-EXACT** | **EQW-LP** | **PROTOCLUST** | 46 | |--------------------------|----------------|---------------|------------|----------------| 47 | | **Iris** | 1.43 | 1.43 | 1.43 | 1.24 | 48 | | **Wine** | 220.05 | 232.08 | 232.08 | 181.35 | 49 | | **Glass Identification** | 3.94 | 3.94 | 3.94 | 3.31 | 50 | | **Ionosphere** | 4.45 | 5.45 | 5.45 | 5.35 | 51 | | **WDBC** | 1197.42 | 1197.42 | 1197.42 | 907.10 | 52 | | **Synthetic Control** | 66.59 | 70.11 | 70.11 | 68.27 | 53 | | **Vehicle** | 150.87 | 155.05 | 155.05 | 120.97 | 54 | | **Yeast** | 0.42 | 0.42 | 0.42 | 0.42 | 55 | | **Ozone** | 235.77 | 245.58 | 245.58 | 194.89 | 56 | | **Waveform** | 10.73 | 10.73 | 10.73 | 10.47 | 57 | 58 | 59 | #### Key insights: 60 | 61 | - The approximate algorithm is significantly faster than the exact algorithm, but it may not always provide the optimal solution. 62 | - The exact algorithm is slower but provides the optimal solution. Does not scale well to large datasets, due to the NP-Hard nature of the problem. 63 | - The approximate algorithm is a good trade-off between speed and accuracy for most datasets. 64 | - MDS based approach are both more accurate than Protoclust. However, Protoclust is remarkably faster on most datasets. 65 | 66 | 67 | > :memo: **Note**: The results show that MDS-based clustering algorithms might be a good alternative to state-of-the-art clustering algorithms for clustering under radius constraint problems. 68 | 69 | > :memo: **Note**: Since the publication of the paper, the Radius Clustering package has been improved and optimized. The results presented here are based on the initial version of the package. For the latest results, please refer to the documentation or the source code. 70 | 71 | 72 | ## References 73 | 74 | - [3] [Clustering to the fewest clusters under intra-cluster dissimilarity constraints](https://hal.science/hal-03356000) 75 | - [4] [Hierarchical Clustering with prototypes via Minimax Linkage](http://faculty.marshall.usc.edu/Jacob-Bien/papers/jasa2011minimax.pdf) 76 | 77 | -------------------------------------------------------------------------------- /docs/source/details.rst: -------------------------------------------------------------------------------- 1 | .. _details: 2 | 3 | How it works 4 | ============ 5 | 6 | This page of the documentation is dedicated to explain the theory behind the algorithm, how it is built and present you 7 | some key results obtained from experiments conducted on real-world datasets. 8 | 9 | First, we'll detail the problem of clustering under radius constraint, then we'll explain the Minimum Dominating Set (MDS) problem and how it is adapted to the clustering problem. Finally, we'll present some key results obtained from experiments conducted on real-world datasets. 10 | 11 | 12 | Clustering under radius constraint 13 | ---------------------------------- 14 | 15 | Clustering tasks are globally concerned about grouping data points into clusters based on some similarity measure. 16 | Clustering under radius constraints is a specific clustering task where the goal is to group data points such that the 17 | minimal maximum distance between any two points in the same cluster is less than or equal to a given radius. 18 | Mathematically, given a set of data points :math:`X = \{x_1, x_2, \ldots, x_n\}` and a radius :math:`r`, 19 | the goal is to find a partition :math:`\mathcal{P}` of :math:`X` into clusters :math:`C_1, C_2, \ldots, C_k` such that : 20 | :math:`\forall C \in \mathcal{P}, \min_{x_i \in C}\max_{x_j \in C}\ d_{ij} \leq r` 21 | where :math:`d_{ij} = d(x_i, x_j)` is the dissimilarity between :math:`x_i` and :math:`x_j`. 22 | 23 | 24 | Minimum Dominating Set (MDS) problem 25 | ------------------------------------ 26 | 27 | The Radius Clustering package implements a clustering algorithm based on the Minimum Dominating Set (MDS) problem. 28 | The MDS problem is a well-known NP-Hard problem in graph theory, and it has been proven to be linked to the clustering 29 | under radius constraint problem. The MDS problem is defined as follows: 30 | 31 | Given an undirected weighted graph :math:`G = (V,E)` where :math:`V` is a set of vertices and :math:`E` is a set of edges, 32 | a dominating set :math:`D` is a subset of :math:`V` such that every vertex in :math:`V` is either in :math:`D` or 33 | adjacent to a vertex in :math:`D`. The goal is to find a dominating set :math:`D` such that the number of vertices in 34 | :math:`D` is minimized. This problem is known to be NP-Hard. 35 | 36 | However, solving this problem in the context of clustering task can be useful. But it has to be adapted to the needs of a clustering task. 37 | 38 | Presenting the algorithm 39 | ------------------------ 40 | 41 | To adapt the MDS problem to the clustering under radius constraint problem, we need to define a graph based on the data points. The vertices of the graph are the data points, and the edges are defined based on the distance between the data points. The weight of the edges is the dissimilarity between the data points. Then, the algorithm operates as follows: 42 | 43 | 1. Construct a graph :math:`G = (V,E)` based on the data points :math:`X`. 44 | 2. Prune the graph by removing the edges :math:`e_{ij}` such that :math:`d(x_i,x_j) > r`. 45 | 3. Solve the MDS problem on the pruned graph. 46 | 4. Assign each vertex to the closest vertex in the dominating set. In case of a tie, assign the vertex to the vertex with the smallest index. 47 | 5. Return the cluster labels. 48 | 49 | Experimental results 50 | -------------------- 51 | 52 | The Radius Clustering package provides two algorithms to solve the MDS problem: an exact algorithm and an approximate algorithm. 53 | The approximate algorithm [casado]_ is based on a heuristic that iteratively selects the vertex that dominates the most vertices 54 | in the graph. The exact algorithm [jiang]_ is based on a branch-and-bound algorithm that finds the minimum dominating set in the graph. 55 | Experimentation has been conducted on real-world datasets to compare the performances of these two algorithms, 56 | and compare them to state-of-the-art clustering algorithms. The complete results from first experiments are available in the paper 57 | `Clustering under radius constraint using minimum dominating sets `_. 58 | 59 | The algorithms selected for comparison are: 60 | 61 | 1. Equiwide clustering (EQW-LP), a state-of-the-art exact algorithm using LP formulation of the problem ([andersen]_) 62 | 2. Protoclust ([bien]_), a state-of-the-art approximate algorithm based on the hierarchical agglomerative clustering using MinMax linkage. 63 | 64 | Here are some key results from the experiments: 65 | 66 | .. csv-table:: Number of clusters obtained on real-world datasets. 67 | :header: Dataset, MDS-APPROX, MDS-EXACT, EQW-LP, PROTOCLUST 68 | :widths: 20, 20, 20, 20, 20 69 | 70 | Iris,3,3,3,4 71 | Wine,4,3,3,4 72 | Glass Identification,7,6,6,7 73 | Ionosphere,2,2,2,5 74 | WDBC,2,2,2,3 75 | Synthetic Control,8,6,6,8 76 | Vehicle,5,4,4,6 77 | Yeast,10,10,10,13 78 | Ozone,3,2,2,3 79 | Waveform,3,3,3,6 80 | 81 | 82 | .. csv-table:: Compactness of the clusters (maximal radius obtained after clustering) obtained on real-world datasets. 83 | :header: Dataset, MDS-APPROX, MDS-EXACT, EQW-LP, PROTOCLUST 84 | :widths: 20, 20, 20, 20, 20 85 | 86 | Iris,1.43,1.43,1.43,1.24 87 | Wine,220.05,232.08,232.08,181.35 88 | Glass Identification,3.94,3.94,3.94,3.31 89 | Ionosphere,4.45,5.45,5.45,5.35 90 | WDBC,1197.42,1197.42,1197.42,907.10 91 | Synthetic Control,66.59,70.11,70.11,68.27 92 | Vehicle,150.87,155.05,155.05,120.97 93 | Yeast,0.42,0.42,0.42,0.42 94 | Ozone,235.77,245.58,245.58,194.89 95 | Waveform,10.73,10.73,10.73,10.47 96 | 97 | 98 | .. image:: ./images/exec_time.png 99 | :width: 800 100 | :align: center 101 | 102 | .. image:: ./images/exec_time_optimized.png 103 | :width: 800 104 | :align: center 105 | 106 | 107 | 108 | Key insights: 109 | +++++++++++++ 110 | 111 | - The approximate algorithm is significantly faster than the exact algorithm, but it may not always provide the optimal solution. 112 | - The exact algorithm is slower but provides the optimal solution. Does not scale well to large datasets, due to the NP-Hard nature of the problem. 113 | - The approximate algorithm is a good trade-off between speed and accuracy for most datasets. 114 | - MDS based approach are both more accurate than Protoclust. However, Protoclust is remarkably faster on most datasets. 115 | 116 | 117 | .. note:: The results show that MDS-based clustering algorithms might be a good alternative to state-of-the-art clustering algorithms for clustering under radius constraint problems. 118 | 119 | .. note:: Since the publication of the paper, the Radius Clustering package has been improved and optimized. The results presented here are based on the initial version of the package. For the latest results, please refer to the documentation or the source code. 120 | 121 | 122 | -------------------------------------------------------------------------------- /docs/source/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ===== 3 | 4 | This page provides a quick guide on how to use the `radius_clustering` package for clustering tasks. The package provides a simple interface for performing radius-based clustering on datasets based on the Minimum Dominating Set (MDS) algorithm. 5 | 6 | This page is divided into three main sections: 7 | 1. **Basic Usage**: A quick example of how to use the `RadiusClustering` class and perform clustering with several parameters. 8 | 2. **Custom Dissimilarity Function**: How to use a custom dissimilarity function with the `RadiusClustering` class. 9 | 3. **Custom MDS Solver**: How to implement a custom MDS solver for more advanced clustering tasks, eventually with less guarantees on the results. 10 | 11 | 12 | Basic Usage 13 | ----------------- 14 | 15 | The `RadiusClustering` class provides a straightforward way to perform clustering based on a specified radius. You can choose between an approximate or exact method for clustering, depending on your needs. 16 | 17 | Here's a basic example of how to use Radius Clustering with the `RadiusClustering` class, using the approximate method: 18 | 19 | .. code-block:: python 20 | 21 | from radius_clustering import RadiusClustering 22 | import numpy as np 23 | 24 | # Generate random data 25 | X = np.random.rand(100, 2) 26 | 27 | # Create an instance of MdsClustering 28 | rad = RadiusClustering(manner="approx", radius=0.5) 29 | # Attention: the 'threshold' parameter is deprecated by version 1.3.0 30 | # and will be removed in a future version. Use 'radius' instead. 31 | 32 | # Fit the model to the data 33 | rad.fit(X) 34 | 35 | # Get cluster labels 36 | labels = rad.labels_ 37 | 38 | print(labels) 39 | 40 | Similarly, you can use the exact method by changing the `manner` parameter to `"exact"`: 41 | .. code-block:: python 42 | # [...] Exact same code as above 43 | rad = RadiusClustering(manner="exact", radius=0.5) #change this parameter 44 | # [...] Exact same code as above 45 | 46 | Custom Dissimilarity Function 47 | ----------------------------- 48 | 49 | The main reason behind the `radius_clustering` package is that users eventually needs to use a dissimilarity function that is not a metric (or distance) function. Plus, sometimes context requires a domain-specific dissimilarity function that is not provided by default, and needs to be implemented by the user. 50 | 51 | To use a custom dissimilarity function, you can pass it as a parameter to the `RadiusClustering` class. Here's an example of how to do this: 52 | .. code-block:: python 53 | 54 | from radius_clustering import RadiusClustering 55 | import numpy as np 56 | 57 | # Generate random data 58 | X = np.random.rand(100, 2) 59 | 60 | # Define a custom dissimilarity function 61 | def dummy_dissimilarity(x, y): 62 | return np.linalg.norm(x - y) + 0.1 # Example: add a constant to the distance 63 | 64 | # Create an instance of MdsClustering with the custom dissimilarity function 65 | rad = RadiusClustering(manner="approx", radius=0.5, metric=dummy_dissimilarity) 66 | 67 | # Fit the model to the data 68 | rad.fit(X) 69 | 70 | # Get cluster labels 71 | labels = rad.labels_ 72 | 73 | print(labels) 74 | 75 | 76 | .. note:: 77 | The custom dissimilarity function will be passed to scikit-learn's `pairwise_distances` function, so it should be compatible with the expected input format and return type. See the scikit-learn documentation for more details on how to implement custom metrics. 78 | 79 | Custom MDS Solver 80 | ----------------- 81 | 82 | The two default solvers provided by the actual implementation of the `radius_clustering` package are focused on exactness (or proximity to exactness) of the results of a NP-hard problem. So, they may not be suitable for all use cases, especially when performance is a concern. 83 | If you have your own implementation of a Minimum Dominating Set (MDS) solver, you can use it with the `RadiusClustering` class ny using the :py:func:'RadiusClustering.set_solver' method. It will check that the solver is compatible with the expected input format and return type, and will use it to perform clustering. 84 | 85 | .. versionadded:: 1.4.0 86 | The :py:func:`RadiusClustering.set_solver` method was added to allow users to set a custom MDS solver. 87 | It is *NOT* backward compatible with previous versions of the package, as it comes with new structure and methods to handle custom solvers. 88 | 89 | Here's an example of how to implement a custom MDS solver and use it with the `RadiusClustering` class, using NetworkX implementation of the dominating set problem : 90 | 91 | .. code-block:: python 92 | 93 | from radius_clustering import RadiusClustering 94 | import time 95 | import numpy as np 96 | import networkx as nx 97 | 98 | # Generate random data 99 | X = np.random.rand(100, 2) 100 | 101 | # Define a custom MDS solver using NetworkX 102 | def custom_mds_solver(n, edges, nb_edges, random_state=None): 103 | start = time.time() 104 | graph = nx.Graph(edges) 105 | centers = list(nx.algorithms.dominating_set(graph)) 106 | centers.sort() 107 | end = time.time() 108 | return centers, end - start 109 | 110 | # Create an instance of MdsClustering with the custom MDS solver 111 | rad = RadiusClustering(manner="approx", radius=0.5) 112 | rad.set_solver(custom_mds_solver) 113 | 114 | # Fit the model to the data 115 | rad.fit(X) 116 | 117 | # Get cluster labels 118 | labels = rad.labels_ 119 | 120 | print(labels) 121 | 122 | .. note:: 123 | The custom MDS solver should accept the same parameters as the default solvers, including the number of points `n`, the edges of the graph `edges`, the number of edges `nb_edges`, and an optional `random_state` parameter for reproducibility. It should return a list of centers and the time taken to compute them. 124 | The `set_solver` method will check that the custom solver is compatible with the expected input format and return type, and will use it to perform clustering. 125 | If the custom solver is not compatible, it will raise a `ValueError` with a descriptive message. 126 | 127 | .. attention:: 128 | We cannot guarantee that the custom MDS solver will produce the same results as the default solvers, especially if it is not purposely designed to solve the Minimum Dominating Set problem but rather just finds a dominating set. The results may vary depending on the implementation and the specific characteristics of the dataset. 129 | As an example, a benchmark of our solutions and a custom one using NetworkX is available in the `Example Gallery` section of the documentation, which shows that the custom solver may produce different results than the default solvers, especially in terms of the number of clusters and the time taken to compute them (see :ref:`sphx_glr_auto_examples_plot_benchmark_custom.py`). 130 | However, it can be useful for specific use cases where performance is a concern or when you have a custom implementation that fits your needs better. 131 | 132 | -------------------------------------------------------------------------------- /tests/test_unit.py: -------------------------------------------------------------------------------- 1 | from radius_clustering import RadiusClustering 2 | import pytest 3 | import numpy as np 4 | 5 | def test_symmetric(): 6 | """ 7 | Test that the RadiusClustering class can handle symmetric distance matrices. 8 | """ 9 | 10 | # Check 1D array input 11 | 12 | X = np.array([0,1]) 13 | with pytest.raises(ValueError): 14 | RadiusClustering(manner="exact", radius=1.5)._check_symmetric(X) 15 | 16 | # Check a symmetric distance matrix 17 | X = np.array([[0, 1, 2], 18 | [1, 0, 1], 19 | [2, 1, 0]]) 20 | 21 | clustering = RadiusClustering(manner="exact", radius=1.5) 22 | assert clustering._check_symmetric(X), "The matrix should be symmetric." 23 | 24 | # Check a non-symmetric distance matrix 25 | X_assym = np.array([[0, 1, 2], 26 | [1, 0, 1], 27 | [2, 2, 3]]) # This is not symmetric 28 | assert not clustering._check_symmetric(X_assym), "The matrix should not be symmetric." 29 | 30 | # check a non-square matrix 31 | X_non_square = np.array([[0, 1], 32 | [1, 0], 33 | [2, 1]]) # This is not square 34 | 35 | assert not clustering._check_symmetric(X_non_square), "The matrix should not be symmetric." 36 | 37 | 38 | def test_fit_distance_matrix(): 39 | """ 40 | Test that the RadiusClustering class can fit to a distance matrix. 41 | This test checks both the exact and approximate methods of clustering. 42 | """ 43 | 44 | # Create a symmetric distance matrix 45 | X = np.array([[0, 1, 2], 46 | [1, 0, 1], 47 | [2, 1, 0]]) 48 | 49 | clustering = RadiusClustering(manner="exact", radius=1.5) 50 | clustering.fit(X) 51 | 52 | # Check that the labels are assigned correctly 53 | assert len(clustering.labels_) == X.shape[0], "Labels length should match number of samples." 54 | assert clustering.nb_edges_ > 0, "There should be edges in the graph." 55 | assert np.array_equal(clustering.X_checked_, clustering.dist_mat_), "X_checked_ should be equal to dist_mat_ because X is a distance matrix." 56 | 57 | @pytest.mark.parametrize( 58 | "test_data", [ 59 | ("euclidean",1.5), 60 | ("manhattan", 2.1), 61 | ("cosine", 1.0) 62 | ] 63 | ) 64 | def test_fit_features(test_data): 65 | """ 66 | Test that the RadiusClustering class can fit to feature data. 67 | This test checks both the exact and approximate methods of clustering 68 | and multiple metrics methods. 69 | """ 70 | # Create a feature matrix 71 | X_features = np.array([[0, 1], 72 | [1, 0], 73 | [2, 1]]) 74 | metric, radius = test_data 75 | 76 | clustering = RadiusClustering(manner="approx", radius=radius) 77 | clustering.fit(X_features, metric=metric) 78 | # Check that the labels are assigned correctly 79 | assert len(clustering.labels_) == X_features.shape[0], "Labels length should match number of samples." 80 | assert clustering.nb_edges_ > 0, "There should be edges in the graph." 81 | assert clustering._check_symmetric(clustering.dist_mat_), "Distance matrix should be symmetric after computed from features." 82 | 83 | def test_radius_clustering_invalid_manner(): 84 | """ 85 | Test that an error is raised when an invalid manner is provided. 86 | """ 87 | with pytest.raises(ValueError): 88 | RadiusClustering(manner="invalid", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) 89 | 90 | with pytest.raises(ValueError): 91 | RadiusClustering(manner="", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) 92 | 93 | 94 | def test_radius_clustering_invalid_radius(): 95 | """ 96 | Test that an error is raised when an invalid radius is provided. 97 | """ 98 | with pytest.raises(ValueError, match="Radius must be a positive float."): 99 | RadiusClustering(manner="exact", radius=-1.0).fit([[0, 1], [1, 0], [2, 1]]) 100 | 101 | with pytest.raises(ValueError, match="Radius must be a positive float."): 102 | RadiusClustering(manner="approx", radius=0.0).fit([[0, 1], [1, 0], [2, 1]]) 103 | 104 | with pytest.raises(ValueError, match="Radius must be a positive float."): 105 | RadiusClustering(manner="exact", radius="invalid").fit([[0, 1], [1, 0], [2, 1]]) 106 | 107 | def test_radius_clustering_fit_without_data(): 108 | """ 109 | Test that an error is raised when fitting without data. 110 | """ 111 | clustering = RadiusClustering(manner="exact", radius=1.5) 112 | with pytest.raises(ValueError): 113 | clustering.fit(None) 114 | 115 | def test_radius_clustering_new_clusterer(): 116 | """ 117 | Test that a custom clusterer can be set within the RadiusClustering class. 118 | """ 119 | def custom_clusterer(n, edges, nb_edges, random_state=None): 120 | # A mock custom clusterer that returns a fixed set of centers 121 | # and a fixed execution time 122 | return [0, 1], 0.1 123 | clustering = RadiusClustering(manner="exact", radius=1.5) 124 | # Set the custom clusterer 125 | assert hasattr(clustering, 'set_solver'), "RadiusClustering should have a set_solver method." 126 | assert callable(clustering.set_solver), "set_solver should be callable." 127 | clustering.set_solver(custom_clusterer) 128 | # Fit the clustering with the custom clusterer 129 | X = np.array([[0, 1], 130 | [1, 0], 131 | [2, 1]]) 132 | clustering.fit(X) 133 | assert clustering.clusterer_ == custom_clusterer, "The custom clusterer should be set correctly." 134 | # Check that the labels are assigned correctly 135 | assert len(clustering.labels_) == X.shape[0], "Labels length should match number of samples." 136 | assert clustering.nb_edges_ > 0, "There should be edges in the graph." 137 | assert clustering.centers_ == [0, 1], "The centers should match the custom clusterer's output." 138 | assert clustering.mds_exec_time_ == 0.1, "The MDS execution time should match the custom clusterer's output." 139 | 140 | def test_invalid_clusterer(): 141 | """ 142 | Test that an error is raised when an invalid clusterer is set. 143 | """ 144 | clustering = RadiusClustering(manner="exact", radius=1.5) 145 | with pytest.raises(ValueError, match="The provided solver must be callable."): 146 | clustering.set_solver("not_a_callable") 147 | 148 | with pytest.raises(ValueError, match="The provided solver must be callable."): 149 | clustering.set_solver(12345) # Not a callable 150 | with pytest.raises(ValueError, match="The provided solver must be callable."): 151 | clustering.set_solver(None) 152 | 153 | def invalid_signature(): 154 | return [0, 1], 0.1 155 | 156 | with pytest.raises(ValueError): 157 | clustering.set_solver(invalid_signature) 158 | def invalid_clusterer(n, edges, nb_edges): 159 | return [0, 1], 0.1 160 | with pytest.raises(ValueError): 161 | clustering.set_solver(invalid_clusterer) -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from radius_clustering import RadiusClustering 4 | from sklearn import datasets 5 | 6 | X = datasets.fetch_openml(name="iris", version=1, parser="auto")["data"] 7 | 8 | def test_radius_clustering_approx(): 9 | """ 10 | Test the approximate method of the RadiusClustering class. 11 | """ 12 | clusterer = RadiusClustering(manner="approx", radius=1.43) 13 | 14 | assert clusterer.manner == "approx", "The manner should be 'approx'." 15 | assert clusterer.radius == 1.43, "The radius should be 1.43." 16 | assert clusterer.random_state is None, "The random state should be None by default." 17 | assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." 18 | assert clusterer._check_symmetric(X) is False, "The input should not be a symmetric distance matrix." 19 | 20 | clusterer.fit(X) 21 | 22 | assert clusterer.X_checked_ is not None, "X_checked_ should not be None after fitting." 23 | assert clusterer.dist_mat_ is not None, "dist_mat_ should not be None after fitting." 24 | assert clusterer.nb_edges_ > 0, "There should be edges in the graph." 25 | assert clusterer.labels_ is not None, "Labels should not be None after fitting." 26 | assert clusterer.centers_ is not None, "Centers should not be None after fitting." 27 | assert clusterer.effective_radius_ > 0, "Effective radius should be greater than 0." 28 | assert clusterer.mds_exec_time_ >= 0, "MDS execution time should be non-negative." 29 | assert clusterer.edges_ is not None, "Edges should not be None after fitting." 30 | assert clusterer.random_state == 42, "Random state should be set to 42 after fitting." 31 | 32 | results = clusterer.labels_ 33 | assert len(results) == X.shape[0], "The number of labels should match the number of samples." 34 | assert len(set(results)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." 35 | 36 | 37 | def test_radius_clustering_exact(): 38 | """ 39 | Test the exact method of the RadiusClustering class. 40 | """ 41 | clusterer = RadiusClustering(manner="exact", radius=1.43) 42 | 43 | assert clusterer.manner == "exact", "The manner should be 'exact'." 44 | assert clusterer.radius == 1.43, "The radius should be 1.43." 45 | assert clusterer.random_state is None, "The random state should be None by default." 46 | assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." 47 | assert clusterer._check_symmetric(X) is False, "The input should not be a symmetric distance matrix." 48 | 49 | clusterer.fit(X) 50 | 51 | assert clusterer.X_checked_ is not None, "X_checked_ should not be None after fitting." 52 | assert clusterer.dist_mat_ is not None, "dist_mat_ should not be None after fitting." 53 | assert clusterer.nb_edges_ > 0, "There should be edges in the graph." 54 | assert clusterer.labels_ is not None, "Labels should not be None after fitting." 55 | assert clusterer.centers_ is not None, "Centers should not be None after fitting." 56 | assert clusterer.effective_radius_ > 0, "Effective radius should be greater than 0." 57 | assert clusterer.mds_exec_time_ >= 0, "MDS execution time should be non-negative." 58 | assert clusterer.edges_ is not None, "Edges should not be None after fitting." 59 | assert clusterer.random_state is None, "Random state should remain None." 60 | 61 | results = clusterer.labels_ 62 | assert len(results) == X.shape[0], "The number of labels should match the number of samples." 63 | assert len(set(results)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." 64 | 65 | def test_radius_clustering_fit_predict(): 66 | """ 67 | Test the fit_predict method of the RadiusClustering class. 68 | """ 69 | clusterer = RadiusClustering(manner="approx", radius=1.43) 70 | 71 | assert clusterer.manner == "approx", "The manner should be 'approx'." 72 | assert clusterer.radius == 1.43, "The radius should be 1.43." 73 | assert clusterer.random_state is None, "The random state should be None by default." 74 | assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." 75 | 76 | labels = clusterer.fit_predict(X) 77 | 78 | assert labels is not None, "Labels should not be None after fit_predict." 79 | assert len(labels) == X.shape[0], "The number of labels should match the number of samples." 80 | assert len(set(labels)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." 81 | 82 | def test_radius_clustering_fit_predict_exact(): 83 | """ 84 | Test the fit_predict method of the RadiusClustering class with exact method. 85 | """ 86 | clusterer = RadiusClustering(manner="exact", radius=1.43) 87 | 88 | assert clusterer.manner == "exact", "The manner should be 'exact'." 89 | assert clusterer.radius == 1.43, "The radius should be 1.43." 90 | assert clusterer.random_state is None, "The random state should be None by default." 91 | assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." 92 | 93 | labels = clusterer.fit_predict(X) 94 | 95 | assert labels is not None, "Labels should not be None after fit_predict." 96 | assert len(labels) == X.shape[0], "The number of labels should match the number of samples." 97 | assert len(set(labels)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." 98 | 99 | def test_radius_clustering_random_state(): 100 | """ 101 | Test the random state functionality of the RadiusClustering class. 102 | """ 103 | clusterer = RadiusClustering(manner="approx", radius=1.43, random_state=123) 104 | 105 | assert clusterer.random_state == 123, "The random state should be set to 123." 106 | 107 | # Fit the model 108 | clusterer.fit(X) 109 | 110 | # Check that the random state is preserved 111 | assert clusterer.random_state == 123, "The random state should remain 123 after fitting." 112 | 113 | # Check that the results are consistent with the random state 114 | labels1 = clusterer.labels_ 115 | 116 | # Re-initialize and fit again with the same random state 117 | clusterer2 = RadiusClustering(manner="approx", radius=1.43, random_state=123) 118 | clusterer2.fit(X) 119 | 120 | labels2 = clusterer2.labels_ 121 | 122 | assert (labels1 == labels2).all(), "Labels should be consistent across runs with the same random state." 123 | 124 | def test_deterministic_behavior(): 125 | """ 126 | Test the deterministic behavior of the RadiusClustering class with a fixed random state. 127 | """ 128 | clusterer1 = RadiusClustering(manner="approx", radius=1.43, random_state=42) 129 | clusterer2 = RadiusClustering(manner="approx", radius=1.43, random_state=42) 130 | 131 | labels1 = clusterer1.fit_predict(X) 132 | labels2 = clusterer2.fit_predict(X) 133 | 134 | assert (labels1 == labels2).all(), "Labels should be the same for two instances with the same random state." 135 | 136 | clusterer1 = RadiusClustering(manner="exact", radius=1.43) 137 | clusterer2 = RadiusClustering(manner="exact", radius=1.43) 138 | labels1 = clusterer1.fit_predict(X) 139 | labels2 = clusterer2.fit_predict(X) 140 | assert (labels1 == labels2).all(), "Labels should be the same for two exact instances." 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | License: GPLv3 3 | PyPI 4 | Code style: Ruff 5 | GitHub Actions Workflow Status 6 | Python version supported 7 | Codecov 8 | Binder 9 | DOI 10 | 11 |

12 | 13 | # Radius Clustering 14 | 15 | Radius clustering is a Python package that implements clustering under radius constraint based on the Minimum Dominating Set (MDS) problem. This problem is NP-Hard but has been studied in the literature and proven to be linked to the clustering under radius constraint problem (see [references](#references) for more details). 16 | 17 | ## Features 18 | 19 | - Implements both exact and approximate MDS-based clustering algorithms 20 | - Compatible with scikit-learn's API for clustering algorithms 21 | - Supports radius-constrained clustering 22 | - Provides options for exact and approximate solutions 23 | - Easy to use and integrate with existing Python data science workflows 24 | - Includes comprehensive documentation and examples 25 | - Full test coverage to ensure reliability and correctness 26 | - Supports custom MDS solvers for flexibility in clustering approaches 27 | - Provides a user-friendly interface for clustering tasks 28 | 29 | > [!CAUTION] 30 | > **Deprecation Notice**: The `threshold` parameter in the `RadiusClustering` class has been deprecated. Please use the `radius` parameter instead for specifying the radius for clustering. It is planned to be completely removed in version 2.0.0. The `radius` parameter is now the standard way to define the radius for clustering, aligning with our objective of making the parameters' name more intuitive and user-friendly. 31 | 32 | > [!NOTE] 33 | > **NEW VERSIONS**: The package is currently under active development for new features and improvements, including some refactoring and enhancements to the existing codebase. Backwards compatibility is not guaranteed, so please check the [CHANGELOG](CHANGELOG.md) for details on changes and updates. 34 | 35 | ## Roadmap 36 | 37 | - [x] Version 1.4.0: 38 | - [x] Add support for custom MDS solvers 39 | - [x] Improve documentation and examples 40 | - [x] Add more examples and tutorials 41 | 42 | ## Installation 43 | 44 | You can install Radius Clustering using pip: 45 | 46 | ```bash 47 | pip install radius-clustering 48 | ``` 49 | 50 | ## Usage 51 | 52 | Here's a basic example of how to use Radius Clustering: 53 | 54 | ```python 55 | import numpy as np 56 | from radius_clustering import RadiusClustering 57 | 58 | # Example usage 59 | X = np.random.rand(100, 2) # Generate random data 60 | 61 | # Create an instance of MdsClustering 62 | rad_clustering = RadiusClustering(manner="approx", radius=0.5) 63 | 64 | # Fit the model to the data 65 | rad_clustering.fit(X) 66 | 67 | # Get cluster labels 68 | labels = rad_clustering.labels_ 69 | 70 | print(labels) 71 | ``` 72 | 73 | ## Documentation 74 | 75 | See the [full documentation for Radius Clustering](https://contrib.scikit-learn.org/radius_clustering/). 76 | 77 | ### Building the documentation 78 | 79 | To build the documentation, you can run the following command, assuming you have all dependencies needed installed: 80 | 81 | ```bash 82 | cd docs 83 | make html 84 | ``` 85 | 86 | Then you can open the `index.html` file in the `build` directory to view the full documentation. 87 | 88 | ## More information 89 | 90 | For more information please refer to the official documentation. 91 | 92 | If you want insights on how the algorithm works, please refer to the [presentation](PRESENTATION.md). 93 | 94 | If you want to know more about the experiments conducted with the package, please refer to the [experiments](EXPERIMENTS.md). 95 | 96 | 97 | ## Contributing 98 | 99 | Contributions to Radius Clustering are welcome! 100 | 101 | Please read the [CONTRIBUTING.md](CONTRIBUTING.md) file for details on how to contribute to the project. 102 | Please note that the project is released with a [Code of Conduct](CODE_OF_CONDUCT.md), and we expect all contributors to adhere to it. 103 | 104 | ## License 105 | 106 | This project is licensed under the GNU General Public License v3.0 - see the LICENSE file for details. 107 | 108 | ## How to cite this work 109 | 110 | If you use Radius Clustering in your research, please cite the following paper and the software itself: 111 | 112 | ```bibtex 113 | @inproceedings{haenn_clustering2024, 114 | TITLE = {{Clustering Under Radius Constraints Using Minimum Dominating Sets}}, 115 | AUTHOR = {Haenn, Quentin and Chardin, Brice and Baron, Micka{\"e}l}, 116 | URL = {https://hal.science/hal-04533921}, 117 | BOOKTITLE = {{Lecture Notes in Artificial Intelligence}}, 118 | ADDRESS = {Poitiers, France}, 119 | PUBLISHER = {{Springer}}, 120 | YEAR = {2024}, 121 | MONTH = Jun, 122 | KEYWORDS = {Constrained Clustering ; Radius Based Clustering ; Minimum Dominating Set ; Constrained Clustering Radius Based Clustering Minimum Dominating Set}, 123 | PDF = {https://hal.science/hal-04533921v1/file/clustering_under_radius_using_mds.pdf}, 124 | HAL_ID = {hal-04533921}, 125 | HAL_VERSION = {v1}, 126 | } 127 | ``` 128 | 129 | ## Acknowledgments 130 | 131 | ### MDS Algorithms 132 | 133 | The two MDS algorithms implemented are forked and modified (or rewritten) from the following authors: 134 | 135 | - [Alejandra Casado](https://github.com/AlejandraCasado) for the minimum dominating set heuristic code [[1](https://www.sciencedirect.com/science/article/pii/S0378475422005055)]. We rewrote the code in C++ to adapt to the need of python interfacing. 136 | - [Hua Jiang](https://github.com/huajiang-ynu) for the minimum dominating set exact algorithm code [[2](https://dl.acm.org/doi/abs/10.24963/ijcai.2023/622)]. The code has been adapted to the need of python interfacing. 137 | 138 | ### Funders 139 | 140 | The Radius Clustering work has been funded by: 141 | 142 | - [LIAS, ISAE-ENSMA](https://www.lias-lab.fr/) 143 | - [LabCom @lienor](https://labcom-alienor.ensma.fr/) and the [French National Research Agency](https://anr.fr/) 144 | 145 | ### Contributors 146 | 147 | - [Quentin Haenn (core developer)](https://www.lias-lab.fr/members/quentinhaenn/), LIAS, ISAE-ENSMA 148 | - [Brice Chardin](https://www.lias-lab.fr/members/bricechardin/), LIAS, ISAE-ENSMA 149 | - [Mickaël Baron](https://www.lias-lab.fr/members/mickaelbaron/), LIAS, ISAE-ENSMA 150 | 151 | 152 | ## References 153 | 154 | - [1] [An iterated greedy algorithm for finding the minimum dominating set in graphs](https://www.sciencedirect.com/science/article/pii/S0378475422005055) 155 | - [2] [An exact algorithm for the minimum dominating set problem](https://dl.acm.org/doi/abs/10.24963/ijcai.2023/622) 156 | - [3] [Clustering under radius constraint using minimum dominating set](https://link.springer.com/chapter/10.1007/978-3-031-62700-2_2) 157 | -------------------------------------------------------------------------------- /examples/plot_benchmark_custom.py: -------------------------------------------------------------------------------- 1 | """ 2 | ===================================================================================== 3 | Benchmark of Radius Clustering using multiple datasets and comparison with custom MDS 4 | ===================================================================================== 5 | 6 | This example demonstrates how to implement a custom solver for the MDS problem 7 | and use it within the Radius Clustering framework. 8 | Plus, it compares the results of a naive implementation using the 9 | `NetworkX` library with the Radius Clustering implementation. 10 | 11 | The example includes: 12 | 1. Defining the custom MDS solver. 13 | 2. Defining datasets to test the clustering. 14 | 3. Applying Radius clustering on the datasets using the custom MDS solver. 15 | 4. Ensure this solution works. 16 | 5. Establish a benchmark procedure to compare the Radius clustering with a naive implementation using `NetworkX`. 17 | 6. Comparing the results in terms of : 18 | - Execution time 19 | - Number of cluster found 20 | 7. Visualizing the benchmark results. 21 | 8. Visualizing the clustering results. 22 | 23 | This example is useful for understanding how to implement a custom MDS solver 24 | and how to perform an advanced usage of the package. 25 | """ 26 | # Author: Haenn Quentin 27 | # SPDX-License-Identifier: MIT 28 | 29 | # %% 30 | # Import necessary libraries 31 | # -------------------------- 32 | # 33 | # Since this example is a benchmark, we need to import the necessary libraries 34 | # to perform the benchmark, including `NetworkX` for the naive implementation, 35 | # `matplotlib` for visualization, and `sklearn` for the datasets. 36 | 37 | 38 | import networkx as nx 39 | import numpy as np 40 | import matplotlib.pyplot as plt 41 | import time 42 | import warnings 43 | 44 | from sklearn.datasets import fetch_openml 45 | from radius_clustering import RadiusClustering 46 | from sklearn.metrics import pairwise_distances_argmin 47 | 48 | warnings.filterwarnings("ignore", category=RuntimeWarning, module="sklearn") 49 | # %% 50 | # Define a custom MDS solver 51 | # -------------------------- 52 | # 53 | # We define a custom MDS solver that uses the `NetworkX` library to compute the MDS. 54 | # Note the signature of the function is identical to the one used in the `RadiusClustering` class. 55 | 56 | 57 | def custom_solver(n: int, edges: np.ndarray, nb_edges: int, random_state=None): 58 | """ 59 | Custom MDS solver using NetworkX to compute the MDS problem. 60 | 61 | Parameters: 62 | ----------- 63 | n : int 64 | The number of points in the dataset. 65 | edges : np.ndarray 66 | The edges of the graph, flattened into a 1D array. 67 | nb_edges : int 68 | The number of edges in the graph. 69 | random_state : int | None 70 | The random state to use for reproducibility. 71 | 72 | Returns: 73 | -------- 74 | centers : list 75 | A sorted list of the centers of the clusters. 76 | mds_exec_time : float 77 | The execution time of the MDS algorithm in seconds. 78 | """ 79 | G = nx.Graph() 80 | G.add_edges_from(edges) 81 | 82 | start_time = time.time() 83 | centers = list(nx.algorithms.dominating.dominating_set(G)) 84 | mds_exec_time = time.time() - start_time 85 | 86 | centers = sorted(centers) 87 | 88 | return centers, mds_exec_time 89 | 90 | 91 | # %% 92 | # Define datasets to test the clustering 93 | # -------------------------------------- 94 | # 95 | # We will use 4 datasets to test the clustering: 96 | # 1. Iris dataset 97 | # 2. Wine dataset 98 | # 3. Breast Cancer dataset (WDBC) 99 | # 4. Vehicle dataset 100 | # These are common datasets used in machine learning and lead to pretty fast results. 101 | # Structure of the variable `DATASETS`: 102 | # - The key is the name of the dataset. 103 | # - The value is a tuple containing: 104 | # - The dataset fetched from OpenML. 105 | # - The radius to use for the Radius clustering. (determined in literature, see references on home page) 106 | # 107 | 108 | 109 | DATASETS = { 110 | "iris": (fetch_openml(name="iris", version=1, as_frame=False), 1.43), 111 | "wine": (fetch_openml(name="wine", version=1, as_frame=False), 232.09), 112 | "glass": (fetch_openml(name="glass", version=1, as_frame=False), 3.94), 113 | "ionosphere": (fetch_openml(name="ionosphere", version=1, as_frame=False), 5.46), 114 | "breast_cancer": (fetch_openml(name="wdbc", version=1, as_frame=False), 1197.42), 115 | "synthetic": (fetch_openml(name="synthetic_control", version=1, as_frame=False), 70.12), 116 | "vehicle": (fetch_openml(name="vehicle", version=1, as_frame=False), 155.05), 117 | "yeast": (fetch_openml(name="yeast", version=1, as_frame=False), 0.4235), 118 | } 119 | 120 | # %% 121 | # Define the benchmark procedure 122 | # -------------------------------------- 123 | # 124 | # We define a function to perform the benchmark on the datasets. 125 | # The procedure is as follows: 126 | # 1. Creates an instance of RadiusClustering for each solver. 127 | # 2. For each instance, fit the algorithm on each dataset. 128 | # 3. Store the execution time and the number of clusters found for each dataset. 129 | # 4. Return the results as a dictionary. 130 | 131 | 132 | def benchmark_radius_clustering(): 133 | results = {} 134 | exact = RadiusClustering(manner="exact", radius=1.43) 135 | approx = RadiusClustering(manner="approx", radius=1.43) 136 | custom = RadiusClustering( 137 | manner="custom", radius=1.43 138 | ) 139 | custom.set_solver(custom_solver) # Set the custom solver 140 | algorithms = [exact, approx, custom] 141 | # Loop through each algorithm and dataset 142 | for algo in algorithms: 143 | algo_results = {} 144 | time_algo = [] 145 | clusters_algo = [] 146 | # Loop through each dataset 147 | for name, (dataset, radius) in DATASETS.items(): 148 | X = dataset.data 149 | # set the radius for the dataset considered 150 | setattr(algo, "radius", radius) 151 | # Fit the algorithm 152 | t0 = time.time() 153 | algo.fit(X) 154 | t_algo = time.time() - t0 155 | 156 | # Store the results 157 | time_algo.append(t_algo) 158 | clusters_algo.append(len(algo.centers_)) 159 | algo_results["time"] = time_algo 160 | algo_results["clusters"] = clusters_algo 161 | results[algo.manner] = algo_results 162 | 163 | return results 164 | 165 | 166 | # %% 167 | # Run the benchmark and plot the results 168 | # -------------------------------------- 169 | # We run the benchmark and plot the results for each dataset. 170 | 171 | 172 | results = benchmark_radius_clustering() 173 | 174 | # Plot the results 175 | fig, axs = plt.subplot_mosaic( 176 | [ 177 | ["time", "time", "time", "time"], 178 | ["iris", "wine", "breast_cancer", "vehicle"], 179 | ["glass", "ionosphere", "synthetic", "yeast"], 180 | ], 181 | layout="constrained", 182 | figsize=(12, 8), 183 | ) 184 | fig.suptitle("Benchmark of Radius Clustering Solvers", fontsize=16) 185 | 186 | axs['time'].set_yscale('log') # Use logarithmic scale for better visibility 187 | 188 | algorithms = list(results.keys()) 189 | dataset_names = list(DATASETS.keys()) 190 | n_algos = len(algorithms) 191 | x_indices = np.arange(len(dataset_names)) # the label locations 192 | bar_width = 0.8 / n_algos # the width of the bars, with some padding 193 | 194 | for i, algo in enumerate(algorithms): 195 | times = results[algo]["time"] 196 | # Calculate position for each bar in the group to center them 197 | position = x_indices - (n_algos * bar_width / 2) + (i * bar_width) + bar_width / 2 198 | axs['time'].bar(position, times, bar_width, label=algo) 199 | 200 | for i, (name, (dataset, _)) in enumerate(DATASETS.items()): 201 | axs[name].bar( 202 | results.keys(), 203 | [results[algo]["clusters"][i] for algo in results.keys()], 204 | label=name, 205 | ) 206 | axs[name].axhline( 207 | y=len(set(dataset.target)), # Number of unique classes in the dataset 208 | label="True number of clusters", 209 | color='r', 210 | linestyle='--', 211 | ) 212 | axs[name].set_title(name) 213 | 214 | axs["iris"].set_ylabel("Number of clusters") 215 | axs["glass"].set_ylabel("Number of clusters") 216 | 217 | axs['time'].set_title("Execution Time (log scale)") 218 | axs['time'].set_xlabel("Datasets") 219 | axs['time'].set_ylabel("Time (seconds)") 220 | axs['time'].set_xticks(x_indices) 221 | axs['time'].set_xticklabels(dataset_names) 222 | axs['time'].legend(title="Algorithms") 223 | plt.tight_layout() 224 | plt.show() 225 | 226 | 227 | # %% 228 | # Conclusion 229 | # ---------- 230 | # 231 | # In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering. 232 | # We visualized the clustering results and the difference between the two clustering algorithms. 233 | # We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters. 234 | # The difference plot can be very useful to see where the two clustering algorithms differ. 235 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | There are different ways to install Radius Clustering: 8 | 9 | * :ref:`From PyPI `. This is the recommended way to install Radius Clustering. It will provide a stable version and pre-built packages are available for most platforms. 10 | 11 | * :ref:`From the source `. This is best for users who want the latest features and are comfortable building from source. This is also needed if you want to contribute to the project. 12 | 13 | .. warning:: 14 | 15 | Radius Clustering is currently not available on PyPI, pending the organization acceptance on PyPI. You can install the package from the source by following the :ref:`instructions `. 16 | Please notice that the compilation stage requires a C and C++ compiler toolchain to be installed on your system. 17 | 18 | 19 | .. _installation-pypi: 20 | 21 | Installing from PyPI 22 | -------------------- 23 | 24 | .. raw:: html 25 | 26 | 38 | 39 | .. div:: install-instructions 40 | 41 | .. tab-set:: 42 | :class: tabs-os 43 | :sync-group: os 44 | 45 | .. tab-item:: Windows 46 | :class-label: tab-4 47 | :sync: windows 48 | 49 | Install the 64-bit version of Python 3, for instance from the 50 | `official website `__. 51 | 52 | Now create a `virtual environment (venv) 53 | `_ and install Radius Clustering. 54 | Note that the virtual environment is optional but strongly recommended, in 55 | order to avoid potential conflicts with other packages. 56 | 57 | .. prompt:: powershell 58 | 59 | python -m venv rad-env 60 | rad-env\Scripts\activate # activate 61 | pip install -U radius-clustering 62 | 63 | In order to check your installation, you can use: 64 | 65 | .. prompt:: powershell 66 | 67 | python -m pip show radius-clustering # show radius-clustering version and location 68 | python -m pip freeze # show all installed packages in the environment 69 | 70 | .. tab-item:: macOS 71 | :class-label: tab-4 72 | :sync: macos 73 | 74 | Install Python 3 using `homebrew `_ (`brew install python`) 75 | or by manually installing the package from the `official website 76 | `__. 77 | 78 | Now create a `virtual environment (venv) 79 | `_ and install Radius Clustering. 80 | Note that the virtual environment is optional but strongly recommended, in 81 | order to avoid potential conflicts with other packages. 82 | 83 | .. prompt:: bash 84 | 85 | python -m venv rad-env 86 | source rad-env/bin/activate # activate 87 | pip install -U radius-clustering 88 | 89 | In order to check your installation, you can use: 90 | 91 | .. prompt:: bash 92 | 93 | python -m pip show radius-clustering # show radius-clustering version and location 94 | python -m pip freeze # show all installed packages in the environment 95 | 96 | .. tab-item:: Linux 97 | :class-label: tab-4 98 | :sync: linux 99 | 100 | Python 3 is usually installed by default on most Linux distributions. To 101 | check if you have it installed, try: 102 | 103 | .. prompt:: bash 104 | 105 | python3 --version 106 | pip3 --version 107 | 108 | If you don't have Python 3 installed, please install `python3` and 109 | `python3-pip` from your distribution's package manager. 110 | 111 | Now create a `virtual environment (venv) 112 | `_ and install Radius Clustering. 113 | Note that the virtual environment is optional but strongly recommended, in 114 | order to avoid potential conflicts with other packages. 115 | 116 | .. prompt:: bash 117 | 118 | python3 -m venv rad-env 119 | source rad-env/bin/activate # activate 120 | pip3 install -U radius-clustering 121 | 122 | In order to check your installation, you can use: 123 | 124 | .. prompt:: bash 125 | 126 | python3 -m pip show radius-clustering # show radius-clustering version and location 127 | python3 -m pip freeze # show all installed packages in the environment 128 | 129 | 130 | Using an isolated environment such as pip venv or conda makes it possible to 131 | install a specific version of mds-clustering with pip or conda and its dependencies 132 | independently of any previously installed Python packages. In particular under Linux 133 | it is discouraged to install pip packages alongside the packages managed by the 134 | package manager of the distribution (apt, dnf, pacman...). 135 | 136 | Note that you should always remember to activate the environment of your choice 137 | prior to running any Python command whenever you start a new terminal session. 138 | 139 | If you have not installed NumPy or SciPy yet, you can also install these using 140 | conda or pip. When using pip, please ensure that *binary wheels* are used, 141 | and NumPy and SciPy are not recompiled from source, which can happen when using 142 | particular configurations of operating system and hardware (such as Linux on 143 | a Raspberry Pi). 144 | 145 | 146 | .. _installation-source: 147 | 148 | Installing from the source 149 | -------------------------- 150 | 151 | Compiler Requirements 152 | ~~~~~~~~~~~~~~~~~~~~~ 153 | 154 | To install Radius Clustering from the source, you need to have a C and C++ compiler and their respective toolchains installed on your system, depending on your operating system. 155 | 156 | .. raw:: html 157 | 158 | 170 | 171 | .. div:: install-instructions 172 | 173 | .. tab-set:: 174 | :class: tabs-os 175 | :sync-group: os 176 | 177 | .. tab-item:: Windows 178 | :class-label: tab-4 179 | :sync: windows 180 | 181 | Install the correct version of Microsoft Visual C++ Build Tools for your Python version from the `official website `__. 182 | 183 | In Build Tools, install C++ toolchain. Ensure that it is added to the system PATH. 184 | You are now ready to install Radius Clustering from source. 185 | 186 | .. tab-item:: macOS 187 | :class-label: tab-4 188 | :sync: macos 189 | 190 | Normally, you should have the necessary tools installed on your system as it comes with Xcode Command Line Tools, which is included when you first install Homebrew or Xcode. 191 | To check if you have the necessary tools installed, try: 192 | 193 | .. prompt:: bash 194 | 195 | gcc --version 196 | g++ --version 197 | 198 | If you don't have the necessary tools installed, you can install them directly from the App Store by getting Xcode. You may also be interested in installing Homebrew. See this `tutorial `__ for more information. 199 | 200 | .. tab-item:: Linux 201 | :class-label: tab-4 202 | :sync: linux 203 | 204 | Normally, you should have the necessary tools installed on your system. To check if you have the necessary tools installed, try: 205 | 206 | .. prompt:: bash 207 | 208 | gcc --version 209 | g++ --version 210 | 211 | If you don't have the necessary tools installed, you can install them using your distribution's package manager. For instance, on Ubuntu, you can install them by running: 212 | 213 | .. prompt:: bash 214 | 215 | sudo apt-get update 216 | sudo apt-get install build-essential 217 | 218 | 219 | Installing Radius Clustering 220 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 221 | 222 | Now you have installed compilers toolchains requirements, you can build and install `radius-clustering` from the sources. You need to clone the repository and 223 | install the package using the following commands: 224 | 225 | .. prompt:: bash 226 | 227 | git clone git@github.com:scikit-learn-contrib/radius_clustering.git # clone the repository 228 | cd radius_clustering 229 | python -m venv rad-env 230 | source rad-env/bin/activate # activate 231 | python -m pip install . 232 | 233 | To check your installation, you can use: 234 | 235 | .. prompt:: bash 236 | 237 | python -m pip show radius-clustering # show radius-clustering version and location 238 | python -m pip freeze # show all installed packages in the environment 239 | python -c "from radius_clustering import *; rad = RadiusClustering(); print(rad)" 240 | 241 | If you want to contribute to the project, you will need to install the development 242 | dependencies. You can do this by running: 243 | 244 | .. prompt:: bash 245 | 246 | python -m pip install -e .[dev] 247 | 248 | Alternatively, if you want to contribute only to the documentation, you can install 249 | the documentation dependencies by running: 250 | 251 | .. prompt:: bash 252 | 253 | python -m pip install -e .[docs] 254 | 255 | Dependencies 256 | ++++++++++++ 257 | 258 | 259 | The minimum version of radius-clustering dependencies are listed below along with its 260 | purpose. 261 | 262 | .. list-table:: 263 | :header-rows: 1 264 | 265 | * - Dependency 266 | - Minimum version 267 | - Purpose 268 | * - numpy 269 | - 1.23.4 270 | - Build, Install 271 | * - scipy 272 | - 1.12.0 273 | - Build, Install 274 | * - scikit-learn 275 | - 1.2.2 276 | - Build, Install 277 | * - cython 278 | - 3.0.10 279 | - Build 280 | * - setuptools 281 | - 61.0.0 282 | - Build 283 | * - pytest 284 | - 8.3.3 285 | - Tests 286 | * - ruff 287 | - 0.2.1 288 | - Tests 289 | * - black 290 | - 24.3.0 291 | - Tests 292 | * - matplotlib 293 | - 3.6.2 294 | - Docs, Examples 295 | * - sphinx 296 | - 8.1.3 297 | - Docs 298 | * - sphinx-copybutton 299 | - 0.5.2 300 | - Docs 301 | * - sphinx-rtd-theme 302 | - 3.0.0 303 | - Docs 304 | * - sphinx_design 305 | - 0.6.1 306 | - Docs 307 | * - sphinx_gallery 308 | - 0.18.0 309 | - Docs 310 | * - sphinx-prompt 311 | - 1.9.0 312 | - Docs 313 | -------------------------------------------------------------------------------- /examples/plot_iris_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============================== 3 | Iris Dataset Clustering Example 4 | =============================== 5 | 6 | This example is meant to illustrate the use of the Radius clustering library on the Iris dataset. 7 | It comes with a simple example of how to use the library to cluster the Iris dataset and a comparison with 8 | kmeans clustering algorithms. 9 | 10 | The example includes: 11 | 1. Loading the Iris dataset 12 | 2. Applying Radius clustering and k-means clustering 13 | 3. Visualizing the clustering results 14 | 15 | This example serves as a simple introduction to using the Radius clustering library 16 | on a well-known dataset. 17 | """ 18 | # Author: Haenn Quentin 19 | # SPDX-License-Identifier: MIT 20 | 21 | 22 | # %% 23 | # Load the Iris dataset 24 | # --------------------- 25 | # 26 | # We start by loading the Iris dataset using the `fetch_openml` function from `sklearn.datasets`. 27 | # The Iris dataset is a well-known dataset that contains 150 samples of iris flowers. 28 | # Each sample has 4 features: sepal length, sepal width, petal length, and petal width. 29 | # The dataset is labeled with 3 classes: setosa, versicolor, and virginica. 30 | 31 | import numpy as np 32 | from sklearn import datasets 33 | from radius_clustering import RadiusClustering 34 | 35 | # Load the Iris dataset 36 | iris = datasets.load_iris() 37 | X = iris["data"] 38 | y = iris.target 39 | 40 | 41 | # %% 42 | # Visualize the Iris dataset 43 | # -------------------------- 44 | # 45 | # We can visualize the Iris dataset by plotting the dataset. We use PCA to reduce the dimensionality to 3D 46 | # and plot the dataset in a 3D scatter plot. 47 | import matplotlib.pyplot as plt 48 | from sklearn.decomposition import PCA 49 | import mpl_toolkits.mplot3d 50 | 51 | # Reduce the dimensionality of the dataset to 3D using PCA 52 | pca = PCA(n_components=3) 53 | iris_reduced = pca.fit_transform(X) 54 | fig = plt.figure(figsize=(8, 6)) 55 | ax = fig.add_subplot(111, projection="3d", elev=48, azim=134) 56 | ax.scatter( 57 | iris_reduced[:, 0], 58 | iris_reduced[:, 1], 59 | iris_reduced[:, 2], 60 | c=y, 61 | cmap="Dark2", 62 | s=40, 63 | ) 64 | # Set plot labels 65 | ax.set_title("Iris dataset in first 3 PCA components") 66 | ax.set_xlabel("1st eigenvector") 67 | ax.set_ylabel("2nd eigenvector") 68 | ax.set_zlabel("3rd eigenvector") 69 | 70 | # Hide tick labels 71 | ax.xaxis.set_ticklabels([]) 72 | ax.yaxis.set_ticklabels([]) 73 | ax.zaxis.set_ticklabels([]) 74 | 75 | plt.show() 76 | 77 | # %% 78 | # Compute Clustering with Radius Clustering 79 | # ----------------------------------------- 80 | # 81 | # We can now apply Radius clustering to the Iris dataset. 82 | # We create an instance of the `RadiusClustering` class and fit it to the Iris dataset. 83 | import time 84 | 85 | rad = RadiusClustering(manner="exact", radius=1.43) 86 | t0 = time.time() 87 | rad.fit(X) 88 | t_rad = time.time() - t0 89 | 90 | # %% 91 | # Compute KMeans Clustering for Comparison 92 | # ---------------------------------------- 93 | # 94 | # We can also apply KMeans clustering to the Iris dataset for comparison. 95 | 96 | from sklearn.cluster import KMeans 97 | 98 | k_means = KMeans(n_clusters=3, n_init=10) 99 | t0 = time.time() 100 | k_means.fit(X) 101 | t_kmeans = time.time() - t0 102 | 103 | # %% Establishing parity between clusters 104 | # -------------------------------------- 105 | # 106 | # We want to have the same color for the same cluster in both plots. 107 | # We can achieve this by matching the cluster labels of the Radius clustering and the KMeans clustering. 108 | # First we define a function to retrieve the cluster centers from the Radius clustering and KMeans clustering and 109 | # match them pairwise. 110 | 111 | 112 | def get_order_labels(kmeans, rad, data): 113 | centers1_cpy = kmeans.cluster_centers_.copy() 114 | centers2_cpy = data[rad.centers_].copy() 115 | order = [] 116 | # For each center in the first clustering, find the closest center in the second clustering 117 | for center in centers1_cpy: 118 | match = pairwise_distances_argmin([center], centers2_cpy) 119 | # if there is only one center left, assign it to the last cluster label not yet assigned 120 | if len(centers2_cpy) == 1: 121 | for i in range(len(centers1_cpy)): 122 | if i not in order: 123 | order.append(i) 124 | break 125 | break 126 | # get coordinates of the center in the second clustering 127 | coordinates = centers2_cpy[match] 128 | # find the closest point in the data to the center to get the cluster label 129 | closest_point = pairwise_distances_argmin(coordinates, data) 130 | match_label = rad.labels_[closest_point] 131 | # remove the center from the second clustering 132 | centers2_cpy = np.delete(centers2_cpy, match, axis=0) 133 | # add the cluster label to the order 134 | order.append(int(match_label[0])) 135 | return order 136 | 137 | 138 | from sklearn.metrics.pairwise import pairwise_distances_argmin 139 | 140 | rad_centers_index = np.array(rad.centers_) 141 | order = get_order_labels(k_means, rad, X) 142 | 143 | kmeans_centers = k_means.cluster_centers_ 144 | rad_centers = rad_centers_index[order] 145 | rad_centers_coordinates = X[rad_centers] 146 | 147 | # Pair the cluster labels 148 | kmeans_labels = pairwise_distances_argmin(X, kmeans_centers) 149 | rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates) 150 | 151 | # %% 152 | # Plotting the results and the difference 153 | # --------------------------------------- 154 | 155 | fig = plt.figure(figsize=(12, 6)) 156 | fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) 157 | colors = ["#4EACC5", "#FF9C34", "#4E9A06"] 158 | 159 | # KMeans 160 | ax = fig.add_subplot(1, 3, 1, projection="3d", elev=48, azim=134, roll=0) 161 | 162 | ax.scatter( 163 | iris_reduced[:, 0], 164 | iris_reduced[:, 1], 165 | iris_reduced[:, 2], 166 | c=kmeans_labels, 167 | cmap="Dark2", 168 | s=40, 169 | ) 170 | # adapting center coordinates to the 3D plot 171 | kmeans_centers = pca.transform(kmeans_centers) 172 | ax.scatter( 173 | kmeans_centers[:, 0], 174 | kmeans_centers[:, 1], 175 | kmeans_centers[:, 2], 176 | c="r", 177 | s=200, 178 | ) 179 | ax.set_title("KMeans") 180 | ax.set_xticks(()) 181 | ax.set_yticks(()) 182 | ax.set_zticks(()) 183 | 184 | ax.text3D(-3.5, 3, 1.0, "train time: %.2fs\ninertia: %f" % (t_kmeans, k_means.inertia_)) 185 | 186 | # MDS 187 | ax = fig.add_subplot(1, 3, 2, projection="3d", elev=48, azim=134, roll=0) 188 | ax.scatter( 189 | iris_reduced[:, 0], 190 | iris_reduced[:, 1], 191 | iris_reduced[:, 2], 192 | c=rad_labels, 193 | cmap="Dark2", 194 | s=40, 195 | ) 196 | # adapting center coordinates to the 3D plot 197 | rad_centers_coordinates = pca.transform(rad_centers_coordinates) 198 | ax.scatter( 199 | rad_centers_coordinates[:, 0], 200 | rad_centers_coordinates[:, 1], 201 | rad_centers_coordinates[:, 2], 202 | c="r", 203 | s=200, 204 | ) 205 | ax.set_title("MDS Clustering") 206 | ax.set_xticks(()) 207 | ax.set_yticks(()) 208 | ax.set_zticks(()) 209 | ax.text3D(-3.5, 3, 0.0, "train time: %.2fs" % t_rad) 210 | 211 | # Initialize the different array to all False 212 | different = rad_labels == 4 213 | ax = fig.add_subplot(1, 3, 3, projection="3d", elev=48, azim=134, roll=0) 214 | 215 | for k in range(3): 216 | different += (kmeans_labels == k) != (rad_labels == k) 217 | 218 | identical = np.logical_not(different) 219 | ax.scatter( 220 | iris_reduced[identical, 0], iris_reduced[identical, 1], color="#bbbbbb", marker="." 221 | ) 222 | ax.scatter(iris_reduced[different, 0], iris_reduced[different, 1], color="m") 223 | ax.set_title("Difference") 224 | ax.set_xticks(()) 225 | ax.set_yticks(()) 226 | ax.set_zticks(()) 227 | 228 | plt.show() 229 | 230 | # %% 231 | # Another difference plot 232 | # ----------------------- 233 | # 234 | # As we saw, the difference plot is not very informative using Iris. 235 | # We'll use a different dataset to show the difference plot. 236 | 237 | wine = datasets.load_wine() 238 | X = wine.data 239 | y = wine.target 240 | pca = PCA(n_components=3) 241 | wine_reduced = pca.fit_transform(X) 242 | 243 | # Compute clustering with MDS 244 | 245 | rad = RadiusClustering(manner="exact", radius=232.09) 246 | t0 = time.time() 247 | rad.fit(X) 248 | t_rad = time.time() - t0 249 | 250 | # Compute KMeans clustering for comparison 251 | 252 | k_means = KMeans(n_clusters=3, n_init=10) 253 | t0 = time.time() 254 | k_means.fit(X) 255 | t_kmeans = time.time() - t0 256 | 257 | # %% 258 | # Reapplying the same process as before 259 | # -------------------------------------- 260 | 261 | rad_centers_index = np.array(rad.centers_) 262 | order = get_order_labels(k_means, rad, X) 263 | 264 | kmeans_centers = k_means.cluster_centers_ 265 | rad_centers = rad_centers_index[order] 266 | rad_centers_coordinates = X[rad_centers] 267 | 268 | # Pair the cluster labels 269 | kmeans_labels = pairwise_distances_argmin(X, kmeans_centers) 270 | rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates) 271 | 272 | # %% 273 | # Plotting the results and the difference 274 | # --------------------------------------- 275 | 276 | fig = plt.figure(figsize=(12, 6)) 277 | fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) 278 | colors = ["#4EACC5", "#FF9C34", "#4E9A06"] 279 | 280 | # KMeans 281 | ax = fig.add_subplot(1, 3, 1, projection="3d", elev=48, azim=134, roll=0) 282 | 283 | ax.scatter( 284 | wine_reduced[:, 0], 285 | wine_reduced[:, 1], 286 | wine_reduced[:, 2], 287 | c=kmeans_labels, 288 | cmap="Dark2", 289 | s=40, 290 | ) 291 | # adapting center coordinates to the 3D plot 292 | kmeans_centers = pca.transform(kmeans_centers) 293 | ax.scatter( 294 | kmeans_centers[:, 0], 295 | kmeans_centers[:, 1], 296 | kmeans_centers[:, 2], 297 | c="r", 298 | s=200, 299 | ) 300 | ax.set_title("KMeans") 301 | ax.set_xticks(()) 302 | ax.set_yticks(()) 303 | ax.set_zticks(()) 304 | 305 | ax.text3D( 306 | 60.0, 80.0, 0.0, "train time: %.2fs\ninertia: %f" % (t_kmeans, k_means.inertia_) 307 | ) 308 | 309 | # MDS 310 | ax = fig.add_subplot(1, 3, 2, projection="3d", elev=48, azim=134, roll=0) 311 | ax.scatter( 312 | wine_reduced[:, 0], 313 | wine_reduced[:, 1], 314 | wine_reduced[:, 2], 315 | c=rad_labels, 316 | cmap="Dark2", 317 | s=40, 318 | ) 319 | # adapting center coordinates to the 3D plot 320 | rad_centers_coordinates = pca.transform(rad_centers_coordinates) 321 | ax.scatter( 322 | rad_centers_coordinates[:, 0], 323 | rad_centers_coordinates[:, 1], 324 | rad_centers_coordinates[:, 2], 325 | c="r", 326 | s=200, 327 | ) 328 | ax.set_title("MDS Clustering") 329 | ax.set_xticks(()) 330 | ax.set_yticks(()) 331 | ax.set_zticks(()) 332 | ax.text3D(60.0, 80.0, 0.0, "train time: %.2fs" % t_rad) 333 | 334 | # Initialize the different array to all False 335 | different = rad_labels == 4 336 | ax = fig.add_subplot(1, 3, 3, projection="3d", elev=48, azim=134, roll=0) 337 | 338 | for k in range(3): 339 | different += (kmeans_labels == k) != (rad_labels == k) 340 | 341 | identical = np.logical_not(different) 342 | ax.scatter( 343 | wine_reduced[identical, 0], wine_reduced[identical, 1], color="#bbbbbb", marker="." 344 | ) 345 | ax.scatter(wine_reduced[different, 0], wine_reduced[different, 1], color="m") 346 | ax.set_title("Difference") 347 | ax.set_xticks(()) 348 | ax.set_yticks(()) 349 | ax.set_zticks(()) 350 | 351 | plt.show() 352 | 353 | # %% 354 | # Conclusion 355 | # ---------- 356 | # 357 | # In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering. 358 | # We visualized the clustering results and the difference between the two clustering algorithms. 359 | # We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters. 360 | # The difference plot can be very useful to see where the two clustering algorithms differ. 361 | -------------------------------------------------------------------------------- /src/radius_clustering/radius_clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | Radius Clustering 3 | 4 | This module provides functionality for Minimum Dominating Set (MDS) based clustering. 5 | It includes methods for solving MDS problems and applying the solutions to 6 | clustering tasks. 7 | 8 | This module serves as the main interface for the Radius clustering library. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import os 14 | import warnings 15 | 16 | import numpy as np 17 | from sklearn.base import BaseEstimator, ClusterMixin 18 | from sklearn.metrics import pairwise_distances 19 | from sklearn.utils.validation import check_random_state, validate_data 20 | 21 | from .algorithms import clustering_approx, clustering_exact 22 | 23 | DIR_PATH = os.path.dirname(os.path.realpath(__file__)) 24 | 25 | 26 | class RadiusClustering(ClusterMixin, BaseEstimator): 27 | r""" 28 | Radius Clustering algorithm. 29 | 30 | This class implements clustering based on the Minimum Dominating Set (MDS) problem. 31 | It can use either an exact or approximate method for solving the MDS problem. 32 | 33 | Parameters: 34 | ----------- 35 | manner : str, optional (default="approx") 36 | The method to use for solving the MDS problem. Can be "exact" or "approx". 37 | radius : float, optional (default=0.5) 38 | The dissimilarity threshold to act as radius constraint for the clustering. 39 | 40 | Attributes: 41 | ----------- 42 | X : array-like, shape (n_samples, n_features) 43 | The input data. 44 | centers\_ : list 45 | The indices of the cluster centers. 46 | labels\_ : array-like, shape (n_samples,) 47 | The cluster labels for each point in the input data. 48 | effective_radius\_ : float 49 | The maximum distance between any point and its assigned cluster center. 50 | random_state\_ : int | None 51 | The random state used for reproducibility. If None, no random state is set. 52 | 53 | .. note:: 54 | The `random_state_` attribute is not used when the `manner` is set to "exact". 55 | 56 | .. versionchanged:: 1.4.0 57 | The `RadiusClustering` class has been refactored. 58 | Clustering algorithms are now separated into their own module 59 | (`algorithms.py`) to improve maintainability and extensibility. 60 | 61 | .. versionadded:: 1.4.0 62 | The `set_solver` method was added to allow users to set a custom solver 63 | for the MDS problem. This allows for flexibility in how the MDS problem is solved 64 | and enables users to use their own implementations of MDS clustering algorithms. 65 | 66 | .. versionadded:: 1.3.0 67 | 68 | - The *random_state* parameter was added to allow reproducibility in the approximate method. 69 | 70 | - The `radius` parameter replaces the `threshold` parameter for setting the dissimilarity threshold for better clarity and consistency. 71 | 72 | .. versionchanged:: 1.3.0 73 | All publicly accessible attributes are now suffixed with an underscore 74 | (e.g., `centers_`, `labels_`). 75 | This is particularly useful for compatibility with scikit-learn's API. 76 | 77 | .. deprecated:: 1.3.0 78 | The `threshold` parameter is deprecated. Use `radius` instead. 79 | Will be removed in a future version. 80 | """ 81 | 82 | _estimator_type = "clusterer" 83 | _algorithms = { 84 | "exact": clustering_exact, 85 | "approx": clustering_approx, 86 | } 87 | 88 | def __init__( 89 | self, 90 | manner: str = "approx", 91 | radius: float = 0.5, 92 | threshold=None, 93 | random_state: int | None = None, 94 | ) -> None: 95 | if threshold is not None: 96 | warnings.warn( 97 | "The 'threshold' parameter is deprecated and" 98 | " will be removed in a future version." 99 | "Please use 'radius' instead.", 100 | DeprecationWarning, 101 | stacklevel=2, 102 | ) 103 | radius = threshold 104 | self.threshold = threshold # For backward compatibility 105 | self.manner = manner 106 | self.radius = radius 107 | self.random_state = random_state 108 | 109 | def _check_symmetric(self, a: np.ndarray, tol: float = 1e-8) -> bool: 110 | if a.ndim != 2: 111 | raise ValueError("Input must be a 2D array.") 112 | if a.shape[0] != a.shape[1]: 113 | return False 114 | return np.allclose(a, a.T, atol=tol) 115 | 116 | def fit(self, X: np.ndarray, y: None = None, metric: str | callable = "euclidean") -> "RadiusClustering": 117 | """ 118 | Fit the MDS clustering model to the input data. 119 | 120 | This method computes the distance matrix if the input is a feature matrix, 121 | or uses the provided distance matrix directly if the input is already 122 | a distance matrix. 123 | 124 | .. note:: 125 | If the input is a distance matrix, it should be symmetric and square. 126 | If the input is a feature matrix, the distance matrix 127 | will be computed using Euclidean distance. 128 | 129 | .. tip:: 130 | Next version will support providing different metrics or 131 | even custom callables to compute the distance matrix. 132 | 133 | Parameters: 134 | ----------- 135 | X : array-like, shape (n_samples, n_features) 136 | The input data to cluster. X should be a 2D array-like structure. 137 | It can either be : 138 | - A distance matrix (symmetric, square) with shape (n_samples, n_samples). 139 | - A feature matrix with shape (n_samples, n_features) 140 | where the distance matrix will be computed. 141 | y : Ignored 142 | Not used, present here for API consistency by convention. 143 | 144 | metric : str | callable, optional (default="euclidean") 145 | The metric to use when computing the distance matrix. 146 | The default is "euclidean". 147 | This should be a valid metric string from 148 | `sklearn.metrics.pairwise_distances` or a callable that computes 149 | the distance between two points. 150 | 151 | .. note:: 152 | The metric parameter *MUST* be a valid metric string from 153 | `sklearn.metrics.pairwise_distances` or a callable that computes 154 | the distance between two points. 155 | Valid metric strings include : 156 | - "euclidean" 157 | - "manhattan" 158 | - "cosine" 159 | - "minkowski" 160 | - and many more supported by scikit-learn. 161 | please refer to the 162 | `sklearn.metrics.pairwise_distances` documentation for a full list. 163 | 164 | .. attention:: 165 | If the input is a distance matrix, the metric parameter is ignored. 166 | The distance matrix should be symmetric and square. 167 | 168 | .. warning:: 169 | If the parameter is a callable, it should : 170 | - Accept two 1D arrays as input. 171 | - Return a single float value representing the distance between the two points. 172 | 173 | Returns: 174 | -------- 175 | self : object 176 | Returns self. 177 | 178 | Examples : 179 | ---------- 180 | 181 | >>> from radius_clustering import RadiusClustering 182 | >>> from sklearn import datasets 183 | >>> # Load the Iris dataset 184 | >>> iris = datasets.fetch_openml(name="iris", version=1, parser="auto") 185 | >>> X = iris["data"] # Use dictionary-style access instead of attribute access 186 | >>> rad = RadiusClustering(manner="exact", threshold=1.43).fit( 187 | ... X 188 | ... ) # Threshold set to 1.43 because it is the optimal 189 | ... # threshold for the Iris dataset 190 | >>> rad.centers_ 191 | [96, 49, 102] 192 | 193 | For examples on common datasets and differences with kmeans, 194 | see :ref:`sphx_glr_auto_examples_plot_iris_example.py` 195 | """ 196 | self.X_checked_ = validate_data(self, X) 197 | 198 | # Create dist and adj matrices 199 | if not self._check_symmetric(self.X_checked_): 200 | dist_mat = pairwise_distances(self.X_checked_, metric=metric) 201 | else: 202 | dist_mat = self.X_checked_ 203 | 204 | if not self._check_symmetric(dist_mat): 205 | raise ValueError("Input distance matrix must be symmetric. Got a non-symmetric matrix.") 206 | self.dist_mat_ = dist_mat 207 | if not isinstance(self.radius, (float, int)): 208 | raise ValueError("Radius must be a positive float.") 209 | if self.radius <= 0: 210 | raise ValueError("Radius must be a positive float.") 211 | adj_mask = np.triu((dist_mat <= self.radius), k=1) 212 | self.nb_edges_ = np.sum(adj_mask) 213 | if self.nb_edges_ == 0: 214 | self.centers_ = list(range(self.X_checked_.shape[0])) 215 | self.labels_ = np.array(self.centers_) 216 | self.effective_radius_ = 0 217 | self.mds_exec_time_ = 0 218 | return self 219 | self.edges_ = np.argwhere(adj_mask).astype( 220 | np.uint32 221 | ) # Edges in the adjacency matrix 222 | # uint32 is used to use less memory. Max number of features is 2^32-1 223 | self.clusterer_ = self._algorithms.get(self.manner, self._algorithms["approx"]) 224 | self._clustering() 225 | self._compute_effective_radius() 226 | self._compute_labels() 227 | 228 | return self 229 | 230 | def fit_predict(self, X: np.ndarray, y: None = None, metric: str | callable = "euclidean") -> np.ndarray: 231 | """ 232 | Fit the model and return the cluster labels. 233 | 234 | This method is a convenience function that combines `fit` and `predict`. 235 | 236 | Parameters: 237 | ----------- 238 | X : array-like, shape (n_samples, n_features) 239 | The input data to cluster. X should be a 2D array-like structure. 240 | It can either be : 241 | - A distance matrix (symmetric, square) with shape (n_samples, n_samples). 242 | - A feature matrix with shape (n_samples, n_features) where 243 | the distance matrix will be computed. 244 | y : Ignored 245 | Not used, present here for API consistency by convention. 246 | 247 | metric : str | callable, optional (default="euclidean") 248 | The metric to use when computing the distance matrix. 249 | The default is "euclidean". 250 | Refer to the `fit` method for more details on valid metrics. 251 | 252 | Returns: 253 | -------- 254 | labels : array, shape (n_samples,) 255 | The cluster labels for each point in X. 256 | """ 257 | self.fit(X, metric=metric) 258 | return self.labels_ 259 | 260 | def _clustering(self): 261 | """ 262 | Perform the clustering using either the exact or approximate MDS method. 263 | """ 264 | n = self.X_checked_.shape[0] 265 | if self.manner not in self._algorithms: 266 | raise ValueError(f"Invalid manner. Please choose in {list(self._algorithms.keys())}.") 267 | if self.clusterer_ == clustering_approx: 268 | if self.random_state is None: 269 | self.random_state = 42 270 | self.random_state_ = check_random_state(self.random_state) 271 | seed = self.random_state_.randint(np.iinfo(np.int32).max) 272 | else: 273 | seed = None 274 | self.centers_, self.mds_exec_time_ = self.clusterer_(n, self.edges_, self.nb_edges_, seed) 275 | 276 | def _compute_effective_radius(self): 277 | """ 278 | Compute the effective radius of the clustering. 279 | 280 | The effective radius is the maximum radius among all clusters. 281 | That means EffRad = max(R(C_i)) for all i. 282 | """ 283 | self.effective_radius_ = np.min(self.dist_mat_[:, self.centers_], axis=1).max() 284 | 285 | def _compute_labels(self): 286 | """ 287 | Compute the cluster labels for each point in the dataset. 288 | """ 289 | distances = self.dist_mat_[:, self.centers_] 290 | self.labels_ = np.argmin(distances, axis=1) 291 | 292 | min_dist = np.min(distances, axis=1) 293 | self.labels_[min_dist > self.radius] = -1 294 | 295 | def set_solver(self, solver: callable) -> None: 296 | """ 297 | Set a custom solver for resolving the MDS problem. 298 | This method allows users to replace the default MDS solver with a custom one. 299 | 300 | An example is provided below and in the example gallery : 301 | :ref:`sphx_glr_auto_examples_plot_benchmark_custom.py` 302 | 303 | .. important:: 304 | The custom solver must accept the same parameters as the default solvers 305 | and return a tuple containing the cluster centers and the execution time. 306 | e.g., it should have the signature: 307 | 308 | >>> def custom_solver( 309 | >>> n: int, 310 | >>> edges: np.ndarray, 311 | >>> nb_edges: int, 312 | >>> random_state: int | None = None 313 | >>> ) -> tuple[list, float]: 314 | >>> # Custom implementation details 315 | >>> centers = [...] 316 | >>> exec_time = ... 317 | >>> # Return the centers and execution time 318 | >>> return centers, exec_time 319 | 320 | This allows for flexibility in how the MDS problem is solved. 321 | 322 | Parameters: 323 | ----------- 324 | solver : callable 325 | The custom solver function to use for MDS clustering. 326 | It should accept the same parameters as the default solvers 327 | and return a tuple containing the cluster centers and the execution time. 328 | 329 | Raises: 330 | ------- 331 | ValueError 332 | If the provided solver does not have the correct signature. 333 | 334 | """ 335 | if not callable(solver): 336 | raise ValueError("The provided solver must be callable.") 337 | 338 | # Check if the solver has the correct signature 339 | try: 340 | n = 3 341 | edges = np.array([[0, 1], [1, 2], [2, 0]]) 342 | nb_edges = edges.shape[0] 343 | solver(n, edges, nb_edges, random_state=None) 344 | except Exception as e: 345 | raise ValueError(f"The provided solver does not have the correct signature: {e}") from e 346 | self.manner = "custom" 347 | self._algorithms["custom"] = solver -------------------------------------------------------------------------------- /src/radius_clustering/utils/mds_core.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file mds_core.cpp 3 | * @brief Core implementation of the Minimum Dominating Set (MDS) algorithm. 4 | * 5 | * This file contains the C++ implementation of the MDS algorithm, 6 | * including the iterated greedy approach and supporting data structures. 7 | * It provides the main computational logic for solving MDS problems. 8 | */ 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "random_manager.h" 18 | 19 | 20 | class Result { 21 | public: 22 | Result() {} // Add this line 23 | Result(std::string instanceName) : instanceName(instanceName) {} 24 | 25 | void add(std::string key, float value) { 26 | map.push_back(Tuple(key, value)); 27 | } 28 | 29 | float get(int pos) { 30 | return map[pos].value; 31 | } 32 | 33 | std::vector getKeys() { 34 | std::vector keys; 35 | for (auto& tuple : map) { 36 | keys.push_back(tuple.name); 37 | } 38 | return keys; 39 | } 40 | 41 | std::string getInstanceName() { 42 | return instanceName; 43 | } 44 | 45 | std::unordered_set getSolutionSet() { 46 | return solutionSet; 47 | } 48 | 49 | void setSolutionSet(std::unordered_set solutionSet) { 50 | this->solutionSet = solutionSet; 51 | } 52 | 53 | 54 | private: 55 | class Tuple { 56 | public: 57 | 58 | std::string name; 59 | float value; 60 | 61 | Tuple(std::string name, float value) : name(name), value(value) {} 62 | }; 63 | std::string instanceName; 64 | std::vector map; 65 | std::unordered_set solutionSet; 66 | }; 67 | 68 | class Instance { 69 | public: 70 | Instance(int n, const std::vector& edges_list, int nb_edges, std::string name) 71 | : name(name), numNodes(n), adjacencyList(n) { 72 | for (int i = 0; i < numNodes; ++i) { 73 | unSelectedNodes.insert(i); 74 | } 75 | constructAdjacencyList(edges_list, nb_edges); 76 | setSupportNodes(); 77 | } 78 | 79 | const std::vector>& getAdjacencyList() const { return adjacencyList; } 80 | const std::unordered_set& getSupportNodes() const { return supportNodes; } 81 | const std::unordered_set& getLeavesNodes() const { return leavesNodes; } 82 | const std::unordered_set& getUnSelectedNodes() const { return unSelectedNodes; } 83 | int getNumNodes() const { return numNodes; } 84 | std::string getName() const { return name; } 85 | 86 | private: 87 | std::string name; 88 | int numNodes; 89 | std::vector> adjacencyList; 90 | std::unordered_set supportNodes; 91 | std::unordered_set leavesNodes; 92 | std::unordered_set unSelectedNodes; 93 | const bool supportAndLeafNodes = true; 94 | 95 | void constructAdjacencyList(const std::vector& edge_list, int nb_edges) { 96 | for (int i = 0; i < 2 * nb_edges; i+=2) { 97 | int u = edge_list[i]; 98 | int v = edge_list[i+1]; 99 | adjacencyList[u].push_back(v); 100 | adjacencyList[v].push_back(u); 101 | } 102 | } 103 | 104 | void setSupportNodes() { 105 | for (int i = 0; i < numNodes; ++i) { 106 | if (adjacencyList[i].size() == 1 && supportAndLeafNodes) { 107 | int neighbour = adjacencyList[i][0]; 108 | if (leavesNodes.find(neighbour) == leavesNodes.end()) { 109 | leavesNodes.insert(i); 110 | supportNodes.insert(neighbour); 111 | } 112 | unSelectedNodes.erase(neighbour); 113 | unSelectedNodes.erase(i); 114 | } else if (adjacencyList[i].empty() && supportAndLeafNodes) { 115 | supportNodes.insert(i); 116 | } 117 | } 118 | } 119 | }; 120 | 121 | class Solution { 122 | public: 123 | Solution(const Instance& inst) 124 | : instance(&inst), numCovered(0), watchers(inst.getNumNodes()) { 125 | unSelectedNodes = inst.getUnSelectedNodes(); 126 | } 127 | 128 | Solution(const Solution& other) = default; 129 | Solution& operator=(const Solution& other) = default; 130 | 131 | bool isFeasible() const { return numCovered == instance->getNumNodes(); } 132 | bool checking() { 133 | bool removed = false; 134 | std::vector selectedList(selectedNotSupportNodes.begin(), selectedNotSupportNodes.end()); 135 | for (int select : selectedList) { 136 | if (watchers[select].size() > 1) { 137 | bool remove = true; 138 | for (int elem : instance->getAdjacencyList()[select]) { 139 | if (watchers[elem].size() == 1) { 140 | remove = false; 141 | break; 142 | } 143 | } 144 | if (remove) { 145 | removed = true; 146 | removeNode(select); 147 | } 148 | } 149 | } 150 | return removed; 151 | } 152 | 153 | void addNode(int node) { 154 | selectedNodes.insert(node); 155 | unSelectedNodes.erase(node); 156 | addWatcher(node); 157 | if (instance->getSupportNodes().find(node) == instance->getSupportNodes().end()) { 158 | selectedNotSupportNodes.insert(node); 159 | } 160 | } 161 | 162 | void removeNode(int node) { 163 | selectedNodes.erase(node); 164 | unSelectedNodes.insert(node); 165 | removeWatcher(node); 166 | selectedNotSupportNodes.erase(node); 167 | } 168 | 169 | int getBestNextNode() const { 170 | int bestCount = -1; 171 | int bestNode = -1; 172 | 173 | for (int i : unSelectedNodes) { 174 | int count = 0; 175 | for (int neighbour : instance->getAdjacencyList()[i]) { 176 | if (watchers[neighbour].empty()) { 177 | count++; 178 | } 179 | } 180 | if (bestCount < count && instance->getLeavesNodes().find(i) == instance->getLeavesNodes().end()) { 181 | bestCount = count; 182 | bestNode = i; 183 | } 184 | } 185 | return bestNode; 186 | } 187 | 188 | int getWorstNodeNew() const { 189 | int worstNode = -1; 190 | int totalMaxWatchers = 0; 191 | 192 | for (int i : selectedNotSupportNodes) { 193 | int minWatchers = std::numeric_limits::max(); 194 | for (int neighbour : instance->getAdjacencyList()[i]) { 195 | if (minWatchers > static_cast(watchers[neighbour].size())) { 196 | minWatchers = watchers[neighbour].size(); 197 | } 198 | } 199 | if (totalMaxWatchers < minWatchers) { 200 | worstNode = i; 201 | totalMaxWatchers = minWatchers; 202 | } 203 | } 204 | 205 | return worstNode; 206 | } 207 | 208 | int evaluate() const { return selectedNodes.size(); } 209 | const std::unordered_set& getSelectedNodes() const { return selectedNodes; } 210 | const std::unordered_set& getSelectedNotSupportNodes() const { return selectedNotSupportNodes; } 211 | const std::unordered_set& getUnSelectedNodes() const { return unSelectedNodes; } 212 | const std::vector>& getWatchers() const { return watchers; } 213 | int getNumNodes() const { return instance->getNumNodes(); } 214 | 215 | private: 216 | const Instance* instance; 217 | std::unordered_set selectedNodes; 218 | std::unordered_set selectedNotSupportNodes; 219 | std::unordered_set unSelectedNodes; 220 | int numCovered; 221 | std::vector> watchers; 222 | 223 | void addWatcher(int selectedNode) { 224 | if (watchers[selectedNode].empty()) { 225 | numCovered++; 226 | } 227 | watchers[selectedNode].insert(selectedNode); 228 | 229 | for (int neighbour : instance->getAdjacencyList()[selectedNode]) { 230 | if (watchers[neighbour].empty()) { 231 | numCovered++; 232 | } 233 | watchers[neighbour].insert(selectedNode); 234 | } 235 | } 236 | 237 | void removeWatcher(int selectedNode) { 238 | watchers[selectedNode].erase(selectedNode); 239 | if (watchers[selectedNode].empty()) { 240 | numCovered--; 241 | } 242 | 243 | for (int neighbour : instance->getAdjacencyList()[selectedNode]) { 244 | watchers[neighbour].erase(selectedNode); 245 | if (watchers[neighbour].empty()) { 246 | numCovered--; 247 | } 248 | } 249 | } 250 | }; 251 | 252 | class GIP { 253 | public: 254 | Solution construct(const Instance& instance) { 255 | Solution solution(instance); 256 | for (int supportNode : instance.getSupportNodes()) { 257 | solution.addNode(supportNode); 258 | } 259 | while (!solution.isFeasible()) { 260 | int selectedNode = solution.getBestNextNode(); 261 | solution.addNode(selectedNode); 262 | } 263 | return solution; 264 | } 265 | }; 266 | 267 | class LocalSearch { 268 | public: 269 | static Solution execute(Solution& sol, const Instance& instance) { 270 | bool improve = true; 271 | while (improve) { 272 | improve = checkImprove(sol, instance); 273 | } 274 | return sol; 275 | } 276 | 277 | private: 278 | static bool checkImprove(Solution& sol, const Instance& instance) { 279 | std::vector copySelected(sol.getSelectedNotSupportNodes().begin(), sol.getSelectedNotSupportNodes().end()); 280 | std::shuffle(copySelected.begin(), copySelected.end(), RandomManager::getRandom()); 281 | 282 | for (int nodeRem : copySelected) { 283 | int nodeNew = selectElemToAdd(nodeRem, instance, sol); 284 | if (nodeNew != -1) { 285 | int of = sol.evaluate(); 286 | sol.removeNode(nodeRem); 287 | sol.addNode(nodeNew); 288 | sol.checking(); 289 | if (sol.evaluate() < of) { 290 | return true; 291 | } 292 | } 293 | } 294 | return false; 295 | } 296 | 297 | static int selectElemToAdd(int node, const Instance& instance, const Solution& solution) { 298 | std::unordered_set neighbours; 299 | bool neighboursInitialized = false; 300 | 301 | if (solution.getWatchers()[node].size() == 1) { 302 | neighbours = std::unordered_set(instance.getAdjacencyList()[node].begin(), instance.getAdjacencyList()[node].end()); 303 | neighboursInitialized = true; 304 | } 305 | 306 | for (int neighbour : instance.getAdjacencyList()[node]) { 307 | if (solution.getWatchers()[neighbour].size() == 1) { 308 | if (!neighboursInitialized) { 309 | neighbours = std::unordered_set(instance.getAdjacencyList()[neighbour].begin(), instance.getAdjacencyList()[neighbour].end()); 310 | neighboursInitialized = true; 311 | } else { 312 | std::unordered_set temp; 313 | for (int n : instance.getAdjacencyList()[neighbour]) { 314 | if (neighbours.find(n) != neighbours.end()) { 315 | temp.insert(n); 316 | } 317 | } 318 | neighbours = std::move(temp); 319 | } 320 | } 321 | } 322 | 323 | if (neighboursInitialized) { 324 | neighbours.erase(node); 325 | } 326 | 327 | return !neighboursInitialized || neighbours.empty() ? -1 : *neighbours.begin(); 328 | } 329 | }; 330 | 331 | class IG { 332 | private: 333 | GIP constructive; 334 | LocalSearch localSearch; 335 | int maxItersWithoutImprove = 200; 336 | float beta = 0.2f; 337 | bool randomDestruct = true; 338 | bool randomConstruct = false; 339 | 340 | public: 341 | IG(GIP& constructive, LocalSearch& localSearch) 342 | : constructive(constructive), localSearch(localSearch) {} 343 | 344 | Result execute(const Instance& instance) { 345 | long initialTime = std::chrono::duration_cast( 346 | std::chrono::system_clock::now().time_since_epoch() 347 | ).count(); 348 | long totalTime = 0; 349 | float secs = 0.0f; 350 | Result result(instance.getName()); 351 | 352 | Solution solution = firstSol(instance); 353 | int numElemsToDestruct = std::ceil(beta * solution.getSelectedNotSupportNodes().size()); 354 | 355 | int numItersWithoutImprove = 0; 356 | int bestOF = solution.evaluate(); 357 | while (numItersWithoutImprove < maxItersWithoutImprove && secs <= 600) { 358 | Solution current_solution = solution; 359 | destruct(current_solution, numElemsToDestruct); 360 | construct(current_solution); 361 | executeLocalSearch(current_solution, instance); 362 | if (current_solution.evaluate() >= bestOF) { 363 | numItersWithoutImprove++; 364 | } else { 365 | numItersWithoutImprove = 0; 366 | bestOF = current_solution.evaluate(); 367 | solution = std::move(current_solution); 368 | } 369 | 370 | totalTime = std::chrono::duration_cast( 371 | std::chrono::system_clock::now().time_since_epoch() 372 | ).count() - initialTime; 373 | secs = totalTime / 1000.0f; 374 | } 375 | 376 | result.setSolutionSet(solution.getSelectedNodes()); 377 | result.add("Time", secs); 378 | result.add("OF", static_cast(bestOF)); 379 | return result; 380 | } 381 | 382 | private: 383 | Solution firstSol(const Instance& instance) { 384 | Solution solution = constructive.construct(instance); 385 | executeLocalSearch(solution, instance); 386 | return solution; 387 | } 388 | 389 | void destruct(Solution& solution, int numElemsToDestruct) { 390 | if (randomDestruct) { 391 | destructRandom(solution, numElemsToDestruct); 392 | } else { 393 | destructGreedy(solution, numElemsToDestruct); 394 | } 395 | } 396 | 397 | void construct(Solution& solution) { 398 | if (randomConstruct) { 399 | constructRandom(solution); 400 | } else { 401 | constructGreedy(solution); 402 | } 403 | } 404 | 405 | void destructRandom(Solution& solution, int numElemsToDestruct) { 406 | std::vector selectedList(solution.getSelectedNotSupportNodes().begin(), solution.getSelectedNotSupportNodes().end()); 407 | std::shuffle(selectedList.begin(), selectedList.end(), RandomManager::getRandom()); 408 | for (int i = 0; i < numElemsToDestruct; i++) { 409 | solution.removeNode(selectedList[i]); 410 | } 411 | } 412 | 413 | void destructGreedy(Solution& solution, int numElemsToDestruct) { 414 | for (int i = 0; i < numElemsToDestruct; i++) { 415 | int worstNode = solution.getWorstNodeNew(); 416 | solution.removeNode(worstNode); 417 | } 418 | } 419 | 420 | void constructRandom(Solution& solution) { 421 | while (!solution.isFeasible()) { 422 | int randomNode = RandomManager::nextInt(solution.getNumNodes()); 423 | solution.addNode(randomNode); 424 | } 425 | } 426 | 427 | void constructGreedy(Solution& solution) { 428 | while (!solution.isFeasible() && !solution.getUnSelectedNodes().empty()) { 429 | int bestNode = solution.getBestNextNode(); 430 | solution.addNode(bestNode); 431 | } 432 | } 433 | 434 | void executeLocalSearch(Solution& solution, const Instance& instance) { 435 | solution = localSearch.execute(solution, instance); 436 | } 437 | }; 438 | 439 | class Main { 440 | private: 441 | GIP constructive; 442 | LocalSearch localSearch; 443 | IG algorithm; 444 | 445 | static void signal_handler(int sig) { 446 | exit(sig); 447 | } 448 | 449 | public: 450 | Main() : algorithm(constructive, localSearch) {} 451 | 452 | Result execute(int numNodes, const std::vector& edges_list, int nb_edges, long seed) { 453 | Instance instance(numNodes, edges_list, nb_edges, "name"); 454 | RandomManager::setSeed(seed); 455 | signal(SIGINT, signal_handler); 456 | return algorithm.execute(instance); 457 | } 458 | }; 459 | 460 | extern "C" { 461 | inline Result iterated_greedy_wrapper(int numNodes, const std::vector& edges_list, int nb_edges, long seed) { 462 | static Main main; // Create a single static instance 463 | 464 | return main.execute(numNodes, edges_list, nb_edges, seed); 465 | } 466 | } -------------------------------------------------------------------------------- /notebooks/comparison_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4acb9df3", 6 | "metadata": {}, 7 | "source": [ 8 | "# Comparison of Radius Clustering with KMeans on the samples Dataset\n", 9 | "\n", 10 | "\n", 11 | "This example is meant to illustrate the use of the Radius clustering library on several datasets.\n", 12 | "\n", 13 | "The example includes:\n", 14 | "1. Loading the datasets\n", 15 | "2. Applying Radius clustering and k-means clustering\n", 16 | "3. Visualizing the clustering results\n", 17 | "\n", 18 | "This example serves as a simple introduction to using the Radius clustering library on well-known datasets.\n", 19 | "\n", 20 | "**Author: Haenn Quentin**\n", 21 | "\n", 22 | "**@SPDX-License-Identifier: MIT**\n", 23 | "\n", 24 | "\n", 25 | "\n", 26 | "## 1. Load the Iris dataset\n", 27 | "\n", 28 | "We start by loading the Iris dataset using the `fetch_openml` function from `sklearn.datasets`.\n", 29 | "The Iris dataset is a well-known dataset that contains 150 samples of iris flowers.\n", 30 | "Each sample has 4 features: sepal length, sepal width, petal length, and petal width.\n", 31 | "The dataset is labeled with 3 classes: setosa, versicolor, and virginica." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "e28a516b", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import numpy as np\n", 42 | "from sklearn import datasets\n", 43 | "from radius_clustering import RadiusClustering\n", 44 | "\n", 45 | "# Load the Iris dataset\n", 46 | "iris = datasets.load_iris()\n", 47 | "X = iris[\"data\"]\n", 48 | "y = iris.target" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "b84938fd", 54 | "metadata": {}, 55 | "source": [ 56 | "\n", 57 | "## 2. Visualize the Iris dataset\n", 58 | "\n", 59 | "\n", 60 | "We can visualize the Iris dataset by plotting the dataset. We use PCA to reduce the dimensionality to 3D and plot the dataset in a 3D scatter plot." 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "28f37b15", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import matplotlib.pyplot as plt\n", 71 | "from sklearn.decomposition import PCA\n", 72 | "import mpl_toolkits.mplot3d\n", 73 | "\n", 74 | "# Reduce the dimensionality of the dataset to 3D using PCA\n", 75 | "pca = PCA(n_components=3)\n", 76 | "iris_reduced = pca.fit_transform(X)\n", 77 | "fig = plt.figure(figsize=(8, 6))\n", 78 | "ax = fig.add_subplot(111, projection=\"3d\", elev=48, azim=134)\n", 79 | "ax.scatter(\n", 80 | " iris_reduced[:, 0],\n", 81 | " iris_reduced[:, 1],\n", 82 | " iris_reduced[:, 2],\n", 83 | " c=y,\n", 84 | " cmap=\"Dark2\",\n", 85 | " s=40,\n", 86 | ")\n", 87 | "# Set plot labels\n", 88 | "ax.set_title(\"Iris dataset in first 3 PCA components\")\n", 89 | "ax.set_xlabel(\"1st eigenvector\")\n", 90 | "ax.set_ylabel(\"2nd eigenvector\")\n", 91 | "ax.set_zlabel(\"3rd eigenvector\")\n", 92 | "\n", 93 | "# Hide tick labels\n", 94 | "ax.xaxis.set_ticklabels([])\n", 95 | "ax.yaxis.set_ticklabels([])\n", 96 | "ax.zaxis.set_ticklabels([])\n", 97 | "\n", 98 | "plt.show()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "cd38d50b", 104 | "metadata": {}, 105 | "source": [ 106 | "\n", 107 | "## 3. Compute Clustering with Radius Clustering\n", 108 | "\n", 109 | "We can now apply Radius clustering to the Iris dataset.\n", 110 | "We create an instance of the `RadiusClustering` class and fit it to the Iris dataset." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "9282ec34", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "import time\n", 121 | "\n", 122 | "rad = RadiusClustering(manner=\"exact\", radius=1.43)\n", 123 | "t0 = time.time()\n", 124 | "rad.fit(X)\n", 125 | "t_rad = time.time() - t0" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "2653845e", 131 | "metadata": {}, 132 | "source": [ 133 | "\n", 134 | "## 4. Compute KMeans Clustering for Comparison\n", 135 | "\n", 136 | "We also apply KMeans clustering to the Iris dataset for comparison.\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "e7e993f5", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "\n", 147 | "from sklearn.cluster import KMeans\n", 148 | "\n", 149 | "k_means = KMeans(n_clusters=3, n_init=10)\n", 150 | "t0 = time.time()\n", 151 | "k_means.fit(X)\n", 152 | "t_kmeans = time.time() - t0" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "d1072a7f", 158 | "metadata": {}, 159 | "source": [ 160 | "## 5. Establishing parity between clusters\n", 161 | "\n", 162 | "We want to have the same color for the same cluster in both plots.\n", 163 | "We can achieve this by matching the cluster labels of the Radius clustering and the KMeans clustering.\n", 164 | "First we define a function to retrieve the cluster centers from the Radius clustering and KMeans clustering and\n", 165 | "match them pairwise." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "3ac48cdf", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "\n", 176 | "def get_order_labels(kmeans, rad, data):\n", 177 | " centers1_cpy = kmeans.cluster_centers_.copy()\n", 178 | " centers2_cpy = data[rad.centers_].copy()\n", 179 | " order = []\n", 180 | " # For each center in the first clustering, find the closest center in the second clustering\n", 181 | " for center in centers1_cpy:\n", 182 | " match = pairwise_distances_argmin([center], centers2_cpy)\n", 183 | " # if there is only one center left, assign it to the last cluster label not yet assigned\n", 184 | " if len(centers2_cpy) == 1:\n", 185 | " for i in range(len(centers1_cpy)):\n", 186 | " if i not in order:\n", 187 | " order.append(i)\n", 188 | " break\n", 189 | " break\n", 190 | " # get coordinates of the center in the second clustering\n", 191 | " coordinates = centers2_cpy[match]\n", 192 | " # find the closest point in the data to the center to get the cluster label\n", 193 | " closest_point = pairwise_distances_argmin(coordinates, data)\n", 194 | " match_label = rad.labels_[closest_point]\n", 195 | " # remove the center from the second clustering\n", 196 | " centers2_cpy = np.delete(centers2_cpy, match, axis=0)\n", 197 | " # add the cluster label to the order\n", 198 | " order.append(int(match_label[0]))\n", 199 | " return order\n", 200 | "\n", 201 | "\n", 202 | "from sklearn.metrics.pairwise import pairwise_distances_argmin\n", 203 | "\n", 204 | "rad_centers_index = np.array(rad.centers_)\n", 205 | "order = get_order_labels(k_means, rad, X)\n", 206 | "\n", 207 | "kmeans_centers = k_means.cluster_centers_\n", 208 | "rad_centers = rad_centers_index[order]\n", 209 | "rad_centers_coordinates = X[rad_centers]\n", 210 | "\n", 211 | "# Pair the cluster labels\n", 212 | "kmeans_labels = pairwise_distances_argmin(X, kmeans_centers)\n", 213 | "rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "b428447c", 219 | "metadata": {}, 220 | "source": [ 221 | "### Plotting the results and the difference" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "69c095ee", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "fig = plt.figure(figsize=(12, 6))\n", 232 | "fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\n", 233 | "colors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n", 234 | "\n", 235 | "# KMeans\n", 236 | "ax = fig.add_subplot(1, 3, 1, projection=\"3d\", elev=48, azim=134, roll=0)\n", 237 | "\n", 238 | "ax.scatter(\n", 239 | " iris_reduced[:, 0],\n", 240 | " iris_reduced[:, 1],\n", 241 | " iris_reduced[:, 2],\n", 242 | " c=kmeans_labels,\n", 243 | " cmap=\"Dark2\",\n", 244 | " s=40,\n", 245 | ")\n", 246 | "# adapting center coordinates to the 3D plot\n", 247 | "kmeans_centers = pca.transform(kmeans_centers)\n", 248 | "ax.scatter(\n", 249 | " kmeans_centers[:, 0],\n", 250 | " kmeans_centers[:, 1],\n", 251 | " kmeans_centers[:, 2],\n", 252 | " c=\"r\",\n", 253 | " s=200,\n", 254 | ")\n", 255 | "ax.set_title(\"KMeans\")\n", 256 | "ax.set_xticks(())\n", 257 | "ax.set_yticks(())\n", 258 | "ax.set_zticks(())\n", 259 | "\n", 260 | "ax.text3D(-3.5, 3, 1.0, \"train time: %.2fs\\ninertia: %f\" % (t_kmeans, k_means.inertia_))\n", 261 | "\n", 262 | "# MDS\n", 263 | "ax = fig.add_subplot(1, 3, 2, projection=\"3d\", elev=48, azim=134, roll=0)\n", 264 | "ax.scatter(\n", 265 | " iris_reduced[:, 0],\n", 266 | " iris_reduced[:, 1],\n", 267 | " iris_reduced[:, 2],\n", 268 | " c=rad_labels,\n", 269 | " cmap=\"Dark2\",\n", 270 | " s=40,\n", 271 | ")\n", 272 | "# adapting center coordinates to the 3D plot\n", 273 | "rad_centers_coordinates = pca.transform(rad_centers_coordinates)\n", 274 | "ax.scatter(\n", 275 | " rad_centers_coordinates[:, 0],\n", 276 | " rad_centers_coordinates[:, 1],\n", 277 | " rad_centers_coordinates[:, 2],\n", 278 | " c=\"r\",\n", 279 | " s=200,\n", 280 | ")\n", 281 | "ax.set_title(\"MDS Clustering\")\n", 282 | "ax.set_xticks(())\n", 283 | "ax.set_yticks(())\n", 284 | "ax.set_zticks(())\n", 285 | "ax.text3D(-3.5, 3, 0.0, \"train time: %.2fs\" % t_rad)\n", 286 | "\n", 287 | "# Initialize the different array to all False\n", 288 | "different = rad_labels == 4\n", 289 | "ax = fig.add_subplot(1, 3, 3, projection=\"3d\", elev=48, azim=134, roll=0)\n", 290 | "\n", 291 | "for k in range(3):\n", 292 | " different += (kmeans_labels == k) != (rad_labels == k)\n", 293 | "\n", 294 | "identical = np.logical_not(different)\n", 295 | "ax.scatter(\n", 296 | " iris_reduced[identical, 0], iris_reduced[identical, 1], color=\"#bbbbbb\", marker=\".\"\n", 297 | ")\n", 298 | "ax.scatter(iris_reduced[different, 0], iris_reduced[different, 1], color=\"m\")\n", 299 | "ax.set_title(\"Difference\")\n", 300 | "ax.set_xticks(())\n", 301 | "ax.set_yticks(())\n", 302 | "ax.set_zticks(())\n", 303 | "\n", 304 | "plt.show()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "id": "3d1c4fcf", 310 | "metadata": {}, 311 | "source": [ 312 | "## Another difference plot\n", 313 | "\n", 314 | "As we saw, the difference plot is not very informative using Iris.\n", 315 | "We'll use a different dataset to show the difference plot." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "ea3d0438", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "wine = datasets.load_wine()\n", 326 | "X = wine.data\n", 327 | "y = wine.target\n", 328 | "pca = PCA(n_components=3)\n", 329 | "wine_reduced = pca.fit_transform(X)\n", 330 | "\n", 331 | "# Compute clustering with MDS\n", 332 | "\n", 333 | "rad = RadiusClustering(manner=\"exact\", radius=232.09)\n", 334 | "t0 = time.time()\n", 335 | "rad.fit(X)\n", 336 | "t_rad = time.time() - t0\n", 337 | "\n", 338 | "# Compute KMeans clustering for comparison\n", 339 | "\n", 340 | "k_means = KMeans(n_clusters=3, n_init=10)\n", 341 | "t0 = time.time()\n", 342 | "k_means.fit(X)\n", 343 | "t_kmeans = time.time() - t0" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "id": "3929dee4", 349 | "metadata": {}, 350 | "source": [ 351 | "## Reapplying the same process as before" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "id": "24449b3a", 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "rad_centers_index = np.array(rad.centers_)\n", 362 | "order = get_order_labels(k_means, rad, X)\n", 363 | "\n", 364 | "kmeans_centers = k_means.cluster_centers_\n", 365 | "rad_centers = rad_centers_index[order]\n", 366 | "rad_centers_coordinates = X[rad_centers]\n", 367 | "\n", 368 | "# Pair the cluster labels\n", 369 | "kmeans_labels = pairwise_distances_argmin(X, kmeans_centers)\n", 370 | "rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "id": "3accac5b", 376 | "metadata": {}, 377 | "source": [ 378 | "## Plotting the results and the difference" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "39235d3c", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "fig = plt.figure(figsize=(12, 6))\n", 389 | "fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\n", 390 | "colors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n", 391 | "\n", 392 | "# KMeans\n", 393 | "ax = fig.add_subplot(1, 3, 1, projection=\"3d\", elev=48, azim=134, roll=0)\n", 394 | "\n", 395 | "ax.scatter(\n", 396 | " wine_reduced[:, 0],\n", 397 | " wine_reduced[:, 1],\n", 398 | " wine_reduced[:, 2],\n", 399 | " c=kmeans_labels,\n", 400 | " cmap=\"Dark2\",\n", 401 | " s=40,\n", 402 | ")\n", 403 | "# adapting center coordinates to the 3D plot\n", 404 | "kmeans_centers = pca.transform(kmeans_centers)\n", 405 | "ax.scatter(\n", 406 | " kmeans_centers[:, 0],\n", 407 | " kmeans_centers[:, 1],\n", 408 | " kmeans_centers[:, 2],\n", 409 | " c=\"r\",\n", 410 | " s=200,\n", 411 | ")\n", 412 | "ax.set_title(\"KMeans\")\n", 413 | "ax.set_xticks(())\n", 414 | "ax.set_yticks(())\n", 415 | "ax.set_zticks(())\n", 416 | "\n", 417 | "ax.text3D(\n", 418 | " 60.0, 80.0, 0.0, \"train time: %.2fs\\ninertia: %f\" % (t_kmeans, k_means.inertia_)\n", 419 | ")\n", 420 | "\n", 421 | "# MDS\n", 422 | "ax = fig.add_subplot(1, 3, 2, projection=\"3d\", elev=48, azim=134, roll=0)\n", 423 | "ax.scatter(\n", 424 | " wine_reduced[:, 0],\n", 425 | " wine_reduced[:, 1],\n", 426 | " wine_reduced[:, 2],\n", 427 | " c=rad_labels,\n", 428 | " cmap=\"Dark2\",\n", 429 | " s=40,\n", 430 | ")\n", 431 | "# adapting center coordinates to the 3D plot\n", 432 | "rad_centers_coordinates = pca.transform(rad_centers_coordinates)\n", 433 | "ax.scatter(\n", 434 | " rad_centers_coordinates[:, 0],\n", 435 | " rad_centers_coordinates[:, 1],\n", 436 | " rad_centers_coordinates[:, 2],\n", 437 | " c=\"r\",\n", 438 | " s=200,\n", 439 | ")\n", 440 | "ax.set_title(\"MDS Clustering\")\n", 441 | "ax.set_xticks(())\n", 442 | "ax.set_yticks(())\n", 443 | "ax.set_zticks(())\n", 444 | "ax.text3D(60.0, 80.0, 0.0, \"train time: %.2fs\" % t_rad)\n", 445 | "\n", 446 | "# Initialize the different array to all False\n", 447 | "different = rad_labels == 4\n", 448 | "ax = fig.add_subplot(1, 3, 3, projection=\"3d\", elev=48, azim=134, roll=0)\n", 449 | "\n", 450 | "for k in range(3):\n", 451 | " different += (kmeans_labels == k) != (rad_labels == k)\n", 452 | "\n", 453 | "identical = np.logical_not(different)\n", 454 | "ax.scatter(\n", 455 | " wine_reduced[identical, 0], wine_reduced[identical, 1], color=\"#bbbbbb\", marker=\".\"\n", 456 | ")\n", 457 | "ax.scatter(wine_reduced[different, 0], wine_reduced[different, 1], color=\"m\")\n", 458 | "ax.set_title(\"Difference\")\n", 459 | "ax.set_xticks(())\n", 460 | "ax.set_yticks(())\n", 461 | "ax.set_zticks(())\n", 462 | "\n", 463 | "plt.show()" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "id": "c1172f38", 469 | "metadata": {}, 470 | "source": [ 471 | "## Conclusion\n", 472 | "\n", 473 | "In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering.\n", 474 | "We visualized the clustering results and the difference between the two clustering algorithms.\n", 475 | "We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters.\n", 476 | "The difference plot can be very useful to see where the two clustering algorithms differ." 477 | ] 478 | } 479 | ], 480 | "metadata": { 481 | "language_info": { 482 | "name": "python" 483 | } 484 | }, 485 | "nbformat": 4, 486 | "nbformat_minor": 5 487 | } 488 | -------------------------------------------------------------------------------- /src/radius_clustering/utils/mds3-util.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file mds3-util.h 3 | * @brief Utility functions and data structures for MDS algorithms. 4 | * 5 | * This header file defines various utility functions, data structures, 6 | * and constants used in the implementation of MDS algorithms. 7 | * It includes helper functions for graph manipulation, random number generation, 8 | * and other common operations used across the MDS solver. 9 | */ 10 | 11 | #ifndef MDS3_UTIL_H 12 | #define MDS3_UTIL_H 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #ifdef _WIN32 21 | #include 22 | #elif defined(__APPLE__) || defined(__linux__) 23 | #include 24 | #include 25 | #else 26 | #error "Unsupported platform" 27 | #endif 28 | 29 | #define WORD_LENGTH 100 30 | #define TRUE 1 31 | #define FALSE 0 32 | #define NONE 0 33 | #define DELIMITER 0 34 | #define PASSIVE 0 35 | #define ACTIVE 1 36 | #define MAX_NODE 10000000 37 | #define max_expand_depth 100000 38 | #define MAXIS 16 39 | 40 | #define for_each_vertex(node) for(int node=1;node<=NB_NODE;node++) 41 | #define for_each_neighbor(__vertex,__neighbor) for(int * __ptr=Node_Neighbors[__vertex],__neighbor=*__ptr;__neighbor!=NONE;__neighbor=*(++__ptr)) 42 | 43 | #define domed(node) (STATUS[node].dominated) 44 | #define clr_domed_status(node) (STATUS[node].dominated=0) 45 | #define set_domed_status(node) (STATUS[node].dominated=1) 46 | 47 | #define fixed(node) (STATUS[node].fixed) 48 | #define deleted(node) (STATUS[node].deleted) 49 | 50 | #define removed(node) (STATUS[node].removed) 51 | #define set_removed_status(node) (STATUS[node].removed=1) 52 | #define clr_removed_status(node) (STATUS[node].removed=0) 53 | 54 | #define branched(node) (STATUS[node].branched) 55 | #define set_branched_status(node) (STATUS[node].branched=1) 56 | #define clr_branched_status(node) (STATUS[node].branched=0) 57 | 58 | #define active(node) (STATUS[node].active) 59 | #define set_active(node) (STATUS[node].active=1) 60 | #define clr_active(node) (STATUS[node].active=0) 61 | 62 | #define included(node) (STATUS[node].included) 63 | #define set_included_status(node) (STATUS[node].included=1) 64 | #define clr_included_status(node) (STATUS[node].included=0) 65 | 66 | #define bit_set(vec,idx) ((*(vec+(idx>>5)))|= (1<<(idx&31))) 67 | #define bit_clr(vec,idx) ((*(vec+(idx>>5)))&= (~(1<<(idx&31)))) 68 | #define bit_val(vec,idx) ((*(vec+(idx>>5)))&(1<<(idx&31))) 69 | #define CUR_BRA_IDX BRAIDX[CUR_LEVEL] 70 | #define CUR_BRA_NODE ITEM(BRA_STK,CUR_BRA_IDX) 71 | #define CUR_LEVEL_UND_IDX UNDIDX[CUR_LEVEL] 72 | #define adjlen(node) ((node)/32+1) 73 | 74 | #define marked(node) (PID[node].marked) 75 | #define set_marked_status(node) (PID[node].marked=1) 76 | #define clr_marked_status(node) (PID[node].marked=0) 77 | 78 | #define involved(node) (PID[node].involved) 79 | #define set_involved_status(node) (PID[node].involved=1) 80 | #define clr_involved_status(node) (PID[node].involved=0) 81 | #define set_newid(node,id) (PID[node].newid=id) 82 | #define set_isno(node,no) (PID[node].isno=no) 83 | 84 | 85 | #define newid(node) (PID[node].newid) 86 | #define isno(node) (PID[node].isno) 87 | 88 | #define branch_node_at_level(i) ITEM(BRA_STK,BRAIDX[i]) 89 | 90 | // Macro for vector 91 | #define VEC_DECLARE(T,tName) \ 92 | typedef struct { \ 93 | T *addr; \ 94 | unsigned used;\ 95 | unsigned capacity;\ 96 | }tName 97 | 98 | #define push_back(Vec,T,Val) \ 99 | do{ \ 100 | assert(Vec->used<=Vec->capacity);\ 101 | if(Vec->used==Vec->capacity){ \ 102 | int size=Vec->capacity*2; \ 103 | Vec->addr=(T *)realloc(Vec->addr,(size+1)*sizeof(T));\ 104 | assert(Vec->addr!=NULL); \ 105 | Vec->capacity=size;\ 106 | }\ 107 | *(Vec->addr+Vec->used)=(Val);\ 108 | (Vec->used)++;\ 109 | }while(0) 110 | 111 | #define new_stack(Vec,VEC_TYPE,ITEM_TYPE,len) \ 112 | do{ \ 113 | assert(len>0);\ 114 | unsigned size=(len);\ 115 | Vec=(VEC_TYPE *)calloc(1,sizeof(VEC_TYPE));\ 116 | assert(Vec!=NULL);\ 117 | (Vec)->addr=(ITEM_TYPE *)malloc((size+1)*sizeof(ITEM_TYPE)); \ 118 | assert((Vec)->addr); \ 119 | (Vec)->capacity=size;\ 120 | (Vec)->used=0;\ 121 | }while(0) 122 | 123 | #define free_stack(Vec) \ 124 | do{ \ 125 | if(Vec!=NULL){\ 126 | free(Vec->addr);\ 127 | free(Vec);\ 128 | }\ 129 | }while(0) 130 | 131 | #define for_each_vec_item(Vec,T,It) for(T *It=Vec->addr, *__end=Vec->addr+Vec->used;It != __end;It++) 132 | 133 | #define remove_value_from_vector(Vec,T,Val) do{ \ 134 | for(T *It=Vec->addr, *__end=Vec->addr+Vec->used;It != __end;)\ 135 | if(*It==Val){\ 136 | Vec->used--;*It=*(Vec->addr+Vec->used);\ 137 | __end--;\ 138 | }else \ 139 | It++;\ 140 | }while(0) 141 | 142 | #define ITEM(VEC,IDX) (VEC->addr[(IDX)]) 143 | #define USED(VEC) (VEC->used) 144 | 145 | VEC_DECLARE(int,VEC_INT); 146 | VEC_DECLARE(unsigned,VEC_UINT); 147 | 148 | /* end of vector */ 149 | /* 150 | typedef struct{ 151 | unsigned fixed:1; 152 | unsigned active:1; 153 | unsigned deleted:1; 154 | unsigned removed:1; 155 | unsigned included:1; 156 | unsigned branched:1; 157 | unsigned dominated:1; 158 | unsigned undidx:25; 159 | }VSTATUS; 160 | */ 161 | typedef struct{ 162 | char fixed:1; 163 | char active:1; 164 | char deleted:1; 165 | char removed:1; 166 | char included:1; 167 | char branched:1; 168 | char dominated:1; 169 | char future:1; 170 | }VSTATUS; 171 | 172 | typedef struct{ 173 | unsigned involved:1; 174 | unsigned marked:1; 175 | unsigned newid:20; 176 | unsigned isno:10; 177 | }PSTATUS; 178 | 179 | 180 | static int * Init_Adj_List; 181 | static int BLOCK_COUNT = 0; 182 | static int *BLOCK_LIST[100]; 183 | static int **Node_Neighbors; 184 | 185 | static unsigned Node_Degree[MAX_NODE]; 186 | 187 | static int NB_NODE,NB_EDGE, Max_Degree = 0, Max_Degree_Node,SUB_PROBLEM_SIZE; 188 | static int FORMAT = 1, NB_NODE_O, NB_EDGE_O; 189 | static double READ_TIME, INIT_TIME, SEARCH_TIME; 190 | static double D0 = 0, D1 = 0, D2 = 0, Dt = 0; 191 | static int INIT_BRANCHING_NODE=0,INIT_UPPER_BOUND=0; 192 | static unsigned long long NB_TREE=0; 193 | 194 | static int * CFG; 195 | static int * LOC; 196 | static int * BRAIDX; 197 | static int * UNDIDX; 198 | static VEC_INT * BRA_STK; 199 | static int BEST_LEVEL,CUR_LEVEL,CUR_UND_IDX; 200 | static VSTATUS * STATUS; 201 | static PSTATUS * PID; 202 | static int * ADJIDX; 203 | static VEC_UINT * ADJ_STK; 204 | static int TIME_OUT, CUT_OFF=0; 205 | static double BEST_SOL_TIME; 206 | static char instance[1024]={'\0'}; 207 | static VEC_INT *iSET[MAXIS+1]; 208 | static int iSET_Counter[MAXIS+1]; 209 | static int iSET_Status[MAXIS]; 210 | static float *Node_Score; 211 | 212 | struct Result { 213 | int* dominating_set; 214 | int set_size; 215 | double exec_time; 216 | }; 217 | 218 | static double get_utime() { 219 | #ifdef _WIN32 220 | FILETIME createTime; 221 | FILETIME exitTime; 222 | FILETIME kernelTime; 223 | FILETIME userTime; 224 | if (GetProcessTimes(GetCurrentProcess(), 225 | &createTime, &exitTime, 226 | &kernelTime, &userTime) != 0) { 227 | ULARGE_INTEGER li = {{userTime.dwLowDateTime, userTime.dwHighDateTime}}; 228 | return li.QuadPart * 1e-7; 229 | } 230 | return 0.0; 231 | #elif defined(__APPLE__) || defined(__linux__) 232 | struct rusage utime; 233 | if (getrusage(RUSAGE_SELF, &utime) == 0) { 234 | return (double)utime.ru_utime.tv_sec + (double)utime.ru_utime.tv_usec * 1e-6; 235 | } 236 | return 0.0; 237 | #else 238 | return (double)clock() / CLOCKS_PER_SEC; 239 | #endif 240 | } 241 | 242 | static int cmp_branching_vertex_score(const void * a, const void *b){ 243 | return Node_Degree[*((int *) b)] - Node_Degree[*((int *) a)]; 244 | } 245 | 246 | static int int_cmp_desc(const void * a, const void * b) { 247 | return *((int *) b) - *((int *) a); 248 | } 249 | 250 | static int int_cmp_asc(const void * a, const void * b) { 251 | return *((int *) a) - *((int *) b); 252 | } 253 | static VEC_INT * FIX_STK,* TMP_STK; 254 | static VEC_INT * VEC_SUBGRAPHS; 255 | static VEC_INT * VEC_SOLUTION; 256 | static int NB_FIXED=0,NEW_IDX=0,NB_UNFIXED=0; 257 | 258 | 259 | static void allocate_memory_for_adjacency_list(int nb_node, int nb_edge,int offset) { 260 | int i, block_size = 40960000; 261 | unsigned int free_size = 0; 262 | Init_Adj_List = (int *) malloc((2 * nb_edge + nb_node) * sizeof(int)); 263 | if (Init_Adj_List == NULL ) { 264 | for (i = 1; i <= NB_NODE; i++) { 265 | if (Node_Degree[i - offset] + 1 > free_size) { 266 | Node_Neighbors[i] = (int *) malloc(block_size * sizeof(int)); 267 | BLOCK_LIST[BLOCK_COUNT++] = Node_Neighbors[i]; 268 | free_size = block_size - (Node_Degree[i - offset] + 1); 269 | } else { 270 | Node_Neighbors[i] = Node_Neighbors[i - 1] 271 | + Node_Degree[i - 1 - offset] + 1; 272 | free_size = free_size - (Node_Degree[i - offset] + 1); 273 | } 274 | } 275 | } else { 276 | BLOCK_COUNT = 1; 277 | BLOCK_LIST[BLOCK_COUNT - 1] = Init_Adj_List; 278 | Node_Neighbors[1] = Init_Adj_List; 279 | for (i = 2; i <= NB_NODE; i++) { 280 | Node_Neighbors[i] = Node_Neighbors[i - 1] + Node_Degree[i - 1 - offset] 281 | + 1; 282 | } 283 | } 284 | } 285 | 286 | 287 | static int _read_graph_from_edge_list(unsigned int* edges, int n, int nb_edges) { 288 | int i, j, l_node, r_node, nb_edge = 0, max_node = n, offset = 0; 289 | int node = 1; 290 | 291 | memset(Node_Degree, 0, (MAX_NODE) * sizeof(int)); 292 | 293 | for (j =0; j < 2 * nb_edges; j+=2) { 294 | l_node = edges[j]; 295 | r_node = edges[j+1]; 296 | 297 | if (l_node >= 0 && r_node >= 0 && l_node != r_node) { 298 | nb_edge++; 299 | if (l_node > max_node) max_node = l_node; 300 | if (r_node > max_node) max_node = r_node; 301 | 302 | } 303 | if (offset ==0 && (l_node == 0 || r_node == 0)){ 304 | offset = 1; 305 | } 306 | Node_Degree[l_node]++; 307 | Node_Degree[r_node]++; 308 | } 309 | NB_NODE = max_node; 310 | 311 | Node_Neighbors = (int **)malloc((NB_NODE + 1) * sizeof(int *)); 312 | allocate_memory_for_adjacency_list(NB_NODE, nb_edge, 1); 313 | memset(Node_Degree, 0, (NB_NODE + 1) * sizeof(int)); 314 | 315 | nb_edge = 0; 316 | for (j = 0; j < 2 * nb_edges; j+=2) { 317 | l_node = edges[j]; 318 | r_node = edges[j+1]; 319 | if (l_node >= 0 && r_node >= 0 && l_node != r_node) { 320 | if (offset) { 321 | l_node += offset; 322 | r_node += offset; 323 | } 324 | for (i = 0; i < Node_Degree[l_node]; i++) { 325 | if (Node_Neighbors[l_node][i] == r_node) 326 | break; 327 | 328 | } 329 | if (i == Node_Degree[l_node]) { 330 | Node_Neighbors[l_node][Node_Degree[l_node]] = r_node; 331 | Node_Neighbors[r_node][Node_Degree[r_node]] = l_node; 332 | Node_Degree[l_node]++; 333 | Node_Degree[r_node]++; 334 | nb_edge++; 335 | } 336 | 337 | } 338 | } 339 | NB_EDGE = nb_edge; 340 | Max_Degree = 0; 341 | for (node = 1; node <= NB_NODE; node++) { 342 | Node_Neighbors[node][Node_Degree[node]] = NONE; 343 | if (Node_Degree[node] > Max_Degree) { 344 | Max_Degree = Node_Degree[node]; 345 | Max_Degree_Node = node; 346 | } 347 | } 348 | return TRUE; 349 | } 350 | 351 | static void free_block() { 352 | int i = 0; 353 | for (i = 0; i < BLOCK_COUNT; i++) 354 | free(BLOCK_LIST[i]); 355 | } 356 | 357 | static int *dfn,*low,*TarStack,TarTop,CNT=0,*SonNum,*RecSta,RecTop,*LasSon,*LasNodeIndex; 358 | 359 | //After preprocess, following variables might still be useful: 360 | static int *SubGraph_size,NB_DCC=0,*InDcc,NB_cut=0; 361 | static double REDUCE_TIME=0; 362 | //NB_DCC indicates the number of v-DCCs(sub-graphs) 363 | //NB_cut indicates the number of cut-vertexes 364 | //SubGraph_size,0-indexed,"SubGraph_size[i]=j" indicates that there are j nodes in the i-th subgraph; 365 | //Cut[x]=1 indicates that x is a cut-vertex 366 | //InDcc[x]=y indicates that vertex x is in the y-th v-DCC; especially, if Cut[x]==1, InDcc[x]=-1, since a cut-vertex might be involved by several v-DCCs 367 | 368 | 369 | 370 | static inline void partition_oneproblem(){ 371 | #ifdef NOR 372 | // Branching_Queue=(int *)malloc(sizeof(int)*(NB_NODE+1)); 373 | NB_UNFIXED=NB_NODE; 374 | #endif 375 | new_stack(VEC_SUBGRAPHS,VEC_INT,int,NB_NODE); 376 | for(int i=1;i<=NB_NODE;++i){ 377 | if(!deleted(i)){ 378 | push_back(VEC_SUBGRAPHS,int,i); 379 | } 380 | } 381 | push_back(VEC_SUBGRAPHS,int,NONE); 382 | } 383 | 384 | static inline void reduce_graph2(){ 385 | 386 | for(int node=1;node<=NB_NODE;node++){ 387 | if(deleted(node))continue; 388 | 389 | set_marked_status(node); 390 | for_each_neighbor(node,neighbor){ 391 | set_marked_status(neighbor); 392 | } 393 | 394 | for_each_neighbor(node,neighbor){ 395 | if(fixed(neighbor)){ 396 | set_branched_status(neighbor); 397 | continue; 398 | } 399 | for_each_neighbor(neighbor,neighbor2){ 400 | if(!marked(neighbor2)){ 401 | set_branched_status(neighbor); 402 | break; 403 | } 404 | } 405 | } 406 | 407 | for_each_neighbor(node,neighbor){ 408 | if(branched(neighbor)) 409 | continue; 410 | for_each_neighbor(neighbor,neighbor2){ 411 | if(branched(neighbor2)){ 412 | set_involved_status(neighbor); 413 | break; 414 | } 415 | } 416 | if(!involved(neighbor)){ 417 | fixed(node)=1; 418 | break; 419 | } 420 | } 421 | 422 | if(fixed(node)){ 423 | for_each_neighbor(node,neighbor){ 424 | if(!branched(neighbor)) 425 | deleted(neighbor)=1; 426 | } 427 | } 428 | 429 | clr_marked_status(node); 430 | for_each_neighbor(node,neighbor){ 431 | clr_marked_status(neighbor); 432 | clr_involved_status(neighbor); 433 | clr_branched_status(neighbor); 434 | } 435 | } 436 | 437 | //reduce adjlist 438 | for(int node=1;node<=NB_NODE;node++){ 439 | if(fixed(node)) 440 | NB_FIXED++; 441 | if(deleted(node)) 442 | continue; 443 | 444 | int *ptr=Node_Neighbors[node],count=0; 445 | for_each_neighbor(node,neighbor){ 446 | if(!deleted(neighbor)){ 447 | *ptr++=neighbor;count++; 448 | } 449 | } 450 | *ptr=NONE; 451 | Node_Degree[node]=count; 452 | } 453 | } 454 | 455 | static inline void reduce_graph(){ 456 | int *Que,*Col,*Dis,*InQue,Ql,Qr; 457 | int *Fixed,*Deleted; 458 | Que=(int *)malloc((NB_NODE+1)*sizeof (int)); 459 | Fixed=(int *) malloc((NB_NODE+1)*sizeof(int)); 460 | Deleted=(int *) malloc((NB_NODE + 1) * sizeof(int)); 461 | Col=(int *) malloc((NB_NODE+1)*sizeof (int)); 462 | Dis=(int *) malloc((NB_NODE+1)*sizeof (int)); 463 | InQue=(int *) malloc((NB_NODE+1)*sizeof (int)); 464 | memset(Fixed,0,(NB_NODE+1)*sizeof (int)); 465 | memset(Deleted, 0, (NB_NODE + 1) * sizeof (int)); 466 | memset(Col,0,(NB_NODE+1)*sizeof (int)); 467 | memset(InQue,0,(NB_NODE+1)*sizeof (int)); 468 | memset(Dis,0x3f,(NB_NODE+1)*sizeof (int)); 469 | const int INF=0x3f3f3f3f; 470 | for(int i=1;i<=NB_NODE;i++){ 471 | if(Deleted[i])continue; 472 | Ql=0;Qr=0; 473 | Que[Qr++]=i;Dis[i]=0;InQue[i]=1; 474 | int Me,tt,ColCnt=0; 475 | while(Ql!=Qr) { 476 | Me = Que[Ql++]; 477 | if (Dis[Me] == 2)break; 478 | for (int j = 0; j < Node_Degree[Me]; j++) { 479 | tt = Node_Neighbors[Me][j]; 480 | //if(Deleted[tt])continue; 481 | //if(Me==1)printf("**%d\n",tt); 482 | if (Dis[tt] <= 1)continue; 483 | if(Dis[tt]>Dis[Me]+1)Dis[tt] = Dis[Me] + 1; 484 | if(!InQue[tt]){ 485 | Que[Qr++] = tt; 486 | InQue[tt]=1; 487 | } 488 | if (Dis[tt] == 2 && Dis[Me]==1 && Col[Me] == 0) { 489 | Col[Me] = 1; 490 | ColCnt++; 491 | } 492 | } 493 | } 494 | int NeiNum=0; 495 | for(int j=1;j